Skip to content

Instantly share code, notes, and snippets.

@visualjeff
Created November 23, 2025 23:11
Show Gist options
  • Select an option

  • Save visualjeff/3062959839719a9d771a2eb3b1ca5cd5 to your computer and use it in GitHub Desktop.

Select an option

Save visualjeff/3062959839719a9d771a2eb3b1ca5cd5 to your computer and use it in GitHub Desktop.
dup-scan.js
#!/usr/bin/env node
// @ts-check
/**
* Duplicate detector for TS/JS React projects.
*
* Heuristics:
* - Extracts functions, React components (PascalCase returning JSX), and const/config objects.
* - Normalizes tokens (identifiers/strings/numbers -> placeholders), hashes for exact clones.
* - Computes near-duplicate and semantic similarity via token Jaccard.
*
* Usage:
* npm run scan:dups -- src apps/packages
*/
import crypto from "node:crypto";
import fs from "node:fs";
import path from "node:path";
import ts from "typescript";
import { pathToFileURL } from "node:url";
/** @typedef {"component" | "function" | "const"} CodeKind */
/**
* @typedef {Object} CodeUnit
* @property {string} id
* @property {string} name
* @property {CodeKind} kind
* @property {string} file
* @property {number} start
* @property {number} end
* @property {[number, number]} lines
* @property {string[]} tokens
* @property {string} canonical
* @property {string} hash
* @property {number} length
*/
const IGNORED_DIRS = new Set(["node_modules", "dist", "build", ".git", ".next", ".turbo"]);
const EXTS = new Set([".ts", ".tsx", ".js", ".jsx", ".mts", ".cts"]);
const NEAR_DUP_THRESHOLD = 0.9;
const SEMANTIC_DUP_THRESHOLD = 0.82;
function walkFiles(root) {
const results = [];
const queue = [root];
while (queue.length) {
const current = queue.pop();
const stat = fs.statSync(current);
if (stat.isDirectory()) {
const base = path.basename(current);
if (IGNORED_DIRS.has(base)) continue;
for (const entry of fs.readdirSync(current)) {
queue.push(path.join(current, entry));
}
} else if (EXTS.has(path.extname(current))) {
results.push(current);
}
}
return results;
}
function scriptKindFor(file) {
const ext = path.extname(file);
if (ext === ".tsx" || ext === ".jsx") return ts.ScriptKind.TSX;
if (ext === ".mts") return ts.ScriptKind.MTS;
if (ext === ".cts") return ts.ScriptKind.CTS;
return ts.ScriptKind.TS;
}
function tokenize(text) {
const scanner = ts.createScanner(ts.ScriptTarget.Latest, false, ts.LanguageVariant.Standard, text);
const tokens = [];
let token = scanner.scan();
while (token !== ts.SyntaxKind.EndOfFileToken) {
switch (token) {
case ts.SyntaxKind.Identifier:
case ts.SyntaxKind.PrivateIdentifier:
tokens.push("ID");
break;
case ts.SyntaxKind.StringLiteral:
case ts.SyntaxKind.NoSubstitutionTemplateLiteral:
case ts.SyntaxKind.TemplateHead:
case ts.SyntaxKind.TemplateMiddle:
case ts.SyntaxKind.TemplateTail:
tokens.push("STR");
break;
case ts.SyntaxKind.NumericLiteral:
tokens.push("NUM");
break;
default:
tokens.push(scanner.getTokenText());
}
token = scanner.scan();
}
return tokens.filter((t) => t.trim().length);
}
function canonicalize(node, sourceText) {
const raw = node.getText();
const tokens = tokenize(raw);
return { tokens, canonical: tokens.join(" ") };
}
function hashCanonical(canonical) {
return crypto.createHash("sha1").update(canonical).digest("hex");
}
function lineRange(sf, start, end) {
const startLine = sf.getLineAndCharacterOfPosition(start).line + 1;
const endLine = sf.getLineAndCharacterOfPosition(end).line + 1;
return [startLine, endLine];
}
function hasJsx(node) {
let found = false;
const visit = (n) => {
if (ts.isJsxElement(n) || ts.isJsxSelfClosingElement(n) || ts.isJsxFragment(n)) {
found = true;
return;
}
ts.forEachChild(n, visit);
};
ts.forEachChild(node, visit);
return found;
}
function isPascalCase(name) {
return /^[A-Z][A-Za-z0-9]*$/.test(name);
}
/** @param {string} file */
function collectUnits(file) {
const sourceText = fs.readFileSync(file, "utf8");
const sourceFile = ts.createSourceFile(file, sourceText, ts.ScriptTarget.Latest, true, scriptKindFor(file));
/** @type {CodeUnit[]} */
const units = [];
/** @param {ts.Node} node @param {CodeKind} kind @param {string} name */
const addUnit = (node, kind, name) => {
const { tokens, canonical } = canonicalize(node, sourceText);
const hash = hashCanonical(canonical);
const [startLine, endLine] = lineRange(sourceFile, node.getStart(), node.getEnd());
units.push({
id: `${file}:${startLine}-${endLine}`,
name,
kind,
file,
start: node.getStart(),
end: node.getEnd(),
lines: [startLine, endLine],
tokens,
canonical,
hash,
length: tokens.length,
});
};
const visit = (node) => {
if (ts.isFunctionDeclaration(node) && node.name) {
const kind = hasJsx(node) && isPascalCase(node.name.getText()) ? "component" : "function";
addUnit(node, kind, node.name.getText());
}
if (ts.isVariableStatement(node)) {
node.declarationList.declarations.forEach((decl) => {
const name = decl.name.getText(sourceFile);
if (!decl.initializer) return;
if (ts.isArrowFunction(decl.initializer) || ts.isFunctionExpression(decl.initializer)) {
const kind = hasJsx(decl.initializer) && isPascalCase(name) ? "component" : "function";
addUnit(decl.initializer, kind, name);
} else if (
ts.isObjectLiteralExpression(decl.initializer) ||
ts.isArrayLiteralExpression(decl.initializer) ||
ts.isLiteralExpression(decl.initializer)
) {
addUnit(decl.initializer, "const", name);
}
});
}
ts.forEachChild(node, visit);
};
visit(sourceFile);
return units;
}
function jaccard(a, b) {
const setA = new Set(a);
const setB = new Set(b);
let intersection = 0;
for (const token of setA) {
if (setB.has(token)) intersection += 1;
}
const union = setA.size + setB.size - intersection;
return union === 0 ? 0 : intersection / union;
}
function findExact(units) {
const byHash = new Map();
for (const u of units) {
const arr = byHash.get(u.hash) ?? [];
arr.push(u);
byHash.set(u.hash, arr);
}
return Array.from(byHash.entries())
.map(([hash, items]) => ({ hash, units: items }))
.filter((g) => g.units.length > 1);
}
function findSimilar(units) {
/** @type {{ a: CodeUnit; b: CodeUnit; score: number; label: "near" | "semantic" }[]} */
const pairs = [];
const sorted = [...units].sort((a, b) => a.length - b.length);
for (let i = 0; i < sorted.length; i++) {
for (let j = i + 1; j < sorted.length; j++) {
const left = sorted[i];
const right = sorted[j];
const sizeRatio = left.length / right.length;
if (sizeRatio < 0.5 || sizeRatio > 2) continue; // skip wildly different sizes
const score = jaccard(left.tokens, right.tokens);
if (score >= NEAR_DUP_THRESHOLD) {
pairs.push({ a: left, b: right, score, label: "near" });
} else if (score >= SEMANTIC_DUP_THRESHOLD) {
pairs.push({ a: left, b: right, score, label: "semantic" });
}
}
}
return pairs;
}
function report(exacts, pairs) {
let findingCount = 0;
if (exacts.length) {
console.log("=== Exact structural duplicates ===");
for (const group of exacts) {
const [first] = group.units;
console.log(`Hash ${group.hash} (${first.kind}, size ${first.length} tokens)`);
for (const unit of group.units) {
console.log(
` - ${unit.kind.padEnd(9)} ${unit.name.padEnd(20)} ${unit.file}:${unit.lines[0]}-${unit.lines[1]}`
);
}
findingCount += group.units.length - 1;
}
console.log("");
}
const near = pairs.filter((p) => p.label === "near");
const semantic = pairs.filter((p) => p.label === "semantic");
if (near.length) {
console.log("=== Near duplicates (structurally similar) ===");
for (const pair of near) {
console.log(
` - ${pair.a.kind}/${pair.b.kind} ${pair.score.toFixed(2)} :: ${pair.a.name} (${pair.a.file}:${pair.a.lines[0]}-${pair.a.lines[1]}) <=> ${pair.b.name} (${pair.b.file}:${pair.b.lines[0]}-${pair.b.lines[1]})`
);
findingCount += 1;
}
console.log("");
}
if (semantic.length) {
console.log("=== Semantic/shape duplicates (identifier-agnostic) ===");
for (const pair of semantic) {
console.log(
` - ${pair.a.kind}/${pair.b.kind} ${pair.score.toFixed(2)} :: ${pair.a.name} (${pair.a.file}:${pair.a.lines[0]}-${pair.a.lines[1]}) <=> ${pair.b.name} (${pair.b.file}:${pair.b.lines[0]}-${pair.b.lines[1]})`
);
findingCount += 1;
}
console.log("");
}
console.log(`Total findings: ${findingCount}`);
}
export async function run(roots) {
const targets = roots.length ? Array.from(new Set(roots)) : ["src"];
const files = targets.flatMap((root) => walkFiles(path.resolve(root)));
const units = files.flatMap((file) => collectUnits(file));
const exacts = findExact(units);
const pairs = findSimilar(units);
report(exacts, pairs);
}
if (import.meta.url === pathToFileURL(process.argv[1]).href) {
run(process.argv.slice(2));
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment