Created
November 23, 2025 23:11
-
-
Save visualjeff/3062959839719a9d771a2eb3b1ca5cd5 to your computer and use it in GitHub Desktop.
dup-scan.js
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env node | |
| // @ts-check | |
| /** | |
| * Duplicate detector for TS/JS React projects. | |
| * | |
| * Heuristics: | |
| * - Extracts functions, React components (PascalCase returning JSX), and const/config objects. | |
| * - Normalizes tokens (identifiers/strings/numbers -> placeholders), hashes for exact clones. | |
| * - Computes near-duplicate and semantic similarity via token Jaccard. | |
| * | |
| * Usage: | |
| * npm run scan:dups -- src apps/packages | |
| */ | |
| import crypto from "node:crypto"; | |
| import fs from "node:fs"; | |
| import path from "node:path"; | |
| import ts from "typescript"; | |
| import { pathToFileURL } from "node:url"; | |
| /** @typedef {"component" | "function" | "const"} CodeKind */ | |
| /** | |
| * @typedef {Object} CodeUnit | |
| * @property {string} id | |
| * @property {string} name | |
| * @property {CodeKind} kind | |
| * @property {string} file | |
| * @property {number} start | |
| * @property {number} end | |
| * @property {[number, number]} lines | |
| * @property {string[]} tokens | |
| * @property {string} canonical | |
| * @property {string} hash | |
| * @property {number} length | |
| */ | |
| const IGNORED_DIRS = new Set(["node_modules", "dist", "build", ".git", ".next", ".turbo"]); | |
| const EXTS = new Set([".ts", ".tsx", ".js", ".jsx", ".mts", ".cts"]); | |
| const NEAR_DUP_THRESHOLD = 0.9; | |
| const SEMANTIC_DUP_THRESHOLD = 0.82; | |
| function walkFiles(root) { | |
| const results = []; | |
| const queue = [root]; | |
| while (queue.length) { | |
| const current = queue.pop(); | |
| const stat = fs.statSync(current); | |
| if (stat.isDirectory()) { | |
| const base = path.basename(current); | |
| if (IGNORED_DIRS.has(base)) continue; | |
| for (const entry of fs.readdirSync(current)) { | |
| queue.push(path.join(current, entry)); | |
| } | |
| } else if (EXTS.has(path.extname(current))) { | |
| results.push(current); | |
| } | |
| } | |
| return results; | |
| } | |
| function scriptKindFor(file) { | |
| const ext = path.extname(file); | |
| if (ext === ".tsx" || ext === ".jsx") return ts.ScriptKind.TSX; | |
| if (ext === ".mts") return ts.ScriptKind.MTS; | |
| if (ext === ".cts") return ts.ScriptKind.CTS; | |
| return ts.ScriptKind.TS; | |
| } | |
| function tokenize(text) { | |
| const scanner = ts.createScanner(ts.ScriptTarget.Latest, false, ts.LanguageVariant.Standard, text); | |
| const tokens = []; | |
| let token = scanner.scan(); | |
| while (token !== ts.SyntaxKind.EndOfFileToken) { | |
| switch (token) { | |
| case ts.SyntaxKind.Identifier: | |
| case ts.SyntaxKind.PrivateIdentifier: | |
| tokens.push("ID"); | |
| break; | |
| case ts.SyntaxKind.StringLiteral: | |
| case ts.SyntaxKind.NoSubstitutionTemplateLiteral: | |
| case ts.SyntaxKind.TemplateHead: | |
| case ts.SyntaxKind.TemplateMiddle: | |
| case ts.SyntaxKind.TemplateTail: | |
| tokens.push("STR"); | |
| break; | |
| case ts.SyntaxKind.NumericLiteral: | |
| tokens.push("NUM"); | |
| break; | |
| default: | |
| tokens.push(scanner.getTokenText()); | |
| } | |
| token = scanner.scan(); | |
| } | |
| return tokens.filter((t) => t.trim().length); | |
| } | |
| function canonicalize(node, sourceText) { | |
| const raw = node.getText(); | |
| const tokens = tokenize(raw); | |
| return { tokens, canonical: tokens.join(" ") }; | |
| } | |
| function hashCanonical(canonical) { | |
| return crypto.createHash("sha1").update(canonical).digest("hex"); | |
| } | |
| function lineRange(sf, start, end) { | |
| const startLine = sf.getLineAndCharacterOfPosition(start).line + 1; | |
| const endLine = sf.getLineAndCharacterOfPosition(end).line + 1; | |
| return [startLine, endLine]; | |
| } | |
| function hasJsx(node) { | |
| let found = false; | |
| const visit = (n) => { | |
| if (ts.isJsxElement(n) || ts.isJsxSelfClosingElement(n) || ts.isJsxFragment(n)) { | |
| found = true; | |
| return; | |
| } | |
| ts.forEachChild(n, visit); | |
| }; | |
| ts.forEachChild(node, visit); | |
| return found; | |
| } | |
| function isPascalCase(name) { | |
| return /^[A-Z][A-Za-z0-9]*$/.test(name); | |
| } | |
| /** @param {string} file */ | |
| function collectUnits(file) { | |
| const sourceText = fs.readFileSync(file, "utf8"); | |
| const sourceFile = ts.createSourceFile(file, sourceText, ts.ScriptTarget.Latest, true, scriptKindFor(file)); | |
| /** @type {CodeUnit[]} */ | |
| const units = []; | |
| /** @param {ts.Node} node @param {CodeKind} kind @param {string} name */ | |
| const addUnit = (node, kind, name) => { | |
| const { tokens, canonical } = canonicalize(node, sourceText); | |
| const hash = hashCanonical(canonical); | |
| const [startLine, endLine] = lineRange(sourceFile, node.getStart(), node.getEnd()); | |
| units.push({ | |
| id: `${file}:${startLine}-${endLine}`, | |
| name, | |
| kind, | |
| file, | |
| start: node.getStart(), | |
| end: node.getEnd(), | |
| lines: [startLine, endLine], | |
| tokens, | |
| canonical, | |
| hash, | |
| length: tokens.length, | |
| }); | |
| }; | |
| const visit = (node) => { | |
| if (ts.isFunctionDeclaration(node) && node.name) { | |
| const kind = hasJsx(node) && isPascalCase(node.name.getText()) ? "component" : "function"; | |
| addUnit(node, kind, node.name.getText()); | |
| } | |
| if (ts.isVariableStatement(node)) { | |
| node.declarationList.declarations.forEach((decl) => { | |
| const name = decl.name.getText(sourceFile); | |
| if (!decl.initializer) return; | |
| if (ts.isArrowFunction(decl.initializer) || ts.isFunctionExpression(decl.initializer)) { | |
| const kind = hasJsx(decl.initializer) && isPascalCase(name) ? "component" : "function"; | |
| addUnit(decl.initializer, kind, name); | |
| } else if ( | |
| ts.isObjectLiteralExpression(decl.initializer) || | |
| ts.isArrayLiteralExpression(decl.initializer) || | |
| ts.isLiteralExpression(decl.initializer) | |
| ) { | |
| addUnit(decl.initializer, "const", name); | |
| } | |
| }); | |
| } | |
| ts.forEachChild(node, visit); | |
| }; | |
| visit(sourceFile); | |
| return units; | |
| } | |
| function jaccard(a, b) { | |
| const setA = new Set(a); | |
| const setB = new Set(b); | |
| let intersection = 0; | |
| for (const token of setA) { | |
| if (setB.has(token)) intersection += 1; | |
| } | |
| const union = setA.size + setB.size - intersection; | |
| return union === 0 ? 0 : intersection / union; | |
| } | |
| function findExact(units) { | |
| const byHash = new Map(); | |
| for (const u of units) { | |
| const arr = byHash.get(u.hash) ?? []; | |
| arr.push(u); | |
| byHash.set(u.hash, arr); | |
| } | |
| return Array.from(byHash.entries()) | |
| .map(([hash, items]) => ({ hash, units: items })) | |
| .filter((g) => g.units.length > 1); | |
| } | |
| function findSimilar(units) { | |
| /** @type {{ a: CodeUnit; b: CodeUnit; score: number; label: "near" | "semantic" }[]} */ | |
| const pairs = []; | |
| const sorted = [...units].sort((a, b) => a.length - b.length); | |
| for (let i = 0; i < sorted.length; i++) { | |
| for (let j = i + 1; j < sorted.length; j++) { | |
| const left = sorted[i]; | |
| const right = sorted[j]; | |
| const sizeRatio = left.length / right.length; | |
| if (sizeRatio < 0.5 || sizeRatio > 2) continue; // skip wildly different sizes | |
| const score = jaccard(left.tokens, right.tokens); | |
| if (score >= NEAR_DUP_THRESHOLD) { | |
| pairs.push({ a: left, b: right, score, label: "near" }); | |
| } else if (score >= SEMANTIC_DUP_THRESHOLD) { | |
| pairs.push({ a: left, b: right, score, label: "semantic" }); | |
| } | |
| } | |
| } | |
| return pairs; | |
| } | |
| function report(exacts, pairs) { | |
| let findingCount = 0; | |
| if (exacts.length) { | |
| console.log("=== Exact structural duplicates ==="); | |
| for (const group of exacts) { | |
| const [first] = group.units; | |
| console.log(`Hash ${group.hash} (${first.kind}, size ${first.length} tokens)`); | |
| for (const unit of group.units) { | |
| console.log( | |
| ` - ${unit.kind.padEnd(9)} ${unit.name.padEnd(20)} ${unit.file}:${unit.lines[0]}-${unit.lines[1]}` | |
| ); | |
| } | |
| findingCount += group.units.length - 1; | |
| } | |
| console.log(""); | |
| } | |
| const near = pairs.filter((p) => p.label === "near"); | |
| const semantic = pairs.filter((p) => p.label === "semantic"); | |
| if (near.length) { | |
| console.log("=== Near duplicates (structurally similar) ==="); | |
| for (const pair of near) { | |
| console.log( | |
| ` - ${pair.a.kind}/${pair.b.kind} ${pair.score.toFixed(2)} :: ${pair.a.name} (${pair.a.file}:${pair.a.lines[0]}-${pair.a.lines[1]}) <=> ${pair.b.name} (${pair.b.file}:${pair.b.lines[0]}-${pair.b.lines[1]})` | |
| ); | |
| findingCount += 1; | |
| } | |
| console.log(""); | |
| } | |
| if (semantic.length) { | |
| console.log("=== Semantic/shape duplicates (identifier-agnostic) ==="); | |
| for (const pair of semantic) { | |
| console.log( | |
| ` - ${pair.a.kind}/${pair.b.kind} ${pair.score.toFixed(2)} :: ${pair.a.name} (${pair.a.file}:${pair.a.lines[0]}-${pair.a.lines[1]}) <=> ${pair.b.name} (${pair.b.file}:${pair.b.lines[0]}-${pair.b.lines[1]})` | |
| ); | |
| findingCount += 1; | |
| } | |
| console.log(""); | |
| } | |
| console.log(`Total findings: ${findingCount}`); | |
| } | |
| export async function run(roots) { | |
| const targets = roots.length ? Array.from(new Set(roots)) : ["src"]; | |
| const files = targets.flatMap((root) => walkFiles(path.resolve(root))); | |
| const units = files.flatMap((file) => collectUnits(file)); | |
| const exacts = findExact(units); | |
| const pairs = findSimilar(units); | |
| report(exacts, pairs); | |
| } | |
| if (import.meta.url === pathToFileURL(process.argv[1]).href) { | |
| run(process.argv.slice(2)); | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment