visualjeff · November 23, 2025 23:11
diff --git a/gistfile1.txt b/gistfile1.txt
 #!/usr/bin/env node
 // @ts-check
 /**
 * Duplicate detector for TS/JS React projects.
 *
 * Heuristics:
 * - Extracts functions, React components (PascalCase returning JSX), and const/config objects.
 * - Normalizes tokens (identifiers/strings/numbers -> placeholders), hashes for exact clones.
 * - Computes near-duplicate and semantic similarity via token Jaccard.
 *
 * Usage:
 *   npm run scan:dups -- src apps/packages
 */
 import crypto from "node:crypto";
 import fs from "node:fs";
 import path from "node:path";
 import ts from "typescript";
 import { pathToFileURL } from "node:url";

 /** @typedef {"component" | "function" | "const"} CodeKind */
 /**
 * @typedef {Object} CodeUnit
 * @property {string} id
 * @property {string} name
 * @property {CodeKind} kind
 * @property {string} file
 * @property {number} start
 * @property {number} end
 * @property {[number, number]} lines
 * @property {string[]} tokens
 * @property {string} canonical
 * @property {string} hash
 * @property {number} length
 */

 const IGNORED_DIRS = new Set(["node_modules", "dist", "build", ".git", ".next", ".turbo"]);
 const EXTS = new Set([".ts", ".tsx", ".js", ".jsx", ".mts", ".cts"]);
 const NEAR_DUP_THRESHOLD = 0.9;
 const SEMANTIC_DUP_THRESHOLD = 0.82;

 function walkFiles(root) {
  const results = [];
  const queue = [root];

  while (queue.length) {
    const current = queue.pop();
    const stat = fs.statSync(current);
    if (stat.isDirectory()) {
      const base = path.basename(current);
      if (IGNORED_DIRS.has(base)) continue;
      for (const entry of fs.readdirSync(current)) {
        queue.push(path.join(current, entry));
      }
    } else if (EXTS.has(path.extname(current))) {
      results.push(current);
    }
  }
  return results;
 }

 function scriptKindFor(file) {
  const ext = path.extname(file);
  if (ext === ".tsx" || ext === ".jsx") return ts.ScriptKind.TSX;
  if (ext === ".mts") return ts.ScriptKind.MTS;
  if (ext === ".cts") return ts.ScriptKind.CTS;
  return ts.ScriptKind.TS;
 }

 function tokenize(text) {
  const scanner = ts.createScanner(ts.ScriptTarget.Latest, false, ts.LanguageVariant.Standard, text);
  const tokens = [];
  let token = scanner.scan();
  while (token !== ts.SyntaxKind.EndOfFileToken) {
    switch (token) {
      case ts.SyntaxKind.Identifier:
      case ts.SyntaxKind.PrivateIdentifier:
        tokens.push("ID");
        break;
      case ts.SyntaxKind.StringLiteral:
      case ts.SyntaxKind.NoSubstitutionTemplateLiteral:
      case ts.SyntaxKind.TemplateHead:
      case ts.SyntaxKind.TemplateMiddle:
      case ts.SyntaxKind.TemplateTail:
        tokens.push("STR");
        break;
      case ts.SyntaxKind.NumericLiteral:
        tokens.push("NUM");
        break;
      default:
        tokens.push(scanner.getTokenText());
    }
    token = scanner.scan();
  }
  return tokens.filter((t) => t.trim().length);
 }

 function canonicalize(node, sourceText) {
  const raw = node.getText();
  const tokens = tokenize(raw);
  return { tokens, canonical: tokens.join(" ") };
 }

 function hashCanonical(canonical) {
  return crypto.createHash("sha1").update(canonical).digest("hex");
 }

 function lineRange(sf, start, end) {
  const startLine = sf.getLineAndCharacterOfPosition(start).line + 1;
  const endLine = sf.getLineAndCharacterOfPosition(end).line + 1;
  return [startLine, endLine];
 }

 function hasJsx(node) {
  let found = false;
  const visit = (n) => {
    if (ts.isJsxElement(n) || ts.isJsxSelfClosingElement(n) || ts.isJsxFragment(n)) {
      found = true;
      return;
    }
    ts.forEachChild(n, visit);
  };
  ts.forEachChild(node, visit);
  return found;
 }

 function isPascalCase(name) {
  return /^[A-Z][A-Za-z0-9]*$/.test(name);
 }

 /** @param {string} file */
 function collectUnits(file) {
  const sourceText = fs.readFileSync(file, "utf8");
  const sourceFile = ts.createSourceFile(file, sourceText, ts.ScriptTarget.Latest, true, scriptKindFor(file));
  /** @type {CodeUnit[]} */
  const units = [];

  /** @param {ts.Node} node @param {CodeKind} kind @param {string} name */
  const addUnit = (node, kind, name) => {
    const { tokens, canonical } = canonicalize(node, sourceText);
    const hash = hashCanonical(canonical);
    const [startLine, endLine] = lineRange(sourceFile, node.getStart(), node.getEnd());
    units.push({
      id: `${file}:${startLine}-${endLine}`,
      name,
      kind,
      file,
      start: node.getStart(),
      end: node.getEnd(),
      lines: [startLine, endLine],
      tokens,
      canonical,
      hash,
      length: tokens.length,
    });
  };

  const visit = (node) => {
    if (ts.isFunctionDeclaration(node) && node.name) {
      const kind = hasJsx(node) && isPascalCase(node.name.getText()) ? "component" : "function";
      addUnit(node, kind, node.name.getText());
    }

    if (ts.isVariableStatement(node)) {
      node.declarationList.declarations.forEach((decl) => {
        const name = decl.name.getText(sourceFile);
        if (!decl.initializer) return;

        if (ts.isArrowFunction(decl.initializer) || ts.isFunctionExpression(decl.initializer)) {
          const kind = hasJsx(decl.initializer) && isPascalCase(name) ? "component" : "function";
          addUnit(decl.initializer, kind, name);
        } else if (
          ts.isObjectLiteralExpression(decl.initializer) ||
          ts.isArrayLiteralExpression(decl.initializer) ||
          ts.isLiteralExpression(decl.initializer)
        ) {
          addUnit(decl.initializer, "const", name);
        }
      });
    }

    ts.forEachChild(node, visit);
  };

  visit(sourceFile);
  return units;
 }

 function jaccard(a, b) {
  const setA = new Set(a);
  const setB = new Set(b);
  let intersection = 0;
  for (const token of setA) {
    if (setB.has(token)) intersection += 1;
  }
  const union = setA.size + setB.size - intersection;
  return union === 0 ? 0 : intersection / union;
 }

 function findExact(units) {
  const byHash = new Map();
  for (const u of units) {
    const arr = byHash.get(u.hash) ?? [];
    arr.push(u);
    byHash.set(u.hash, arr);
  }
  return Array.from(byHash.entries())
    .map(([hash, items]) => ({ hash, units: items }))
    .filter((g) => g.units.length > 1);
 }

 function findSimilar(units) {
  /** @type {{ a: CodeUnit; b: CodeUnit; score: number; label: "near" | "semantic" }[]} */
  const pairs = [];
  const sorted = [...units].sort((a, b) => a.length - b.length);

  for (let i = 0; i < sorted.length; i++) {
    for (let j = i + 1; j < sorted.length; j++) {
      const left = sorted[i];
      const right = sorted[j];
      const sizeRatio = left.length / right.length;
      if (sizeRatio < 0.5 || sizeRatio > 2) continue; // skip wildly different sizes
      const score = jaccard(left.tokens, right.tokens);
      if (score >= NEAR_DUP_THRESHOLD) {
        pairs.push({ a: left, b: right, score, label: "near" });
      } else if (score >= SEMANTIC_DUP_THRESHOLD) {
        pairs.push({ a: left, b: right, score, label: "semantic" });
      }
    }
  }
  return pairs;
 }

 function report(exacts, pairs) {
  let findingCount = 0;

  if (exacts.length) {
    console.log("=== Exact structural duplicates ===");
    for (const group of exacts) {
      const [first] = group.units;
      console.log(`Hash ${group.hash} (${first.kind}, size ${first.length} tokens)`);
      for (const unit of group.units) {
        console.log(
          `  - ${unit.kind.padEnd(9)} ${unit.name.padEnd(20)} ${unit.file}:${unit.lines[0]}-${unit.lines[1]}`
        );
      }
      findingCount += group.units.length - 1;
    }
    console.log("");
  }

  const near = pairs.filter((p) => p.label === "near");
  const semantic = pairs.filter((p) => p.label === "semantic");

  if (near.length) {
    console.log("=== Near duplicates (structurally similar) ===");
    for (const pair of near) {
      console.log(
        `  - ${pair.a.kind}/${pair.b.kind} ${pair.score.toFixed(2)} :: ${pair.a.name} (${pair.a.file}:${pair.a.lines[0]}-${pair.a.lines[1]}) <=> ${pair.b.name} (${pair.b.file}:${pair.b.lines[0]}-${pair.b.lines[1]})`
      );
      findingCount += 1;
    }
    console.log("");
  }

  if (semantic.length) {
    console.log("=== Semantic/shape duplicates (identifier-agnostic) ===");
    for (const pair of semantic) {
      console.log(
        `  - ${pair.a.kind}/${pair.b.kind} ${pair.score.toFixed(2)} :: ${pair.a.name} (${pair.a.file}:${pair.a.lines[0]}-${pair.a.lines[1]}) <=> ${pair.b.name} (${pair.b.file}:${pair.b.lines[0]}-${pair.b.lines[1]})`
      );
      findingCount += 1;
    }
    console.log("");
  }

  console.log(`Total findings: ${findingCount}`);
 }

 export async function run(roots) {
  const targets = roots.length ? Array.from(new Set(roots)) : ["src"];
  const files = targets.flatMap((root) => walkFiles(path.resolve(root)));
  const units = files.flatMap((file) => collectUnits(file));

  const exacts = findExact(units);
  const pairs = findSimilar(units);

  report(exacts, pairs);
 }

 if (import.meta.url === pathToFileURL(process.argv[1]).href) {
  run(process.argv.slice(2));
 }
	#!/usr/bin/env node
	// @ts-check
	/**
	* Duplicate detector for TS/JS React projects.
	*
	* Heuristics:
	* - Extracts functions, React components (PascalCase returning JSX), and const/config objects.
	* - Normalizes tokens (identifiers/strings/numbers -> placeholders), hashes for exact clones.
	* - Computes near-duplicate and semantic similarity via token Jaccard.
	*
	* Usage:
	* npm run scan:dups -- src apps/packages
	*/
	import crypto from "node:crypto";
	import fs from "node:fs";
	import path from "node:path";
	import ts from "typescript";
	import { pathToFileURL } from "node:url";

	/** @typedef {"component" \| "function" \| "const"} CodeKind */
	/**
	* @typedef {Object} CodeUnit
	* @property {string} id
	* @property {string} name
	* @property {CodeKind} kind
	* @property {string} file
	* @property {number} start
	* @property {number} end
	* @property {[number, number]} lines
	* @property {string[]} tokens
	* @property {string} canonical
	* @property {string} hash
	* @property {number} length
	*/

	const IGNORED_DIRS = new Set(["node_modules", "dist", "build", ".git", ".next", ".turbo"]);
	const EXTS = new Set([".ts", ".tsx", ".js", ".jsx", ".mts", ".cts"]);
	const NEAR_DUP_THRESHOLD = 0.9;
	const SEMANTIC_DUP_THRESHOLD = 0.82;

	function walkFiles(root) {
	const results = [];
	const queue = [root];

	while (queue.length) {
	const current = queue.pop();
	const stat = fs.statSync(current);
	if (stat.isDirectory()) {
	const base = path.basename(current);
	if (IGNORED_DIRS.has(base)) continue;
	for (const entry of fs.readdirSync(current)) {
	queue.push(path.join(current, entry));
	}
	} else if (EXTS.has(path.extname(current))) {
	results.push(current);
	}
	}
	return results;
	}

	function scriptKindFor(file) {
	const ext = path.extname(file);
	if (ext === ".tsx" \|\| ext === ".jsx") return ts.ScriptKind.TSX;
	if (ext === ".mts") return ts.ScriptKind.MTS;
	if (ext === ".cts") return ts.ScriptKind.CTS;
	return ts.ScriptKind.TS;
	}

	function tokenize(text) {
	const scanner = ts.createScanner(ts.ScriptTarget.Latest, false, ts.LanguageVariant.Standard, text);
	const tokens = [];
	let token = scanner.scan();
	while (token !== ts.SyntaxKind.EndOfFileToken) {
	switch (token) {
	case ts.SyntaxKind.Identifier:
	case ts.SyntaxKind.PrivateIdentifier:
	tokens.push("ID");
	break;
	case ts.SyntaxKind.StringLiteral:
	case ts.SyntaxKind.NoSubstitutionTemplateLiteral:
	case ts.SyntaxKind.TemplateHead:
	case ts.SyntaxKind.TemplateMiddle:
	case ts.SyntaxKind.TemplateTail:
	tokens.push("STR");
	break;
	case ts.SyntaxKind.NumericLiteral:
	tokens.push("NUM");
	break;
	default:
	tokens.push(scanner.getTokenText());
	}
	token = scanner.scan();
	}
	return tokens.filter((t) => t.trim().length);
	}

	function canonicalize(node, sourceText) {
	const raw = node.getText();
	const tokens = tokenize(raw);
	return { tokens, canonical: tokens.join(" ") };
	}

	function hashCanonical(canonical) {
	return crypto.createHash("sha1").update(canonical).digest("hex");
	}

	function lineRange(sf, start, end) {
	const startLine = sf.getLineAndCharacterOfPosition(start).line + 1;
	const endLine = sf.getLineAndCharacterOfPosition(end).line + 1;
	return [startLine, endLine];
	}

	function hasJsx(node) {
	let found = false;
	const visit = (n) => {
	if (ts.isJsxElement(n) \|\| ts.isJsxSelfClosingElement(n) \|\| ts.isJsxFragment(n)) {
	found = true;
	return;
	}
	ts.forEachChild(n, visit);
	};
	ts.forEachChild(node, visit);
	return found;
	}

	function isPascalCase(name) {
	return /^[A-Z][A-Za-z0-9]*$/.test(name);
	}

	/** @param {string} file */
	function collectUnits(file) {
	const sourceText = fs.readFileSync(file, "utf8");
	const sourceFile = ts.createSourceFile(file, sourceText, ts.ScriptTarget.Latest, true, scriptKindFor(file));
	/** @type {CodeUnit[]} */
	const units = [];

	/** @param {ts.Node} node @param {CodeKind} kind @param {string} name */
	const addUnit = (node, kind, name) => {
	const { tokens, canonical } = canonicalize(node, sourceText);
	const hash = hashCanonical(canonical);
	const [startLine, endLine] = lineRange(sourceFile, node.getStart(), node.getEnd());
	units.push({
	id: `${file}:${startLine}-${endLine}`,
	name,
	kind,
	file,
	start: node.getStart(),
	end: node.getEnd(),
	lines: [startLine, endLine],
	tokens,
	canonical,
	hash,
	length: tokens.length,
	});
	};

	const visit = (node) => {
	if (ts.isFunctionDeclaration(node) && node.name) {
	const kind = hasJsx(node) && isPascalCase(node.name.getText()) ? "component" : "function";
	addUnit(node, kind, node.name.getText());
	}

	if (ts.isVariableStatement(node)) {
	node.declarationList.declarations.forEach((decl) => {
	const name = decl.name.getText(sourceFile);
	if (!decl.initializer) return;

	if (ts.isArrowFunction(decl.initializer) \|\| ts.isFunctionExpression(decl.initializer)) {
	const kind = hasJsx(decl.initializer) && isPascalCase(name) ? "component" : "function";
	addUnit(decl.initializer, kind, name);
	} else if (
	ts.isObjectLiteralExpression(decl.initializer) \|\|
	ts.isArrayLiteralExpression(decl.initializer) \|\|
	ts.isLiteralExpression(decl.initializer)
	) {
	addUnit(decl.initializer, "const", name);
	}
	});
	}

	ts.forEachChild(node, visit);
	};

	visit(sourceFile);
	return units;
	}

	function jaccard(a, b) {
	const setA = new Set(a);
	const setB = new Set(b);
	let intersection = 0;
	for (const token of setA) {
	if (setB.has(token)) intersection += 1;
	}
	const union = setA.size + setB.size - intersection;
	return union === 0 ? 0 : intersection / union;
	}

	function findExact(units) {
	const byHash = new Map();
	for (const u of units) {
	const arr = byHash.get(u.hash) ?? [];
	arr.push(u);
	byHash.set(u.hash, arr);
	}
	return Array.from(byHash.entries())
	.map(([hash, items]) => ({ hash, units: items }))
	.filter((g) => g.units.length > 1);
	}

	function findSimilar(units) {
	/** @type {{ a: CodeUnit; b: CodeUnit; score: number; label: "near" \| "semantic" }[]} */
	const pairs = [];
	const sorted = [...units].sort((a, b) => a.length - b.length);

	for (let i = 0; i < sorted.length; i++) {
	for (let j = i + 1; j < sorted.length; j++) {
	const left = sorted[i];
	const right = sorted[j];
	const sizeRatio = left.length / right.length;
	if (sizeRatio < 0.5 \|\| sizeRatio > 2) continue; // skip wildly different sizes
	const score = jaccard(left.tokens, right.tokens);
	if (score >= NEAR_DUP_THRESHOLD) {
	pairs.push({ a: left, b: right, score, label: "near" });
	} else if (score >= SEMANTIC_DUP_THRESHOLD) {
	pairs.push({ a: left, b: right, score, label: "semantic" });
	}
	}
	}
	return pairs;
	}

	function report(exacts, pairs) {
	let findingCount = 0;

	if (exacts.length) {
	console.log("=== Exact structural duplicates ===");
	for (const group of exacts) {
	const [first] = group.units;
	console.log(`Hash ${group.hash} (${first.kind}, size ${first.length} tokens)`);
	for (const unit of group.units) {
	console.log(
	` - ${unit.kind.padEnd(9)} ${unit.name.padEnd(20)} ${unit.file}:${unit.lines[0]}-${unit.lines[1]}`
	);
	}
	findingCount += group.units.length - 1;
	}
	console.log("");
	}

	const near = pairs.filter((p) => p.label === "near");
	const semantic = pairs.filter((p) => p.label === "semantic");

	if (near.length) {
	console.log("=== Near duplicates (structurally similar) ===");
	for (const pair of near) {
	console.log(
	` - ${pair.a.kind}/${pair.b.kind} ${pair.score.toFixed(2)} :: ${pair.a.name} (${pair.a.file}:${pair.a.lines[0]}-${pair.a.lines[1]}) <=> ${pair.b.name} (${pair.b.file}:${pair.b.lines[0]}-${pair.b.lines[1]})`
	);
	findingCount += 1;
	}
	console.log("");
	}

	if (semantic.length) {
	console.log("=== Semantic/shape duplicates (identifier-agnostic) ===");
	for (const pair of semantic) {
	console.log(
	` - ${pair.a.kind}/${pair.b.kind} ${pair.score.toFixed(2)} :: ${pair.a.name} (${pair.a.file}:${pair.a.lines[0]}-${pair.a.lines[1]}) <=> ${pair.b.name} (${pair.b.file}:${pair.b.lines[0]}-${pair.b.lines[1]})`
	);
	findingCount += 1;
	}
	console.log("");
	}

	console.log(`Total findings: ${findingCount}`);
	}

	export async function run(roots) {
	const targets = roots.length ? Array.from(new Set(roots)) : ["src"];
	const files = targets.flatMap((root) => walkFiles(path.resolve(root)));
	const units = files.flatMap((file) => collectUnits(file));

	const exacts = findExact(units);
	const pairs = findSimilar(units);

	report(exacts, pairs);
	}

	if (import.meta.url === pathToFileURL(process.argv[1]).href) {
	run(process.argv.slice(2));
	}
No results found