schmichri · March 14, 2025 07:17 · schmichri · Mar 14, 2025
diff --git a/deadLinkFixer.ts b/deadLinkFixer.ts
 import {promises as fs} from 'fs';
 import * as path from 'path';
 import puppeteer, {Browser} from 'puppeteer';
 import pLimit from "p-limit";
 import {chunkArray} from "../../common/arrayHelper";
 import {sleep} from "../../common/sleep";

 interface URLResult {
    originalUrl: string;
    finalUrl: string;
    status: number;
    ok: boolean;
 }

 /**
 * Recursively traverses any JSON value to extract HTTP/HTTPS URLs from strings.
 * Keeps duplicate URLs.
 * @param data A JSON value to traverse.
 * @returns An array of URLs found.
 */
 function parseUrls(data: unknown): string[] {
    const urls: string[] = [];
    const urlRegex = /https?:\/\/[^\s"'<>]+/g; // simple regex to match http/https URLs

    function traverse(value: unknown): void {
        if (typeof value === 'string') {
            const matches = value.match(urlRegex);
            if (matches) {
                urls.push(...matches);
            }
        } else if (Array.isArray(value)) {
            value.forEach(traverse);
        } else if (value !== null && typeof value === 'object') {
            Object.values(value).forEach(traverse);
        }
    }

    traverse(data);
    return urls;
 }

 /**
 * Filters out URLs from the domain license-token.com (including subdomains).
 * @param urls The array of URLs to filter.
 * @param endsWith
 * @returns A new array of URLs that do not belong to license-token.com.
 */
 function filterUrls(urls: string[], endsWith: string = 'license-token.com'): string[] {
    return urls.filter((url) => {
        try {
            const hostname = new URL(url).hostname;
            return !hostname.endsWith(endsWith);
        } catch (error) {
            // Exclude any URL that cannot be parsed.
            return false;
        }
    });
 }

 /**
 * Uses Puppeteer to check a URL. It opens a new page with a typical browser user agent,
 * navigates to the URL (following redirects), and captures the final URL and status code.
 * @param url The URL to check.
 * @param browser The Puppeteer browser instance.
 * @returns An object with the original URL, final URL, HTTP status code, and a boolean flag indicating success.
 */
 async function checkUrl(url: string, browser: Browser): Promise<URLResult> {
    const page = await browser.newPage();

    // Emulate a typical browser by setting a standard Chrome user agent.
    await page.setUserAgent(
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
    );

    let status = 0;
    let finalUrl = url;

    try {
        console.log(`Checking URL: ${url}`);
        const response = await page.goto(url, {
            waitUntil: 'domcontentloaded',
            timeout: 5000,
        });
        if (response) {
            status = response.status();
            finalUrl = response.url();
        }
    } catch (error) {
        console.error(`Error processing ${url}:`, error);
    } finally {
        await page.close();
    }

    const ok = status >= 200 && status < 400;
    return {originalUrl: url, finalUrl, status, ok};
 }

 /**
 * Recursively cleans a JSON structure by removing dead URLs.
 * • If an array element is exactly a dead URL, remove that element.
 * • If a property’s value is exactly a dead URL, remove that property entirely.
 * @param data The JSON data to clean.
 * @param deadUrls A Set of dead URLs.
 * @returns The cleaned JSON data.
 */
 function cleanJsonData(data: unknown, deadUrls: Set<string>): unknown {
    if (typeof data === 'string') {
        // If the string is exactly a dead URL, remove it by returning undefined.
        if (deadUrls.has(data)) {
            return undefined;
        }
        return data;
    } else if (Array.isArray(data)) {
        const cleanedArray = data
            .map(item => cleanJsonData(item, deadUrls))
            .filter(item => item !== undefined); // Remove elements that became undefined.
        return cleanedArray;
    } else if (data !== null && typeof data === 'object') {
        const cleanedObj: Record<string, unknown> = {};
        Object.entries(data as Record<string, unknown>).forEach(([key, value]) => {
            const cleanedValue = cleanJsonData(value, deadUrls);
            // Only keep properties that are not removed.
            if (cleanedValue !== undefined) {
                cleanedObj[key] = cleanedValue;
            }
        });
        return cleanedObj;
    }
    return data;
 }

 /**
 * Main function: reads JSON from the given file path, extracts URLs,
 * filters out license-token.com URLs, checks each URL using Puppeteer,
 * cleans the JSON by removing dead URLs from arrays and object properties,
 * and writes the cleaned JSON to an output file.
 * @param jsonData
 */
 async function processUrls(jsonData: any): Promise<unknown> {
    let browser: Browser | null = null;
    try {

        const allUrls = parseUrls(jsonData);
        const urlsToCheck = [...new Set(filterUrls(allUrls))]; // dedup and filter license-token.com
        console.log('URLs to check:', urlsToCheck.length);

        // Launch Puppeteer's browser instance.
        browser = await puppeteer.launch({
            headless: true,
            protocolTimeout: 20_000
        });

        const limit = 12;

        let results: URLResult[] = [];

        for (const chunk of chunkArray(urlsToCheck, limit)) {
            const _r = await Promise.all(chunk.map((url) => pLimit(limit + 1)(() => checkUrl(url, browser!))));
            results = results.concat(_r);
            console.log('Results:', results);
            await sleep(1000);
        }

        // Build a set of dead URLs (where ok is false).
        const deadUrls = new Set<string>(
            results.filter(result => !result.ok).map(result => result.originalUrl)
        );

        console.log('Dead URLs found:', [...deadUrls]);

        // Clean the JSON data by removing dead URL entries.
        return cleanJsonData(jsonData, deadUrls)
    } catch (error) {
        console.error('Error processing the JSON file:', error);
    } finally {
        if (browser) {
            await browser.close();
        }
    }
 }


 /**
 * Main function: reads JSON from the given file path, extracts URLs,
 * filters out license-token.com URLs, checks each URL using Puppeteer,
 * cleans the JSON by removing dead URLs from arrays and object properties,
 * and writes the cleaned JSON to an output file.
 * @param filePath The path to the JSON file.
 */
 export async function checkJsonFileForDeadLinks(filePath: string): Promise<unknown> {
    let browser: Browser | null = null;
    try {
        const fileData = await fs.readFile(path.resolve(filePath), 'utf-8');

        const jsonData = JSON.parse(fileData);

        const cleanedData = await processUrls(jsonData)

        // Write the cleaned JSON data to a new file.
        await fs.writeFile(filePath, JSON.stringify(cleanedData, null, 2), 'utf-8');
        console.log(`Cleaned JSON has been overwritten to: ${filePath}`);

        return cleanedData
    } catch (error) {
        console.error('Error processing the JSON file:', error);
    }
 }
	import {promises as fs} from 'fs';
	import * as path from 'path';
	import puppeteer, {Browser} from 'puppeteer';
	import pLimit from "p-limit";
	import {chunkArray} from "../../common/arrayHelper";
	import {sleep} from "../../common/sleep";

	interface URLResult {
	originalUrl: string;
	finalUrl: string;
	status: number;
	ok: boolean;
	}

	/**
	* Recursively traverses any JSON value to extract HTTP/HTTPS URLs from strings.
	* Keeps duplicate URLs.
	* @param data A JSON value to traverse.
	* @returns An array of URLs found.
	*/
	function parseUrls(data: unknown): string[] {
	const urls: string[] = [];
	const urlRegex = /https?:\/\/[^\s"'<>]+/g; // simple regex to match http/https URLs

	function traverse(value: unknown): void {
	if (typeof value === 'string') {
	const matches = value.match(urlRegex);
	if (matches) {
	urls.push(...matches);
	}
	} else if (Array.isArray(value)) {
	value.forEach(traverse);
	} else if (value !== null && typeof value === 'object') {
	Object.values(value).forEach(traverse);
	}
	}

	traverse(data);
	return urls;
	}

	/**
	* Filters out URLs from the domain license-token.com (including subdomains).
	* @param urls The array of URLs to filter.
	* @param endsWith
	* @returns A new array of URLs that do not belong to license-token.com.
	*/
	function filterUrls(urls: string[], endsWith: string = 'license-token.com'): string[] {
	return urls.filter((url) => {
	try {
	const hostname = new URL(url).hostname;
	return !hostname.endsWith(endsWith);
	} catch (error) {
	// Exclude any URL that cannot be parsed.
	return false;
	}
	});
	}

	/**
	* Uses Puppeteer to check a URL. It opens a new page with a typical browser user agent,
	* navigates to the URL (following redirects), and captures the final URL and status code.
	* @param url The URL to check.
	* @param browser The Puppeteer browser instance.
	* @returns An object with the original URL, final URL, HTTP status code, and a boolean flag indicating success.
	*/
	async function checkUrl(url: string, browser: Browser): Promise<URLResult> {
	const page = await browser.newPage();

	// Emulate a typical browser by setting a standard Chrome user agent.
	await page.setUserAgent(
	'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
	);

	let status = 0;
	let finalUrl = url;

	try {
	console.log(`Checking URL: ${url}`);
	const response = await page.goto(url, {
	waitUntil: 'domcontentloaded',
	timeout: 5000,
	});
	if (response) {
	status = response.status();
	finalUrl = response.url();
	}
	} catch (error) {
	console.error(`Error processing ${url}:`, error);
	} finally {
	await page.close();
	}

	const ok = status >= 200 && status < 400;
	return {originalUrl: url, finalUrl, status, ok};
	}

	/**
	* Recursively cleans a JSON structure by removing dead URLs.
	* • If an array element is exactly a dead URL, remove that element.
	* • If a property’s value is exactly a dead URL, remove that property entirely.
	* @param data The JSON data to clean.
	* @param deadUrls A Set of dead URLs.
	* @returns The cleaned JSON data.
	*/
	function cleanJsonData(data: unknown, deadUrls: Set<string>): unknown {
	if (typeof data === 'string') {
	// If the string is exactly a dead URL, remove it by returning undefined.
	if (deadUrls.has(data)) {
	return undefined;
	}
	return data;
	} else if (Array.isArray(data)) {
	const cleanedArray = data
	.map(item => cleanJsonData(item, deadUrls))
	.filter(item => item !== undefined); // Remove elements that became undefined.
	return cleanedArray;
	} else if (data !== null && typeof data === 'object') {
	const cleanedObj: Record<string, unknown> = {};
	Object.entries(data as Record<string, unknown>).forEach(([key, value]) => {
	const cleanedValue = cleanJsonData(value, deadUrls);
	// Only keep properties that are not removed.
	if (cleanedValue !== undefined) {
	cleanedObj[key] = cleanedValue;
	}
	});
	return cleanedObj;
	}
	return data;
	}

	/**
	* Main function: reads JSON from the given file path, extracts URLs,
	* filters out license-token.com URLs, checks each URL using Puppeteer,
	* cleans the JSON by removing dead URLs from arrays and object properties,
	* and writes the cleaned JSON to an output file.
	* @param jsonData
	*/
	async function processUrls(jsonData: any): Promise<unknown> {
	let browser: Browser \| null = null;
	try {

	const allUrls = parseUrls(jsonData);
	const urlsToCheck = [...new Set(filterUrls(allUrls))]; // dedup and filter license-token.com
	console.log('URLs to check:', urlsToCheck.length);

	// Launch Puppeteer's browser instance.
	browser = await puppeteer.launch({
	headless: true,
	protocolTimeout: 20_000
	});

	const limit = 12;

	let results: URLResult[] = [];

	for (const chunk of chunkArray(urlsToCheck, limit)) {
	const _r = await Promise.all(chunk.map((url) => pLimit(limit + 1)(() => checkUrl(url, browser!))));
	results = results.concat(_r);
	console.log('Results:', results);
	await sleep(1000);
	}

	// Build a set of dead URLs (where ok is false).
	const deadUrls = new Set<string>(
	results.filter(result => !result.ok).map(result => result.originalUrl)
	);

	console.log('Dead URLs found:', [...deadUrls]);

	// Clean the JSON data by removing dead URL entries.
	return cleanJsonData(jsonData, deadUrls)
	} catch (error) {
	console.error('Error processing the JSON file:', error);
	} finally {
	if (browser) {
	await browser.close();
	}
	}
	}


	/**
	* Main function: reads JSON from the given file path, extracts URLs,
	* filters out license-token.com URLs, checks each URL using Puppeteer,
	* cleans the JSON by removing dead URLs from arrays and object properties,
	* and writes the cleaned JSON to an output file.
	* @param filePath The path to the JSON file.
	*/
	export async function checkJsonFileForDeadLinks(filePath: string): Promise<unknown> {
	let browser: Browser \| null = null;
	try {
	const fileData = await fs.readFile(path.resolve(filePath), 'utf-8');

	const jsonData = JSON.parse(fileData);

	const cleanedData = await processUrls(jsonData)

	// Write the cleaned JSON data to a new file.
	await fs.writeFile(filePath, JSON.stringify(cleanedData, null, 2), 'utf-8');
	console.log(`Cleaned JSON has been overwritten to: ${filePath}`);

	return cleanedData
	} catch (error) {
	console.error('Error processing the JSON file:', error);
	}
	}
No results found