Last active
March 14, 2025 07:17
-
-
Save schmichri/bb38f454bd74abff870703706ce4740b to your computer and use it in GitHub Desktop.
Reads all Urls from a json and uses puppeteer to check if the links are dead or not. If they are dead it removes them from the JSON File
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import {promises as fs} from 'fs'; | |
| import * as path from 'path'; | |
| import puppeteer, {Browser} from 'puppeteer'; | |
| import pLimit from "p-limit"; | |
| import {chunkArray} from "../../common/arrayHelper"; | |
| import {sleep} from "../../common/sleep"; | |
| interface URLResult { | |
| originalUrl: string; | |
| finalUrl: string; | |
| status: number; | |
| ok: boolean; | |
| } | |
| /** | |
| * Recursively traverses any JSON value to extract HTTP/HTTPS URLs from strings. | |
| * Keeps duplicate URLs. | |
| * @param data A JSON value to traverse. | |
| * @returns An array of URLs found. | |
| */ | |
| function parseUrls(data: unknown): string[] { | |
| const urls: string[] = []; | |
| const urlRegex = /https?:\/\/[^\s"'<>]+/g; // simple regex to match http/https URLs | |
| function traverse(value: unknown): void { | |
| if (typeof value === 'string') { | |
| const matches = value.match(urlRegex); | |
| if (matches) { | |
| urls.push(...matches); | |
| } | |
| } else if (Array.isArray(value)) { | |
| value.forEach(traverse); | |
| } else if (value !== null && typeof value === 'object') { | |
| Object.values(value).forEach(traverse); | |
| } | |
| } | |
| traverse(data); | |
| return urls; | |
| } | |
| /** | |
| * Filters out URLs from the domain license-token.com (including subdomains). | |
| * @param urls The array of URLs to filter. | |
| * @param endsWith | |
| * @returns A new array of URLs that do not belong to license-token.com. | |
| */ | |
| function filterUrls(urls: string[], endsWith: string = 'license-token.com'): string[] { | |
| return urls.filter((url) => { | |
| try { | |
| const hostname = new URL(url).hostname; | |
| return !hostname.endsWith(endsWith); | |
| } catch (error) { | |
| // Exclude any URL that cannot be parsed. | |
| return false; | |
| } | |
| }); | |
| } | |
| /** | |
| * Uses Puppeteer to check a URL. It opens a new page with a typical browser user agent, | |
| * navigates to the URL (following redirects), and captures the final URL and status code. | |
| * @param url The URL to check. | |
| * @param browser The Puppeteer browser instance. | |
| * @returns An object with the original URL, final URL, HTTP status code, and a boolean flag indicating success. | |
| */ | |
| async function checkUrl(url: string, browser: Browser): Promise<URLResult> { | |
| const page = await browser.newPage(); | |
| // Emulate a typical browser by setting a standard Chrome user agent. | |
| await page.setUserAgent( | |
| 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36' | |
| ); | |
| let status = 0; | |
| let finalUrl = url; | |
| try { | |
| console.log(`Checking URL: ${url}`); | |
| const response = await page.goto(url, { | |
| waitUntil: 'domcontentloaded', | |
| timeout: 5000, | |
| }); | |
| if (response) { | |
| status = response.status(); | |
| finalUrl = response.url(); | |
| } | |
| } catch (error) { | |
| console.error(`Error processing ${url}:`, error); | |
| } finally { | |
| await page.close(); | |
| } | |
| const ok = status >= 200 && status < 400; | |
| return {originalUrl: url, finalUrl, status, ok}; | |
| } | |
| /** | |
| * Recursively cleans a JSON structure by removing dead URLs. | |
| * • If an array element is exactly a dead URL, remove that element. | |
| * • If a property’s value is exactly a dead URL, remove that property entirely. | |
| * @param data The JSON data to clean. | |
| * @param deadUrls A Set of dead URLs. | |
| * @returns The cleaned JSON data. | |
| */ | |
| function cleanJsonData(data: unknown, deadUrls: Set<string>): unknown { | |
| if (typeof data === 'string') { | |
| // If the string is exactly a dead URL, remove it by returning undefined. | |
| if (deadUrls.has(data)) { | |
| return undefined; | |
| } | |
| return data; | |
| } else if (Array.isArray(data)) { | |
| const cleanedArray = data | |
| .map(item => cleanJsonData(item, deadUrls)) | |
| .filter(item => item !== undefined); // Remove elements that became undefined. | |
| return cleanedArray; | |
| } else if (data !== null && typeof data === 'object') { | |
| const cleanedObj: Record<string, unknown> = {}; | |
| Object.entries(data as Record<string, unknown>).forEach(([key, value]) => { | |
| const cleanedValue = cleanJsonData(value, deadUrls); | |
| // Only keep properties that are not removed. | |
| if (cleanedValue !== undefined) { | |
| cleanedObj[key] = cleanedValue; | |
| } | |
| }); | |
| return cleanedObj; | |
| } | |
| return data; | |
| } | |
| /** | |
| * Main function: reads JSON from the given file path, extracts URLs, | |
| * filters out license-token.com URLs, checks each URL using Puppeteer, | |
| * cleans the JSON by removing dead URLs from arrays and object properties, | |
| * and writes the cleaned JSON to an output file. | |
| * @param jsonData | |
| */ | |
| async function processUrls(jsonData: any): Promise<unknown> { | |
| let browser: Browser | null = null; | |
| try { | |
| const allUrls = parseUrls(jsonData); | |
| const urlsToCheck = [...new Set(filterUrls(allUrls))]; // dedup and filter license-token.com | |
| console.log('URLs to check:', urlsToCheck.length); | |
| // Launch Puppeteer's browser instance. | |
| browser = await puppeteer.launch({ | |
| headless: true, | |
| protocolTimeout: 20_000 | |
| }); | |
| const limit = 12; | |
| let results: URLResult[] = []; | |
| for (const chunk of chunkArray(urlsToCheck, limit)) { | |
| const _r = await Promise.all(chunk.map((url) => pLimit(limit + 1)(() => checkUrl(url, browser!)))); | |
| results = results.concat(_r); | |
| console.log('Results:', results); | |
| await sleep(1000); | |
| } | |
| // Build a set of dead URLs (where ok is false). | |
| const deadUrls = new Set<string>( | |
| results.filter(result => !result.ok).map(result => result.originalUrl) | |
| ); | |
| console.log('Dead URLs found:', [...deadUrls]); | |
| // Clean the JSON data by removing dead URL entries. | |
| return cleanJsonData(jsonData, deadUrls) | |
| } catch (error) { | |
| console.error('Error processing the JSON file:', error); | |
| } finally { | |
| if (browser) { | |
| await browser.close(); | |
| } | |
| } | |
| } | |
| /** | |
| * Main function: reads JSON from the given file path, extracts URLs, | |
| * filters out license-token.com URLs, checks each URL using Puppeteer, | |
| * cleans the JSON by removing dead URLs from arrays and object properties, | |
| * and writes the cleaned JSON to an output file. | |
| * @param filePath The path to the JSON file. | |
| */ | |
| export async function checkJsonFileForDeadLinks(filePath: string): Promise<unknown> { | |
| let browser: Browser | null = null; | |
| try { | |
| const fileData = await fs.readFile(path.resolve(filePath), 'utf-8'); | |
| const jsonData = JSON.parse(fileData); | |
| const cleanedData = await processUrls(jsonData) | |
| // Write the cleaned JSON data to a new file. | |
| await fs.writeFile(filePath, JSON.stringify(cleanedData, null, 2), 'utf-8'); | |
| console.log(`Cleaned JSON has been overwritten to: ${filePath}`); | |
| return cleanedData | |
| } catch (error) { | |
| console.error('Error processing the JSON file:', error); | |
| } | |
| } |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
We use this to check outgoing links on https://www.license-token.com/wiki.