Created
January 5, 2026 14:01
-
-
Save mekarpeles/b4b35306ccb0d70ad1ebc9d7892ca839 to your computer and use it in GitHub Desktop.
Research: goodreads top book extraction
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /** | |
| * Extracts key information from a single Goodreads BookListItem HTML element. | |
| * | |
| * @param {HTMLElement} bookArticleElement The <article class="BookListItem"> element. | |
| * @returns {object|null} An object containing the extracted data, or null if essential data is missing. | |
| */ | |
| function extractBookData(bookArticleElement) { | |
| if (!bookArticleElement || bookArticleElement.tagName.toLowerCase() !== 'article') { | |
| console.error("Invalid element provided. Must be an <article> element."); | |
| return null; | |
| } | |
| // Initialize the data object with all three separate rating/count fields | |
| const bookData = { | |
| rank: null, | |
| title: null, | |
| url: null, | |
| authors: [], | |
| averageRating: null, // Correct: For the 3.84 value | |
| ratingsCount: null, // Correct: For the 44.6k ratings value | |
| shelvingsCount: null, // Correct: For the 156k shelvings value | |
| coverUrl: null | |
| }; | |
| // --- 1. Extract Rank (#200) --- | |
| const rankElement = bookArticleElement.querySelector('.BookListItemRank h2'); | |
| if (rankElement) { | |
| bookData.rank = rankElement.textContent.trim().replace('#', ''); | |
| } | |
| // --- 2. Extract Title and URL --- | |
| const titleLink = bookArticleElement.querySelector('a[data-testid="bookTitle"]'); | |
| if (titleLink) { | |
| bookData.title = titleLink.textContent.trim(); | |
| bookData.url = titleLink.href; | |
| } else { | |
| // Essential data missing, stop processing | |
| console.warn("Could not find book title/URL for an item."); | |
| return null; | |
| } | |
| // --- 3. Extract Author(s) --- | |
| // Authors are within 'ContributorLink__name' inside a ContributorLinksList | |
| const authorNames = bookArticleElement.querySelectorAll('.ContributorLink__name'); | |
| authorNames.forEach(nameEl => { | |
| bookData.authors.push(nameEl.textContent.trim()); | |
| }); | |
| // --- 4. Extract Average Rating (e.g., 3.84) --- | |
| const averageRatingValue = bookArticleElement.querySelector('[data-testid="ratingValue"] .Text__semibold'); | |
| if (averageRatingValue) { | |
| bookData.averageRating = parseFloat(averageRatingValue.textContent.trim()); | |
| } | |
| // --- 5. Extract Ratings Count (e.g., 44.6k) --- | |
| const ratingsCountElement = bookArticleElement.querySelector('[data-testid="ratingsCount"] .Text__subdued'); | |
| if (ratingsCountElement) { | |
| // Example: "44.6k ratings" -> "44.6k" | |
| bookData.ratingsCount = ratingsCountElement.textContent.trim().replace(' ratings', ''); | |
| } | |
| // --- 6. Extract Shelving Count (e.g., 156k) --- | |
| // We use the robust Regex solution on the entire container text. | |
| const shelvingsContainer = bookArticleElement.querySelector('.BookListItemRating'); | |
| if (shelvingsContainer) { | |
| const text = shelvingsContainer.textContent; | |
| // Regex: Finds a number (with optional decimal), followed by 'k' or 'm', followed by ' shelvings'. Captures the number/k/m part. | |
| const match = text.match(/([\d\.]+k|[\d\.]+m)\s*shelvings/i); | |
| if (match && match[1]) { | |
| bookData.shelvingsCount = match[1].trim(); // Capture only the count part (e.g., "156k") | |
| } | |
| } | |
| // --- 7. Extract Cover URL --- | |
| // The cover image uses srcset, we'll try to get the 1x size from the srcset attribute | |
| const coverImage = bookArticleElement.querySelector('.BookCover__image img.ResponsiveImage'); | |
| if (coverImage) { | |
| const srcset = coverImage.getAttribute('srcset'); | |
| if (srcset) { | |
| // Split by comma, take the first entry (1x image), and split by space to get the URL | |
| const urlMatch = srcset.split(',')[0].split(' ')[0]; | |
| bookData.coverUrl = urlMatch; | |
| } else { | |
| // Fallback to the standard src attribute | |
| bookData.coverUrl = coverImage.src; | |
| } | |
| } | |
| return bookData; | |
| } | |
| /** | |
| * Main function to run the extraction on the entire page (or a container). | |
| */ | |
| function extractAllBookData() { | |
| // Select all the book records on the page | |
| const bookElements = document.querySelectorAll('article.BookListItem'); | |
| const results = []; | |
| bookElements.forEach((element) => { | |
| const data = extractBookData(element); | |
| if (data) { | |
| results.push(data); | |
| } | |
| }); | |
| return results; | |
| } | |
| /** | |
| * Executes the book data extraction and automatically triggers a JSON file download. | |
| * | |
| * @param {string} filename The name for the downloaded file (e.g., 'goodreads_data.json'). | |
| */ | |
| function downloadBookDataAsJson(filename = 'goodreads_extracted_data.json') { | |
| // 1. Extract the data using the previously defined function | |
| const dataArray = extractAllBookData(); | |
| if (dataArray.length === 0) { | |
| console.warn("No book data was extracted. Download cancelled."); | |
| return; | |
| } | |
| // 2. Convert the JavaScript array into a neatly formatted JSON string | |
| const jsonString = JSON.stringify(dataArray, null, 2); | |
| // 3. Create a Blob (Binary Large Object) containing the JSON data | |
| // The type is 'application/json' | |
| const blob = new Blob([jsonString], { type: 'application/json' }); | |
| // 4. Create an invisible download link element | |
| const url = URL.createObjectURL(blob); | |
| const a = document.createElement('a'); | |
| // 5. Configure the download | |
| a.href = url; | |
| a.download = filename; // Set the desired file name | |
| // 6. Simulate a click on the link to trigger the download | |
| document.body.appendChild(a); // Append to the body (necessary for some browsers) | |
| a.click(); | |
| // 7. Clean up the temporary elements | |
| document.body.removeChild(a); | |
| URL.revokeObjectURL(url); // Free up the resource URL | |
| console.log(`✅ Success! Download of '${filename}' should have started.`); | |
| console.log(`Total books extracted: ${dataArray.length}`); | |
| } | |
| // EXECUTION: Define the functions and immediately run them | |
| extractAllBookData(); | |
| downloadBookDataAsJson(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment