Skip to content

Instantly share code, notes, and snippets.

@mekarpeles
Created January 5, 2026 14:01
Show Gist options
  • Select an option

  • Save mekarpeles/b4b35306ccb0d70ad1ebc9d7892ca839 to your computer and use it in GitHub Desktop.

Select an option

Save mekarpeles/b4b35306ccb0d70ad1ebc9d7892ca839 to your computer and use it in GitHub Desktop.
Research: goodreads top book extraction
/**
* Extracts key information from a single Goodreads BookListItem HTML element.
*
* @param {HTMLElement} bookArticleElement The <article class="BookListItem"> element.
* @returns {object|null} An object containing the extracted data, or null if essential data is missing.
*/
function extractBookData(bookArticleElement) {
if (!bookArticleElement || bookArticleElement.tagName.toLowerCase() !== 'article') {
console.error("Invalid element provided. Must be an <article> element.");
return null;
}
// Initialize the data object with all three separate rating/count fields
const bookData = {
rank: null,
title: null,
url: null,
authors: [],
averageRating: null, // Correct: For the 3.84 value
ratingsCount: null, // Correct: For the 44.6k ratings value
shelvingsCount: null, // Correct: For the 156k shelvings value
coverUrl: null
};
// --- 1. Extract Rank (#200) ---
const rankElement = bookArticleElement.querySelector('.BookListItemRank h2');
if (rankElement) {
bookData.rank = rankElement.textContent.trim().replace('#', '');
}
// --- 2. Extract Title and URL ---
const titleLink = bookArticleElement.querySelector('a[data-testid="bookTitle"]');
if (titleLink) {
bookData.title = titleLink.textContent.trim();
bookData.url = titleLink.href;
} else {
// Essential data missing, stop processing
console.warn("Could not find book title/URL for an item.");
return null;
}
// --- 3. Extract Author(s) ---
// Authors are within 'ContributorLink__name' inside a ContributorLinksList
const authorNames = bookArticleElement.querySelectorAll('.ContributorLink__name');
authorNames.forEach(nameEl => {
bookData.authors.push(nameEl.textContent.trim());
});
// --- 4. Extract Average Rating (e.g., 3.84) ---
const averageRatingValue = bookArticleElement.querySelector('[data-testid="ratingValue"] .Text__semibold');
if (averageRatingValue) {
bookData.averageRating = parseFloat(averageRatingValue.textContent.trim());
}
// --- 5. Extract Ratings Count (e.g., 44.6k) ---
const ratingsCountElement = bookArticleElement.querySelector('[data-testid="ratingsCount"] .Text__subdued');
if (ratingsCountElement) {
// Example: "44.6k ratings" -> "44.6k"
bookData.ratingsCount = ratingsCountElement.textContent.trim().replace(' ratings', '');
}
// --- 6. Extract Shelving Count (e.g., 156k) ---
// We use the robust Regex solution on the entire container text.
const shelvingsContainer = bookArticleElement.querySelector('.BookListItemRating');
if (shelvingsContainer) {
const text = shelvingsContainer.textContent;
// Regex: Finds a number (with optional decimal), followed by 'k' or 'm', followed by ' shelvings'. Captures the number/k/m part.
const match = text.match(/([\d\.]+k|[\d\.]+m)\s*shelvings/i);
if (match && match[1]) {
bookData.shelvingsCount = match[1].trim(); // Capture only the count part (e.g., "156k")
}
}
// --- 7. Extract Cover URL ---
// The cover image uses srcset, we'll try to get the 1x size from the srcset attribute
const coverImage = bookArticleElement.querySelector('.BookCover__image img.ResponsiveImage');
if (coverImage) {
const srcset = coverImage.getAttribute('srcset');
if (srcset) {
// Split by comma, take the first entry (1x image), and split by space to get the URL
const urlMatch = srcset.split(',')[0].split(' ')[0];
bookData.coverUrl = urlMatch;
} else {
// Fallback to the standard src attribute
bookData.coverUrl = coverImage.src;
}
}
return bookData;
}
/**
* Main function to run the extraction on the entire page (or a container).
*/
function extractAllBookData() {
// Select all the book records on the page
const bookElements = document.querySelectorAll('article.BookListItem');
const results = [];
bookElements.forEach((element) => {
const data = extractBookData(element);
if (data) {
results.push(data);
}
});
return results;
}
/**
* Executes the book data extraction and automatically triggers a JSON file download.
*
* @param {string} filename The name for the downloaded file (e.g., 'goodreads_data.json').
*/
function downloadBookDataAsJson(filename = 'goodreads_extracted_data.json') {
// 1. Extract the data using the previously defined function
const dataArray = extractAllBookData();
if (dataArray.length === 0) {
console.warn("No book data was extracted. Download cancelled.");
return;
}
// 2. Convert the JavaScript array into a neatly formatted JSON string
const jsonString = JSON.stringify(dataArray, null, 2);
// 3. Create a Blob (Binary Large Object) containing the JSON data
// The type is 'application/json'
const blob = new Blob([jsonString], { type: 'application/json' });
// 4. Create an invisible download link element
const url = URL.createObjectURL(blob);
const a = document.createElement('a');
// 5. Configure the download
a.href = url;
a.download = filename; // Set the desired file name
// 6. Simulate a click on the link to trigger the download
document.body.appendChild(a); // Append to the body (necessary for some browsers)
a.click();
// 7. Clean up the temporary elements
document.body.removeChild(a);
URL.revokeObjectURL(url); // Free up the resource URL
console.log(`✅ Success! Download of '${filename}' should have started.`);
console.log(`Total books extracted: ${dataArray.length}`);
}
// EXECUTION: Define the functions and immediately run them
extractAllBookData();
downloadBookDataAsJson();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment