Created
November 20, 2025 23:11
-
-
Save yukiarimo/5fb21672bcc81422253753d718935571 to your computer and use it in GitHub Desktop.
Light Novel Web Scraper (from ncode.syosetu.com)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import requests | |
| from bs4 import BeautifulSoup | |
| import time | |
| import os | |
| def sanitize_filename(filename): | |
| """Removes characters that are invalid for filenames.""" | |
| return "".join(c for c in filename if c.isalnum() or c in (' ', '-', '_')).rstrip() | |
| def scrape_novel(novel_code): | |
| """ | |
| Scrapes a novel from ncode.syosetu.com chapter by chapter. | |
| Args: | |
| novel_code (str): The unique code for the novel (e.g., 'n9629ex'). | |
| """ | |
| base_url = "https://ncode.syosetu.com" | |
| start_url = f"{base_url}/{novel_code}/1/" | |
| # Use a session object to be more efficient and simulate a real user | |
| session = requests.Session() | |
| session.headers.update({ | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | |
| }) | |
| print(f"Starting scrape for novel code: {novel_code}") | |
| # --- Step 1: Get the main novel title for the filename --- | |
| try: | |
| main_page_url = f"{base_url}/{novel_code}/" | |
| print(f"Fetching novel title from: {main_page_url}") | |
| response = session.get(main_page_url) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| novel_title_element = soup.find('h1', class_='p-novel__title') | |
| if novel_title_element: | |
| novel_title = novel_title_element.get_text(strip=True) | |
| else: | |
| # Fallback title if the main one can't be found | |
| novel_title = novel_code | |
| output_filename = f"{sanitize_filename(novel_title)}.txt" | |
| print(f"Novel Title: '{novel_title}'") | |
| print(f"Saving to: '{output_filename}'") | |
| except requests.exceptions.RequestException as e: | |
| print(f"Error fetching novel title: {e}") | |
| # Use the novel code as a fallback filename | |
| output_filename = f"{novel_code}.txt" | |
| novel_title = novel_code | |
| # --- Step 2: Scrape all chapters --- | |
| current_url = start_url | |
| chapter_count = 0 | |
| with open(output_filename, "w", encoding="utf-8") as file: | |
| file.write(f"Title: {novel_title}\n") | |
| file.write("=" * (len(novel_title) + 7) + "\n\n") | |
| while current_url: | |
| try: | |
| print(f"Scraping chapter from: {current_url}") | |
| response = session.get(current_url) | |
| # Check if the request was successful | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| # Extract the chapter title | |
| chapter_title_element = soup.find('h1', class_='p-novel__title--rensai') | |
| chapter_title = chapter_title_element.get_text(strip=True) if chapter_title_element else "Untitled Chapter" | |
| # Extract the chapter body | |
| novel_body = soup.find('div', class_='js-novel-text') | |
| if not novel_body: | |
| print(f"Warning: Could not find chapter text on {current_url}. Skipping.") | |
| # Attempt to find the next link anyway | |
| else: | |
| # Write title and a separator | |
| file.write(f"## {chapter_title} ##\n\n") | |
| # Write each paragraph from the chapter body | |
| paragraphs = novel_body.find_all('p') | |
| for p in paragraphs: | |
| # get_text with strip=True cleans up whitespace | |
| line = p.get_text(strip=True) | |
| if line: # Avoid writing empty lines | |
| file.write(line + '\n') | |
| # Add a clear separator between chapters | |
| file.write("\n\n---\n\n") | |
| chapter_count += 1 | |
| # Find the 'Next' button to continue to the next chapter | |
| next_link_element = soup.find('a', class_='c-pager__item--next') | |
| if next_link_element and next_link_element.get('href'): | |
| # The href is relative (e.g., '/n9629ex/2/'), so we join it with the base URL | |
| next_path = next_link_element['href'] | |
| current_url = f"{base_url}{next_path}" | |
| else: | |
| # If there's no 'Next' button, we've reached the end | |
| current_url = None | |
| # Be a polite scraper and wait a second between requests | |
| time.sleep(1) | |
| except requests.exceptions.HTTPError as e: | |
| print(f"HTTP Error on {current_url}: {e}") | |
| print("This might be the end of the novel or a temporary issue. Stopping.") | |
| break | |
| except requests.exceptions.RequestException as e: | |
| print(f"A network error occurred: {e}") | |
| print("Pausing for 10 seconds before retrying...") | |
| time.sleep(10) # Wait longer if there's a network issue | |
| print("\nScraping complete!") | |
| print(f"Successfully saved {chapter_count} chapters to '{output_filename}'") | |
| # --- Main execution block --- | |
| if __name__ == "__main__": | |
| # The code for the novel from the URL (https://ncode.syosetu.com/n9629ex/) | |
| NOVEL_ID = "n9629ex" | |
| scrape_novel(NOVEL_ID) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment