Skip to content

Instantly share code, notes, and snippets.

@yukiarimo
Created November 20, 2025 23:11
Show Gist options
  • Select an option

  • Save yukiarimo/5fb21672bcc81422253753d718935571 to your computer and use it in GitHub Desktop.

Select an option

Save yukiarimo/5fb21672bcc81422253753d718935571 to your computer and use it in GitHub Desktop.
Light Novel Web Scraper (from ncode.syosetu.com)
import requests
from bs4 import BeautifulSoup
import time
import os
def sanitize_filename(filename):
"""Removes characters that are invalid for filenames."""
return "".join(c for c in filename if c.isalnum() or c in (' ', '-', '_')).rstrip()
def scrape_novel(novel_code):
"""
Scrapes a novel from ncode.syosetu.com chapter by chapter.
Args:
novel_code (str): The unique code for the novel (e.g., 'n9629ex').
"""
base_url = "https://ncode.syosetu.com"
start_url = f"{base_url}/{novel_code}/1/"
# Use a session object to be more efficient and simulate a real user
session = requests.Session()
session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
})
print(f"Starting scrape for novel code: {novel_code}")
# --- Step 1: Get the main novel title for the filename ---
try:
main_page_url = f"{base_url}/{novel_code}/"
print(f"Fetching novel title from: {main_page_url}")
response = session.get(main_page_url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
novel_title_element = soup.find('h1', class_='p-novel__title')
if novel_title_element:
novel_title = novel_title_element.get_text(strip=True)
else:
# Fallback title if the main one can't be found
novel_title = novel_code
output_filename = f"{sanitize_filename(novel_title)}.txt"
print(f"Novel Title: '{novel_title}'")
print(f"Saving to: '{output_filename}'")
except requests.exceptions.RequestException as e:
print(f"Error fetching novel title: {e}")
# Use the novel code as a fallback filename
output_filename = f"{novel_code}.txt"
novel_title = novel_code
# --- Step 2: Scrape all chapters ---
current_url = start_url
chapter_count = 0
with open(output_filename, "w", encoding="utf-8") as file:
file.write(f"Title: {novel_title}\n")
file.write("=" * (len(novel_title) + 7) + "\n\n")
while current_url:
try:
print(f"Scraping chapter from: {current_url}")
response = session.get(current_url)
# Check if the request was successful
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# Extract the chapter title
chapter_title_element = soup.find('h1', class_='p-novel__title--rensai')
chapter_title = chapter_title_element.get_text(strip=True) if chapter_title_element else "Untitled Chapter"
# Extract the chapter body
novel_body = soup.find('div', class_='js-novel-text')
if not novel_body:
print(f"Warning: Could not find chapter text on {current_url}. Skipping.")
# Attempt to find the next link anyway
else:
# Write title and a separator
file.write(f"## {chapter_title} ##\n\n")
# Write each paragraph from the chapter body
paragraphs = novel_body.find_all('p')
for p in paragraphs:
# get_text with strip=True cleans up whitespace
line = p.get_text(strip=True)
if line: # Avoid writing empty lines
file.write(line + '\n')
# Add a clear separator between chapters
file.write("\n\n---\n\n")
chapter_count += 1
# Find the 'Next' button to continue to the next chapter
next_link_element = soup.find('a', class_='c-pager__item--next')
if next_link_element and next_link_element.get('href'):
# The href is relative (e.g., '/n9629ex/2/'), so we join it with the base URL
next_path = next_link_element['href']
current_url = f"{base_url}{next_path}"
else:
# If there's no 'Next' button, we've reached the end
current_url = None
# Be a polite scraper and wait a second between requests
time.sleep(1)
except requests.exceptions.HTTPError as e:
print(f"HTTP Error on {current_url}: {e}")
print("This might be the end of the novel or a temporary issue. Stopping.")
break
except requests.exceptions.RequestException as e:
print(f"A network error occurred: {e}")
print("Pausing for 10 seconds before retrying...")
time.sleep(10) # Wait longer if there's a network issue
print("\nScraping complete!")
print(f"Successfully saved {chapter_count} chapters to '{output_filename}'")
# --- Main execution block ---
if __name__ == "__main__":
# The code for the novel from the URL (https://ncode.syosetu.com/n9629ex/)
NOVEL_ID = "n9629ex"
scrape_novel(NOVEL_ID)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment