yukiarimo · November 20, 2025 23:11
diff --git a/novel_scraper.py b/novel_scraper.py
 import requests
 from bs4 import BeautifulSoup
 import time
 import os

 def sanitize_filename(filename):
    """Removes characters that are invalid for filenames."""
    return "".join(c for c in filename if c.isalnum() or c in (' ', '-', '_')).rstrip()

 def scrape_novel(novel_code):
    """
    Scrapes a novel from ncode.syosetu.com chapter by chapter.

    Args:
        novel_code (str): The unique code for the novel (e.g., 'n9629ex').
    """
    base_url = "https://ncode.syosetu.com"
    start_url = f"{base_url}/{novel_code}/1/"

    # Use a session object to be more efficient and simulate a real user
    session = requests.Session()
    session.headers.update({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    })

    print(f"Starting scrape for novel code: {novel_code}")

    # --- Step 1: Get the main novel title for the filename ---
    try:
        main_page_url = f"{base_url}/{novel_code}/"
        print(f"Fetching novel title from: {main_page_url}")
        response = session.get(main_page_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        novel_title_element = soup.find('h1', class_='p-novel__title')
        if novel_title_element:
            novel_title = novel_title_element.get_text(strip=True)
        else:
            # Fallback title if the main one can't be found
            novel_title = novel_code

        output_filename = f"{sanitize_filename(novel_title)}.txt"
        print(f"Novel Title: '{novel_title}'")
        print(f"Saving to: '{output_filename}'")

    except requests.exceptions.RequestException as e:
        print(f"Error fetching novel title: {e}")
        # Use the novel code as a fallback filename
        output_filename = f"{novel_code}.txt"
        novel_title = novel_code

    # --- Step 2: Scrape all chapters ---
    current_url = start_url
    chapter_count = 0

    with open(output_filename, "w", encoding="utf-8") as file:
        file.write(f"Title: {novel_title}\n")
        file.write("=" * (len(novel_title) + 7) + "\n\n")

        while current_url:
            try:
                print(f"Scraping chapter from: {current_url}")
                response = session.get(current_url)
                # Check if the request was successful
                response.raise_for_status()

                soup = BeautifulSoup(response.text, 'html.parser')

                # Extract the chapter title
                chapter_title_element = soup.find('h1', class_='p-novel__title--rensai')
                chapter_title = chapter_title_element.get_text(strip=True) if chapter_title_element else "Untitled Chapter"

                # Extract the chapter body
                novel_body = soup.find('div', class_='js-novel-text')
                if not novel_body:
                    print(f"Warning: Could not find chapter text on {current_url}. Skipping.")
                    # Attempt to find the next link anyway
                else:
                    # Write title and a separator
                    file.write(f"## {chapter_title} ##\n\n")

                    # Write each paragraph from the chapter body
                    paragraphs = novel_body.find_all('p')
                    for p in paragraphs:
                        # get_text with strip=True cleans up whitespace
                        line = p.get_text(strip=True)
                        if line: # Avoid writing empty lines
                            file.write(line + '\n')

                    # Add a clear separator between chapters
                    file.write("\n\n---\n\n")
                    chapter_count += 1

                # Find the 'Next' button to continue to the next chapter
                next_link_element = soup.find('a', class_='c-pager__item--next')

                if next_link_element and next_link_element.get('href'):
                    # The href is relative (e.g., '/n9629ex/2/'), so we join it with the base URL
                    next_path = next_link_element['href']
                    current_url = f"{base_url}{next_path}"
                else:
                    # If there's no 'Next' button, we've reached the end
                    current_url = None

                # Be a polite scraper and wait a second between requests
                time.sleep(1)

            except requests.exceptions.HTTPError as e:
                print(f"HTTP Error on {current_url}: {e}")
                print("This might be the end of the novel or a temporary issue. Stopping.")
                break
            except requests.exceptions.RequestException as e:
                print(f"A network error occurred: {e}")
                print("Pausing for 10 seconds before retrying...")
                time.sleep(10) # Wait longer if there's a network issue

    print("\nScraping complete!")
    print(f"Successfully saved {chapter_count} chapters to '{output_filename}'")

 # --- Main execution block ---
 if __name__ == "__main__":
    # The code for the novel from the URL (https://ncode.syosetu.com/n9629ex/)
    NOVEL_ID = "n9629ex"
    scrape_novel(NOVEL_ID)
	import requests
	from bs4 import BeautifulSoup
	import time
	import os

	def sanitize_filename(filename):
	"""Removes characters that are invalid for filenames."""
	return "".join(c for c in filename if c.isalnum() or c in (' ', '-', '_')).rstrip()

	def scrape_novel(novel_code):
	"""
	Scrapes a novel from ncode.syosetu.com chapter by chapter.

	Args:
	novel_code (str): The unique code for the novel (e.g., 'n9629ex').
	"""
	base_url = "https://ncode.syosetu.com"
	start_url = f"{base_url}/{novel_code}/1/"

	# Use a session object to be more efficient and simulate a real user
	session = requests.Session()
	session.headers.update({
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
	})

	print(f"Starting scrape for novel code: {novel_code}")

	# --- Step 1: Get the main novel title for the filename ---
	try:
	main_page_url = f"{base_url}/{novel_code}/"
	print(f"Fetching novel title from: {main_page_url}")
	response = session.get(main_page_url)
	response.raise_for_status()
	soup = BeautifulSoup(response.text, 'html.parser')

	novel_title_element = soup.find('h1', class_='p-novel__title')
	if novel_title_element:
	novel_title = novel_title_element.get_text(strip=True)
	else:
	# Fallback title if the main one can't be found
	novel_title = novel_code

	output_filename = f"{sanitize_filename(novel_title)}.txt"
	print(f"Novel Title: '{novel_title}'")
	print(f"Saving to: '{output_filename}'")

	except requests.exceptions.RequestException as e:
	print(f"Error fetching novel title: {e}")
	# Use the novel code as a fallback filename
	output_filename = f"{novel_code}.txt"
	novel_title = novel_code

	# --- Step 2: Scrape all chapters ---
	current_url = start_url
	chapter_count = 0

	with open(output_filename, "w", encoding="utf-8") as file:
	file.write(f"Title: {novel_title}\n")
	file.write("=" * (len(novel_title) + 7) + "\n\n")

	while current_url:
	try:
	print(f"Scraping chapter from: {current_url}")
	response = session.get(current_url)
	# Check if the request was successful
	response.raise_for_status()

	soup = BeautifulSoup(response.text, 'html.parser')

	# Extract the chapter title
	chapter_title_element = soup.find('h1', class_='p-novel__title--rensai')
	chapter_title = chapter_title_element.get_text(strip=True) if chapter_title_element else "Untitled Chapter"

	# Extract the chapter body
	novel_body = soup.find('div', class_='js-novel-text')
	if not novel_body:
	print(f"Warning: Could not find chapter text on {current_url}. Skipping.")
	# Attempt to find the next link anyway
	else:
	# Write title and a separator
	file.write(f"## {chapter_title} ##\n\n")

	# Write each paragraph from the chapter body
	paragraphs = novel_body.find_all('p')
	for p in paragraphs:
	# get_text with strip=True cleans up whitespace
	line = p.get_text(strip=True)
	if line: # Avoid writing empty lines
	file.write(line + '\n')

	# Add a clear separator between chapters
	file.write("\n\n---\n\n")
	chapter_count += 1

	# Find the 'Next' button to continue to the next chapter
	next_link_element = soup.find('a', class_='c-pager__item--next')

	if next_link_element and next_link_element.get('href'):
	# The href is relative (e.g., '/n9629ex/2/'), so we join it with the base URL
	next_path = next_link_element['href']
	current_url = f"{base_url}{next_path}"
	else:
	# If there's no 'Next' button, we've reached the end
	current_url = None

	# Be a polite scraper and wait a second between requests
	time.sleep(1)

	except requests.exceptions.HTTPError as e:
	print(f"HTTP Error on {current_url}: {e}")
	print("This might be the end of the novel or a temporary issue. Stopping.")
	break
	except requests.exceptions.RequestException as e:
	print(f"A network error occurred: {e}")
	print("Pausing for 10 seconds before retrying...")
	time.sleep(10) # Wait longer if there's a network issue

	print("\nScraping complete!")
	print(f"Successfully saved {chapter_count} chapters to '{output_filename}'")

	# --- Main execution block ---
	if __name__ == "__main__":
	# The code for the novel from the URL (https://ncode.syosetu.com/n9629ex/)
	NOVEL_ID = "n9629ex"
	scrape_novel(NOVEL_ID)
No results found