This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import requests | |
| from bs4 import BeautifulSoup | |
| import time | |
| import os | |
| def sanitize_filename(filename): | |
| """Removes characters that are invalid for filenames.""" | |
| return "".join(c for c in filename if c.isalnum() or c in (' ', '-', '_')).rstrip() | |
| def scrape_novel(novel_code): |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import json | |
| import re | |
| import sys | |
| from tqdm import tqdm | |
| from html import unescape | |
| from bs4 import BeautifulSoup | |
| def strip_html_tags(html: str) -> str: | |
| """Remove HTML tags for plain text extraction.""" | |
| soup = BeautifulSoup(html, "html.parser") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| let htmlString = document.body.innerHTML | |
| const parser = new DOMParser(); | |
| const doc = parser.parseFromString(htmlString, 'text/html'); | |
| const messages = []; | |
| const divs = doc.querySelectorAll('.group.relative.max-w-3xl.min-max-w-3xl.m-auto.w-full.p-2'); | |
| divs.forEach(div => { | |
| const nameElement = div.querySelector('.text-small'); | |
| if (nameElement) { |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| function extractAndDownloadAllChapters() { | |
| // Find all containers that hold chapter text | |
| const chapterContainers = document.querySelectorAll('.cha-words'); | |
| // Initialize an array to hold all chapter texts | |
| let allChaptersText = []; | |
| // Iterate over each chapter container | |
| chapterContainers.forEach(container => { | |
| // Get all paragraph elements within the container |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import pandas as pd | |
| import torch | |
| from torch.utils.data import Dataset, DataLoader | |
| from transformers import BartForConditionalGeneration, BartTokenizer | |
| from torch.optim import AdamW | |
| import os | |
| os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0" | |
| # Define your dataset class |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import json | |
| from bs4 import BeautifulSoup | |
| with open("Main - Rushia Uruha.html") as fp: | |
| soup = BeautifulSoup(fp, "html.parser") | |
| items = [] | |
| main_text = None | |
| target_text = None | |
| for tag in soup.find_all("span", class_="s1"): | |
| text = tag.get_text().strip() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| function timeConverter(UNIX_timestamp){ | |
| var a = new Date(UNIX_timestamp * 1000); | |
| var months = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']; | |
| var year = a.getFullYear(); | |
| var month = months[a.getMonth()]; | |
| var date = a.getDate(); | |
| var hour = a.getHours(); | |
| var time = date + ' ' + month + ' ' + year + ' ' + hour + ':' + '00' ; | |
| return time; | |
| } |