Last active
December 24, 2023 08:07
-
-
Save yukiarimo/f3eea376e96f79e1c537bd2fad93328b to your computer and use it in GitHub Desktop.
Character AI Dialog Extractor
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import json | |
| from bs4 import BeautifulSoup | |
| with open("Main - Rushia Uruha.html") as fp: | |
| soup = BeautifulSoup(fp, "html.parser") | |
| items = [] | |
| main_text = None | |
| target_text = None | |
| for tag in soup.find_all("span", class_="s1"): | |
| text = tag.get_text().strip() | |
| if text == "Yuki Arimo": | |
| if target_text: | |
| items.append({"main_text": main_text, "target_text": target_text}) | |
| main_text = None | |
| target_text = None | |
| main_text = "" | |
| elif text == "Rushia Uruha": | |
| if main_text: | |
| items.append({"main_text": main_text, "target_text": target_text}) | |
| main_text = None | |
| target_text = None | |
| target_text = "" | |
| elif tag.name == "img": | |
| continue | |
| else: | |
| if main_text is not None: | |
| main_text += " " + text | |
| elif target_text is not None: | |
| target_text += " " + text | |
| if main_text or target_text: | |
| items.append({"main_text": main_text, "target_text": target_text}) | |
| new_list = [] | |
| for i in range(len(items)-1): | |
| if items[i].get("main_text") and items[i+1].get("target_text"): | |
| new_dict = {"main_text": items[i].get("main_text"), "target_text": items[i+1].get("target_text")} | |
| new_list.append(new_dict) | |
| json.dump(new_list, open("output.json", "w"), ensure_ascii=False, indent=4) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Uh oh!
There was an error while loading. Please reload this page.