Last active
October 12, 2024 14:08
-
-
Save aschiavon91/b75efcbc5342df443dee777a77f8cd7a to your computer and use it in GitHub Desktop.
extract questions and answers in python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!pip install markdownify | |
| #!pip install beautifulsoup4 | |
| from os import listdir, remove, makedirs, path | |
| from bs4 import BeautifulSoup | |
| import shutil | |
| from markdownify import MarkdownConverter | |
| from datetime import datetime | |
| HTML_PATH = "./htmls" | |
| MARKDOWN_FILENAME = "responses.md" | |
| CWD = path.dirname(path.realpath(__file__)) | |
| def sort_by_date(array): | |
| return sorted(array, key=lambda x: x["date"]) | |
| def soup2md(soup, **options): | |
| return MarkdownConverter(**options).convert_soup(soup) | |
| def parse_htmls(base_path): | |
| qnas = [] | |
| makedirs(base_path, exist_ok=True) | |
| for file in get_files(base_path): | |
| try: | |
| without_prefix = file.split("_ ")[-1] | |
| splitted = without_prefix.split("-") | |
| filename = splitted[0].strip() | |
| filedate = splitted[1].strip() | |
| if "Módulo Introdutório" in file: | |
| filedate = "01-02-2024" | |
| else: | |
| filedate = filedate.split(".")[0].split("-")[0].replace("_", "-") | |
| with open(f"{base_path}/{file}", "r") as f: | |
| qnas.append( | |
| { | |
| "filename": filename, | |
| "date": datetime.strptime(filedate, "%d-%m-%Y"), | |
| "responses": get_question_and_answers_from_file(f), | |
| } | |
| ) | |
| except Exception as e: | |
| print(e) | |
| return qnas | |
| def create_markdown(markdown_file, database): | |
| if path.isfile(markdown_file): | |
| remove(markdown_file) | |
| image_folder = "./output/images" | |
| makedirs(image_folder, exist_ok=True) | |
| markdown = ["# Banco de dados de perguntas e respostas"] | |
| for i, item in enumerate(database): | |
| filename = item["filename"] | |
| responses = item["responses"] | |
| date = item["date"] | |
| markdown.append(f"## **[{date.strftime("%d-%m-%Y")}]** {filename}") | |
| for _, response in enumerate(responses): | |
| question = response["question"] | |
| images = response["images"] | |
| user_answer = response["user_answer"] | |
| correct_answer = response["correct_answer"] | |
| is_correct = response["is_correct"] | |
| markdown.append(f"### {question}") | |
| for j, img in enumerate(images): | |
| baseimg = path.basename(img) | |
| img_path = img.replace("./", "./htmls/") | |
| img_source = path.abspath(img_path) | |
| markdown.append(f"") | |
| shutil.copyfile(img_source, f"{image_folder}/{baseimg}") | |
| markdown.append(f"- [user answer] {user_answer}") | |
| markdown.append(f"- [correct answer] {correct_answer}") | |
| markdown.append(f"- [is correct?] {is_correct}") | |
| with open(markdown_file, "w+", newline="\n", encoding="utf-8") as file: | |
| file.write("\n\n\n".join(markdown)) | |
| def get_question_and_answers_from_file(file): | |
| question_and_answers = [] | |
| file_content = file.read().encode("utf-8") | |
| soup = BeautifulSoup(file_content, "html.parser") | |
| questions_elements = soup.find_all("div", attrs={"class": "jarviswidget"}) | |
| for question_el in questions_elements: | |
| question = soup2md(question_el.h2) | |
| imgs = [i["src"] for i in question_el.find_all("img")] | |
| # check if already revised | |
| revised_finder = {"class": ["fa-arrow-left", "txt-color-green"]} | |
| revised_check = question_el.find("span", revised_finder) | |
| correct_answer = None | |
| user_answer = None | |
| if revised_check: | |
| correct_answer = soup2md(revised_check.find_previous_sibling("label")) | |
| checked_box = question_el.find(is_el_checked) | |
| user_answer = soup2md(checked_box.find_next_sibling("label")) | |
| is_correct = user_answer == correct_answer | |
| question_and_answers.append( | |
| { | |
| "question": question, | |
| "images": imgs, | |
| "user_answer": user_answer, | |
| "correct_answer": correct_answer, | |
| "is_correct": is_correct, | |
| } | |
| ) | |
| return question_and_answers | |
| def get_files(base_path): | |
| return [f for f in listdir(base_path) if path.isfile(path.join(base_path, f))] | |
| def is_el_checked(tag): | |
| return tag.has_attr("checked") | |
| def run(): | |
| output_file = f"{CWD}/output/{MARKDOWN_FILENAME}" | |
| qan = parse_htmls(HTML_PATH) | |
| qna_sorted = sort_by_date(qan) | |
| create_markdown(output_file, qna_sorted) | |
| print("DONE!") | |
| if __name__ == "__main__": | |
| run() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment