Skip to content

Instantly share code, notes, and snippets.

@aschiavon91
Last active October 12, 2024 14:08
Show Gist options
  • Select an option

  • Save aschiavon91/b75efcbc5342df443dee777a77f8cd7a to your computer and use it in GitHub Desktop.

Select an option

Save aschiavon91/b75efcbc5342df443dee777a77f8cd7a to your computer and use it in GitHub Desktop.
extract questions and answers in python
#!pip install markdownify
#!pip install beautifulsoup4
from os import listdir, remove, makedirs, path
from bs4 import BeautifulSoup
import shutil
from markdownify import MarkdownConverter
from datetime import datetime
HTML_PATH = "./htmls"
MARKDOWN_FILENAME = "responses.md"
CWD = path.dirname(path.realpath(__file__))
def sort_by_date(array):
return sorted(array, key=lambda x: x["date"])
def soup2md(soup, **options):
return MarkdownConverter(**options).convert_soup(soup)
def parse_htmls(base_path):
qnas = []
makedirs(base_path, exist_ok=True)
for file in get_files(base_path):
try:
without_prefix = file.split("_ ")[-1]
splitted = without_prefix.split("-")
filename = splitted[0].strip()
filedate = splitted[1].strip()
if "Módulo Introdutório" in file:
filedate = "01-02-2024"
else:
filedate = filedate.split(".")[0].split("-")[0].replace("_", "-")
with open(f"{base_path}/{file}", "r") as f:
qnas.append(
{
"filename": filename,
"date": datetime.strptime(filedate, "%d-%m-%Y"),
"responses": get_question_and_answers_from_file(f),
}
)
except Exception as e:
print(e)
return qnas
def create_markdown(markdown_file, database):
if path.isfile(markdown_file):
remove(markdown_file)
image_folder = "./output/images"
makedirs(image_folder, exist_ok=True)
markdown = ["# Banco de dados de perguntas e respostas"]
for i, item in enumerate(database):
filename = item["filename"]
responses = item["responses"]
date = item["date"]
markdown.append(f"## **[{date.strftime("%d-%m-%Y")}]** {filename}")
for _, response in enumerate(responses):
question = response["question"]
images = response["images"]
user_answer = response["user_answer"]
correct_answer = response["correct_answer"]
is_correct = response["is_correct"]
markdown.append(f"### {question}")
for j, img in enumerate(images):
baseimg = path.basename(img)
img_path = img.replace("./", "./htmls/")
img_source = path.abspath(img_path)
markdown.append(f"![imagem {i} {j}](images/{baseimg})")
shutil.copyfile(img_source, f"{image_folder}/{baseimg}")
markdown.append(f"- [user answer] {user_answer}")
markdown.append(f"- [correct answer] {correct_answer}")
markdown.append(f"- [is correct?] {is_correct}")
with open(markdown_file, "w+", newline="\n", encoding="utf-8") as file:
file.write("\n\n\n".join(markdown))
def get_question_and_answers_from_file(file):
question_and_answers = []
file_content = file.read().encode("utf-8")
soup = BeautifulSoup(file_content, "html.parser")
questions_elements = soup.find_all("div", attrs={"class": "jarviswidget"})
for question_el in questions_elements:
question = soup2md(question_el.h2)
imgs = [i["src"] for i in question_el.find_all("img")]
# check if already revised
revised_finder = {"class": ["fa-arrow-left", "txt-color-green"]}
revised_check = question_el.find("span", revised_finder)
correct_answer = None
user_answer = None
if revised_check:
correct_answer = soup2md(revised_check.find_previous_sibling("label"))
checked_box = question_el.find(is_el_checked)
user_answer = soup2md(checked_box.find_next_sibling("label"))
is_correct = user_answer == correct_answer
question_and_answers.append(
{
"question": question,
"images": imgs,
"user_answer": user_answer,
"correct_answer": correct_answer,
"is_correct": is_correct,
}
)
return question_and_answers
def get_files(base_path):
return [f for f in listdir(base_path) if path.isfile(path.join(base_path, f))]
def is_el_checked(tag):
return tag.has_attr("checked")
def run():
output_file = f"{CWD}/output/{MARKDOWN_FILENAME}"
qan = parse_htmls(HTML_PATH)
qna_sorted = sort_by_date(qan)
create_markdown(output_file, qna_sorted)
print("DONE!")
if __name__ == "__main__":
run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment