Last active
October 10, 2024 11:20
-
-
Save espenmn/5d952eaf7939e5e091a0c9b6d16e118e to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import difflib | |
| from markdown import markdown | |
| from bs4 import BeautifulSoup | |
| # File paths (replace with your actual paths) | |
| A_path = 'document_A.txt' | |
| B_path = 'document_B.txt' | |
| C_path = 'document_C.txt' | |
| output_path = 'document_D.txt' | |
| log_path = 'log.txt' | |
| def remove_markdown(text): | |
| html = markdown(text) | |
| clean_text = BeautifulSoup(html, "html.parser").get_text() | |
| return clean_text.strip('\n') | |
| # Function to read document remove markdown and split into paragraphs | |
| def read_document(filepath, remove_md=False): | |
| with open(filepath, 'r', encoding='utf-8') as file: | |
| content = file.read() | |
| if remove_md: | |
| content = remove_markdown(content) | |
| return content.split('\n') # Assuming paragraphs are separated by newlines | |
| # Function to write document | |
| def write_document(filepath, paragraphs): | |
| with open(filepath, 'w', encoding='utf-8') as file: | |
| file.write('\n'.join(paragraphs)) | |
| # Function to replace diffrences in strings | |
| def replace_differences(A, B, C): | |
| # Use difflib to get matching blocks and differences | |
| matcher = difflib.SequenceMatcher(None, A, B) | |
| # Find the first part of A that is different from B | |
| for tag, i1, i2, j1, j2 in matcher.get_opcodes(): | |
| if tag != 'equal': | |
| # A[i1:i2] is the part of A that differs | |
| # B[j1:j2] is the part of B that differs | |
| # Now, replace this part of C (based on the A segment) with the B segment | |
| # Find the corresponding position in C to replace | |
| # This assumes that C's differing part starts where A's differing part would be | |
| c_start = C.find(A[i1:i2].strip()) | |
| if c_start != -1: | |
| C = C[:c_start] + B[j1:j2] + C[c_start+len(A[i1:i2]):] | |
| return C | |
| # Function to compare documents and replace paragraphs in C | |
| # TO DO: Use this for all below, not just equal sequences | |
| def replace_and_log(A_path, B_path, C_path, output_path, log_path): | |
| paragraphs_A = read_document(A_path, remove_md=False) | |
| paragraphs_B = read_document(B_path, remove_md=False) | |
| paragraphs_C = read_document(C_path, remove_md=True) | |
| paragraphs_markdown_C = read_document(C_path, remove_md=False) | |
| # Make a new list without markdown codes so we can search for text from A and B | |
| # TO do, try this instead of having two Cs above | |
| markdown_C = [] | |
| for paragraph in paragraphs_C: | |
| markdown_C.append(remove_markdown(paragraph)) | |
| log = [] | |
| # Use Differ object to get the difference | |
| differ = difflib.Differ() | |
| matcher = difflib.SequenceMatcher(None, paragraphs_A, paragraphs_B) | |
| # Find the first part of A that is different from B | |
| log.append('**** Starting *****') | |
| tagcounter = 0 | |
| for tag, i1, i2, j1, j2 in matcher.get_opcodes(): | |
| tagcounter += 1 | |
| log.append(f"\n") | |
| log.append(f"------ tagcounter: {tagcounter} ------") | |
| log.append(f"tag: {tag}") | |
| if tag == 'delete': | |
| a_seq = paragraphs_A[i1:i2] | |
| for a_line in a_seq: | |
| count = paragraphs_C.count(a_line) | |
| log.append(f"{count} instance of {a_line} found in Document C") | |
| if count == 1: | |
| #with and without markdown gets different numbers, if we figure out why the next lines are not needed | |
| items_with_a_line1 = [index for index, item in enumerate(paragraphs_markdown_C) if a_line in item] | |
| if len(items_with_a_line1) != 1: | |
| log.append(f"********** >>>>>>>>>>>>> Warning something might be wrong at line {count} script line [149]\n\n\n") | |
| paragraphs_markdown_C[d_index] = f"<span class='red deleted tag-{tag}'>{paragraphs_C[c_index].replace(a_line, b_line1)}</span>\n" | |
| d_index = items_with_a_line1[0] | |
| log.append(f"Found at {d_index}") | |
| log.append(f"{paragraphs_markdown_C[d_index]} should be deleted") | |
| #Marking the paragraph(s) with css class deleted so we can remove mark and remove in editor | |
| paragraphs_markdown_C[d_index] = f"<span class='red deleted tag-{tag}'>{paragraphs_markdown_C[d_index]}</span>\n" | |
| if tag == 'replace': | |
| # A[i1:i2] is the part of A that differs | |
| # B[j1:j2] is the part of B that differs | |
| a_seq = paragraphs_A[i1:i2] | |
| b_seq = paragraphs_B[j1:j2] | |
| log.append(f"A seq: {a_seq}") | |
| log.append(f"B seq: {b_seq}") | |
| # a_line = "\n".join(A[i1:i2]) | |
| # b_line = "\n".join(C[j1:j2]) | |
| # a_line1 = paragraphs_A[i1] | |
| # b_line1 = paragraphs_B[j1] | |
| a_len = len(a_seq) | |
| b_len = len(b_seq) | |
| if a_len == b_len: | |
| log.append(">> Equal lenght of sequences <<") | |
| linecounter = 0 | |
| for a_line1, b_line1 in zip(paragraphs_A[i1:i2], paragraphs_B[j1:j2]): | |
| a_line = remove_markdown(a_line1) | |
| #TO DO: Find out if we want to keep markdown from line b | |
| b_line = remove_markdown(b_line1) | |
| linecounter += 1 | |
| log.append(f"Line Counter: {linecounter}") | |
| # a_line1 = remove_markdown(a_line1) | |
| # b_line1 = remove_markdown(b_line1) | |
| log.append(f"Replacing: A line: {a_line} with B line: {b_line}") | |
| count = paragraphs_C.count(a_line) | |
| log.append(f"{count} instance of {a_line} found in Document C") | |
| if count == 1: | |
| c_index = paragraphs_C.index(a_line) | |
| items_with_a_line1 = [index for index, item in enumerate(paragraphs_markdown_C) if a_line in remove_markdown(item)] | |
| if len(items_with_a_line1) != 1: | |
| log.append(f"********** >>>>>>>>>>>>> Warning something might be wrong at line {count} script line [151]\n\n\n\n") | |
| ## Note, markdown from line is included, alternatively use b_line instead or use function 'replace differences' (for each line??) | |
| # TO DO: Check if replace differences handles multiple lines | |
| paragraphs_markdown_C[d_index] = f"<span class='red replaced tag-{tag}'>{paragraphs_C[c_index].replace(a_line, b_line1)}</span>\n" | |
| else: | |
| d_index = items_with_a_line1[0] | |
| log.append(f"Found at {d_index}") | |
| log.append(f"{paragraphs_markdown_C[d_index]}") | |
| # log.append(f"{paragraphs_C[c_index]}") | |
| # Will remove new markdow | |
| # paragraphs_markdown_C[d_index] = f"<span class='red replaced replace-1 tag-{tag}'>{paragraphs_markdown_C[d_index].replace(a_line, b_line1)}</span>\n" | |
| # Will remove old markdown | |
| # paragraphs_markdown_C[d_index] = f"<span class='red replaced replace-1 tag-{tag}'>{paragraphs_C[c_index].replace(a_line, b_line1)}</span>\n" | |
| #Trying to replace it with another 'compare' | |
| replacetext = replace_differences(a_line, b_line, paragraphs_markdown_C[d_index]) | |
| paragraphs_markdown_C[d_index] = f"<span class='red replaced tag-{tag}'>{replacetext}</span>\n" | |
| else: | |
| log.append('**********************') | |
| log.append('Found none or several places, could not replace') | |
| log.append('----------------------') | |
| logline = f"A: {a_line} B: {b_line} A seq: {a_seq} A line: {i1} B seq: {b_seq} B line: {j1}" | |
| log.append(logline) | |
| log.append('----------------------') | |
| elif a_len > b_len: | |
| sec_1 = remove_markdown(a_seq[0]) | |
| count = paragraphs_C.count(sec_1) | |
| log.append("****** B is shorter than A ******") | |
| log.append(f"{count} instance of {sec_1} found in Document C") | |
| if count == 1: | |
| c_index = paragraphs_C.index(a_line) | |
| log.append(f"Trying to replace: A: {a_seq} with B line: {b_seq}") | |
| items_with_a_line1 = [index for index, item in enumerate(paragraphs_markdown_C) if sec_1 in remove_markdown(item)] | |
| b_text = "\n".join(paragraphs_B[j1:j2]) | |
| if len(items_with_a_line1) == 0: | |
| log.append("********** >>>>>>>>>>>>> Warning something might be wrong [187]\n\n\n") | |
| replacetext=f"<span class='red replaced tag-{tag}'>{paragraphs_C[c_index].replace(sec_1, b_text)}</span>\n" | |
| paragraphs_markdown_C[d_index] = replacetext | |
| elif len_a == 1: | |
| d_index = items_with_a_line1[0] | |
| log.append(f"First line of A found in C line sec {c_index}") | |
| # TO DO: This replace will not keep markdown code inside paragraphs | |
| log.append(f"Replace text from A: {sec_1} to B: {b_text}") | |
| log.append(f"Content of paragraphs_markdown_C[{d_index}] was {paragraphs_markdown_C[d_index]}") | |
| replacetext=f"<span class='red replaced tag-{tag}'>{paragraphs_markdown_C[d_index].replace(sec_1, b_text)}</span>\n" | |
| paragraphs_markdown_C[d_index] = replacetext | |
| log.append(f"D[{d_index}] should now contain {b_text}") | |
| log.append(f"D[{d_index}] now contain {paragraphs_markdown_C[d_index]}") | |
| len_a = len(a_seq) | |
| log.append(f"Lenght of A is: {len_a}") | |
| if len_a > 1: | |
| log.append(f"A contained several lines {len_a}") | |
| log.append("Trying to remove other lines") | |
| for counter, a_line1 in enumerate(a_seq[1:], start=1): | |
| # log.append(f"D[{c_index+counter}] was {a_line1} TO DO FIX") | |
| paragraphs_markdown_C[d_index+counter] = f"<span class='red deleteme tag-{tag}'>{paragraphs_C[c_index+counter]}</span>\n" | |
| else: | |
| # print('2') | |
| log.append('************************') | |
| log.append('Found several places, could not replace 2') | |
| log.append('----------------------') | |
| logline = f"A: {a_seq} A line: {i1}:{i2} B seq: {b_seq}, B line: {j1}:{j2}" | |
| log.append(logline) | |
| log.append('------------------------') | |
| elif a_len < b_len and a_len > 0: | |
| sec_1 = remove_markdown(a_seq[0]) | |
| count = paragraphs_C.count(sec_1) | |
| log.append("****** B is longer than A ******") | |
| log.append(f"{count} instance of {sec_1} found in Document C") | |
| import pdb;pdb.set_trace() | |
| if count == 1: | |
| c_index = paragraphs_C.index(a_line) | |
| log.append(f"Trying to replace: A: {a_seq} with B line: {b_seq}") | |
| items_with_a_line1 = [index for index, item in enumerate(paragraphs_markdown_C) if sec_1 in remove_markdown(item)] | |
| d_index = items_with_a_line1[0] | |
| log.append(f"First line of A found in C line sec {d_index}") | |
| b_text = "\n".join(paragraphs_B[j1:j2]) | |
| log.append(f"Replace text from A: {sec_1} to B: {b_text}") | |
| log.append(f"Content of paragraphs_markdown_C[{d_index}] was {paragraphs_markdown_C[d_index]}") | |
| replacetext=f"<span class='red replaced tag-{tag}'>{paragraphs_markdown_C[d_index].replace(sec_1, b_text)}</span>\n" | |
| paragraphs_markdown_C[d_index] = replacetext | |
| log.append(f"D[{d_index}] should now contain {b_text}") | |
| log.append(f"D[{d_index}] now contain {paragraphs_markdown_C[d_index]}") | |
| len_a = len(a_seq) | |
| log.append(f"Lenght of A is: {len_a}") | |
| if len_a > 1: | |
| log.append(f"A contained several lines {len_a}") | |
| log.append("Trying to remove other lines") | |
| for counter, a_line1 in enumerate(a_seq[1:], start=1): | |
| log.append(f"D[{c_index+counter}] was {a_line1}") | |
| paragraphs_markdown_C[d_index+counter] = f"<span class='red deleteme tag-{tag}'>{paragraphs_markdown_C[d_index+counter]}</span>\n" | |
| else: | |
| log.append('************************') | |
| log.append('Found several places, could not replace 3') | |
| log.append('----------------------') | |
| logline = f"A: {sec_1}, B: {b_seq}, A seq: {a_seq}, A line: {i1}:{i2}, B seq: {b_seq}, B line: {j1}:{j2}" | |
| log.append(logline) | |
| log.append('------------------------') | |
| else: | |
| log.append('**********************') | |
| logline = f"Long: Too long to replace or different lenghts: {a_len} line vs {b_len}" | |
| log.append(logline) | |
| log.append('----------------------') | |
| logline = f"{a_seq} with {b_seq}" | |
| log.append(logline) | |
| log.append('----------------------') | |
| # print(paragraphs_markdown_C) | |
| write_document(output_path, paragraphs_markdown_C) | |
| print("\n".join(log)) | |
| # Write the log | |
| with open(log_path, 'w', encoding='utf-8') as log_file: | |
| log_file.write('\n'.join(log)) | |
| # Call the function | |
| replace_and_log(A_path, B_path, C_path, output_path, log_path) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment