Skip to content

Instantly share code, notes, and snippets.

@espenmn
Last active October 10, 2024 11:20
Show Gist options
  • Select an option

  • Save espenmn/5d952eaf7939e5e091a0c9b6d16e118e to your computer and use it in GitHub Desktop.

Select an option

Save espenmn/5d952eaf7939e5e091a0c9b6d16e118e to your computer and use it in GitHub Desktop.
import difflib
from markdown import markdown
from bs4 import BeautifulSoup
# File paths (replace with your actual paths)
A_path = 'document_A.txt'
B_path = 'document_B.txt'
C_path = 'document_C.txt'
output_path = 'document_D.txt'
log_path = 'log.txt'
def remove_markdown(text):
html = markdown(text)
clean_text = BeautifulSoup(html, "html.parser").get_text()
return clean_text.strip('\n')
# Function to read document remove markdown and split into paragraphs
def read_document(filepath, remove_md=False):
with open(filepath, 'r', encoding='utf-8') as file:
content = file.read()
if remove_md:
content = remove_markdown(content)
return content.split('\n') # Assuming paragraphs are separated by newlines
# Function to write document
def write_document(filepath, paragraphs):
with open(filepath, 'w', encoding='utf-8') as file:
file.write('\n'.join(paragraphs))
# Function to replace diffrences in strings
def replace_differences(A, B, C):
# Use difflib to get matching blocks and differences
matcher = difflib.SequenceMatcher(None, A, B)
# Find the first part of A that is different from B
for tag, i1, i2, j1, j2 in matcher.get_opcodes():
if tag != 'equal':
# A[i1:i2] is the part of A that differs
# B[j1:j2] is the part of B that differs
# Now, replace this part of C (based on the A segment) with the B segment
# Find the corresponding position in C to replace
# This assumes that C's differing part starts where A's differing part would be
c_start = C.find(A[i1:i2].strip())
if c_start != -1:
C = C[:c_start] + B[j1:j2] + C[c_start+len(A[i1:i2]):]
return C
# Function to compare documents and replace paragraphs in C
# TO DO: Use this for all below, not just equal sequences
def replace_and_log(A_path, B_path, C_path, output_path, log_path):
paragraphs_A = read_document(A_path, remove_md=False)
paragraphs_B = read_document(B_path, remove_md=False)
paragraphs_C = read_document(C_path, remove_md=True)
paragraphs_markdown_C = read_document(C_path, remove_md=False)
# Make a new list without markdown codes so we can search for text from A and B
# TO do, try this instead of having two Cs above
markdown_C = []
for paragraph in paragraphs_C:
markdown_C.append(remove_markdown(paragraph))
log = []
# Use Differ object to get the difference
differ = difflib.Differ()
matcher = difflib.SequenceMatcher(None, paragraphs_A, paragraphs_B)
# Find the first part of A that is different from B
log.append('**** Starting *****')
tagcounter = 0
for tag, i1, i2, j1, j2 in matcher.get_opcodes():
tagcounter += 1
log.append(f"\n")
log.append(f"------ tagcounter: {tagcounter} ------")
log.append(f"tag: {tag}")
if tag == 'delete':
a_seq = paragraphs_A[i1:i2]
for a_line in a_seq:
count = paragraphs_C.count(a_line)
log.append(f"{count} instance of {a_line} found in Document C")
if count == 1:
#with and without markdown gets different numbers, if we figure out why the next lines are not needed
items_with_a_line1 = [index for index, item in enumerate(paragraphs_markdown_C) if a_line in item]
if len(items_with_a_line1) != 1:
log.append(f"********** >>>>>>>>>>>>> Warning something might be wrong at line {count} script line [149]\n\n\n")
paragraphs_markdown_C[d_index] = f"<span class='red deleted tag-{tag}'>{paragraphs_C[c_index].replace(a_line, b_line1)}</span>\n"
d_index = items_with_a_line1[0]
log.append(f"Found at {d_index}")
log.append(f"{paragraphs_markdown_C[d_index]} should be deleted")
#Marking the paragraph(s) with css class deleted so we can remove mark and remove in editor
paragraphs_markdown_C[d_index] = f"<span class='red deleted tag-{tag}'>{paragraphs_markdown_C[d_index]}</span>\n"
if tag == 'replace':
# A[i1:i2] is the part of A that differs
# B[j1:j2] is the part of B that differs
a_seq = paragraphs_A[i1:i2]
b_seq = paragraphs_B[j1:j2]
log.append(f"A seq: {a_seq}")
log.append(f"B seq: {b_seq}")
# a_line = "\n".join(A[i1:i2])
# b_line = "\n".join(C[j1:j2])
# a_line1 = paragraphs_A[i1]
# b_line1 = paragraphs_B[j1]
a_len = len(a_seq)
b_len = len(b_seq)
if a_len == b_len:
log.append(">> Equal lenght of sequences <<")
linecounter = 0
for a_line1, b_line1 in zip(paragraphs_A[i1:i2], paragraphs_B[j1:j2]):
a_line = remove_markdown(a_line1)
#TO DO: Find out if we want to keep markdown from line b
b_line = remove_markdown(b_line1)
linecounter += 1
log.append(f"Line Counter: {linecounter}")
# a_line1 = remove_markdown(a_line1)
# b_line1 = remove_markdown(b_line1)
log.append(f"Replacing: A line: {a_line} with B line: {b_line}")
count = paragraphs_C.count(a_line)
log.append(f"{count} instance of {a_line} found in Document C")
if count == 1:
c_index = paragraphs_C.index(a_line)
items_with_a_line1 = [index for index, item in enumerate(paragraphs_markdown_C) if a_line in remove_markdown(item)]
if len(items_with_a_line1) != 1:
log.append(f"********** >>>>>>>>>>>>> Warning something might be wrong at line {count} script line [151]\n\n\n\n")
## Note, markdown from line is included, alternatively use b_line instead or use function 'replace differences' (for each line??)
# TO DO: Check if replace differences handles multiple lines
paragraphs_markdown_C[d_index] = f"<span class='red replaced tag-{tag}'>{paragraphs_C[c_index].replace(a_line, b_line1)}</span>\n"
else:
d_index = items_with_a_line1[0]
log.append(f"Found at {d_index}")
log.append(f"{paragraphs_markdown_C[d_index]}")
# log.append(f"{paragraphs_C[c_index]}")
# Will remove new markdow
# paragraphs_markdown_C[d_index] = f"<span class='red replaced replace-1 tag-{tag}'>{paragraphs_markdown_C[d_index].replace(a_line, b_line1)}</span>\n"
# Will remove old markdown
# paragraphs_markdown_C[d_index] = f"<span class='red replaced replace-1 tag-{tag}'>{paragraphs_C[c_index].replace(a_line, b_line1)}</span>\n"
#Trying to replace it with another 'compare'
replacetext = replace_differences(a_line, b_line, paragraphs_markdown_C[d_index])
paragraphs_markdown_C[d_index] = f"<span class='red replaced tag-{tag}'>{replacetext}</span>\n"
else:
log.append('**********************')
log.append('Found none or several places, could not replace')
log.append('----------------------')
logline = f"A: {a_line} B: {b_line} A seq: {a_seq} A line: {i1} B seq: {b_seq} B line: {j1}"
log.append(logline)
log.append('----------------------')
elif a_len > b_len:
sec_1 = remove_markdown(a_seq[0])
count = paragraphs_C.count(sec_1)
log.append("****** B is shorter than A ******")
log.append(f"{count} instance of {sec_1} found in Document C")
if count == 1:
c_index = paragraphs_C.index(a_line)
log.append(f"Trying to replace: A: {a_seq} with B line: {b_seq}")
items_with_a_line1 = [index for index, item in enumerate(paragraphs_markdown_C) if sec_1 in remove_markdown(item)]
b_text = "\n".join(paragraphs_B[j1:j2])
if len(items_with_a_line1) == 0:
log.append("********** >>>>>>>>>>>>> Warning something might be wrong [187]\n\n\n")
replacetext=f"<span class='red replaced tag-{tag}'>{paragraphs_C[c_index].replace(sec_1, b_text)}</span>\n"
paragraphs_markdown_C[d_index] = replacetext
elif len_a == 1:
d_index = items_with_a_line1[0]
log.append(f"First line of A found in C line sec {c_index}")
# TO DO: This replace will not keep markdown code inside paragraphs
log.append(f"Replace text from A: {sec_1} to B: {b_text}")
log.append(f"Content of paragraphs_markdown_C[{d_index}] was {paragraphs_markdown_C[d_index]}")
replacetext=f"<span class='red replaced tag-{tag}'>{paragraphs_markdown_C[d_index].replace(sec_1, b_text)}</span>\n"
paragraphs_markdown_C[d_index] = replacetext
log.append(f"D[{d_index}] should now contain {b_text}")
log.append(f"D[{d_index}] now contain {paragraphs_markdown_C[d_index]}")
len_a = len(a_seq)
log.append(f"Lenght of A is: {len_a}")
if len_a > 1:
log.append(f"A contained several lines {len_a}")
log.append("Trying to remove other lines")
for counter, a_line1 in enumerate(a_seq[1:], start=1):
# log.append(f"D[{c_index+counter}] was {a_line1} TO DO FIX")
paragraphs_markdown_C[d_index+counter] = f"<span class='red deleteme tag-{tag}'>{paragraphs_C[c_index+counter]}</span>\n"
else:
# print('2')
log.append('************************')
log.append('Found several places, could not replace 2')
log.append('----------------------')
logline = f"A: {a_seq} A line: {i1}:{i2} B seq: {b_seq}, B line: {j1}:{j2}"
log.append(logline)
log.append('------------------------')
elif a_len < b_len and a_len > 0:
sec_1 = remove_markdown(a_seq[0])
count = paragraphs_C.count(sec_1)
log.append("****** B is longer than A ******")
log.append(f"{count} instance of {sec_1} found in Document C")
import pdb;pdb.set_trace()
if count == 1:
c_index = paragraphs_C.index(a_line)
log.append(f"Trying to replace: A: {a_seq} with B line: {b_seq}")
items_with_a_line1 = [index for index, item in enumerate(paragraphs_markdown_C) if sec_1 in remove_markdown(item)]
d_index = items_with_a_line1[0]
log.append(f"First line of A found in C line sec {d_index}")
b_text = "\n".join(paragraphs_B[j1:j2])
log.append(f"Replace text from A: {sec_1} to B: {b_text}")
log.append(f"Content of paragraphs_markdown_C[{d_index}] was {paragraphs_markdown_C[d_index]}")
replacetext=f"<span class='red replaced tag-{tag}'>{paragraphs_markdown_C[d_index].replace(sec_1, b_text)}</span>\n"
paragraphs_markdown_C[d_index] = replacetext
log.append(f"D[{d_index}] should now contain {b_text}")
log.append(f"D[{d_index}] now contain {paragraphs_markdown_C[d_index]}")
len_a = len(a_seq)
log.append(f"Lenght of A is: {len_a}")
if len_a > 1:
log.append(f"A contained several lines {len_a}")
log.append("Trying to remove other lines")
for counter, a_line1 in enumerate(a_seq[1:], start=1):
log.append(f"D[{c_index+counter}] was {a_line1}")
paragraphs_markdown_C[d_index+counter] = f"<span class='red deleteme tag-{tag}'>{paragraphs_markdown_C[d_index+counter]}</span>\n"
else:
log.append('************************')
log.append('Found several places, could not replace 3')
log.append('----------------------')
logline = f"A: {sec_1}, B: {b_seq}, A seq: {a_seq}, A line: {i1}:{i2}, B seq: {b_seq}, B line: {j1}:{j2}"
log.append(logline)
log.append('------------------------')
else:
log.append('**********************')
logline = f"Long: Too long to replace or different lenghts: {a_len} line vs {b_len}"
log.append(logline)
log.append('----------------------')
logline = f"{a_seq} with {b_seq}"
log.append(logline)
log.append('----------------------')
# print(paragraphs_markdown_C)
write_document(output_path, paragraphs_markdown_C)
print("\n".join(log))
# Write the log
with open(log_path, 'w', encoding='utf-8') as log_file:
log_file.write('\n'.join(log))
# Call the function
replace_and_log(A_path, B_path, C_path, output_path, log_path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment