Skip to content

Instantly share code, notes, and snippets.

@masta-g3
Created January 11, 2025 01:22
Show Gist options
  • Select an option

  • Save masta-g3/3c25a2bb5c28aeec70f9c75cafcae938 to your computer and use it in GitHub Desktop.

Select an option

Save masta-g3/3c25a2bb5c28aeec70f9c75cafcae938 to your computer and use it in GitHub Desktop.
split markdown by headers
def split_markdown_document(text, target_min=500, target_max=700):
# Identify headers and their levels
header_pattern = re.compile(r'^(#+)\s+(.*)', re.MULTILINE)
headers = [(m.start(), len(m.group(1)), m.group(2)) for m in re.finditer(header_pattern, text)]
# Add sentinel header at the end
headers.append((len(text), 0, ""))
# If no real headers found (other than sentinel), treat entire doc as one big chunk
real_headers = [h for h in headers if h[1] > 0]
if not real_headers:
words = re.findall(r'\b\w+\b', text)
# If text fits in one chunk, return it as is
if len(words) <= target_max:
return [text.strip()]
# Otherwise chunk by word count
chunks = []
current_chunk_words = []
for w in words:
current_chunk_words.append(w)
if len(current_chunk_words) >= target_max:
chunks.append(' '.join(current_chunk_words))
current_chunk_words = []
if current_chunk_words:
chunks.append(' '.join(current_chunk_words))
return chunks
# Determine the top-level header level (smallest header level encountered)
top_level = min(h_level for _, h_level, _ in real_headers)
# Parse the document into sections based on top-level headers
sections = []
current_section = None
for i in range(len(headers) - 1):
start_idx, level, header_text = headers[i]
next_start_idx = headers[i+1][0]
# Find where the current header line ends
header_line_end = text.find('\n', start_idx)
if header_line_end == -1:
header_line_end = next_start_idx
block_content = text[header_line_end:next_start_idx].strip()
if level == top_level:
# Close out the previous section if any
if current_section is not None:
sections.append(current_section)
# Start a new section
current_section = {
"blocks": [
{"level": level, "header_text": header_text, "content": block_content}
]
}
elif level > top_level and current_section is not None:
# Subsection inside current top-level section
current_section["blocks"].append({
"level": level,
"header_text": header_text,
"content": block_content
})
# Add the last section if it exists
if current_section is not None:
sections.append(current_section)
def word_count(s):
return len(re.findall(r'\b\w+\b', s))
chunks = []
# Create chunks from each top-level section
for section in sections:
current_chunk = ""
current_count = 0
for block in section["blocks"]:
block_text = f"{'#' * block['level']} {block['header_text']}\n\n{block['content']}".strip()
block_words = word_count(block_text)
# Decide whether to close the current chunk before adding this block
if current_count + block_words > target_max and current_count >= target_min:
# Current chunk is large enough; finalize it and start a new chunk
chunks.append(current_chunk.strip())
current_chunk = block_text
current_count = block_words
else:
# Add block to current chunk
if current_chunk:
current_chunk += "\n\n" + block_text
else:
current_chunk = block_text
current_count += block_words
# Close out remaining chunk in this section
if current_chunk.strip():
chunks.append(current_chunk.strip())
return chunks
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment