Created
January 11, 2025 01:22
-
-
Save masta-g3/3c25a2bb5c28aeec70f9c75cafcae938 to your computer and use it in GitHub Desktop.
split markdown by headers
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def split_markdown_document(text, target_min=500, target_max=700): | |
| # Identify headers and their levels | |
| header_pattern = re.compile(r'^(#+)\s+(.*)', re.MULTILINE) | |
| headers = [(m.start(), len(m.group(1)), m.group(2)) for m in re.finditer(header_pattern, text)] | |
| # Add sentinel header at the end | |
| headers.append((len(text), 0, "")) | |
| # If no real headers found (other than sentinel), treat entire doc as one big chunk | |
| real_headers = [h for h in headers if h[1] > 0] | |
| if not real_headers: | |
| words = re.findall(r'\b\w+\b', text) | |
| # If text fits in one chunk, return it as is | |
| if len(words) <= target_max: | |
| return [text.strip()] | |
| # Otherwise chunk by word count | |
| chunks = [] | |
| current_chunk_words = [] | |
| for w in words: | |
| current_chunk_words.append(w) | |
| if len(current_chunk_words) >= target_max: | |
| chunks.append(' '.join(current_chunk_words)) | |
| current_chunk_words = [] | |
| if current_chunk_words: | |
| chunks.append(' '.join(current_chunk_words)) | |
| return chunks | |
| # Determine the top-level header level (smallest header level encountered) | |
| top_level = min(h_level for _, h_level, _ in real_headers) | |
| # Parse the document into sections based on top-level headers | |
| sections = [] | |
| current_section = None | |
| for i in range(len(headers) - 1): | |
| start_idx, level, header_text = headers[i] | |
| next_start_idx = headers[i+1][0] | |
| # Find where the current header line ends | |
| header_line_end = text.find('\n', start_idx) | |
| if header_line_end == -1: | |
| header_line_end = next_start_idx | |
| block_content = text[header_line_end:next_start_idx].strip() | |
| if level == top_level: | |
| # Close out the previous section if any | |
| if current_section is not None: | |
| sections.append(current_section) | |
| # Start a new section | |
| current_section = { | |
| "blocks": [ | |
| {"level": level, "header_text": header_text, "content": block_content} | |
| ] | |
| } | |
| elif level > top_level and current_section is not None: | |
| # Subsection inside current top-level section | |
| current_section["blocks"].append({ | |
| "level": level, | |
| "header_text": header_text, | |
| "content": block_content | |
| }) | |
| # Add the last section if it exists | |
| if current_section is not None: | |
| sections.append(current_section) | |
| def word_count(s): | |
| return len(re.findall(r'\b\w+\b', s)) | |
| chunks = [] | |
| # Create chunks from each top-level section | |
| for section in sections: | |
| current_chunk = "" | |
| current_count = 0 | |
| for block in section["blocks"]: | |
| block_text = f"{'#' * block['level']} {block['header_text']}\n\n{block['content']}".strip() | |
| block_words = word_count(block_text) | |
| # Decide whether to close the current chunk before adding this block | |
| if current_count + block_words > target_max and current_count >= target_min: | |
| # Current chunk is large enough; finalize it and start a new chunk | |
| chunks.append(current_chunk.strip()) | |
| current_chunk = block_text | |
| current_count = block_words | |
| else: | |
| # Add block to current chunk | |
| if current_chunk: | |
| current_chunk += "\n\n" + block_text | |
| else: | |
| current_chunk = block_text | |
| current_count += block_words | |
| # Close out remaining chunk in this section | |
| if current_chunk.strip(): | |
| chunks.append(current_chunk.strip()) | |
| return chunks |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment