masta-g3 · January 11, 2025 01:22
diff --git a/md_splitter b/md_splitter
 def split_markdown_document(text, target_min=500, target_max=700):
    # Identify headers and their levels
    header_pattern = re.compile(r'^(#+)\s+(.*)', re.MULTILINE)
    headers = [(m.start(), len(m.group(1)), m.group(2)) for m in re.finditer(header_pattern, text)]
    # Add sentinel header at the end
    headers.append((len(text), 0, ""))

    # If no real headers found (other than sentinel), treat entire doc as one big chunk
    real_headers = [h for h in headers if h[1] > 0]
    if not real_headers:
        words = re.findall(r'\b\w+\b', text)
        # If text fits in one chunk, return it as is
        if len(words) <= target_max:
            return [text.strip()]

        # Otherwise chunk by word count
        chunks = []
        current_chunk_words = []
        for w in words:
            current_chunk_words.append(w)
            if len(current_chunk_words) >= target_max:
                chunks.append(' '.join(current_chunk_words))
                current_chunk_words = []
        if current_chunk_words:
            chunks.append(' '.join(current_chunk_words))
        return chunks

    # Determine the top-level header level (smallest header level encountered)
    top_level = min(h_level for _, h_level, _ in real_headers)

    # Parse the document into sections based on top-level headers
    sections = []
    current_section = None

    for i in range(len(headers) - 1):
        start_idx, level, header_text = headers[i]
        next_start_idx = headers[i+1][0]

        # Find where the current header line ends
        header_line_end = text.find('\n', start_idx)
        if header_line_end == -1:
            header_line_end = next_start_idx
        block_content = text[header_line_end:next_start_idx].strip()

        if level == top_level:
            # Close out the previous section if any
            if current_section is not None:
                sections.append(current_section)

            # Start a new section
            current_section = {
                "blocks": [
                    {"level": level, "header_text": header_text, "content": block_content}
                ]
            }
        elif level > top_level and current_section is not None:
            # Subsection inside current top-level section
            current_section["blocks"].append({
                "level": level,
                "header_text": header_text,
                "content": block_content
            })

    # Add the last section if it exists
    if current_section is not None:
        sections.append(current_section)

    def word_count(s):
        return len(re.findall(r'\b\w+\b', s))

    chunks = []
    # Create chunks from each top-level section
    for section in sections:
        current_chunk = ""
        current_count = 0

        for block in section["blocks"]:
            block_text = f"{'#' * block['level']} {block['header_text']}\n\n{block['content']}".strip()
            block_words = word_count(block_text)

            # Decide whether to close the current chunk before adding this block
            if current_count + block_words > target_max and current_count >= target_min:
                # Current chunk is large enough; finalize it and start a new chunk
                chunks.append(current_chunk.strip())
                current_chunk = block_text
                current_count = block_words
            else:
                # Add block to current chunk
                if current_chunk:
                    current_chunk += "\n\n" + block_text
                else:
                    current_chunk = block_text
                current_count += block_words

        # Close out remaining chunk in this section
        if current_chunk.strip():
            chunks.append(current_chunk.strip())

    return chunks
	def split_markdown_document(text, target_min=500, target_max=700):
	# Identify headers and their levels
	header_pattern = re.compile(r'^(#+)\s+(.*)', re.MULTILINE)
	headers = [(m.start(), len(m.group(1)), m.group(2)) for m in re.finditer(header_pattern, text)]
	# Add sentinel header at the end
	headers.append((len(text), 0, ""))

	# If no real headers found (other than sentinel), treat entire doc as one big chunk
	real_headers = [h for h in headers if h[1] > 0]
	if not real_headers:
	words = re.findall(r'\b\w+\b', text)
	# If text fits in one chunk, return it as is
	if len(words) <= target_max:
	return [text.strip()]

	# Otherwise chunk by word count
	chunks = []
	current_chunk_words = []
	for w in words:
	current_chunk_words.append(w)
	if len(current_chunk_words) >= target_max:
	chunks.append(' '.join(current_chunk_words))
	current_chunk_words = []
	if current_chunk_words:
	chunks.append(' '.join(current_chunk_words))
	return chunks

	# Determine the top-level header level (smallest header level encountered)
	top_level = min(h_level for _, h_level, _ in real_headers)

	# Parse the document into sections based on top-level headers
	sections = []
	current_section = None

	for i in range(len(headers) - 1):
	start_idx, level, header_text = headers[i]
	next_start_idx = headers[i+1][0]

	# Find where the current header line ends
	header_line_end = text.find('\n', start_idx)
	if header_line_end == -1:
	header_line_end = next_start_idx
	block_content = text[header_line_end:next_start_idx].strip()

	if level == top_level:
	# Close out the previous section if any
	if current_section is not None:
	sections.append(current_section)

	# Start a new section
	current_section = {
	"blocks": [
	{"level": level, "header_text": header_text, "content": block_content}
	]
	}
	elif level > top_level and current_section is not None:
	# Subsection inside current top-level section
	current_section["blocks"].append({
	"level": level,
	"header_text": header_text,
	"content": block_content
	})

	# Add the last section if it exists
	if current_section is not None:
	sections.append(current_section)

	def word_count(s):
	return len(re.findall(r'\b\w+\b', s))

	chunks = []
	# Create chunks from each top-level section
	for section in sections:
	current_chunk = ""
	current_count = 0

	for block in section["blocks"]:
	block_text = f"{'#' * block['level']} {block['header_text']}\n\n{block['content']}".strip()
	block_words = word_count(block_text)

	# Decide whether to close the current chunk before adding this block
	if current_count + block_words > target_max and current_count >= target_min:
	# Current chunk is large enough; finalize it and start a new chunk
	chunks.append(current_chunk.strip())
	current_chunk = block_text
	current_count = block_words
	else:
	# Add block to current chunk
	if current_chunk:
	current_chunk += "\n\n" + block_text
	else:
	current_chunk = block_text
	current_count += block_words

	# Close out remaining chunk in this section
	if current_chunk.strip():
	chunks.append(current_chunk.strip())

	return chunks
No results found