Skip to content

Instantly share code, notes, and snippets.

@richardhundt
Created August 24, 2025 20:26
Show Gist options
  • Select an option

  • Save richardhundt/8a6b200e9f062e21aee6f4ee7c51c4c3 to your computer and use it in GitHub Desktop.

Select an option

Save richardhundt/8a6b200e9f062e21aee6f4ee7c51c4c3 to your computer and use it in GitHub Desktop.
glm-4.5-codec
class GLM_45_Codec:
"""
GLM-4.5 codec that implements the ProviderCodec protocol.
This codec includes the GLM-4.5 jinja template inlined and handles
the specific response format used by GLM models. It includes stateful
parsing for streaming to handle tool calls intelligently.
"""
def __init__(self):
"""Initialize the GLM-4.5 codec."""
self._template = jinja2.Template(self.TEMPLATE_STR)
# State for streaming tool call parsing
self._in_tool_call = False
self._tool_call_buffer = ""
self._accumulated_content = ""
# GLM-4.5 template inlined
TEMPLATE_STR = """[gMASK]<sop>
{%- if tools -%}
<|system|>
# Tools
You may call one or more functions to assist with the user query.
You are provided with function signatures within <tools></tools> XML tags:
<tools>
{% for tool in tools %}
{{ tool | tojson|string }}
{% endfor %}
</tools>
For each function call, output the function name and arguments within the following XML format:
<tool_call>{function-name}
<arg_key>{arg-key-1}</arg_key>
<arg_value>{arg-value-1}</arg_value>
<arg_key>{arg-key-2}</arg_key>
<arg_value>{arg-value-2}</arg_value>
...
</tool_call>{%- endif -%}
{%- macro visible_text(content) -%}
{%- if content is string -%}
{{- content }}
{%- elif content is iterable and content is not mapping -%}
{%- for item in content -%}
{%- if item is mapping and item.type == 'text' -%}
{{- item.text }}
{%- elif item is string -%}
{{- item }}
{%- endif -%}
{%- endfor -%}
{%- else -%}
{{- content }}
{%- endif -%}
{%- endmacro -%}
{%- set ns = namespace(last_user_index=-1) %}
{%- for m in messages %}
{%- if m.role == 'user' %}
{% set ns.last_user_index = loop.index0 -%}
{%- endif %}
{%- endfor %}
{% for m in messages %}
{%- if m.role == 'user' -%}<|user|>
{% set content = visible_text(m.content) %}{{ content }}
{{- '/nothink' if (enable_thinking is defined and not enable_thinking) else '' -}}
{%- elif m.role == 'assistant' -%}
<|assistant|>
{%- set reasoning_content = '' %}
{%- set content = visible_text(m.content) or '' %}
{%- if m.reasoning_content is string %}
{%- set reasoning_content = m.reasoning_content %}
{%- endif %}
{%- if loop.index0 > ns.last_user_index and reasoning_content -%}
{{ '\n<think>' + reasoning_content + '</think>'}}
{%- else -%}
{{ '\n<think></think>' }}
{%- endif -%}
{%- if content -%}
{{ '\n' + content }}
{%- endif -%}
{% if m.tool_calls %}
{% for tc in m.tool_calls %}
{%- if tc.function %}
{%- set tc = tc.function %}
{%- endif %}
{{ '\n<tool_call>' + tc.name }}
{% set _args = tc.arguments %}
{% for k, v in _args.items() %}
<arg_key>{{ k }}</arg_key>
<arg_value>{{ v | tojson|string if v is not string else v }}</arg_value>
{% endfor %}
</tool_call>{% endfor %}
{% endif %}
{%- elif m.role == 'tool' -%}
{%- if m.content is string -%}
{%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
{{- '<|observation|>' }}
{%- endif %}
{{- '\n<tool_response>\n' }}
{{- m.content }}
{{- '\n</tool_response>' }}
{%- else -%}
<|observation|>{% for tr in m.content %}
<tool_response>
{{ tr.output if tr.output is defined else tr }}
</tool_response>{% endfor -%}
{% endif -%}
{%- elif m.role == 'system' -%}
<|system|>
{{ visible_text(m.content) }}
{%- endif -%}
{%- endfor -%}
{%- if add_generation_prompt -%}
<|assistant|>{{- '\n<think></think>' if (enable_thinking is defined and not enable_thinking) else '' -}}
{%- endif -%}"""
def reset_state(self):
"""Reset the streaming state for a new conversation."""
self._in_tool_call = False
self._tool_call_buffer = ""
self._accumulated_content = ""
def process_streaming_chunk(self, chunk: str) -> tuple[str, bool]:
"""
Process a streaming chunk with state machine logic for tool calls.
Args:
chunk: The streaming JSON chunk from GLM API
Returns:
tuple of (content_to_emit, is_complete_tool_call)
"""
try:
# Parse the GLM streaming chunk format
chunk_data = json.loads(chunk)
# Extract text content from GLM format
content = ""
if "choices" in chunk_data and chunk_data["choices"]:
choice = chunk_data["choices"][0]
if "text" in choice:
content = choice["text"]
if not content:
return "", False
except (json.JSONDecodeError, KeyError):
# If it's not valid JSON, treat as plain text
content = chunk
# Accumulate all content for final parsing
self._accumulated_content += content
# Check if we're entering a tool call
if not self._in_tool_call and "<tool_call>" in content:
# Split the content at the tool call start
parts = content.split("<tool_call>", 1)
content_before = parts[0]
tool_call_start = "<tool_call>" + parts[1] if len(parts) > 1 else "<tool_call>"
self._in_tool_call = True
self._tool_call_buffer = tool_call_start
# Return only the content before the tool call
return content_before, False
# If we're in a tool call, accumulate in buffer
elif self._in_tool_call:
self._tool_call_buffer += content
# Check if we've completed the tool call
if "</tool_call>" in self._tool_call_buffer:
# Split at the closing tag
parts = self._tool_call_buffer.split("</tool_call>", 1)
complete_tool_call = parts[0] + "</tool_call>"
content_after = parts[1] if len(parts) > 1 else ""
self._in_tool_call = False
self._tool_call_buffer = ""
# Return the content after the tool call (if any)
return content_after, True
else:
# Still accumulating tool call, don't emit anything
return "", False
# Normal content, not in a tool call
else:
return content, False
def get_accumulated_content(self) -> str:
"""Get the accumulated content from streaming."""
return self._accumulated_content
def encode(self, messages: list[Message], options: dict[str, Any] | None = None) -> str:
"""
Encode messages and options into a GLM-4.5 formatted prompt string.
Args:
messages: List of Message objects to encode
options: Optional parameters for template rendering
Returns:
Formatted prompt string ready for GLM-4.5 model
"""
# Convert Message objects to dicts for jinja2
message_dicts = []
for msg in messages:
msg_dict: dict[str, Any] = {
"role": msg.role,
"content": msg.content,
}
# Handle tool calls
if msg.tool_calls:
msg_dict["tool_calls"] = [
{
"function": {
"name": tc.function.name,
"arguments": tc.function.arguments
}
}
for tc in msg.tool_calls
]
# Handle tool call id for tool responses
if msg.tool_call_id:
msg_dict["tool_call_id"] = msg.tool_call_id
message_dicts.append(msg_dict)
# Template variables
template_vars = {
"messages": message_dicts,
"add_generation_prompt": True,
"enable_thinking": True,
}
# Add tools if provided in options
if options and "tools" in options:
template_vars["tools"] = options["tools"]
# Add any other template variables from options
if options:
for key, value in options.items():
if key.startswith("template_"):
template_vars[key[9:]] = value # Remove "template_" prefix
return self._template.render(**template_vars)
def decode(self, raw_response: str, model: str, start_time: datetime) -> Response:
"""
Decode raw GLM-4.5 response string into Response object.
Args:
raw_response: Raw response string from GLM-4.5 model
model: Model name for the response
start_time: Start time of the request
Returns:
Response object with parsed content and tool calls
"""
content = raw_response.strip()
try:
# Try to parse as JSON first (common for API responses)
data = json.loads(content)
# Handle different response formats
if "choices" in data and data["choices"]:
choice = data["choices"][0]
if "text" in choice:
content = choice["text"]
elif "message" in choice and "content" in choice["message"]:
content = choice["message"]["content"]
elif "content" in data:
content = data["content"]
elif "text" in data:
content = data["text"]
elif "response" in data:
content = data["response"]
else:
# Fallback to raw response
content = raw_response
except json.JSONDecodeError:
# Not JSON, treat as plain text
content = raw_response
# Parse tool calls if present (GLM format)
tool_calls = []
if "<tool_call>" in content:
# Extract tool calls from GLM format
tool_call_pattern = r'<tool_call>([^\s<\n]+)\s*((?:<arg_key>.*?</arg_key>\s*<arg_value>.*?</arg_value>\s*)*)</tool_call>'
matches = re.findall(tool_call_pattern, content, re.DOTALL)
for tool_name, args_str in matches:
# Parse arguments
arg_pattern = r'<arg_key>(.*?)</arg_key>\s*<arg_value>(.*?)</arg_value>'
arg_matches = re.findall(arg_pattern, args_str, re.DOTALL)
arguments = {}
for key, value in arg_matches:
key = key.strip()
value = value.strip()
# Try to parse JSON values
try:
arguments[key] = json.loads(value)
except json.JSONDecodeError:
arguments[key] = value
tool_calls.append(ToolCall(
function=ToolCallFunction(
name=tool_name.strip(),
arguments=arguments
)
))
# Remove tool calls from content for cleaner display
content = re.sub(tool_call_pattern, '', content, flags=re.DOTALL).strip()
# Clean up GLM thinking tags - remove empty or whitespace-only think sequences
if "<think>" in content:
# First pass: remove empty think tags
content = re.sub(r'<think>\s*</think>', '', content, flags=re.DOTALL)
# Second pass: remove nested or malformed empty think sequences like "</think>\n</think>\n</think>"
content = re.sub(r'</think>\s*</think>', '</think>', content, flags=re.DOTALL)
# Third pass: remove any remaining think tags with only whitespace
content = re.sub(r'<think>\s+</think>', '', content, flags=re.DOTALL)
# Final cleanup: remove any remaining think tags
content = re.sub(r'<think>.*?</think>', '', content, flags=re.DOTALL)
content = content.strip()
return Response(
model=model,
created_at=start_time,
done=True,
message=Message(
role="assistant",
content=content,
tool_calls=tool_calls if tool_calls else None
),
total_duration=int((datetime.now() - start_time).total_seconds() * 1000)
)
@richardhundt
Copy link
Author

richardhundt commented Aug 24, 2025

And here's the streaming dispatch loop:

            # ...
            encoded = codec.encode(messages, options) # send it through the jinja template
            request_data = {
                "prompt": encoded, 
                "stream": True,
            }
            # ...
            async with client.stream("POST", endpoint, json=request_data) as response:
                response.raise_for_status()

                model_name = request_data["model"]

                # Reset codec state for new stream
                self._codec.reset_state()

                async for line in response.aiter_lines():
                    logger.debug(f"Received streaming line: {repr(line)}")

                    if not line.strip():
                        continue

                    assert line.startswith("data: ")
                    data_str = line[6:]  # Remove "data: " prefix
                    logger.debug(f"SSE data: {repr(data_str)}")

                    if data_str.strip() == "[DONE]":
                        # Stream is complete, parse accumulated content for tool calls
                        break

                    content_to_emit, is_complete_tool_call = self._codec.process_streaming_chunk(data_str)
                    if content_to_emit:
                        logger.debug(f"Yielding content from codec: {repr(content_to_emit)}")
                        yield Response(
                            model=model_name,
                            created_at=start_time,
                            done=False,
                            message=Message(
                                role="assistant",
                                content=content_to_emit,
                                tool_calls=[]
                            ),
                            total_duration=None
                        )

                # Parse the complete accumulated content using the codec to extract tool calls
                final_accumulated_content = self._codec.get_accumulated_content()

                if final_accumulated_content:
                    final_response = self._codec.decode(final_accumulated_content, model_name, start_time)
                    yield final_response

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment