Last active
October 7, 2025 16:57
-
-
Save gvergnaud/72a2fb4978ee1c302f066a926178bab8 to your computer and use it in GitHub Desktop.
A partial JSON parser that support incremental parsing and accessing the work-in-progress JSON structure.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /** | |
| * Why? | |
| * - With LLMs we often need to parse partial JSON strings incrementally, | |
| * as they are being generated. | |
| * - open-source Partial JSON parser all have problems: | |
| * - They are ineficient: re-parsing the full JSON on each update | |
| * - They don't stream updates of string values. | |
| * - They don't guarantee that a given position in a JSON will remain of the same type. | |
| * | |
| * What is this? | |
| * - This is JSON stream parser that's: | |
| * - efficient: Incremental parsing, never do the same work twice. | |
| * - stream updates of string values. | |
| * - provides strong typing guarantees: types never change. | |
| * | |
| * What is missing? | |
| * - Error handling when the JSON is invalid | |
| */ | |
| export type JSONValue = object | unknown[] | null | boolean | number | string; | |
| type ParserScope = | |
| | { type: "global"; current?: null | boolean | number | string } | |
| | { type: "array"; current: unknown[] } | |
| | { type: "object"; key: string; current: Record<string, unknown> }; | |
| export class JSONStreamParser { | |
| private tokenizer = new JSONStreamTokenizer(); | |
| private stack: ParserScope[] = []; | |
| private currentScope: ParserScope = { type: "global" }; | |
| write(jsonStr: string) { | |
| const events = this.tokenizer.write(jsonStr); | |
| this.processEvents(events); | |
| } | |
| end(): JSONValue { | |
| const events = this.tokenizer.end(); | |
| this.processEvents(events); | |
| return this.getCurrentValue(); | |
| } | |
| getCurrentValue(): JSONValue { | |
| const currentValue = | |
| this.stack.length > 0 ? this.stack[0].current : this.currentScope.current; | |
| return structuredClone(currentValue ?? null); | |
| } | |
| private processEvents(events: JSONParseToken[]) { | |
| for (const event of events) { | |
| switch (event.type) { | |
| case "startObject": { | |
| const newObj = {}; | |
| if (this.currentScope.type === "array") { | |
| this.currentScope.current.push(newObj); | |
| this.stack.push(this.currentScope); | |
| } else if (this.currentScope.type === "object") { | |
| this.currentScope.current[this.currentScope.key] = newObj; | |
| this.stack.push(this.currentScope); | |
| } | |
| this.currentScope = { type: "object", key: "", current: newObj }; | |
| break; | |
| } | |
| case "startArray": { | |
| const newArr: unknown[] = []; | |
| if (this.currentScope.type === "array") { | |
| this.currentScope.current.push(newArr); | |
| this.stack.push(this.currentScope); | |
| } else if (this.currentScope.type === "object") { | |
| this.currentScope.current[this.currentScope.key] = newArr; | |
| this.stack.push(this.currentScope); | |
| } | |
| this.currentScope = { type: "array", current: newArr }; | |
| break; | |
| } | |
| case "endObject": | |
| case "endArray": { | |
| this.currentScope = this.stack.pop() || this.currentScope; | |
| break; | |
| } | |
| case "key": { | |
| if (this.currentScope.type === "object") { | |
| this.currentScope.key = event.value; | |
| } | |
| break; | |
| } | |
| case "value_start": { | |
| if (this.currentScope.type === "array") { | |
| this.currentScope.current.push(event.value); | |
| } else if (this.currentScope.type === "object") { | |
| this.currentScope.current[this.currentScope.key] = event.value; | |
| } else if (this.currentScope.type === "global") { | |
| this.currentScope.current = event.value; | |
| } | |
| break; | |
| } | |
| case "value_update": { | |
| if (this.currentScope.type === "array") { | |
| setLast(this.currentScope.current, event.value); | |
| } else if (this.currentScope.type === "object") { | |
| this.currentScope.current[this.currentScope.key] = event.value; | |
| } else if (this.currentScope.type === "global") { | |
| this.currentScope.current = event.value; | |
| } | |
| break; | |
| } | |
| case "value_complete": { | |
| if (this.currentScope.type === "array") { | |
| setLast(this.currentScope.current, event.value); | |
| } else if (this.currentScope.type === "object") { | |
| this.currentScope.current[this.currentScope.key] = event.value; | |
| this.currentScope.key = ""; | |
| } else if (this.currentScope.type === "global") { | |
| this.currentScope.current = event.value; | |
| } | |
| break; | |
| } | |
| } | |
| } | |
| } | |
| } | |
| const setLast = <T>(array: T[], value: T) => { | |
| array[array.length ? array.length - 1 : 0] = value; | |
| }; | |
| type JSONParseToken = | |
| | { type: "startObject" } | |
| | { type: "endObject" } | |
| | { type: "startArray" } | |
| | { type: "endArray" } | |
| | { type: "key"; value: string } | |
| | { type: "value_start"; value: any } | |
| | { type: "value_update"; value: any } | |
| | { type: "value_complete"; value: any }; | |
| const tokenStartObject = { type: "startObject" } satisfies JSONParseToken; | |
| const tokenEndObject = { type: "endObject" } satisfies JSONParseToken; | |
| const tokenStartArray = { type: "startArray" } satisfies JSONParseToken; | |
| const tokenEndArray = { type: "endArray" } satisfies JSONParseToken; | |
| type TokenizerScope = | |
| | { type: "global" | "array" } | |
| | { type: "object"; inValue: boolean }; | |
| type ValueContext = | |
| | { | |
| type: "string"; | |
| buffer: string; | |
| parser: UnicodeStreamParser; | |
| isComplete: boolean; | |
| } | |
| | { type: "literal"; buffer: string }; | |
| class JSONStreamTokenizer { | |
| private valueContext: ValueContext | null = null; | |
| private lastEmittedStringValueUpdate: string | null = null; | |
| private stack: TokenizerScope[] = []; | |
| private currentScope: TokenizerScope = { type: "global" }; | |
| get isInStringValue(): boolean { | |
| return ( | |
| this.valueContext?.type === "string" && | |
| ((this.currentScope.type === "object" && this.currentScope.inValue) || | |
| this.currentScope.type === "array" || | |
| this.currentScope.type === "global") | |
| ); | |
| } | |
| write(jsonStr: string): JSONParseToken[] { | |
| const events: JSONParseToken[] = []; | |
| for (const char of jsonStr) { | |
| if ( | |
| this.valueContext?.type === "string" && | |
| !this.valueContext.isComplete | |
| ) { | |
| const isEndOfString = | |
| !this.valueContext.parser.isEscaping && char === '"'; | |
| if (isEndOfString) { | |
| this.valueContext.buffer = this.valueContext.parser.end(); | |
| this.valueContext.isComplete = true; | |
| } else { | |
| this.valueContext.buffer = this.valueContext.parser.write(char); | |
| } | |
| const maybeEvent = this.maybeEmitValueUpdate(this.valueContext.buffer); | |
| if (maybeEvent) appendEvent(events, maybeEvent); | |
| continue; | |
| } | |
| switch (char) { | |
| case "{": { | |
| appendEvent(events, tokenStartObject); | |
| this.stack.push(this.currentScope); | |
| this.currentScope = { type: "object", inValue: false }; | |
| break; | |
| } | |
| case "}": { | |
| if (this.valueContext) { | |
| appendEvent(events, ...this.emitValueComplete()); | |
| } | |
| appendEvent(events, tokenEndObject); | |
| this.currentScope = this.stack.pop() || { type: "global" }; | |
| break; | |
| } | |
| case "[": { | |
| appendEvent(events, tokenStartArray); | |
| this.stack.push(this.currentScope); | |
| this.currentScope = { type: "array" }; | |
| break; | |
| } | |
| case "]": { | |
| if (this.valueContext) { | |
| appendEvent(events, ...this.emitValueComplete()); | |
| } | |
| appendEvent(events, tokenEndArray); | |
| this.currentScope = this.stack.pop() || { type: "global" }; | |
| break; | |
| } | |
| case ":": { | |
| if (this.valueContext?.type !== "string") { | |
| throw new Error( | |
| `Invalid JSON: expected a key, but got ${ | |
| this.valueContext | |
| ? ` unknown literal ${this.valueContext.buffer}` | |
| : ' ":"' | |
| }` | |
| ); | |
| } | |
| appendEvent(events, { | |
| type: "key", | |
| value: this.valueContext.buffer, | |
| }); | |
| this.currentScope = { | |
| type: "object", | |
| inValue: true, | |
| }; | |
| this.valueContext = null; | |
| break; | |
| } | |
| case ",": { | |
| if (this.valueContext) { | |
| appendEvent(events, ...this.emitValueComplete()); | |
| } | |
| if (this.currentScope.type === "object") { | |
| this.currentScope = { | |
| type: "object", | |
| inValue: false, | |
| }; | |
| } | |
| break; | |
| } | |
| case '"': { | |
| this.valueContext = { | |
| type: "string", | |
| buffer: "", | |
| parser: new UnicodeStreamParser(), | |
| isComplete: false, | |
| }; | |
| break; | |
| } | |
| default: { | |
| if (char !== " " && char !== "\n" && char !== "\t") { | |
| if (this.valueContext) { | |
| this.valueContext.buffer += char; | |
| } else { | |
| this.valueContext = { type: "literal", buffer: char }; | |
| } | |
| } | |
| break; | |
| } | |
| } | |
| } | |
| return events; | |
| } | |
| end(): JSONParseToken[] { | |
| return this.emitValueComplete(); | |
| } | |
| private maybeEmitValueUpdate(value: string): JSONParseToken | null { | |
| if (!this.isInStringValue) return null; | |
| if (this.lastEmittedStringValueUpdate === null) { | |
| this.lastEmittedStringValueUpdate = value; | |
| return { | |
| type: "value_start", | |
| value: value, | |
| }; | |
| } | |
| if (this.lastEmittedStringValueUpdate === value) return null; | |
| this.lastEmittedStringValueUpdate = value; | |
| return { | |
| type: "value_update", | |
| value, | |
| }; | |
| } | |
| private emitValueComplete(): JSONParseToken[] { | |
| const events: JSONParseToken[] = []; | |
| // Closing string in case the JSON is unfinished. | |
| if (this.valueContext?.type === "string" && !this.valueContext.isComplete) { | |
| this.valueContext.buffer = this.valueContext.parser.end(); | |
| this.valueContext.isComplete = true; | |
| } | |
| if (this.valueContext === null) return events; | |
| if (this.lastEmittedStringValueUpdate === null) { | |
| appendEvent(events, { | |
| type: "value_start", | |
| value: this.parseValue(this.valueContext), | |
| }); | |
| appendEvent(events, { | |
| type: "value_complete", | |
| value: this.parseValue(this.valueContext), | |
| }); | |
| } else { | |
| appendEvent(events, { | |
| type: "value_complete", | |
| value: this.parseValue(this.valueContext), | |
| }); | |
| } | |
| this.lastEmittedStringValueUpdate = null; | |
| this.valueContext = null; | |
| return events; | |
| } | |
| private parseValue( | |
| valueContext: ValueContext | |
| ): null | boolean | number | string { | |
| switch (valueContext.type) { | |
| case "string": { | |
| return valueContext.buffer; | |
| } | |
| case "literal": { | |
| if (valueContext.buffer === "true") return true; | |
| if (valueContext.buffer === "false") return false; | |
| if (valueContext.buffer === "null") return null; | |
| if (!Number.isNaN(Number(valueContext.buffer))) | |
| return Number(valueContext.buffer); | |
| throw new Error( | |
| `Invalid JSON: Unexpected literal: ${valueContext.buffer}` | |
| ); | |
| } | |
| } | |
| } | |
| } | |
| const appendEvent = ( | |
| events: JSONParseToken[], | |
| ...eventsToAppend: JSONParseToken[] | |
| ) => { | |
| for (const event of eventsToAppend) { | |
| switch (event.type) { | |
| case "value_update": { | |
| const lastEvent = events[events.length - 1]; | |
| if ( | |
| lastEvent && | |
| (lastEvent.type === "value_start" || | |
| lastEvent.type === "value_update") | |
| ) { | |
| lastEvent.value = event.value; | |
| } else { | |
| events.push(event); | |
| } | |
| break; | |
| } | |
| case "value_complete": { | |
| const lastEvent = events[events.length - 1]; | |
| if (lastEvent && lastEvent.type === "value_update") { | |
| events[events.length - 1] = event; | |
| } else { | |
| events.push(event); | |
| } | |
| break; | |
| } | |
| default: { | |
| events.push(event); | |
| break; | |
| } | |
| } | |
| } | |
| }; | |
| type UnicodeEscapeState = { type: "unicode_escape"; digits: string[] }; | |
| type UnicodeStreamState = | |
| | { type: "unescaped" } | |
| | { type: "escape" } | |
| | UnicodeEscapeState; | |
| class UnicodeStreamParser { | |
| private state: UnicodeStreamState = { type: "unescaped" }; | |
| private lowSurrogateState?: { high: number }; | |
| private stringOutput: string = ""; | |
| get isEscaping(): boolean { | |
| return this.state.type === "escape"; | |
| } | |
| constructor( | |
| private onError: (error: Error) => void = (err) => { | |
| throw err; | |
| } | |
| ) {} | |
| write(chunk: string): string { | |
| this.processBuffer(chunk); | |
| return this.stringOutput; | |
| } | |
| end(): string { | |
| if (this.state.type === "unicode_escape" && this.lowSurrogateState) { | |
| this.onError(new Error("Unterminated surrogate pair")); | |
| } | |
| this.state = { type: "unescaped" }; | |
| return this.stringOutput; | |
| } | |
| private processBuffer(chunk: string): void { | |
| for (const char of chunk) { | |
| switch (this.state.type) { | |
| case "unescaped": | |
| if (char === "\\") { | |
| this.state = { type: "escape" }; | |
| } else { | |
| this.stringOutput += char; | |
| } | |
| break; | |
| case "escape": | |
| if (char === "u") { | |
| this.state = { type: "unicode_escape", digits: [] }; | |
| } else { | |
| this.handleEscapeChar(char); | |
| this.state = { type: "unescaped" }; | |
| } | |
| break; | |
| case "unicode_escape": | |
| this.state.digits.push(char); | |
| if (this.state.digits.length === 4) { | |
| this.handleUnicodeEscape(this.state); | |
| this.state = { type: "unescaped" }; | |
| } | |
| break; | |
| } | |
| } | |
| } | |
| private handleEscapeChar(char: string): void { | |
| const escapeMap: Record<string, string> = { | |
| '"': '"', | |
| "\\": "\\", | |
| "/": "/", | |
| b: "\b", | |
| f: "\f", | |
| n: "\n", | |
| r: "\r", | |
| t: "\t", | |
| }; | |
| this.stringOutput += escapeMap[char] ?? char; | |
| } | |
| private handleUnicodeEscape(state: UnicodeEscapeState): void { | |
| const hexStr = state.digits.join(""); | |
| const codeUnit = parseInt(hexStr, 16); | |
| if (Number.isNaN(codeUnit)) { | |
| this.onError(new Error(`Invalid Unicode escape: \\u${hexStr}`)); | |
| return; | |
| } | |
| if (this.lowSurrogateState) { | |
| // Handle low surrogate | |
| if (codeUnit >= 0xdc00 && codeUnit <= 0xdfff) { | |
| const high = this.lowSurrogateState.high; | |
| const codePoint = | |
| ((high - 0xd800) << 10) + (codeUnit - 0xdc00) + 0x10000; | |
| this.stringOutput += String.fromCodePoint(codePoint); | |
| } else { | |
| this.onError(new Error(`Invalid low surrogate: \\u${hexStr}`)); | |
| } | |
| this.lowSurrogateState = undefined; | |
| } else if (codeUnit >= 0xd800 && codeUnit <= 0xdbff) { | |
| // High surrogate, expect low surrogate next | |
| this.lowSurrogateState = { high: codeUnit }; | |
| } else if (codeUnit >= 0xdc00 && codeUnit <= 0xdfff) { | |
| // Lone low surrogate | |
| this.onError(new Error(`Lone low surrogate: \\u${hexStr}`)); | |
| } else { | |
| // Regular Unicode character | |
| this.stringOutput += String.fromCharCode(codeUnit); | |
| this.lowSurrogateState = undefined; | |
| } | |
| } | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import jsonpatch, { Operation } from "fast-json-patch"; | |
| import { JSONStreamParser, JSONValue } from "./json-stream-parser"; | |
| type ServerStreamChunk = | |
| | { type: "initialization"; value: JSONValue } | |
| | { type: "update"; patch: Operation[] } | |
| | { type: "complete"; value: JSONValue }; | |
| /** | |
| * Turn stream of incomplete JSON into JSON patches. | |
| */ | |
| const serverStream = function* ( | |
| jsonChunkStream: Generator<string> | |
| ): Generator<ServerStreamChunk> { | |
| const parser = new JSONStreamParser(); | |
| let previousServerResult: JSONValue = null; | |
| for (const chunk of jsonChunkStream) { | |
| const serverResult = parser.write(chunk); | |
| if (serverResult === null) { | |
| continue; | |
| } | |
| if (previousServerResult === null) { | |
| yield { type: "initialization", value: serverResult }; | |
| previousServerResult = serverResult; | |
| continue; | |
| } | |
| const patch: Operation[] = jsonpatch.compare( | |
| previousServerResult, | |
| serverResult | |
| ); | |
| previousServerResult = serverResult; | |
| if (patch.length) { | |
| yield { type: "update", patch }; | |
| } | |
| } | |
| const finalResult = parser.end(); | |
| yield { type: "complete", value: finalResult }; | |
| }; | |
| const createLlmJsonStream = function* ( | |
| jsonString: string | |
| ): Generator<string> { | |
| yield* splitStringRandomly(jsonString); | |
| } | |
| const splitStringRandomly = (str: string) => { | |
| const result: string[] = []; | |
| while (str.length > 0) { | |
| const randomIndex = numberBetween(1, Math.min(4, str.length)); | |
| result.push(str.slice(0, randomIndex)); | |
| str = str.slice(randomIndex); | |
| } | |
| return result; | |
| }; | |
| const numberBetween = (min: number, max: number) => { | |
| return Math.floor(Math.random() * (max - min + 1)) + min; | |
| }; | |
| const jsonString = `{ | |
| "🧪 _test": "🚀 Stress-test JSON parser 🚀", | |
| "nested": { | |
| "level1": { | |
| "level2": { | |
| "level3": { | |
| "level4": { | |
| "level5": { | |
| "level6": { | |
| "level7": { | |
| "level8": { | |
| "level9": { | |
| "level10": { | |
| "deep": true, | |
| "array": [ | |
| [ | |
| [ | |
| [ | |
| [1, 2, 3, {"a": "b"}] | |
| ] | |
| ] | |
| ] | |
| ], | |
| "empty": {}, | |
| "nullValue": null, | |
| "unicodeKey🌍": "unicodeValue🌍", | |
| "escapes": "\\\\\\"\\\\\\\\\\\\/\\\\b\\\\f\\\\n\\\\r\\\\t\\u00A9\\uD83D\\uDE00\\uD83D\\uDC35", | |
| "surrogatePair": "\\uD83D\\uDE0A", | |
| "mixed": [ | |
| 42, | |
| -42, | |
| 3.14159265359, | |
| -3.14159265359e+10, | |
| 1.7976931348623157e+308, | |
| -1.7976931348623157e+308, | |
| 2.2250738585072014e-308, | |
| true, | |
| false, | |
| null, | |
| "", | |
| " ", | |
| "\\u0000", | |
| "\\uFFFF", | |
| "\\uD83D\\uDE00", | |
| "\\uD83D\\uDC35", | |
| {"": ""}, | |
| {"\\uD83D\\uDE00": "\\uD83D\\uDC35"}, | |
| [], | |
| [null], | |
| [1, "2", true, false, null, {}], | |
| {"\\uD83D\\uDE00": ["\\uD83D\\uDC35", {"nested": {}}]} | |
| ], | |
| "objectWithAllTypes": { | |
| "string": "Hello\\\\nWorld\\\\u00A9", | |
| "number": 1234567890.123456789, | |
| "boolean": true, | |
| "null": null, | |
| "array": [1, "2", true, false, null, {}], | |
| "emptyObject": {}, | |
| "emptyArray": [], | |
| "scientificNotation": 1.23e-45, | |
| "negativeZero": -0, | |
| "infinityPlaceholder": "Infinity", | |
| "naNPlaceholder": "NaN" | |
| }, | |
| "trailingCommaObject": { | |
| "valid": true | |
| }, | |
| "trailingCommaArray": [1, 2, 3] | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| }, | |
| "specialNumbers": { | |
| "maxInt": 9007199254740991, | |
| "minInt": -9007199254740991, | |
| "maxSafeInteger": 9007199254740991, | |
| "minSafeInteger": -9007199254740991, | |
| "maxFloat": 1.7976931348623157e+308, | |
| "minFloat": -1.7976931348623157e+308, | |
| "epsilon": 2.2250738585072014e-308 | |
| }, | |
| "edgeCases": { | |
| "emptyString": "", | |
| "whitespaceString": " \\t\\n\\r", | |
| "controlChars": "\\u0000\\u0001\\u0002\\u0003\\u0004\\u0005\\u0006\\u0007\\b\\t\\n\\u000B\\f\\r\\u000E\\u000F\\u0010\\u0011\\u0012\\u0013\\u0014\\u0015\\u0016\\u0017\\u0018\\u0019\\u001A\\u001B\\u001C\\u001D\\u001E\\u001F", | |
| "invalidUnicodeEscape": "\\\\uXYZ", | |
| "unclosed": { | |
| "object": "{", | |
| "array": "[", | |
| "string": "\\"unclosed" | |
| }, | |
| "circularReferencePlaceholder": "[Circular]", | |
| "commentsPlaceholder": "// This is not valid JSON, but some parsers might choke on it" | |
| }, | |
| "largeArray": [ | |
| 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, | |
| 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, | |
| 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, | |
| 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, | |
| 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100 | |
| ], | |
| "mixedWhitespace": { | |
| "tabKey\\t": "tabValue\\t", | |
| "newlineKey\\n": "newlineValue\\n", | |
| "carriageReturnKey\\r": "carriageReturnValue\\r" | |
| }, | |
| "unicodeKeys": { | |
| "😊": "smile", | |
| "❤️": "heart", | |
| "🎉": "party", | |
| "🐶": "dog", | |
| "🍣": "sushi", | |
| "🚀": "rocket" | |
| }, | |
| "nestedArrays": [ | |
| [], | |
| [[]], | |
| [[[]]], | |
| [[[[]]]], | |
| [[[[[]]]]], | |
| [[[[[{}]]]]], | |
| [[[[[{"a": "b"}]]]]] | |
| ], | |
| "finalTest": { | |
| "validJSON": true, | |
| "butDidItCrash?": false | |
| } | |
| }` | |
| const llmJsonStream = createLlmJsonStream(jsonString) | |
| let wipValue: unknown = {}; | |
| for (const chunk of serverStream(llmJsonStream)) { | |
| switch (chunk.type) { | |
| case "initialization": | |
| wipValue = chunk.value; | |
| break; | |
| case "update": | |
| jsonpatch.applyPatch(wipValue, chunk.patch); | |
| break; | |
| case "complete": | |
| wipValue = chunk.value; | |
| break; | |
| } | |
| } | |
| expect(wipValue).toEqual(JSON.parse(jsonString)); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment