Created
January 18, 2026 22:31
-
-
Save nathan130200/66dbba9164eae46d599764994d7778a8 to your computer and use it in GitHub Desktop.
Basic XML parser using SAX-like events.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| public delegate void StartElementHandler(string name, IReadOnlyDictionary<string, string> attributes); | |
| public delegate void EndElementHandler(string name); | |
| public delegate void CharacterDataHandler(string content); | |
| public delegate void CommentHandler(string data); | |
| public delegate void ProcessingInstructionHandler(string target, string data); | |
| public class XmlParser(Encoding? encoding = default) | |
| { | |
| static readonly UTF8Encoding s_DefaultEncoding = new(false, true); | |
| public event StartElementHandler? OnStartElement; | |
| public event EndElementHandler? OnEndElement; | |
| public event CharacterDataHandler? OnText; | |
| public event CharacterDataHandler? OnCdata; | |
| public event CommentHandler? OnComment; | |
| public event ProcessingInstructionHandler? OnProcessingInstruction; | |
| readonly List<byte> _buffer = []; | |
| readonly Stack<string> _elementStack = new(); | |
| readonly StringBuilder _textBuffer = new(); | |
| readonly Encoding _encoding = encoding ?? s_DefaultEncoding; | |
| bool _finished = false; | |
| public void Parse(byte[] data, int len, bool isFinal) | |
| { | |
| if (_finished) | |
| throw new XmlException("Parser finished."); | |
| if (data != null && len > 0) | |
| { | |
| for (int i = 0; i < len; i++) | |
| { | |
| _buffer.Add(data[i]); | |
| } | |
| } | |
| ProcessBuffer(isFinal); | |
| if (isFinal) | |
| { | |
| FlushText(); | |
| _finished = true; | |
| if (_elementStack.Count > 0) | |
| throw new XmlException("Unclosed token."); | |
| } | |
| } | |
| private void ProcessBuffer(bool isFinal) | |
| { | |
| int pos = 0; | |
| while (pos < _buffer.Count) | |
| { | |
| byte b = _buffer[pos]; | |
| if (b == '<') | |
| { | |
| FlushText(); | |
| int tagEnd = FindTagEnd(pos); | |
| if (tagEnd == -1) | |
| { | |
| if (!isFinal) | |
| break; | |
| else | |
| throw new XmlException("Unclosed tag."); | |
| } | |
| int tagLen = tagEnd - pos + 1; | |
| string tag = _encoding.GetString([.. _buffer.GetRange(pos, tagLen)]); | |
| ParseTag(tag); | |
| _buffer.RemoveRange(0, tagEnd + 1); | |
| pos = 0; | |
| } | |
| else | |
| { | |
| if (!char.IsWhiteSpace((char)b) || _textBuffer.Length > 0) | |
| _textBuffer.Append((char)b); | |
| pos++; | |
| } | |
| } | |
| if (pos > 0) | |
| { | |
| _buffer.RemoveRange(0, pos); | |
| } | |
| } | |
| private int FindTagEnd(int start) | |
| { | |
| bool inQuote = false; | |
| char quoteChar = '\0'; | |
| for (int i = start + 1; i < _buffer.Count; i++) | |
| { | |
| char c = (char)_buffer[i]; | |
| if (c == '"' || c == '\'') | |
| { | |
| if (!inQuote) | |
| { | |
| inQuote = true; | |
| quoteChar = c; | |
| } | |
| else if (c == quoteChar) | |
| { | |
| inQuote = false; | |
| } | |
| } | |
| else if (c == '>' && !inQuote) | |
| { | |
| return i; | |
| } | |
| } | |
| return -1; | |
| } | |
| private void ParseTag(string tag) | |
| { | |
| tag = tag.Trim(); | |
| if (tag.Length < 2) return; | |
| // Remove < e > | |
| string content = tag.Substring(1, tag.Length - 2).Trim(); | |
| // Processing Instruction | |
| if (content.StartsWith('?')) | |
| { | |
| ParseProcessingInstruction(content); | |
| return; | |
| } | |
| // Comment | |
| if (content.StartsWith("!--")) | |
| { | |
| ParseComment(content); | |
| return; | |
| } | |
| // CDATA | |
| if (content.StartsWith("![CDATA[")) | |
| { | |
| ParseCdata(content); | |
| return; | |
| } | |
| // End tag | |
| if (content.StartsWith('/')) | |
| { | |
| ParseEndTag(content); | |
| return; | |
| } | |
| // Start tag | |
| ParseStartTag(content); | |
| } | |
| private void ParseStartTag(ReadOnlySpan<char> content) | |
| { | |
| bool selfClosing = content.EndsWith('/'); | |
| if (selfClosing) | |
| content = content.Slice(0, content.Length - 1).Trim(); | |
| int spaceIdx = content.IndexOfAny([' ', '\t', '\n', '\r']); | |
| string tagName; | |
| ReadOnlySpan<char> attrString = ""; | |
| if (spaceIdx > 0) | |
| { | |
| tagName = content.Slice(0, spaceIdx).ToString(); | |
| attrString = content.Slice(spaceIdx).Trim(); | |
| } | |
| else | |
| { | |
| tagName = content.ToString(); | |
| } | |
| var attrs = ParseAttributes(attrString); | |
| _elementStack.Push(tagName); | |
| OnStartElement?.Invoke(tagName, attrs); | |
| if (selfClosing) | |
| { | |
| _elementStack.Pop(); | |
| OnEndElement?.Invoke(tagName); | |
| } | |
| } | |
| private void ParseEndTag(string content) | |
| { | |
| string tagName = content.Substring(1).Trim(); | |
| if (_elementStack.Count > 0) | |
| { | |
| string expected = _elementStack.Pop(); | |
| if (expected != tagName) | |
| throw new XmlException($"Tag mismatch: expected '{expected}', got '{tagName}'"); | |
| } | |
| OnEndElement?.Invoke(tagName); | |
| } | |
| static Dictionary<string, string> ParseAttributes(ReadOnlySpan<char> attrString) | |
| { | |
| var attrs = new Dictionary<string, string>(); | |
| if (attrString.IsWhiteSpace()) return attrs; | |
| int i = 0; | |
| while (i < attrString.Length) | |
| { | |
| // Skip whitespace | |
| while (i < attrString.Length && char.IsWhiteSpace(attrString[i])) | |
| i++; | |
| if (i >= attrString.Length) break; | |
| // Read attribute name | |
| var name = new StringBuilder(); | |
| while (i < attrString.Length && !char.IsWhiteSpace(attrString[i]) && attrString[i] != '=') | |
| { | |
| name.Append(attrString[i]); | |
| i++; | |
| } | |
| // Skip whitespace and '=' | |
| while (i < attrString.Length && (char.IsWhiteSpace(attrString[i]) || attrString[i] == '=')) | |
| i++; | |
| // Read attribute value | |
| var value = new StringBuilder(); | |
| if (i < attrString.Length && (attrString[i] == '"' || attrString[i] == '\'')) | |
| { | |
| char quote = attrString[i]; | |
| i++; // Skip opening quote | |
| while (i < attrString.Length && attrString[i] != quote) | |
| { | |
| value.Append(attrString[i]); | |
| i++; | |
| } | |
| if (i < attrString.Length) | |
| i++; // Skip closing quote | |
| } | |
| if (name.Length > 0) | |
| { | |
| attrs[XmlConvert.VerifyName(name.ToString())] = DecodeEntities(value.ToString()); | |
| } | |
| } | |
| return attrs; | |
| } | |
| private void ParseComment(string content) | |
| { | |
| if (content.StartsWith("!--") && content.EndsWith("--")) | |
| { | |
| string comment = content.Substring(3, content.Length - 5); | |
| OnComment?.Invoke(comment); | |
| } | |
| } | |
| private void ParseCdata(ReadOnlySpan<char> content) | |
| { | |
| if (content.StartsWith("![CDATA[") && content.EndsWith("]]")) | |
| { | |
| string data = content.Slice(8, content.Length - 10).ToString(); | |
| OnCdata?.Invoke(data); | |
| } | |
| } | |
| private void ParseProcessingInstruction(ReadOnlySpan<char> content) | |
| { | |
| content = content.Slice(1).Trim(); // Remove '?' | |
| if (content.EndsWith('?')) | |
| content = content.Slice(0, content.Length - 1).Trim(); | |
| int spaceIdx = content.IndexOf(' '); | |
| string target, data; | |
| if (spaceIdx > 0) | |
| { | |
| target = content.Slice(0, spaceIdx).ToString(); | |
| data = content.Slice(spaceIdx).Trim().ToString(); | |
| } | |
| else | |
| { | |
| target = content.ToString(); | |
| data = ""; | |
| } | |
| OnProcessingInstruction?.Invoke(target, data); | |
| } | |
| private void FlushText() | |
| { | |
| if (_textBuffer.Length > 0) | |
| { | |
| string text = DecodeEntities(_textBuffer.ToString().Trim()); | |
| if (!string.IsNullOrWhiteSpace(text)) | |
| OnText?.Invoke(text); | |
| _textBuffer.Clear(); | |
| } | |
| } | |
| static string DecodeEntities(string text) | |
| { | |
| return text | |
| .Replace("<", "<") | |
| .Replace(">", ">") | |
| .Replace("&", "&") | |
| .Replace(""", "\"") | |
| .Replace("'", "'"); | |
| } | |
| public void Reset() | |
| { | |
| _buffer.Clear(); | |
| _elementStack.Clear(); | |
| _textBuffer.Clear(); | |
| _finished = false; | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment