Skip to content

Instantly share code, notes, and snippets.

@nathan130200
Created January 18, 2026 22:31
Show Gist options
  • Select an option

  • Save nathan130200/66dbba9164eae46d599764994d7778a8 to your computer and use it in GitHub Desktop.

Select an option

Save nathan130200/66dbba9164eae46d599764994d7778a8 to your computer and use it in GitHub Desktop.
Basic XML parser using SAX-like events.
public delegate void StartElementHandler(string name, IReadOnlyDictionary<string, string> attributes);
public delegate void EndElementHandler(string name);
public delegate void CharacterDataHandler(string content);
public delegate void CommentHandler(string data);
public delegate void ProcessingInstructionHandler(string target, string data);
public class XmlParser(Encoding? encoding = default)
{
static readonly UTF8Encoding s_DefaultEncoding = new(false, true);
public event StartElementHandler? OnStartElement;
public event EndElementHandler? OnEndElement;
public event CharacterDataHandler? OnText;
public event CharacterDataHandler? OnCdata;
public event CommentHandler? OnComment;
public event ProcessingInstructionHandler? OnProcessingInstruction;
readonly List<byte> _buffer = [];
readonly Stack<string> _elementStack = new();
readonly StringBuilder _textBuffer = new();
readonly Encoding _encoding = encoding ?? s_DefaultEncoding;
bool _finished = false;
public void Parse(byte[] data, int len, bool isFinal)
{
if (_finished)
throw new XmlException("Parser finished.");
if (data != null && len > 0)
{
for (int i = 0; i < len; i++)
{
_buffer.Add(data[i]);
}
}
ProcessBuffer(isFinal);
if (isFinal)
{
FlushText();
_finished = true;
if (_elementStack.Count > 0)
throw new XmlException("Unclosed token.");
}
}
private void ProcessBuffer(bool isFinal)
{
int pos = 0;
while (pos < _buffer.Count)
{
byte b = _buffer[pos];
if (b == '<')
{
FlushText();
int tagEnd = FindTagEnd(pos);
if (tagEnd == -1)
{
if (!isFinal)
break;
else
throw new XmlException("Unclosed tag.");
}
int tagLen = tagEnd - pos + 1;
string tag = _encoding.GetString([.. _buffer.GetRange(pos, tagLen)]);
ParseTag(tag);
_buffer.RemoveRange(0, tagEnd + 1);
pos = 0;
}
else
{
if (!char.IsWhiteSpace((char)b) || _textBuffer.Length > 0)
_textBuffer.Append((char)b);
pos++;
}
}
if (pos > 0)
{
_buffer.RemoveRange(0, pos);
}
}
private int FindTagEnd(int start)
{
bool inQuote = false;
char quoteChar = '\0';
for (int i = start + 1; i < _buffer.Count; i++)
{
char c = (char)_buffer[i];
if (c == '"' || c == '\'')
{
if (!inQuote)
{
inQuote = true;
quoteChar = c;
}
else if (c == quoteChar)
{
inQuote = false;
}
}
else if (c == '>' && !inQuote)
{
return i;
}
}
return -1;
}
private void ParseTag(string tag)
{
tag = tag.Trim();
if (tag.Length < 2) return;
// Remove < e >
string content = tag.Substring(1, tag.Length - 2).Trim();
// Processing Instruction
if (content.StartsWith('?'))
{
ParseProcessingInstruction(content);
return;
}
// Comment
if (content.StartsWith("!--"))
{
ParseComment(content);
return;
}
// CDATA
if (content.StartsWith("![CDATA["))
{
ParseCdata(content);
return;
}
// End tag
if (content.StartsWith('/'))
{
ParseEndTag(content);
return;
}
// Start tag
ParseStartTag(content);
}
private void ParseStartTag(ReadOnlySpan<char> content)
{
bool selfClosing = content.EndsWith('/');
if (selfClosing)
content = content.Slice(0, content.Length - 1).Trim();
int spaceIdx = content.IndexOfAny([' ', '\t', '\n', '\r']);
string tagName;
ReadOnlySpan<char> attrString = "";
if (spaceIdx > 0)
{
tagName = content.Slice(0, spaceIdx).ToString();
attrString = content.Slice(spaceIdx).Trim();
}
else
{
tagName = content.ToString();
}
var attrs = ParseAttributes(attrString);
_elementStack.Push(tagName);
OnStartElement?.Invoke(tagName, attrs);
if (selfClosing)
{
_elementStack.Pop();
OnEndElement?.Invoke(tagName);
}
}
private void ParseEndTag(string content)
{
string tagName = content.Substring(1).Trim();
if (_elementStack.Count > 0)
{
string expected = _elementStack.Pop();
if (expected != tagName)
throw new XmlException($"Tag mismatch: expected '{expected}', got '{tagName}'");
}
OnEndElement?.Invoke(tagName);
}
static Dictionary<string, string> ParseAttributes(ReadOnlySpan<char> attrString)
{
var attrs = new Dictionary<string, string>();
if (attrString.IsWhiteSpace()) return attrs;
int i = 0;
while (i < attrString.Length)
{
// Skip whitespace
while (i < attrString.Length && char.IsWhiteSpace(attrString[i]))
i++;
if (i >= attrString.Length) break;
// Read attribute name
var name = new StringBuilder();
while (i < attrString.Length && !char.IsWhiteSpace(attrString[i]) && attrString[i] != '=')
{
name.Append(attrString[i]);
i++;
}
// Skip whitespace and '='
while (i < attrString.Length && (char.IsWhiteSpace(attrString[i]) || attrString[i] == '='))
i++;
// Read attribute value
var value = new StringBuilder();
if (i < attrString.Length && (attrString[i] == '"' || attrString[i] == '\''))
{
char quote = attrString[i];
i++; // Skip opening quote
while (i < attrString.Length && attrString[i] != quote)
{
value.Append(attrString[i]);
i++;
}
if (i < attrString.Length)
i++; // Skip closing quote
}
if (name.Length > 0)
{
attrs[XmlConvert.VerifyName(name.ToString())] = DecodeEntities(value.ToString());
}
}
return attrs;
}
private void ParseComment(string content)
{
if (content.StartsWith("!--") && content.EndsWith("--"))
{
string comment = content.Substring(3, content.Length - 5);
OnComment?.Invoke(comment);
}
}
private void ParseCdata(ReadOnlySpan<char> content)
{
if (content.StartsWith("![CDATA[") && content.EndsWith("]]"))
{
string data = content.Slice(8, content.Length - 10).ToString();
OnCdata?.Invoke(data);
}
}
private void ParseProcessingInstruction(ReadOnlySpan<char> content)
{
content = content.Slice(1).Trim(); // Remove '?'
if (content.EndsWith('?'))
content = content.Slice(0, content.Length - 1).Trim();
int spaceIdx = content.IndexOf(' ');
string target, data;
if (spaceIdx > 0)
{
target = content.Slice(0, spaceIdx).ToString();
data = content.Slice(spaceIdx).Trim().ToString();
}
else
{
target = content.ToString();
data = "";
}
OnProcessingInstruction?.Invoke(target, data);
}
private void FlushText()
{
if (_textBuffer.Length > 0)
{
string text = DecodeEntities(_textBuffer.ToString().Trim());
if (!string.IsNullOrWhiteSpace(text))
OnText?.Invoke(text);
_textBuffer.Clear();
}
}
static string DecodeEntities(string text)
{
return text
.Replace("&lt;", "<")
.Replace("&gt;", ">")
.Replace("&amp;", "&")
.Replace("&quot;", "\"")
.Replace("&apos;", "'");
}
public void Reset()
{
_buffer.Clear();
_elementStack.Clear();
_textBuffer.Clear();
_finished = false;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment