Last active
March 16, 2026 06:54
-
-
Save hdf/6ecfa4754ddce29d8764ea67b4a98e62 to your computer and use it in GitHub Desktop.
Small C# class for Where condition string tokenizing. Maybe later parse into a predicate?
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| using System.Text.Json; | |
| using System.Text.Json.Serialization; | |
| using System.Text.Encodings.Web; | |
| using System.Text; | |
| [JsonSerializable(typeof(List<Dictionary<string, string>>))] | |
| internal partial class SourceGenerationContext : JsonSerializerContext { } | |
| public class Program | |
| { | |
| public static class Tokenizer | |
| { | |
| public enum TokenType | |
| { | |
| ParenthesisOpen, | |
| ParenthesisClose, | |
| BracketOpen, | |
| BracketClose, | |
| QuoteBegin, | |
| QuoteEnd, | |
| VariableName, | |
| ConstantValue, | |
| Equal, | |
| NotEqual, | |
| GreaterThan, | |
| LessThan, | |
| GreaterThanOrEqual, | |
| LessThanOrEqual, | |
| In, | |
| NotIn, | |
| And, | |
| Or | |
| } | |
| #region Private Constants | |
| private static readonly Dictionary<TokenType, TokenType[]> AllowedNextTokens = new Dictionary<TokenType, TokenType[]> | |
| { | |
| { TokenType.ParenthesisOpen, new[] { TokenType.VariableName, TokenType.ParenthesisOpen } }, | |
| { TokenType.ParenthesisClose, new[] { TokenType.And, TokenType.Or, TokenType.ParenthesisClose } }, | |
| { TokenType.BracketOpen, new[] { TokenType.QuoteBegin, TokenType.ConstantValue } }, | |
| { TokenType.BracketClose, new[] { TokenType.And, TokenType.Or, TokenType.ParenthesisClose } }, | |
| { TokenType.QuoteBegin, new[] { TokenType.ConstantValue } }, | |
| { TokenType.QuoteEnd, new[] { TokenType.And, TokenType.Or, TokenType.ParenthesisClose } }, | |
| { TokenType.VariableName, new[] { TokenType.Equal, TokenType.NotEqual, TokenType.GreaterThan, TokenType.LessThan, TokenType.GreaterThanOrEqual, TokenType.LessThanOrEqual, TokenType.In, TokenType.NotIn } }, | |
| { TokenType.ConstantValue, new[] { TokenType.And, TokenType.Or, TokenType.QuoteEnd, TokenType.BracketClose, TokenType.ParenthesisClose } }, | |
| { TokenType.Equal, new[] { TokenType.QuoteBegin, TokenType.VariableName, TokenType.ConstantValue } }, | |
| { TokenType.NotEqual, new[] { TokenType.QuoteBegin, TokenType.VariableName, TokenType.ConstantValue } }, | |
| { TokenType.GreaterThan, new[] { TokenType.QuoteBegin, TokenType.VariableName, TokenType.ConstantValue } }, | |
| { TokenType.LessThan, new[] { TokenType.QuoteBegin, TokenType.VariableName, TokenType.ConstantValue } }, | |
| { TokenType.GreaterThanOrEqual, new[] { TokenType.QuoteBegin, TokenType.VariableName, TokenType.ConstantValue } }, | |
| { TokenType.LessThanOrEqual, new[] { TokenType.QuoteBegin, TokenType.VariableName, TokenType.ConstantValue } }, | |
| { TokenType.In, new[] { TokenType.BracketOpen } }, | |
| { TokenType.NotIn, new[] { TokenType.BracketOpen } }, | |
| { TokenType.And, new[] { TokenType.ParenthesisOpen, TokenType.VariableName } }, | |
| { TokenType.Or, new[] { TokenType.ParenthesisOpen, TokenType.VariableName } } | |
| }; | |
| private static readonly HashSet<TokenType> InitialTokens = new HashSet<TokenType> | |
| { | |
| TokenType.ParenthesisOpen, | |
| TokenType.And, | |
| TokenType.Or | |
| }; | |
| private static readonly HashSet<TokenType> OperatorTokens = new HashSet<TokenType> | |
| { | |
| TokenType.Equal, | |
| TokenType.NotEqual, | |
| TokenType.GreaterThan, | |
| TokenType.LessThan, | |
| TokenType.GreaterThanOrEqual, | |
| TokenType.LessThanOrEqual, | |
| TokenType.In, | |
| TokenType.NotIn | |
| }; | |
| private static readonly HashSet<TokenType> BeforeOperatorTokens = new HashSet<TokenType> | |
| { | |
| TokenType.VariableName, | |
| TokenType.ConstantValue, | |
| TokenType.QuoteEnd, | |
| TokenType.BracketClose, | |
| TokenType.ParenthesisClose, | |
| TokenType.And, | |
| TokenType.Or | |
| }; | |
| private static readonly Dictionary<string, TokenType> TokenMap = new Dictionary<string, TokenType> | |
| { | |
| { "equal", TokenType.Equal }, | |
| { "notequal", TokenType.NotEqual }, | |
| { "greaterthan", TokenType.GreaterThan }, | |
| { "lessthan", TokenType.LessThan }, | |
| { "greaterthanorequal", TokenType.GreaterThanOrEqual }, | |
| { "lessthanorequal", TokenType.LessThanOrEqual }, | |
| { "in", TokenType.In }, | |
| { "notin", TokenType.NotIn }, | |
| { "and", TokenType.And }, | |
| { "or", TokenType.Or } | |
| }; | |
| private static readonly Dictionary<TokenType, string> TokenToStringMap = new Dictionary<TokenType, string>{ | |
| { TokenType.ParenthesisOpen, "(" }, | |
| { TokenType.ParenthesisClose, ")" }, | |
| { TokenType.BracketOpen, "[" }, | |
| { TokenType.BracketClose, "]" }, | |
| { TokenType.QuoteBegin, "'" }, | |
| { TokenType.QuoteEnd, "'" }, | |
| { TokenType.Equal, " equal " }, | |
| { TokenType.NotEqual, " notequal " }, | |
| { TokenType.GreaterThan, " greaterthan " }, | |
| { TokenType.LessThan, " lessthan " }, | |
| { TokenType.GreaterThanOrEqual, " greaterthanorequal " }, | |
| { TokenType.LessThanOrEqual, " lessthanorequal " }, | |
| { TokenType.In, " in " }, | |
| { TokenType.NotIn, " notin " }, | |
| { TokenType.And, " and " }, | |
| { TokenType.Or, " or " } | |
| }; | |
| #endregion | |
| #region Private Methods | |
| private static bool IsValidVariableName(string token) | |
| { | |
| if (string.IsNullOrEmpty(token)) | |
| { | |
| return false; | |
| } | |
| char first = token[0]; | |
| bool isFirstValid = (first >= 'a' && first <= 'z') || (first >= 'A' && first <= 'Z') || first == '_'; | |
| if (!isFirstValid) | |
| { | |
| return false; | |
| } | |
| for (int j = 1; j < token.Length; j++) | |
| { | |
| char c = token[j]; | |
| bool isValid = (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_'; | |
| if (!isValid) | |
| { | |
| return false; | |
| } | |
| } | |
| return true; | |
| } | |
| private static int SkipOverQuoted(string str, int startIndex, string sectionEndStr = "]") | |
| { | |
| bool inQuote = false; | |
| for (int i = startIndex; i < str.Length; i++) | |
| { | |
| if (str[i] == '\'') | |
| { | |
| // Két egymást követő aposztróf egy escaped idézőjel a string literálban. | |
| if (inQuote && i + 1 < str.Length && str[i + 1] == '\'') | |
| { | |
| i++; | |
| continue; | |
| } | |
| inQuote = !inQuote; | |
| continue; | |
| } | |
| if (!inQuote && i + sectionEndStr.Length <= str.Length && | |
| string.CompareOrdinal(str, i, sectionEndStr, 0, sectionEndStr.Length) == 0) | |
| { | |
| return i; | |
| } | |
| } | |
| if (inQuote) | |
| { | |
| throw new ArgumentException("Unmatched quote"); | |
| } | |
| return -1; | |
| } | |
| private static int FindQuoteEnd(string str, int startIndex) | |
| { | |
| for (int i = startIndex; i < str.Length; i++) | |
| { | |
| if (str[i] != '\'') | |
| { | |
| continue; | |
| } | |
| // Két egymást követő aposztróf egy escaped idézőjel a string literálban. | |
| if (i + 1 < str.Length && str[i + 1] == '\'') | |
| { | |
| i++; | |
| continue; | |
| } | |
| return i; | |
| } | |
| throw new ArgumentException("Unmatched quote"); | |
| } | |
| private static TokenType AddTokenToResultWithValidation(List<Tuple<TokenType, string>> result, TokenType tokenType, string tokenValue = "") | |
| { | |
| if (result.Count > 0) | |
| { | |
| var lastToken = result.Last(); | |
| if (!AllowedNextTokens.TryGetValue(lastToken.Item1, out var allowedNext) || !allowedNext.Contains(tokenType)) | |
| { | |
| Console.WriteLine($"Current tokens: {SerializeTokens(result)}"); | |
| throw new ArgumentException($"Unexpected token: {tokenType} after {lastToken.Item1}!"); | |
| } | |
| } | |
| result.Add(Tuple.Create(tokenType, tokenValue)); | |
| return tokenType; | |
| } | |
| #endregion | |
| #region Public Methods | |
| public static List<Tuple<TokenType, string>> Tokenize(string str) | |
| { | |
| List<Tuple<TokenType, string>> result = new List<Tuple<TokenType, string>>(); | |
| int parenthesisDepth = 0; | |
| TokenType? prevToken = null; | |
| int i = 0; | |
| while (i < str.Length) | |
| { | |
| char c = str[i]; | |
| if (char.IsWhiteSpace(c)) | |
| { | |
| i++; | |
| continue; | |
| } | |
| if (c == '(') | |
| { | |
| prevToken = AddTokenToResultWithValidation(result, TokenType.ParenthesisOpen); | |
| parenthesisDepth++; | |
| i++; | |
| continue; | |
| } | |
| if (c == ')') | |
| { | |
| prevToken = AddTokenToResultWithValidation(result, TokenType.ParenthesisClose); | |
| parenthesisDepth--; | |
| i++; | |
| continue; | |
| } | |
| if (c == '\'') | |
| { | |
| prevToken = AddTokenToResultWithValidation(result, TokenType.QuoteBegin); | |
| i++; | |
| int quoteEndIndex = FindQuoteEnd(str, i); | |
| string value = str.Substring(i, quoteEndIndex - i); | |
| prevToken = AddTokenToResultWithValidation(result, TokenType.ConstantValue, value); | |
| prevToken = AddTokenToResultWithValidation(result, TokenType.QuoteEnd); | |
| i = quoteEndIndex + 1; | |
| continue; | |
| } | |
| if (c == '[') | |
| { | |
| prevToken = AddTokenToResultWithValidation(result, TokenType.BracketOpen); | |
| i++; | |
| int bracketEndIndex = SkipOverQuoted(str, i); | |
| if (bracketEndIndex == -1) | |
| { | |
| throw new ArgumentException("Unmatched bracket"); | |
| } | |
| string value = str.Substring(i, bracketEndIndex - i); | |
| prevToken = AddTokenToResultWithValidation(result, TokenType.ConstantValue, value); | |
| prevToken = AddTokenToResultWithValidation(result, TokenType.BracketClose); | |
| i = bracketEndIndex + 1; | |
| continue; | |
| } | |
| if (prevToken == null || InitialTokens.Contains(prevToken.Value)) // Változó név | |
| { | |
| int nextSpaceIndex = str.IndexOf(' ', i); | |
| if (nextSpaceIndex == -1) | |
| { | |
| throw new ArgumentException("Unexpected end of input after variable!"); | |
| } | |
| string tokenStr = str.Substring(i, nextSpaceIndex - i); | |
| if (!IsValidVariableName(tokenStr)) | |
| { | |
| throw new ArgumentException($"Invalid variable name: {tokenStr}"); | |
| } | |
| prevToken = AddTokenToResultWithValidation(result, TokenType.VariableName, tokenStr); | |
| i = nextSpaceIndex + 1; | |
| continue; | |
| } | |
| else if (BeforeOperatorTokens.Contains(prevToken.Value)) // Operátor | |
| { | |
| int nextSpaceIndex = str.IndexOf(' ', i); | |
| if (nextSpaceIndex == -1) | |
| { | |
| throw new ArgumentException("Unexpected end of input after operator!"); | |
| } | |
| string tokenStr = str.Substring(i, nextSpaceIndex - i).ToLower(); | |
| if (!TokenMap.TryGetValue(tokenStr, out var tokenType)) | |
| { | |
| throw new ArgumentException($"Unknown operator: {tokenStr}"); | |
| } | |
| prevToken = AddTokenToResultWithValidation(result, tokenType); | |
| i = nextSpaceIndex + 1; | |
| continue; | |
| } | |
| else if (OperatorTokens.Contains(prevToken.Value)) // Operátor után szám (vagy null). (A string és tömb konstans értékek máshol vannak kezelve.) | |
| { | |
| int nextSpaceIndex = str.IndexOf(' ', i); | |
| if (nextSpaceIndex == -1) | |
| { | |
| nextSpaceIndex = str.Length; | |
| } | |
| bool delimitedByParenthesis = false; | |
| int nextParenthesisCloseIndex = str.IndexOf(')', i); | |
| if (nextParenthesisCloseIndex != -1 && nextParenthesisCloseIndex < nextSpaceIndex) | |
| { | |
| nextSpaceIndex = nextParenthesisCloseIndex; | |
| delimitedByParenthesis = true; | |
| } | |
| string tokenStr = str.Substring(i, nextSpaceIndex - i); | |
| if (tokenStr.Equals("null", StringComparison.OrdinalIgnoreCase)) | |
| { | |
| tokenStr = "null"; | |
| } | |
| else if (!decimal.TryParse(tokenStr, System.Globalization.CultureInfo.InvariantCulture, out _)) | |
| { | |
| throw new ArgumentException($"Invalid number: {tokenStr}"); | |
| } | |
| prevToken = AddTokenToResultWithValidation(result, TokenType.ConstantValue, tokenStr); | |
| i = delimitedByParenthesis ? nextSpaceIndex : nextSpaceIndex + 1; | |
| continue; | |
| } | |
| Console.WriteLine($"Current tokens: {SerializeTokens(result)}"); | |
| throw new ArgumentException($"Unexpected character: '{c}' at position {i}"); | |
| } | |
| if (parenthesisDepth != 0) | |
| { | |
| throw new ArgumentException("Unmatched parenthesis"); | |
| } | |
| return result; | |
| } | |
| public static string TokensToString(List<Tuple<TokenType, string>> tokens) | |
| { | |
| StringBuilder sb = new StringBuilder(); | |
| foreach (var token in tokens) | |
| { | |
| if (token.Item1 == TokenType.VariableName || token.Item1 == TokenType.ConstantValue) | |
| { | |
| sb.Append(token.Item2); | |
| } | |
| else if (TokenToStringMap.TryGetValue(token.Item1, out var tokenStr)) | |
| { | |
| sb.Append(tokenStr); | |
| } | |
| else | |
| { | |
| throw new Exception($"No string mapping for token type: {token.Item1}"); | |
| } | |
| } | |
| return sb.ToString(); | |
| } | |
| #endregion | |
| #region Serialization | |
| private static readonly SourceGenerationContext _serializationContext = new SourceGenerationContext( | |
| new JsonSerializerOptions | |
| { | |
| WriteIndented = true, | |
| Encoder = JavaScriptEncoder.UnsafeRelaxedJsonEscaping | |
| }); | |
| public static string SerializeTokens(List<Tuple<TokenType, string>> tokens) | |
| { | |
| List<Dictionary<string, string>> shapedTokens = tokens | |
| .Select(token => new Dictionary<string, string> { [token.Item1.ToString()] = token.Item2 }) | |
| .ToList(); | |
| return JsonSerializer.Serialize(shapedTokens, typeof(List<Dictionary<string, string>>), _serializationContext); | |
| } | |
| #endregion | |
| } | |
| public static void Main(string[] args) | |
| { | |
| string original = "((region in ['Nor]''th]Zone','South','A''B'] and years_experience lessthanorequal 1.5) or (_root equal 1 and score greaterthan 90)) and (name equal 'O]'']Connor' and level lessthan 10) and (department notin [HR,Finance , 'Legal' ] or (title notequal 'Intern' and _temp1 greaterthanorequal -5)) or nothing equal null or nothing notequal ''"; | |
| var tokens = Tokenizer.Tokenize(original); | |
| Console.WriteLine($"Tokens: {Tokenizer.SerializeTokens(tokens)}"); | |
| string generated = Tokenizer.TokensToString(tokens); | |
| Console.WriteLine($"Tokens as String: {generated}"); | |
| Console.WriteLine($"Original and recreated strings {(original != generated ? "DON'T " : "")}MATCH!"); | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment