-
-
Save AmadorMunozBerzosa/3288771aaa199cc73acc7781bff4d49e to your computer and use it in GitHub Desktop.
| module Krow.Regex.ActivePatterns | |
| open System.Text | |
| let (|Regex|_|) (pattern:IRegex) input = | |
| if input = null then | |
| None | |
| else | |
| try | |
| let match' = RegularExpressions.Regex.Match(input, pattern |> Regex.evaluate) | |
| if match'.Success then | |
| Some( List.tail [ for groups in match'.Groups -> groups.Value ]) | |
| else | |
| None | |
| with _ -> None | |
| let (|Regexs|) (pattern:IRegex) input = | |
| if input = null then [] else | |
| try | |
| let matches = RegularExpressions.Regex.Matches(input, pattern |> Regex.evaluate) | |
| [ for match' in matches do (List.tail [ for group in match'.Groups -> group.Value ]) ] | |
| with e -> [] |
| [<AutoOpen>] | |
| module Krow.Regex.Evaluation | |
| open Krow.Regex.Types | |
| open System.Text | |
| module Regex = | |
| let escape s = (RegularExpressions.Regex.Escape s).Replace("]", "\]") | |
| let unescape (s:string) = (RegularExpressions.Regex.Unescape (s.Replace("\]", "]"))) | |
| [<AutoOpen>] | |
| module private Helpers = | |
| let groupable (regex:IRegex) = | |
| match regex with | |
| | :? Regex.Sequence | :? Regex.OneOf -> Regex.NonCapturing regex :> IRegex | |
| | _ -> regex | |
| let listGroupable (regex:IRegex) = | |
| match regex with | |
| | :? Regex.OneOf -> Regex.NonCapturing regex :> IRegex | |
| | _ -> regex | |
| let rec charsetContent charSet = | |
| match charSet with | |
| | CharSet.OneOf chars -> | |
| let string = (new string(chars |> List.toArray)).Replace("/",@"\/") | |
| $@"{string}" | |
| | CharSet.Range (first,last) -> | |
| $@"{first}-{last}" | |
| | CharSet.Multiple charSets -> | |
| charSets |> List.map charsetContent |> List.reduce (+) | |
| let referenceString reference = | |
| match reference with | |
| | Group.Positional number -> number |> string | |
| | Group.Named string -> string | |
| |> escape | |
| let rec evaluate (regex:IRegex) = | |
| match regex with | |
| | :? Regex.Literal as literal -> | |
| let (Regex.Literal literal) = literal | |
| escape literal | |
| | :? Regex.Raw as literal -> | |
| let (Regex.Raw literal) = literal | |
| literal | |
| | :? Regex.OneOf as oneOf -> | |
| let (Regex.OneOf regexs) = oneOf | |
| regexs | |
| |> List.map evaluate |> String.concat "|" | |
| | :? Regex.Sequence as sequence -> | |
| let (Regex.Sequence regexs) = sequence | |
| regexs |> List.map (listGroupable >> evaluate) |> String.concat "" | |
| | :? Regex.NonCapturing as grouping -> | |
| let (Regex.NonCapturing regex) = grouping | |
| $@"(?:{evaluate regex})" | |
| | :? Mode.WithModes as withModes -> | |
| let (Mode.WithModes (modes, regex)) = withModes | |
| let modeChar = function | |
| | Mode.CaseInsensitive -> "i" | |
| | Mode.Multiline -> "m" | |
| | Mode.ExplicitCapture -> "n" | |
| | Mode.IgnoreUnescapedWhiteSpace -> "x" | |
| let modeList = modes |> List.map modeChar |> String.concat "" | |
| $@"(?{modeList}:{regex})" | |
| | :? Group.Reference as reference -> | |
| let string = reference |> referenceString | |
| match reference with | |
| | Group.Positional _ -> $@"\{string}" | |
| | Group.Named _ -> $@"\k<{string}>" | |
| | :? Look.Look as look -> | |
| match look with | |
| | Look.Ahead regex -> $@"(?={regex |> evaluate})" | |
| | Look.Behind regex -> $@"(?<={regex |> evaluate})" | |
| | :? Look.Negated as look -> | |
| let (Look.Negated look) = look | |
| match look with | |
| | Look.Ahead regex -> $@"(?!{regex |> evaluate})" | |
| | Look.Behind regex -> $@"(?<!{regex |> evaluate})" | |
| | :? Group.Group as group -> | |
| let (Group.Group(group,regex)) = group | |
| let regex = regex |> evaluate | |
| match group with | |
| // Capturing | |
| | Group.Capturing -> $@"({regex})" | |
| | Group.CapturingWithName name -> $@"(?<{name}>{regex})" | |
| // Non capturing | |
| | Group.NonBacktrackingGrouping -> $@"(?>{regex})" | |
| // Balancing | |
| | Group.UnCapturing reference -> | |
| $@"(?<-{reference |> referenceString}>{regex})" | |
| | Group.Balancing (newName, reference) -> | |
| $@"(?<{newName |> escape}-{reference |> referenceString}>{regex})" | |
| | :? SpecialChar.SpecialChar as special -> | |
| match special with | |
| | SpecialChar.WildCard -> @"." | |
| | SpecialChar.Bell -> @"\a" | |
| | SpecialChar.Backspace -> @"\b" | |
| | SpecialChar.Tab -> @"\t" | |
| | SpecialChar.VerticalTab -> @"\v" | |
| | SpecialChar.CarriageReturn -> @"\r" | |
| | SpecialChar.NewLine -> @"\n" | |
| | SpecialChar.Escaped -> @"\e" | |
| | SpecialChar.Octal oct -> $@"\{oct}" | |
| | SpecialChar.Hexadecimal hex -> $@"\x{hex}" | |
| | SpecialChar.ASCII ascii -> $@"\u{ascii}" | |
| | :? Anchor.Anchor as anchor -> | |
| match anchor with | |
| | Anchor.Start -> @"\A" | |
| | Anchor.StartOfLine -> @"^" | |
| | Anchor.End -> @"\z" | |
| | Anchor.EndOfLine -> @"$" | |
| | Anchor.Boundary -> @"\b" | |
| | Anchor.NotBoundary -> @"\B" | |
| | Anchor.AfterMatch -> @"\G" | |
| | :? CharSet.CharSet as charSet -> | |
| $"[{charsetContent charSet}]" | |
| | :? CharSet.Negated as negated -> | |
| let (CharSet.Negated charSet) = negated | |
| $"[^{charsetContent charSet}]" | |
| | :? CharClass.CharClass as charClass -> | |
| match charClass with | |
| | CharClass.InUnicodeBlock block -> $@"\p{{{block}}}" | |
| | CharClass.LetterOrDigit -> @"\w" | |
| | CharClass.WhitespaceChar -> @"\s" | |
| | CharClass.Digit -> @"\d" | |
| | :? CharClass.Negated as negated -> | |
| let (CharClass.Negated charClass) = negated | |
| match charClass with | |
| | CharClass.InUnicodeBlock block -> $@"\P{{{block}}}" | |
| | CharClass.LetterOrDigit -> @"\W" | |
| | CharClass.WhitespaceChar -> @"\S" | |
| | CharClass.Digit -> @"\D" | |
| | :? Quantity.Quantified as quantified -> | |
| match quantified with | |
| | Quantity.Greedy (regex,quantity) -> | |
| let regex = regex |> groupable |> evaluate | |
| match quantity with | |
| | Quantity.Exactly amount -> | |
| $@"{regex}{{{amount}}}" | |
| | Quantity.AtLeast amount -> | |
| if amount = 0 then | |
| $@"{regex}*" | |
| else if amount = 1 then | |
| $@"{regex}+" | |
| else | |
| $@"{regex}{{{amount},}}" | |
| | Quantity.Between (min,max) -> | |
| if min = 0 && max = 1 then | |
| $@"{regex}?" | |
| else | |
| $@"{regex}{{{min},{max}}}" | |
| | Quantity.Lazy (regex,quantity) -> | |
| let greedQuantified = Quantity.Greedy(regex,quantity) |> evaluate | |
| greedQuantified + "?" | |
| | :? Condition.Conditional as conditional -> | |
| let evaluateCondition = function | |
| | Condition.Regex regex -> regex |> evaluate | |
| | Condition.Reference reference -> reference |> referenceString | |
| $@"(?({conditional.If |> evaluateCondition}){conditional.Then |> evaluate}|{conditional.Else |> evaluate})" | |
| | _ -> failwith "Not supported" |
| module Examples | |
| open Krow.Regex | |
| let bounded (regex:IRegex) = | |
| Anchor.Start + regex + Anchor.End | |
| let lineBounded (regex:IRegex) = | |
| Anchor.StartOfLine + regex + Anchor.EndOfLine | |
| let separatedList separator (regex:IRegex) = | |
| regex + (separator + regex) * (0,()) | |
| module Guid = | |
| let hexDigit = CharSet.Range('0', '9') / CharSet.Range('a', 'f') | |
| let guid = | |
| Regex.Sequence [ | |
| hexDigit * 8 + "-" | |
| hexDigit * 4 + "-" | |
| CharSet.Range('1', '5') + "-" | |
| CharSet.OneOf ['8';'9';'a';'b'] | |
| hexDigit * 3 + "-" | |
| hexDigit * 12 | |
| ] | |
| module Email = | |
| let allowedSpecialChars = CharSet.OneOf [ | |
| '!';'#';'$';'%';'&';''';'*';'+';'/';'=';'?';'^';'_';'`';'{';'|';'}';'~';'-' | |
| ] | |
| let alphaNumeric = CharSet.Range('a','z') / CharSet.Range('0','9') | |
| let alphaNumericOrHyphen = alphaNumeric / "-" | |
| module Hex = | |
| let group1 = | |
| ["01";"08";"0B";"0C";"0E";"1F";"21";"23";"5B";"5D";"7F"] | |
| |> List.map (fun a -> SpecialChar.Hexadecimal a :> IRegex) | |
| |> Regex.OneOf | |
| let group2 = | |
| ["01";"09";"0B";"0C";"0E";"7F"] | |
| |> List.map (fun a -> SpecialChar.Hexadecimal a :> IRegex) | |
| |> Regex.OneOf | |
| let part = group1 / ( @"\" + group2) | |
| let name = part * (0,()) | |
| let quotedName = "\"" + name + "\"" | |
| module User = | |
| let stringPart = (alphaNumeric / allowedSpecialChars) * (1,()) | |
| let stringName = separatedList "." stringPart | |
| let name = stringName / Hex.quotedName | |
| module Ip = | |
| let ipPart = | |
| Regex.OneOf [ | |
| "25" + CharSet.Range('0','5') | |
| "2" + CharSet.Range('0','4') + CharSet.Range('0','9') | |
| CharSet.OneOf ['0';'1'] + CharSet.Range('0','9') + CharSet.Range('0','9') | |
| ] | |
| let hexPart = | |
| alphaNumericOrHyphen * (0,()) + alphaNumeric + ":" + Hex.name | |
| let lastPart = ipPart / hexPart | |
| let address = "[" + (ipPart + ".") * 3 + lastPart + "]" | |
| module Domain = | |
| let part = (alphaNumeric * (1,())) |> separatedList "-" | |
| let name = part |> separatedList "." | |
| let email = bounded (User.name + "@" + (Ip.address / Domain.name)) |
| [<AutoOpen>] | |
| module Krow.Regex.Operators | |
| open Regex | |
| type RegexSequence = RegexSequence with | |
| static member (?<-) (RegexSequence, first:Sequence, second:Sequence) = | |
| let (Sequence list1) = first | |
| let (Sequence list2) = second | |
| Sequence(list1 @ list2) | |
| static member (?<-) (RegexSequence, first:IRegex, second:Sequence) = | |
| (?<-) RegexSequence (Sequence [first]) second | |
| static member (?<-) (RegexSequence, first:Sequence, second:IRegex) = | |
| (?<-) RegexSequence first (Sequence [second]) | |
| static member (?<-) (RegexSequence, first:IRegex, second:IRegex) = | |
| (?<-) RegexSequence (Sequence [first]) (Sequence [second]) | |
| static member (?<-) (RegexSequence, first:string, second:IRegex) = | |
| (?<-) RegexSequence (Sequence [Literal first]) (Sequence [second]) | |
| static member (?<-) (RegexSequence, first:IRegex, second:string) = | |
| (?<-) RegexSequence (Sequence [first]) (Sequence [Literal second]) | |
| static member inline (?<-) (RegexSequence, first, second) = | |
| first + second | |
| let inline (+) first second : 'R = ( (?<-) RegexSequence first second) | |
| type RegexOneOf = RegexOneOf with | |
| static member (?<-) (RegexOneOf, first:OneOf, second:OneOf) = | |
| let (OneOf list1) = first | |
| let (OneOf list2) = second | |
| OneOf(list1 @ list2) | |
| static member (?<-) (RegexOneOf, first:IRegex, second:OneOf) = | |
| (?<-) RegexOneOf (OneOf [first]) second | |
| static member (?<-) (RegexOneOf, first:OneOf, second:IRegex) = | |
| (?<-) RegexOneOf first (OneOf [second]) | |
| static member (?<-) (RegexOneOf, first:IRegex, second:IRegex) = | |
| OneOf [first;second] | |
| static member (?<-) (RegexOneOf, first:CharSet.CharSet, second:CharSet.CharSet) = | |
| match first,second with | |
| | CharSet.Multiple charsets1, CharSet.Multiple charsets2 -> | |
| CharSet.Multiple (charsets1 @ charsets2) | |
| | CharSet.Multiple charsets1, charset2 -> | |
| CharSet.Multiple (charsets1 @ [charset2]) | |
| | charset1, CharSet.Multiple charsets2 -> | |
| CharSet.Multiple (charset1 :: charsets2) | |
| | charset1, charset2 -> | |
| CharSet.Multiple [charset1;charset2] | |
| static member (?<-) (RegexOneOf, CharSet.Negated first, CharSet.Negated second) = | |
| CharSet.Negated ((?<-) RegexOneOf first second) | |
| static member (?<-) (RegexOneOf, first:string, second:IRegex) = | |
| (?<-) RegexOneOf (OneOf [Literal first]) (OneOf [second]) | |
| static member (?<-) (RegexOneOf, first:IRegex, second:string) = | |
| (?<-) RegexOneOf (OneOf [first]) (OneOf [Literal second]) | |
| static member inline (?<-) (RegexOneOf, first, second) = | |
| first / second | |
| let inline (/) first second : 'R = ( (?<-) RegexOneOf first second) | |
| type RegexQuantification = RegexQuantification with | |
| static member (?<-) (RegexQuantification, regex, quantity) = | |
| Quantity.Greedy(regex, Quantity.Exactly quantity) | |
| static member (?<-) (RegexQuantification, regex, quantity) = | |
| Quantity.Greedy(regex, Quantity.Between quantity) | |
| static member (?<-) (RegexQuantification, regex, quantity) = | |
| let quantity, () = quantity | |
| Quantity.Greedy(regex, Quantity.AtLeast quantity) | |
| static member inline (?<-) (RegexQuantification, first, second) = | |
| first * second | |
| let inline ( * ) first second : 'R = ( (?<-) RegexQuantification first second) | |
| type RegexLazyQuantification = RegexLazyQuantification with | |
| static member (?<-) (RegexLazyQuantification, regex, quantity) = | |
| Quantity.Lazy(regex, Quantity.Exactly quantity) | |
| static member (?<-) (RegexLazyQuantification, regex, quantity) = | |
| Quantity.Lazy(regex, Quantity.Between quantity) | |
| static member (?<-) (RegexLazyQuantification, regex, quantity) = | |
| let quantity, () = quantity | |
| Quantity.Lazy(regex, Quantity.AtLeast quantity) | |
| static member inline (?<-) (RegexLazyQuantification, first, second) = | |
| first *? second | |
| let inline ( *? ) first second : 'R = ( (?<-) RegexLazyQuantification first second) | |
| type RegexNegation = RegexNegation with | |
| static member (?<-) (RegexNegation, charClass:CharClass.CharClass, _) = | |
| CharClass.Negated charClass | |
| static member (?<-) (RegexNegation, charClass:CharClass.Negated, _) = | |
| let (CharClass.Negated charClass) = charClass | |
| charClass | |
| static member (?<-) (RegexNegation, charClass:CharSet.CharSet, _) = | |
| CharSet.Negated charClass | |
| static member (?<-) (RegexNegation, charClass:CharSet.Negated, _) = | |
| let (CharSet.Negated charClass) = charClass | |
| charClass | |
| static member (?<-) (RegexNegation, look:Look.Look, _) = | |
| Look.Negated look | |
| static member (?<-) (RegexNegation, look:Look.Negated, _) = | |
| let (Look.Negated look) = look | |
| look | |
| static member inline (?<-) (RegexNegation, first, _) = | |
| !first | |
| let inline (!) first : 'R = ( (?<-) RegexNegation first ()) | |
| let aaa = !(Look.Ahead (Literal "aa")) |
| [<AutoOpen>] | |
| module Krow.Regex.Types | |
| type IRegex = interface end | |
| module Regex = | |
| type Literal = | |
| | Literal of string | |
| interface IRegex | |
| type Raw = | |
| | Raw of string | |
| interface IRegex | |
| type OneOf = | |
| | OneOf of IRegex list | |
| interface IRegex | |
| type Sequence = | |
| | Sequence of IRegex list interface IRegex | |
| type internal NonCapturing = | |
| | NonCapturing of IRegex | |
| interface IRegex | |
| module Mode = | |
| type Mode = | |
| | CaseInsensitive | |
| | Multiline | |
| | ExplicitCapture | |
| | IgnoreUnescapedWhiteSpace | |
| type WithModes = | |
| | WithModes of Mode list * IRegex | |
| interface IRegex | |
| module Look = | |
| type Look = | |
| | Ahead of IRegex | |
| | Behind of IRegex | |
| interface IRegex | |
| type Negated = | |
| |Negated of Look interface IRegex | |
| module Group = | |
| type Reference = | |
| | Positional of int | |
| | Named of string | |
| interface IRegex | |
| type Kind = | |
| // Capturing | |
| | Capturing | |
| | CapturingWithName of string | |
| // Non capturing | |
| | NonBacktrackingGrouping | |
| // Balancing | |
| | UnCapturing of Reference // Balancing while omitting first arg | |
| | Balancing of string * Reference | |
| type Group = | |
| | Group of Kind * IRegex | |
| interface IRegex | |
| module SpecialChar = | |
| type SpecialChar = | |
| | WildCard | |
| | Bell | |
| | Backspace | |
| | Tab | |
| | VerticalTab | |
| | CarriageReturn | |
| | NewLine | |
| | Escaped | |
| | Octal of string | |
| | Hexadecimal of string | |
| | ASCII of string | |
| interface IRegex | |
| module Anchor = | |
| type Anchor = | |
| | Start | |
| | StartOfLine | |
| | End | |
| | EndOfLine | |
| | Boundary | |
| | NotBoundary | |
| | AfterMatch | |
| interface IRegex | |
| module CharSet = | |
| type CharSet = | |
| | OneOf of char list | |
| | Range of char * char | |
| | Multiple of CharSet list | |
| interface IRegex | |
| type Negated = | |
| | Negated of CharSet | |
| interface IRegex | |
| module CharClass = | |
| type CharClass = | |
| | InUnicodeBlock of string | |
| | LetterOrDigit | |
| | WhitespaceChar | |
| | Digit | |
| interface IRegex | |
| type Negated = | |
| | Negated of CharClass | |
| interface IRegex | |
| module Quantity = | |
| type Quantity = | |
| | Exactly of int | |
| | AtLeast of int | |
| | Between of int * int | |
| type Quantified = | |
| | Greedy of IRegex * Quantity | |
| | Lazy of IRegex * Quantity | |
| interface IRegex | |
| module Condition = | |
| type Condition = | |
| | Regex of IRegex | |
| | Reference of Group.Reference | |
| type Conditional = | |
| { If: Condition; Then: IRegex; Else: IRegex } | |
| interface IRegex |
It is working very well.
Except for ||| that doesn't seem to overload like + does.
Literal "this" ||| Literal "that"worksLiteral "this" ||| "that"doesn't work
I'm not sure why
So I changed to '/'. I know is not symmetrical and it doesn't seem commutative. OTOH it confers the meaning well.
For instance: Literal "Err" / "Warn" / "Info" is very readable.
Also RegularExpressions.Regex.Escape doesn't escape the character ] which is a problem when doing OneOf, so I changed it to:
let escape s = (RegularExpressions.Regex.Escape s).Replace("]", "\]")
Here is an example:
let parseErrWarnInfo = """
Err (1, 7) - (1, 12): "This shows over there as an error".
Warn (2, 7) - (2, 12): "This shows over there as a warning".
Info (3, 7) - (3, 12): "This shows over there as information".
"""
let digitsCap = MoreThanOnce Digit |> Capturing
let spaces = ManyTimesOrNone WhitespaceChar
let coords = "(" + digitsCap + "," + spaces + digitsCap + ")"
let notOneOf v = Seq.toList v |> NotOneOf
let errWarnInfo =
Sequence [
Capturing(Literal "Err" / "Warn" / "Info") + " "
coords + " - "
coords + ": "
"\"" + Capturing (notOneOf "\"" |> MoreThanOnceLazily) + "\"."
]
|> evaluate
printfn "Regex: %s" errWarnInfo
// (Err|Warn|Info)\ \((\d+),\s*(\d+)\)\ -\ \((\d+),\s*(\d+)\):\ "([^"]+?)"\.
I also changed the operator for Either so it preserves the original order. I doesn't make a difference to the regex, but the rearranging was a little disconcerting.
With a couple of active patterns:
let (|Regex|_|) pattern input =
if input = null then None else
try
let m = RegularExpressions.Regex.Match(input, pattern)
if m.Success then Some(List.tail [ for g in m.Groups -> g.Value ])
else None
with e -> None
let (|Regexs|) pattern input =
if input = null then [| |] else
try
let ms = RegularExpressions.Regex.Matches(input, pattern)
[| for m in ms do yield (List.tail [ for g in m.Groups -> g.Value ]) |]
with e -> [| |]
then it can be used very nicely like this:
match parseErrWarnInfo with Regexs errWarnInfo r -> printfn "Matches found:\n%A" r
match parseErrWarnInfo with
| Regex errWarnInfo r -> printfn "First Match: %A" r
| _ -> printfn "No match found"
match parseErrWarnInfo with Regexs errWarnInfo r -> printfn "Matches found:\n%A" r
// Matches found:
// [|["Err"; "1"; "7"; "1"; "12"; "This shows over there as an error"];
// ["Warn"; "2"; "7"; "2"; "12"; "This shows over there as a warning"];
// ["Info"; "3"; "7"; "3"; "12"; "This shows over there as information"]|]
match parseErrWarnInfo with
| Regex errWarnInfo r -> printfn "First Match: %A" r
| _ -> printfn "No match found"
// First Match: ["Err"; "1"; "7"; "1"; "12"; "This shows over there as an error"]
Guid:
let hexDigit = InRange('0', '9') / InRange('a', 'f')
let hexDigits n = Exactly(uint32 n, hexDigit)
Sequence [
hexDigits 8 + "-"
hexDigits 4 + "-"
InRange('1', '5')
hexDigits 3 + "-"
oneOf "89ab"
hexDigits 3 + "-"
hexDigits 12
]
|> evaluate
|> printfn "%s"
// (?:[0-9]|[a-f]){8}-(?:[0-9]|[a-f]){4}-[1-5](?:[0-9]|[a-f]){3}-[89ab](?:[0-9]|[a-f]){3}-(?:[0-9]|[a-f]){12}
email:
let allowed = NotOneOfEscaped (escape "<>()[].,;:@" + evaluate WhitespaceChar) |> MoreThanOnce
let listSep sep elems = elems + ManyTimesOrNone (Literal sep + elems)
listSep "." allowed + "@" + listSep "." allowed
|> evaluate
|> printfn "%s"
in this case allowed needed to include \s without further escaping. For this case I added:
...
| LiteralRegex of string
| OneOfEscaped of string
| NotOneOfEscaped of string
...
| LiteralRegex rx -> rx
| OneOfEscaped string -> sprintf @"[%s]" string
| NotOneOfEscaped string -> sprintf @"[^%s]" string
...
to allow for cases not contemplated or for composing with regex from other sources.
Here is a version of the evaluator the uses
sprintffor those in prior versions of F#: