| 1 | #Region "Microsoft.VisualBasic::14e6e7d6f16d2a7c7404557abfd5af5a, Microsoft.VisualBasic.Core\Extensions\StringHelpers\RegexExtensions.vb" |
| 2 | |
| 3 | ' Author: |
| 4 | ' |
| 5 | ' asuka (amethyst.asuka@gcmodeller.org) |
| 6 | ' xie (genetics@smrucc.org) |
| 7 | ' xieguigang (xie.guigang@live.com) |
| 8 | ' |
| 9 | ' Copyright (c) 2018 GPL3 Licensed |
| 10 | ' |
| 11 | ' |
| 12 | ' GNU GENERAL PUBLIC LICENSE (GPL3) |
| 13 | ' |
| 14 | ' |
| 15 | ' This program is free software: you can redistribute it and/or modify |
| 16 | ' it under the terms of the GNU General Public License as published by |
| 17 | ' the Free Software Foundation, either version 3 of the License, or |
| 18 | ' (at your option) any later version. |
| 19 | ' |
| 20 | ' This program is distributed in the hope that it will be useful, |
| 21 | ' but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 22 | ' MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 23 | ' GNU General Public License for more details. |
| 24 | ' |
| 25 | ' You should have received a copy of the GNU General Public License |
| 26 | ' along with this program. If not, see <http://www.gnu.org/licenses/>. |
| 27 | |
| 28 | |
| 29 | |
| 30 | ' /********************************************************************************/ |
| 31 | |
| 32 | ' Summaries: |
| 33 | |
| 34 | ' Module RegexExtensions |
| 35 | ' |
| 36 | ' Properties: RegexpTimeout |
| 37 | ' |
| 38 | ' Constructor: (+1 Overloads) Sub New |
| 39 | ' Function: (+2 Overloads) EachValue, IsPattern, Locates, PythonRawRegexp, (+2 Overloads) ToArray |
| 40 | ' Structure [NameOf] |
| 41 | ' |
| 42 | ' |
| 43 | ' |
| 44 | ' |
| 45 | ' |
| 46 | ' /********************************************************************************/ |
| 47 | |
| 48 | #End Region |
| 49 | |
| 50 | Imports System.Runtime.CompilerServices |
| 51 | Imports System.Text.RegularExpressions |
| 52 | Imports Microsoft.VisualBasic.CommandLine.Reflection |
| 53 | Imports Microsoft.VisualBasic.Language |
| 54 | |
| 55 | Public Module RegexExtensions |
| 56 | |
| 57 | ''' <summary> |
| 58 | ''' 模拟python的raw字符串的正则表达式,多行的 |
| 59 | ''' </summary> |
| 60 | Public Const RegexPythonRawString As RegexOptions = |
| 61 | RegexOptions.Multiline Or |
| 62 | RegexOptions.IgnorePatternWhitespace Or |
| 63 | RegexOptions.Compiled |
| 64 | |
| 65 | Const TimeoutConfig$ = "REGEX_DEFAULT_MATCH_TIMEOUT" |
| 66 | |
| 67 | Public Property RegexpTimeout As Integer |
| 68 | Get |
| 69 | Dim domain As AppDomain = AppDomain.CurrentDomain |
| 70 | Dim timeout = domain.GetData(TimeoutConfig) |
| 71 | |
| 72 | Return timeout |
| 73 | End Get |
| 74 | Set(value As Integer) |
| 75 | Dim domain As AppDomain = AppDomain.CurrentDomain |
| 76 | Dim timeout = TimeSpan.FromSeconds(value) |
| 77 | |
| 78 | ' Set a timeout interval of 2 seconds. |
| 79 | Call domain.SetData(TimeoutConfig, timeout) |
| 80 | End Set |
| 81 | End Property |
| 82 | |
| 83 | Sub New() |
| 84 | RegexpTimeout = 5 |
| 85 | End Sub |
| 86 | |
| 87 | ''' <summary> |
| 88 | ''' Name of <see cref="RegexOptions"/> |
| 89 | ''' </summary> |
| 90 | Public Structure [NameOf] |
| 91 | |
| 92 | ''' <summary> |
| 93 | ''' Specifies that no options are set. For more information about the default behavior |
| 94 | ''' of the regular expression engine, see the "Default Options" section in the Regular |
| 95 | ''' Expression Options topic. |
| 96 | ''' </summary> |
| 97 | Public Const None As String = NameOf(None) |
| 98 | |
| 99 | ''' <summary> |
| 100 | ''' Specifies case-insensitive matching. For more information, see the "Case-Insensitive |
| 101 | ''' Matching " section in the Regular Expression Options topic. |
| 102 | ''' </summary> |
| 103 | Public Const IgnoreCase As String = NameOf(IgnoreCase) |
| 104 | |
| 105 | ''' <summary> |
| 106 | ''' Multiline mode. Changes the meaning of ^ and $ so they match at the beginning |
| 107 | ''' and end, respectively, of any line, and not just the beginning and end of the |
| 108 | ''' entire string. For more information, see the "Multiline Mode" section in the |
| 109 | ''' Regular Expression Options topic. |
| 110 | ''' </summary> |
| 111 | Public Const Multiline As String = NameOf(Multiline) |
| 112 | |
| 113 | ''' <summary> |
| 114 | ''' Specifies that the only valid captures are explicitly named or numbered groups |
| 115 | ''' of the form (?<name>…). This allows unnamed parentheses to act as noncapturing |
| 116 | ''' groups without the syntactic clumsiness of the expression (?:…). For more information, |
| 117 | ''' see the "Explicit Captures Only" section in the Regular Expression Options topic. |
| 118 | ''' </summary> |
| 119 | Public Const ExplicitCapture As String = NameOf(ExplicitCapture) |
| 120 | |
| 121 | ''' <summary> |
| 122 | ''' Specifies that the regular expression is compiled to an assembly. This yields |
| 123 | ''' faster execution but increases startup time. This value should not be assigned |
| 124 | ''' to the System.Text.RegularExpressions.RegexCompilationInfo.Options property when |
| 125 | ''' calling the System.Text.RegularExpressions.Regex.CompileToAssembly(System.Text.RegularExpressions.RegexCompilationInfo[],System.Reflection.AssemblyName) |
| 126 | ''' method. For more information, see the "Compiled Regular Expressions" section |
| 127 | ''' in the Regular Expression Options topic. |
| 128 | ''' </summary> |
| 129 | Public Const Compiled As String = NameOf(Compiled) |
| 130 | |
| 131 | ''' <summary> |
| 132 | ''' Specifies single-line mode. Changes the meaning of the dot (.) so it matches |
| 133 | ''' every character (instead of every character except \n). For more information, |
| 134 | ''' see the "Single-line Mode" section in the Regular Expression Options topic. |
| 135 | ''' </summary> |
| 136 | Public Const Singleline As String = NameOf(Singleline) |
| 137 | |
| 138 | ''' <summary> |
| 139 | ''' Eliminates unescaped white space from the pattern and enables comments marked |
| 140 | ''' with #. However, this value does not affect or eliminate white space in character |
| 141 | ''' classes, numeric quantifiers, or tokens that mark the beginning of individual |
| 142 | ''' regular expression language elements. For more information, see the "Ignore White |
| 143 | ''' Space" section of the Regular Expression Options topic. |
| 144 | ''' </summary> |
| 145 | Public Const IgnorePatternWhitespace As String = NameOf(IgnorePatternWhitespace) |
| 146 | |
| 147 | ''' <summary> |
| 148 | ''' Specifies that the search will be from right to left instead of from left to |
| 149 | ''' right. For more information, see the "Right-to-Left Mode" section in the Regular |
| 150 | ''' Expression Options topic. |
| 151 | ''' </summary> |
| 152 | Public Const RightToLeft As String = NameOf(RightToLeft) |
| 153 | |
| 154 | ''' <summary> |
| 155 | ''' Enables ECMAScript-compliant behavior for the expression. This value can be used |
| 156 | ''' only in conjunction with the System.Text.RegularExpressions.RegexOptions.IgnoreCase, |
| 157 | ''' System.Text.RegularExpressions.RegexOptions.Multiline, and System.Text.RegularExpressions.RegexOptions.Compiled |
| 158 | ''' values. The use of this value with any other values results in an exception.For |
| 159 | ''' more information on the System.Text.RegularExpressions.RegexOptions.ECMAScript |
| 160 | ''' option, see the "ECMAScript Matching Behavior" section in the Regular Expression |
| 161 | ''' Options topic. |
| 162 | ''' </summary> |
| 163 | Public Const ECMAScript As String = NameOf(ECMAScript) |
| 164 | |
| 165 | ''' <summary> |
| 166 | ''' Specifies that cultural differences in language is ignored. For more information, |
| 167 | ''' see the "Comparison Using the Invariant Culture" section in the Regular Expression |
| 168 | ''' Options topic. |
| 169 | ''' </summary> |
| 170 | Public Const CultureInvariant As String = NameOf(CultureInvariant) |
| 171 | End Structure |
| 172 | |
| 173 | <Extension> |
| 174 | Public Function EachValue(Of T)(m As MatchCollection, parser As Func(Of String, T)) As IEnumerable(Of T) |
| 175 | Return From s As Match In m Select parser(s.Value) |
| 176 | End Function |
| 177 | |
| 178 | ''' <summary> |
| 179 | ''' Each match its value in the source match collection. |
| 180 | ''' </summary> |
| 181 | ''' <param name="m"></param> |
| 182 | ''' <returns></returns> |
| 183 | <Extension> |
| 184 | Public Function EachValue(m As MatchCollection) As IEnumerable(Of String) |
| 185 | Return From s As Match In m Select s.Value |
| 186 | End Function |
| 187 | |
| 188 | ''' <summary> |
| 189 | ''' Gets the matched strings from the regex match result as source |
| 190 | ''' </summary> |
| 191 | ''' <param name="source"></param> |
| 192 | ''' <returns></returns> |
| 193 | <ExportAPI("As.Array")> |
| 194 | <Extension> Public Function ToArray(source As MatchCollection) As String() |
| 195 | Dim LQuery$() = LinqAPI.Exec(Of String) _ |
| 196 | _ |
| 197 | () <= From m As Match |
| 198 | In source |
| 199 | Select m.Value |
| 200 | |
| 201 | Return LQuery |
| 202 | End Function |
| 203 | |
| 204 | ''' <summary> |
| 205 | ''' Converts the <see cref="Regex"/> string pattern match results to the objects. |
| 206 | ''' (这个函数是非并行化的,所以不需要担心会打乱顺序) |
| 207 | ''' </summary> |
| 208 | ''' <typeparam name="T"></typeparam> |
| 209 | ''' <param name="source"></param> |
| 210 | ''' <param name="[ctype]">The object parser</param> |
| 211 | ''' <returns></returns> |
| 212 | <Extension> |
| 213 | Public Function ToArray(Of T)(source As MatchCollection, [ctype] As Func(Of String, T)) As T() |
| 214 | Dim LQuery As T() = LinqAPI.Exec(Of T) _ |
| 215 | _ |
| 216 | () <= From m As Match |
| 217 | In source |
| 218 | Let s As String = m.Value |
| 219 | Select [ctype](s) |
| 220 | |
| 221 | Return LQuery |
| 222 | End Function |
| 223 | |
| 224 | ''' <summary> |
| 225 | ''' The enitre string input equals to the pattern's matched. |
| 226 | ''' </summary> |
| 227 | ''' <param name="s"></param> |
| 228 | ''' <param name="pattern"></param> |
| 229 | ''' <returns></returns> |
| 230 | <Extension> |
| 231 | Public Function IsPattern(s$, pattern$, Optional opt As RegexOptions = RegexICSng) As Boolean |
| 232 | ' 2018-6-1 因为空字符串肯定无法匹配上目标模式 |
| 233 | ' 所以match函数总回返回空字符串 |
| 234 | ' 由于s参数本身就是空字符串,所以会造成空字符串可以被任意模式完全匹配的bug |
| 235 | Return Not s.StringEmpty AndAlso Regex.Match(s, pattern, opt).Value = s |
| 236 | End Function |
| 237 | |
| 238 | ''' <summary> |
| 239 | ''' 模拟python语言之中的从raw string构建正则表达式 |
| 240 | ''' </summary> |
| 241 | ''' <param name="raw$"></param> |
| 242 | ''' <returns></returns> |
| 243 | <Extension> Public Function PythonRawRegexp(raw As String) As Regex |
| 244 | Return New Regex(raw, RegexOptions.Multiline Or RegexOptions.IgnorePatternWhitespace) |
| 245 | End Function |
| 246 | |
| 247 | ''' <summary> |
| 248 | ''' 函数返回以1为底的位置,当找不到的时候会返回零 |
| 249 | ''' </summary> |
| 250 | ''' <param name="str$"></param> |
| 251 | ''' <param name="pattern$"></param> |
| 252 | ''' <param name="opt"></param> |
| 253 | ''' <returns></returns> |
| 254 | <Extension> |
| 255 | Public Function Locates(str$, pattern$, Optional opt As RegexOptions = RegexICSng) As Integer |
| 256 | Dim sub$ = Regex.Match(str, pattern, opt).Value |
| 257 | |
| 258 | If String.IsNullOrEmpty([sub]) Then |
| 259 | Return 0 |
| 260 | Else |
| 261 | Return InStr(str, [sub], CompareMethod.Binary) |
| 262 | End If |
| 263 | End Function |
| 264 | End Module |