1 | #Region "Microsoft.VisualBasic::14e6e7d6f16d2a7c7404557abfd5af5a, Microsoft.VisualBasic.Core\Extensions\StringHelpers\RegexExtensions.vb" |
2 | |
3 | ' Author: |
4 | ' |
5 | ' asuka (amethyst.asuka@gcmodeller.org) |
6 | ' xie (genetics@smrucc.org) |
7 | ' xieguigang (xie.guigang@live.com) |
8 | ' |
9 | ' Copyright (c) 2018 GPL3 Licensed |
10 | ' |
11 | ' |
12 | ' GNU GENERAL PUBLIC LICENSE (GPL3) |
13 | ' |
14 | ' |
15 | ' This program is free software: you can redistribute it and/or modify |
16 | ' it under the terms of the GNU General Public License as published by |
17 | ' the Free Software Foundation, either version 3 of the License, or |
18 | ' (at your option) any later version. |
19 | ' |
20 | ' This program is distributed in the hope that it will be useful, |
21 | ' but WITHOUT ANY WARRANTY; without even the implied warranty of |
22 | ' MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
23 | ' GNU General Public License for more details. |
24 | ' |
25 | ' You should have received a copy of the GNU General Public License |
26 | ' along with this program. If not, see <http://www.gnu.org/licenses/>. |
27 | |
28 | |
29 | |
30 | ' /********************************************************************************/ |
31 | |
32 | ' Summaries: |
33 | |
34 | ' Module RegexExtensions |
35 | ' |
36 | ' Properties: RegexpTimeout |
37 | ' |
38 | ' Constructor: (+1 Overloads) Sub New |
39 | ' Function: (+2 Overloads) EachValue, IsPattern, Locates, PythonRawRegexp, (+2 Overloads) ToArray |
40 | ' Structure [NameOf] |
41 | ' |
42 | ' |
43 | ' |
44 | ' |
45 | ' |
46 | ' /********************************************************************************/ |
47 | |
48 | #End Region |
49 | |
50 | Imports System.Runtime.CompilerServices |
51 | Imports System.Text.RegularExpressions |
52 | Imports Microsoft.VisualBasic.CommandLine.Reflection |
53 | Imports Microsoft.VisualBasic.Language |
54 | |
55 | Public Module RegexExtensions |
56 | |
57 | ''' <summary> |
58 | ''' 模拟python的raw字符串的正则表达式,多行的 |
59 | ''' </summary> |
60 | Public Const RegexPythonRawString As RegexOptions = |
61 | RegexOptions.Multiline Or |
62 | RegexOptions.IgnorePatternWhitespace Or |
63 | RegexOptions.Compiled |
64 | |
65 | Const TimeoutConfig$ = "REGEX_DEFAULT_MATCH_TIMEOUT" |
66 | |
67 | Public Property RegexpTimeout As Integer |
68 | Get |
69 | Dim domain As AppDomain = AppDomain.CurrentDomain |
70 | Dim timeout = domain.GetData(TimeoutConfig) |
71 | |
72 | Return timeout |
73 | End Get |
74 | Set(value As Integer) |
75 | Dim domain As AppDomain = AppDomain.CurrentDomain |
76 | Dim timeout = TimeSpan.FromSeconds(value) |
77 | |
78 | ' Set a timeout interval of 2 seconds. |
79 | Call domain.SetData(TimeoutConfig, timeout) |
80 | End Set |
81 | End Property |
82 | |
83 | Sub New() |
84 | RegexpTimeout = 5 |
85 | End Sub |
86 | |
87 | ''' <summary> |
88 | ''' Name of <see cref="RegexOptions"/> |
89 | ''' </summary> |
90 | Public Structure [NameOf] |
91 | |
92 | ''' <summary> |
93 | ''' Specifies that no options are set. For more information about the default behavior |
94 | ''' of the regular expression engine, see the "Default Options" section in the Regular |
95 | ''' Expression Options topic. |
96 | ''' </summary> |
97 | Public Const None As String = NameOf(None) |
98 | |
99 | ''' <summary> |
100 | ''' Specifies case-insensitive matching. For more information, see the "Case-Insensitive |
101 | ''' Matching " section in the Regular Expression Options topic. |
102 | ''' </summary> |
103 | Public Const IgnoreCase As String = NameOf(IgnoreCase) |
104 | |
105 | ''' <summary> |
106 | ''' Multiline mode. Changes the meaning of ^ and $ so they match at the beginning |
107 | ''' and end, respectively, of any line, and not just the beginning and end of the |
108 | ''' entire string. For more information, see the "Multiline Mode" section in the |
109 | ''' Regular Expression Options topic. |
110 | ''' </summary> |
111 | Public Const Multiline As String = NameOf(Multiline) |
112 | |
113 | ''' <summary> |
114 | ''' Specifies that the only valid captures are explicitly named or numbered groups |
115 | ''' of the form (?<name>…). This allows unnamed parentheses to act as noncapturing |
116 | ''' groups without the syntactic clumsiness of the expression (?:…). For more information, |
117 | ''' see the "Explicit Captures Only" section in the Regular Expression Options topic. |
118 | ''' </summary> |
119 | Public Const ExplicitCapture As String = NameOf(ExplicitCapture) |
120 | |
121 | ''' <summary> |
122 | ''' Specifies that the regular expression is compiled to an assembly. This yields |
123 | ''' faster execution but increases startup time. This value should not be assigned |
124 | ''' to the System.Text.RegularExpressions.RegexCompilationInfo.Options property when |
125 | ''' calling the System.Text.RegularExpressions.Regex.CompileToAssembly(System.Text.RegularExpressions.RegexCompilationInfo[],System.Reflection.AssemblyName) |
126 | ''' method. For more information, see the "Compiled Regular Expressions" section |
127 | ''' in the Regular Expression Options topic. |
128 | ''' </summary> |
129 | Public Const Compiled As String = NameOf(Compiled) |
130 | |
131 | ''' <summary> |
132 | ''' Specifies single-line mode. Changes the meaning of the dot (.) so it matches |
133 | ''' every character (instead of every character except \n). For more information, |
134 | ''' see the "Single-line Mode" section in the Regular Expression Options topic. |
135 | ''' </summary> |
136 | Public Const Singleline As String = NameOf(Singleline) |
137 | |
138 | ''' <summary> |
139 | ''' Eliminates unescaped white space from the pattern and enables comments marked |
140 | ''' with #. However, this value does not affect or eliminate white space in character |
141 | ''' classes, numeric quantifiers, or tokens that mark the beginning of individual |
142 | ''' regular expression language elements. For more information, see the "Ignore White |
143 | ''' Space" section of the Regular Expression Options topic. |
144 | ''' </summary> |
145 | Public Const IgnorePatternWhitespace As String = NameOf(IgnorePatternWhitespace) |
146 | |
147 | ''' <summary> |
148 | ''' Specifies that the search will be from right to left instead of from left to |
149 | ''' right. For more information, see the "Right-to-Left Mode" section in the Regular |
150 | ''' Expression Options topic. |
151 | ''' </summary> |
152 | Public Const RightToLeft As String = NameOf(RightToLeft) |
153 | |
154 | ''' <summary> |
155 | ''' Enables ECMAScript-compliant behavior for the expression. This value can be used |
156 | ''' only in conjunction with the System.Text.RegularExpressions.RegexOptions.IgnoreCase, |
157 | ''' System.Text.RegularExpressions.RegexOptions.Multiline, and System.Text.RegularExpressions.RegexOptions.Compiled |
158 | ''' values. The use of this value with any other values results in an exception.For |
159 | ''' more information on the System.Text.RegularExpressions.RegexOptions.ECMAScript |
160 | ''' option, see the "ECMAScript Matching Behavior" section in the Regular Expression |
161 | ''' Options topic. |
162 | ''' </summary> |
163 | Public Const ECMAScript As String = NameOf(ECMAScript) |
164 | |
165 | ''' <summary> |
166 | ''' Specifies that cultural differences in language is ignored. For more information, |
167 | ''' see the "Comparison Using the Invariant Culture" section in the Regular Expression |
168 | ''' Options topic. |
169 | ''' </summary> |
170 | Public Const CultureInvariant As String = NameOf(CultureInvariant) |
171 | End Structure |
172 | |
173 | <Extension> |
174 | Public Function EachValue(Of T)(m As MatchCollection, parser As Func(Of String, T)) As IEnumerable(Of T) |
175 | Return From s As Match In m Select parser(s.Value) |
176 | End Function |
177 | |
178 | ''' <summary> |
179 | ''' Each match its value in the source match collection. |
180 | ''' </summary> |
181 | ''' <param name="m"></param> |
182 | ''' <returns></returns> |
183 | <Extension> |
184 | Public Function EachValue(m As MatchCollection) As IEnumerable(Of String) |
185 | Return From s As Match In m Select s.Value |
186 | End Function |
187 | |
188 | ''' <summary> |
189 | ''' Gets the matched strings from the regex match result as source |
190 | ''' </summary> |
191 | ''' <param name="source"></param> |
192 | ''' <returns></returns> |
193 | <ExportAPI("As.Array")> |
194 | <Extension> Public Function ToArray(source As MatchCollection) As String() |
195 | Dim LQuery$() = LinqAPI.Exec(Of String) _ |
196 | _ |
197 | () <= From m As Match |
198 | In source |
199 | Select m.Value |
200 | |
201 | Return LQuery |
202 | End Function |
203 | |
204 | ''' <summary> |
205 | ''' Converts the <see cref="Regex"/> string pattern match results to the objects. |
206 | ''' (这个函数是非并行化的,所以不需要担心会打乱顺序) |
207 | ''' </summary> |
208 | ''' <typeparam name="T"></typeparam> |
209 | ''' <param name="source"></param> |
210 | ''' <param name="[ctype]">The object parser</param> |
211 | ''' <returns></returns> |
212 | <Extension> |
213 | Public Function ToArray(Of T)(source As MatchCollection, [ctype] As Func(Of String, T)) As T() |
214 | Dim LQuery As T() = LinqAPI.Exec(Of T) _ |
215 | _ |
216 | () <= From m As Match |
217 | In source |
218 | Let s As String = m.Value |
219 | Select [ctype](s) |
220 | |
221 | Return LQuery |
222 | End Function |
223 | |
224 | ''' <summary> |
225 | ''' The enitre string input equals to the pattern's matched. |
226 | ''' </summary> |
227 | ''' <param name="s"></param> |
228 | ''' <param name="pattern"></param> |
229 | ''' <returns></returns> |
230 | <Extension> |
231 | Public Function IsPattern(s$, pattern$, Optional opt As RegexOptions = RegexICSng) As Boolean |
232 | ' 2018-6-1 因为空字符串肯定无法匹配上目标模式 |
233 | ' 所以match函数总回返回空字符串 |
234 | ' 由于s参数本身就是空字符串,所以会造成空字符串可以被任意模式完全匹配的bug |
235 | Return Not s.StringEmpty AndAlso Regex.Match(s, pattern, opt).Value = s |
236 | End Function |
237 | |
238 | ''' <summary> |
239 | ''' 模拟python语言之中的从raw string构建正则表达式 |
240 | ''' </summary> |
241 | ''' <param name="raw$"></param> |
242 | ''' <returns></returns> |
243 | <Extension> Public Function PythonRawRegexp(raw As String) As Regex |
244 | Return New Regex(raw, RegexOptions.Multiline Or RegexOptions.IgnorePatternWhitespace) |
245 | End Function |
246 | |
247 | ''' <summary> |
248 | ''' 函数返回以1为底的位置,当找不到的时候会返回零 |
249 | ''' </summary> |
250 | ''' <param name="str$"></param> |
251 | ''' <param name="pattern$"></param> |
252 | ''' <param name="opt"></param> |
253 | ''' <returns></returns> |
254 | <Extension> |
255 | Public Function Locates(str$, pattern$, Optional opt As RegexOptions = RegexICSng) As Integer |
256 | Dim sub$ = Regex.Match(str, pattern, opt).Value |
257 | |
258 | If String.IsNullOrEmpty([sub]) Then |
259 | Return 0 |
260 | Else |
261 | Return InStr(str, [sub], CompareMethod.Binary) |
262 | End If |
263 | End Function |
264 | End Module |