1 |
#Region "Microsoft.VisualBasic::3cd812a048bce23c28e1dd026724f40f, Microsoft.VisualBasic.Core\Text\SearchEngine\TextIndexing\TextIndexing.vb"
|
2 |
|
3 |
|
4 |
|
5 |
|
6 |
|
7 |
|
8 |
|
9 |
|
10 |
|
11 |
|
12 |
|
13 |
|
14 |
|
15 |
|
16 |
|
17 |
|
18 |
|
19 |
|
20 |
|
21 |
|
22 |
|
23 |
|
24 |
|
25 |
|
26 |
|
27 |
|
28 |
|
29 |
|
30 |
|
31 |
|
32 |
|
33 |
|
34 |
|
35 |
|
36 |
|
37 |
|
38 |
|
39 |
|
40 |
|
41 |
|
42 |
|
43 |
|
44 |
|
45 |
#End Region
|
46 |
|
47 |
Imports Microsoft.VisualBasic.CommandLine.Reflection
|
48 |
Imports Microsoft.VisualBasic.ComponentModel
|
49 |
Imports Microsoft.VisualBasic.Language
|
50 |
Imports Microsoft.VisualBasic.Linq.Extensions
|
51 |
Imports Microsoft.VisualBasic.Scripting.MetaData
|
52 |
Imports Microsoft.VisualBasic.Text.Levenshtein
|
53 |
|
54 |
Namespace Text.Search
|
55 |
|
56 |
<Package("Text.Indexing", Publisher:="xie.guigang@gcmodeller.org", Category:=APICategories.UtilityTools)>
|
57 |
Public Class TextIndexing
|
58 |
|
59 |
|
60 |
|
61 |
|
62 |
|
63 |
Public ReadOnly Property cache As TextSegment()
|
64 |
|
65 |
ReadOnly _text As String
|
66 |
ReadOnly _mMatches As Dictionary(Of Integer, String)
|
67 |
ReadOnly _max As Integer
|
68 |
|
69 |
|
70 |
|
71 |
|
72 |
|
73 |
|
74 |
|
75 |
Sub New(text As String, min As Integer, max As Integer)
|
76 |
If min = max Then
|
77 |
cache = __cache(text, max)
|
78 |
Else
|
79 |
cache = LinqAPI.Exec(Of TextSegment) _
|
80 |
_
|
81 |
() <= From d As Integer
|
82 |
In (max - min) _
|
83 |
.Sequence _
|
84 |
.AsParallel
|
85 |
Let len As Integer = min + d
|
86 |
Select __cache(text, len)
|
87 |
End If
|
88 |
|
89 |
_text = text
|
90 |
_max = max
|
91 |
_mMatches = max _
|
92 |
.Sequence _
|
93 |
.ToDictionary(Function(l) l,
|
94 |
Function(d) New String("m"c, d))
|
95 |
|
96 |
Call $"{cache.Length} cache data from length range from {min} to {max}...".__DEBUG_ECHO
|
97 |
End Sub
|
98 |
|
99 |
Public Overrides Function ToString() As String
|
100 |
Return Mid(_text, 1, 120) & "..."
|
101 |
End Function
|
102 |
|
103 |
Private Shared Function __cache(text As String, len As Integer) As TextSegment()
|
104 |
Dim out As New List(Of TextSegment)
|
105 |
|
106 |
For i As Integer = 1 To text.Length - len
|
107 |
Dim s As String = Mid(text, i, len)
|
108 |
|
109 |
|
110 |
out += New TextSegment(s) With {
|
111 |
.Index = i
|
112 |
}
|
113 |
Next
|
114 |
|
115 |
Return out
|
116 |
End Function
|
117 |
|
118 |
|
119 |
|
120 |
|
121 |
<param name="keyword"></param>
|
122 |
<param name="cutoff"></param>
|
123 |
<param name="numPartition">每一个分区之中的元素的数量,负数表示不进行分区</param>
|
124 |
|
125 |
Public Function Found(keyword$, Optional cutoff# = 0.6, Optional numPartition% = 1024) As Map(Of TextSegment, DistResult)()
|
126 |
If numPartition <= 0 Then
|
127 |
Dim out = __indexing(cache, keyword, cutoff, True)
|
128 |
Return out
|
129 |
Else
|
130 |
Return __workParts(keyword, cutoff, numPartition)
|
131 |
End If
|
132 |
End Function
|
133 |
|
134 |
Private Function __workParts(keyword$, cutoff#, numPartitions%) As Map(Of TextSegment, DistResult)()
|
135 |
Dim resultSet As New List(Of Map(Of TextSegment, DistResult))
|
136 |
Dim partitions = cache.Split(numPartitions)
|
137 |
|
138 |
Call $"{partitions.Length} partitions...".__DEBUG_ECHO
|
139 |
|
140 |
Dim LQuery = From part As TextSegment()
|
141 |
In partitions.AsParallel
|
142 |
Select __indexing(
|
143 |
part, keyword, cutoff,
|
144 |
parallel:=False)
|
145 |
|
146 |
For Each part As Map(Of TextSegment, DistResult)() In LQuery
|
147 |
Call resultSet.Add(part)
|
148 |
Next
|
149 |
|
150 |
Return resultSet
|
151 |
End Function
|
152 |
|
153 |
|
154 |
|
155 |
|
156 |
<param name="keyword"></param>
|
157 |
<param name="cutoff">表示出现连续的m匹配的片段的长度,-1表示所搜索的关键词片段的长度一半</param>
|
158 |
|
159 |
Public Function Found(keyword$, Optional cutoff% = -1) As Map(Of TextSegment, DistResult)()
|
160 |
If cutoff = -1 Then
|
161 |
cutoff = Len(keyword) / 2
|
162 |
End If
|
163 |
|
164 |
Dim LQuery = LinqAPI.Exec(Of Map(Of TextSegment, DistResult)) _
|
165 |
_
|
166 |
() <= From piece As TextSegment
|
167 |
In cache
|
168 |
Let levl As DistResult = LevenshteinDistance.ComputeDistance(piece.Array, keyword)
|
169 |
Where Not levl Is Nothing AndAlso IsMatch(levl.DistEdits, cutoff)
|
170 |
Select New Map(Of TextSegment, DistResult) With {
|
171 |
.Key = piece,
|
172 |
.Maps = levl
|
173 |
}
|
174 |
|
175 |
Return LQuery
|
176 |
End Function
|
177 |
|
178 |
|
179 |
函数返回最长的匹配的个数,-1表示没有匹配
|
180 |
|
181 |
<param name="m"></param>
|
182 |
<param name="cutoff"></param>
|
183 |
|
184 |
Public Function IsMatch(m As String, cutoff As Integer) As Integer
|
185 |
For i As Integer = _max To cutoff Step -1
|
186 |
If Not _mMatches.ContainsKey(i) Then
|
187 |
Continue For
|
188 |
End If
|
189 |
|
190 |
|
191 |
|
192 |
Dim cache As String = _mMatches(i)
|
193 |
|
194 |
If InStr(m, cache) > 0 Then
|
195 |
Return i
|
196 |
End If
|
197 |
Next
|
198 |
|
199 |
Return -1
|
200 |
End Function
|
201 |
|
202 |
Private Shared Function __indexing(part As TextSegment(), keyword$, cutoff#, parallel As Boolean) As Map(Of TextSegment, DistResult)()
|
203 |
Dim LQuery = From index As TextSegment
|
204 |
In part.Populate(parallel)
|
205 |
Let levenshtein As DistResult = index Like keyword
|
206 |
Where Not levenshtein Is Nothing AndAlso
|
207 |
levenshtein.Score >= cutoff
|
208 |
Select New Map(Of TextSegment, DistResult) With {
|
209 |
.Key = index,
|
210 |
.Maps = levenshtein
|
211 |
}
|
212 |
|
213 |
Dim out As Map(Of TextSegment, DistResult)() = LQuery.ToArray
|
214 |
Return out
|
215 |
End Function
|
216 |
|
217 |
<ExportAPI("Index.Fuzzy")>
|
218 |
Public Shared Function FuzzyIndex(text As String, keyword As String,
|
219 |
Optional cutoff As Double = 0.6,
|
220 |
Optional min As Integer = 3,
|
221 |
Optional max As Integer = 20) As Map(Of TextSegment, DistResult)()
|
222 |
Dim indexr As New TextIndexing(text, min, max)
|
223 |
Dim searchs = indexr.Found(keyword, cutoff)
|
224 |
Return searchs
|
225 |
End Function
|
226 |
|
227 |
|
228 |
|
229 |
|
230 |
|
231 |
<param name="keyword"></param>
|
232 |
<param name="Matches">
|
233 |
The continues length of the matches, if this value is ZERO or negative value, then the function will using the expression len(keyword)/2 as the default value.
|
234 |
</param>
|
235 |
|
236 |
|
237 |
|
238 |
<ExportAPI("Index.Fuzzy")>
|
239 |
Public Shared Function FuzzyIndex(text$, keyword$,
|
240 |
<Parameter("Cutoff", "The continues length of the matches, if this value is ZERO or negative value, then the function will using the expression len(keyword)/2 as the default value.")>
|
241 |
Optional matches% = -1,
|
242 |
Optional min% = 3,
|
243 |
Optional max% = 20) As Map(Of TextSegment, DistResult)()
|
244 |
|
245 |
Dim indexr As New TextIndexing(text, min, max)
|
246 |
Dim searchs = indexr.Found(keyword, matches)
|
247 |
Return searchs
|
248 |
End Function
|
249 |
End Class
|
250 |
End Namespace
|