1 |
#Region "Microsoft.VisualBasic::d98097891d41c4aca973057a20980e05, Microsoft.VisualBasic.Core\Text\StringSimilarity\Similarity.vb"
|
2 |
|
3 |
|
4 |
|
5 |
|
6 |
|
7 |
|
8 |
|
9 |
|
10 |
|
11 |
|
12 |
|
13 |
|
14 |
|
15 |
|
16 |
|
17 |
|
18 |
|
19 |
|
20 |
|
21 |
|
22 |
|
23 |
|
24 |
|
25 |
|
26 |
|
27 |
|
28 |
|
29 |
|
30 |
|
31 |
|
32 |
|
33 |
|
34 |
|
35 |
|
36 |
|
37 |
|
38 |
|
39 |
|
40 |
|
41 |
|
42 |
|
43 |
|
44 |
|
45 |
|
46 |
|
47 |
|
48 |
|
49 |
|
50 |
#End Region
|
51 |
|
52 |
Imports System.Runtime.CompilerServices
|
53 |
Imports Microsoft.VisualBasic.GenericLambda(Of String)
|
54 |
Imports Microsoft.VisualBasic.Language
|
55 |
Imports Microsoft.VisualBasic.Language.Default
|
56 |
Imports Microsoft.VisualBasic.Text.Levenshtein
|
57 |
|
58 |
Namespace Text.Similarity
|
59 |
|
60 |
|
61 |
|
62 |
|
63 |
|
64 |
Public Delegate Function ISimilarity(s1 As String, s2 As String) As Double
|
65 |
|
66 |
Public Module Evaluations
|
67 |
|
68 |
ReadOnly ignoreCase As New DefaultValue(Of IEquals)(AddressOf tokenEqualsIgnoreCase)
|
69 |
|
70 |
|
71 |
两个字符串之间是通过单词的排布的相似度来比较相似度的
|
72 |
|
73 |
<param name="s1"></param>
|
74 |
<param name="s2"></param>
|
75 |
<param name="ignoreCase"></param>
|
76 |
<param name="cost#"></param>
|
77 |
<param name="dist"></param>
|
78 |
<returns></returns>
|
79 |
Public Function Evaluate(s1$, s2$,
|
80 |
Optional ignoreCase As Boolean = True,
|
81 |
Optional cost# = 0.7,
|
82 |
Optional ByRef dist As DistResult = Nothing) As Double
|
83 |
|
84 |
If String.Equals(s1, s2, If(ignoreCase, StringComparison.OrdinalIgnoreCase, StringComparison.Ordinal)) Then
|
85 |
Return 1
|
86 |
End If
|
87 |
|
88 |
Dim tokenEquals As IEquals = New IEquals(AddressOf Evaluations.tokenEquals) Or Evaluations.ignoreCase.When(ignoreCase)
|
89 |
|
90 |
dist = LevenshteinDistance.ComputeDistance(
|
91 |
s1.Split,
|
92 |
s2.Split,
|
93 |
tokenEquals,
|
94 |
Function(s) s.FirstOrDefault,
|
95 |
cost)
|
96 |
|
97 |
If dist Is Nothing Then
|
98 |
Return 0
|
99 |
Else
|
100 |
Return dist.MatchSimilarity
|
101 |
End If
|
102 |
End Function
|
103 |
|
104 |
<MethodImpl(MethodImplOptions.AggressiveInlining)>
|
105 |
Private Function tokenEquals(w1$, w2$) As Boolean
|
106 |
Return w1$ = w2$
|
107 |
End Function
|
108 |
|
109 |
<MethodImpl(MethodImplOptions.AggressiveInlining)>
|
110 |
Private Function tokenEqualsIgnoreCase(w1$, w2$) As Boolean
|
111 |
Return String.Equals(w1, w2, StringComparison.OrdinalIgnoreCase)
|
112 |
End Function
|
113 |
|
114 |
Public Delegate Function IEvaluate(s1$, s2$, ignoreCase As Boolean, cost#, ByRef dist As DistResult) As Double
|
115 |
|
116 |
|
117 |
计算字符串,这个是直接通过计算字符而非像<see cref="Evaluate"/>方法之中计算单词的
|
118 |
|
119 |
<param name="s1$"></param>
|
120 |
<param name="s2$"></param>
|
121 |
<param name="ignoreCase"></param>
|
122 |
<param name="cost#"></param>
|
123 |
<param name="dist"></param>
|
124 |
<returns></returns>
|
125 |
Public Function LevenshteinEvaluate(s1$, s2$,
|
126 |
Optional ignoreCase As Boolean = True,
|
127 |
Optional cost# = 0.7,
|
128 |
Optional ByRef dist As DistResult = Nothing) As Double
|
129 |
If ignoreCase Then
|
130 |
s1 = s1.ToLower
|
131 |
s2 = s2.ToLower
|
132 |
End If
|
133 |
|
134 |
If s1 = s2 Then
|
135 |
Return 1
|
136 |
End If
|
137 |
|
138 |
dist = LevenshteinDistance.ComputeDistance(s1, s2, cost)
|
139 |
|
140 |
If dist Is Nothing Then
|
141 |
Return 0
|
142 |
Else
|
143 |
Return dist.MatchSimilarity
|
144 |
End If
|
145 |
End Function
|
146 |
|
147 |
|
148 |
以s1为准则,将s2进行比较,返回s2之中的单词在s1之中的排列顺序
|
149 |
|
150 |
<param name="s1"></param>
|
151 |
<param name="s2"></param>
|
152 |
<returns>序列之中的-1表示s2之中的单词在s1之中不存在</returns>
|
153 |
Public Function TokenOrders(s1 As String, s2 As String, Optional caseSensitive As Boolean = False) As Integer()
|
154 |
Dim t1$() = s1.Split
|
155 |
Return t1$.TokenOrders(s2, caseSensitive)
|
156 |
End Function
|
157 |
|
158 |
<MethodImpl(MethodImplOptions.AggressiveInlining)>
|
159 |
<Extension>
|
160 |
Public Function TokenOrders(s1$(), s2$, Optional caseSensitive As Boolean = False) As Integer()
|
161 |
Return TokenOrders(s1, s2.Split.Distinct, caseSensitive)
|
162 |
End Function
|
163 |
|
164 |
<Extension>
|
165 |
Public Function TokenOrders(s1$(), s2 As IEnumerable(Of String), Optional caseSensitive As Boolean = False, Optional fuzzy As Boolean = True) As Integer()
|
166 |
Dim orders As New List(Of Integer)
|
167 |
|
168 |
For Each t$ In s2
|
169 |
orders += s1.Located(t$, caseSensitive, fuzzy)
|
170 |
Next
|
171 |
|
172 |
Return orders
|
173 |
End Function
|
174 |
|
175 |
<MethodImpl(MethodImplOptions.AggressiveInlining)>
|
176 |
<Extension>
|
177 |
Public Function IsOrdered(s1$(), s2$, Optional caseSensitive As Boolean = False) As Boolean
|
178 |
Return s1.IsOrdered(s2.Split, caseSensitive)
|
179 |
End Function
|
180 |
|
181 |
|
182 |
查看<paramref name="s2"/>之中的字符串的顺序是否是在<paramref name="s1"/>之中按顺序排序的
|
183 |
|
184 |
<param name="s1$"></param>
|
185 |
<param name="s2$"></param>
|
186 |
<param name="caseSensitive"></param>
|
187 |
<returns></returns>
|
188 |
<Extension>
|
189 |
Public Function IsOrdered(s1$(), s2$(), Optional caseSensitive As Boolean = False, Optional fuzzy As Boolean = True) As Boolean
|
190 |
Dim orders%() = s1.TokenOrders(s2, caseSensitive, fuzzy)
|
191 |
orders = orders.Where(Function(x) x <> -1).ToArray
|
192 |
|
193 |
If orders.Length = 0 Then
|
194 |
Return False
|
195 |
End If
|
196 |
|
197 |
|
198 |
If orders.SequenceEqual(orders.OrderBy(Function(x) x)) Then
|
199 |
|
200 |
Return True
|
201 |
Else
|
202 |
Return False
|
203 |
End If
|
204 |
End Function
|
205 |
|
206 |
<MethodImpl(MethodImplOptions.AggressiveInlining)>
|
207 |
<Extension>
|
208 |
Public Function IsOrdered(s1$, s2$, Optional caseSensitive As Boolean = False) As Boolean
|
209 |
Return s1.Split.IsOrdered(s2$, caseSensitive)
|
210 |
End Function
|
211 |
|
212 |
<MethodImpl(MethodImplOptions.AggressiveInlining)>
|
213 |
<Extension>
|
214 |
Public Function IsOrdered(s1$, s2$(), Optional caseSensitive As Boolean = False, Optional fuzzy As Boolean = True) As Boolean
|
215 |
Return s1.Split.IsOrdered(s2$, caseSensitive, fuzzy)
|
216 |
End Function
|
217 |
|
218 |
Public Function StringSelection(query As String, collection As IEnumerable(Of String), Optional cutoff# = 0.6, Optional ignoreCase As Boolean = True, Optional tokenBased As Boolean = False) As String
|
219 |
Dim compare As IEvaluate
|
220 |
|
221 |
If tokenBased Then
|
222 |
compare = AddressOf Evaluate
|
223 |
Else
|
224 |
compare = AddressOf LevenshteinEvaluate
|
225 |
End If
|
226 |
|
227 |
Dim LQuery = From s As String
|
228 |
In collection.AsParallel
|
229 |
Let score As Double = compare(query, s, ignoreCase, 0.7, Nothing)
|
230 |
Where score >= cutoff
|
231 |
Select s,
|
232 |
score
|
233 |
Order By score Descending
|
234 |
Dim result = LQuery.FirstOrDefault
|
235 |
|
236 |
If result Is Nothing Then
|
237 |
Return Nothing
|
238 |
Else
|
239 |
Return result.s
|
240 |
End If
|
241 |
End Function
|
242 |
End Module
|
243 |
End Namespace
|