| 1 |
#Region "Microsoft.VisualBasic::280babb704bc1ffe077b26228589e065, Microsoft.VisualBasic.Core\ComponentModel\Algorithm\Levenshtein\LevenshteinDistance.vb"
|
| 2 |
|
| 3 |
|
| 4 |
|
| 5 |
|
| 6 |
|
| 7 |
|
| 8 |
|
| 9 |
|
| 10 |
|
| 11 |
|
| 12 |
|
| 13 |
|
| 14 |
|
| 15 |
|
| 16 |
|
| 17 |
|
| 18 |
|
| 19 |
|
| 20 |
|
| 21 |
|
| 22 |
|
| 23 |
|
| 24 |
|
| 25 |
|
| 26 |
|
| 27 |
|
| 28 |
|
| 29 |
|
| 30 |
|
| 31 |
|
| 32 |
|
| 33 |
|
| 34 |
|
| 35 |
|
| 36 |
|
| 37 |
|
| 38 |
|
| 39 |
|
| 40 |
|
| 41 |
|
| 42 |
|
| 43 |
|
| 44 |
|
| 45 |
|
| 46 |
#End Region
|
| 47 |
|
| 48 |
Imports Microsoft.VisualBasic.CommandLine.Reflection
|
| 49 |
Imports Microsoft.VisualBasic.ComponentModel.Algorithm
|
| 50 |
Imports Microsoft.VisualBasic.ComponentModel.Algorithm.DynamicProgramming
|
| 51 |
Imports Microsoft.VisualBasic.ComponentModel.DataStructures
|
| 52 |
Imports Microsoft.VisualBasic.Linq.Extensions
|
| 53 |
Imports Microsoft.VisualBasic.Net.Protocols
|
| 54 |
Imports Microsoft.VisualBasic.Scripting.MetaData
|
| 55 |
Imports Microsoft.VisualBasic.Text.Xml.Models
|
| 56 |
Imports sys = System.Math
|
| 57 |
|
| 58 |
Namespace Text.Levenshtein
|
| 59 |
|
| 60 |
|
| 61 |
|
| 62 |
|
| 63 |
|
| 64 |
|
| 65 |
|
| 66 |
'''
|
| 67 |
<Package("Distance.Levenshtein",
|
| 68 |
Description:="Implement the Levenshtein Edit Distance algorithm and result data visualization.",
|
| 69 |
Publisher:="furkanavcu",
|
| 70 |
Category:=APICategories.UtilityTools,
|
| 71 |
Url:="http://www.codeproject.com/Tips/697588/Levenshtein-Edit-Distance-Algorithm")>
|
| 72 |
<Cite(Title:="Binary codes capable of correcting deletions, insertions, and reversals",
|
| 73 |
Pages:="707–710", StartPage:=707, Issue:="8", Volume:=10, Authors:="Levenshtein,
|
| 74 |
Vladimir I",
|
| 75 |
Journal:="Soviet Physics Doklady", Year:=1966)>
|
| 76 |
Public Module LevenshteinDistance
|
| 77 |
|
| 78 |
|
| 79 |
|
| 80 |
|
| 81 |
|
| 82 |
|
| 83 |
|
| 84 |
|
| 85 |
Private Function __createTable(reference As Integer(), hypotheses As Integer(), cost As Double) As Double(,)
|
| 86 |
Return CreateTable(Of Integer)(reference, hypotheses, DynamicProgramming.Cost(Of Integer).DefaultCost(cost), AddressOf __int32Equals)
|
| 87 |
End Function
|
| 88 |
|
| 89 |
Private Function __int32Equals(a As Integer, b As Integer) As Boolean
|
| 90 |
Return a = b
|
| 91 |
End Function
|
| 92 |
|
| 93 |
|
| 94 |
|
| 95 |
|
| 96 |
|
| 97 |
|
| 98 |
|
| 99 |
|
| 100 |
|
| 101 |
|
| 102 |
Public Function CreateTable(Of T)(reference As T(), hypotheses As T(), cost As Cost(Of T), equals As GenericLambda(Of T).IEquals) As Double(,)
|
| 103 |
Dim distTable As Double(,) = New Double(reference.Length, hypotheses.Length) {}
|
| 104 |
|
| 105 |
For i As Integer = 0 To reference.Length - 1
|
| 106 |
distTable(i, 0) = i * cost.insert(reference(i))
|
| 107 |
Next
|
| 108 |
|
| 109 |
For j As Integer = 0 To hypotheses.Length - 1
|
| 110 |
distTable(0, j) = j * cost.delete(hypotheses(j))
|
| 111 |
Next
|
| 112 |
|
| 113 |
distTable(reference.Length, 0) = cost.insert(Nothing)
|
| 114 |
distTable(0, hypotheses.Length) = cost.delete(Nothing)
|
| 115 |
|
| 116 |
|
| 117 |
|
| 118 |
|
| 119 |
|
| 120 |
For i As Integer = 1 To reference.Length
|
| 121 |
For j As Integer = 1 To hypotheses.Length
|
| 122 |
|
| 123 |
If equals(reference(i - 1), hypotheses(j - 1)) Then
|
| 124 |
|
| 125 |
distTable(i, j) = distTable(i - 1, j - 1)
|
| 126 |
Else
|
| 127 |
Dim n As Double = sys.Min(
|
| 128 |
distTable(i - 1, j - 1) + cost.substitute(reference(i - 1), hypotheses(j - 1)),
|
| 129 |
distTable(i - 1, j) + cost.delete(reference(i - 1)))
|
| 130 |
distTable(i, j) = sys.Min(n, distTable(i, j - 1) + cost.insert(hypotheses(j - 1)))
|
| 131 |
End If
|
| 132 |
Next
|
| 133 |
Next
|
| 134 |
|
| 135 |
Return distTable
|
| 136 |
End Function
|
| 137 |
|
| 138 |
|
| 139 |
|
| 140 |
|
| 141 |
|
| 142 |
|
| 143 |
|
| 144 |
|
| 145 |
|
| 146 |
|
| 147 |
Public Function ComputeDistance(Of T)(reference As T(), hypotheses As T(), equals As GenericLambda(Of T).IEquals, Optional cost As Double = 0.7) As Double
|
| 148 |
If hypotheses Is Nothing Then hypotheses = New T() {}
|
| 149 |
If reference Is Nothing Then reference = New T() {}
|
| 150 |
|
| 151 |
Dim distTable#(,) = CreateTable(Of T)(reference, hypotheses, DynamicProgramming.Cost(Of T).DefaultCost(cost), equals)
|
| 152 |
Dim i As Integer = reference.Length, j As Integer = hypotheses.Length
|
| 153 |
|
| 154 |
Return distTable(i, j)
|
| 155 |
End Function
|
| 156 |
|
| 157 |
|
| 158 |
|
| 159 |
|
| 160 |
|
| 161 |
|
| 162 |
|
| 163 |
|
| 164 |
|
| 165 |
|
| 166 |
|
| 167 |
Public Function ComputeDistance(Of T)(reference As T(), hypotheses As T(), equals As GenericLambda(Of T).IEquals, asChar As ToChar(Of T), Optional cost As Double = 0.7) As DistResult
|
| 168 |
If hypotheses Is Nothing Then hypotheses = New T() {}
|
| 169 |
If reference Is Nothing Then reference = New T() {}
|
| 170 |
|
| 171 |
Dim distTable#(,) = CreateTable(Of T)(reference, hypotheses, DynamicProgramming.Cost(Of T).DefaultCost(cost), equals)
|
| 172 |
Dim i As Integer = reference.Length,
|
| 173 |
j As Integer = hypotheses.Length
|
| 174 |
Dim sHyp As String = New String(hypotheses.Select(Function(x) asChar(x)).ToArray)
|
| 175 |
Dim sRef As String = New String(reference.Select(Function(x) asChar(x)).ToArray)
|
| 176 |
Dim result As New DistResult With {
|
| 177 |
.Hypotheses = sHyp,
|
| 178 |
.Reference = sRef
|
| 179 |
}
|
| 180 |
Return __computeRoute(sHyp, result, i, j, distTable)
|
| 181 |
End Function
|
| 182 |
|
| 183 |
<ExportAPI("ToHTML", Info:="View distance evolve route of the Levenshtein Edit Distance calculation.")>
|
| 184 |
Public Function GetVisulization(res As DistResult) As String
|
| 185 |
Return res.HTMLVisualize
|
| 186 |
End Function
|
| 187 |
|
| 188 |
<ExportAPI("Write.Xml.DistResult")>
|
| 189 |
Public Function SaveMatch(result As DistResult, SaveTo As String) As Boolean
|
| 190 |
Return result.GetXml.SaveTo(SaveTo)
|
| 191 |
End Function
|
| 192 |
|
| 193 |
Public Delegate Function ToChar(Of T)(x As T) As Char
|
| 194 |
|
| 195 |
|
| 196 |
|
| 197 |
|
| 198 |
|
| 199 |
|
| 200 |
|
| 201 |
|
| 202 |
<ExportAPI("ComputeDistance", Info:="Implement the Levenshtein Edit Distance algorithm.")>
|
| 203 |
Public Function ComputeDistance(reference As Integer(), hypotheses As String, Optional cost As Double = 0.7) As DistResult
|
| 204 |
If hypotheses Is Nothing Then hypotheses = ""
|
| 205 |
If reference Is Nothing Then reference = New Integer() {}
|
| 206 |
|
| 207 |
Dim distTable#(,) = __createTable(reference,
|
| 208 |
hypotheses.Select(Function(ch) Asc(ch)).ToArray,
|
| 209 |
cost)
|
| 210 |
Dim i As Integer = reference.Length,
|
| 211 |
j As Integer = hypotheses.Length
|
| 212 |
Dim result As New DistResult With {
|
| 213 |
.Hypotheses = hypotheses,
|
| 214 |
.Reference = Nothing
|
| 215 |
}
|
| 216 |
Return __computeRoute(hypotheses, result, i, j, distTable)
|
| 217 |
End Function
|
| 218 |
|
| 219 |
Const a As Integer = Asc("a"c)
|
| 220 |
|
| 221 |
Public Function Similarity(Of T)(query As T(), subject As T(), Optional penalty As Double = 0.75) As Double
|
| 222 |
If query.IsNullOrEmpty OrElse subject.IsNullOrEmpty Then
|
| 223 |
Return 0
|
| 224 |
End If
|
| 225 |
|
| 226 |
Dim distinct As T() =
|
| 227 |
(New [Set](query) + New [Set](subject)) _
|
| 228 |
.ToArray _
|
| 229 |
.Select(Function(x) DirectCast(x, T)) _
|
| 230 |
.ToArray
|
| 231 |
Dim dict = (From index As Integer
|
| 232 |
In distinct.Sequence(offSet:=a)
|
| 233 |
Select ch = ChrW(index),
|
| 234 |
obj = distinct(index - a)) _
|
| 235 |
.ToDictionary(Function(x) x.obj,
|
| 236 |
Function(x) x.ch)
|
| 237 |
Dim ref As String = New String(query.Select(Function(x) dict(x)).ToArray)
|
| 238 |
Dim sbj As String = New String(subject.Select(Function(x) dict(x)).ToArray)
|
| 239 |
|
| 240 |
If String.IsNullOrEmpty(ref) OrElse String.IsNullOrEmpty(sbj) Then
|
| 241 |
Return 0
|
| 242 |
End If
|
| 243 |
|
| 244 |
Dim result As DistResult = ComputeDistance(ref, sbj, penalty)
|
| 245 |
If result Is Nothing Then
|
| 246 |
Return 0
|
| 247 |
Else
|
| 248 |
Return result.Score
|
| 249 |
End If
|
| 250 |
End Function
|
| 251 |
|
| 252 |
|
| 253 |
|
| 254 |
|
| 255 |
|
| 256 |
|
| 257 |
|
| 258 |
|
| 259 |
|
| 260 |
|
| 261 |
Private Function __computeRoute(hypotheses$,
|
| 262 |
result As DistResult,
|
| 263 |
i%, j%,
|
| 264 |
distTable#(,)) As DistResult
|
| 265 |
|
| 266 |
Dim css As New List(Of Coordinate)
|
| 267 |
Dim evolve As List(Of Char) = New List(Of Char)
|
| 268 |
Dim edits As New List(Of Char)
|
| 269 |
|
| 270 |
While True
|
| 271 |
|
| 272 |
Call css.Add({i - 1, j})
|
| 273 |
|
| 274 |
If i = 0 AndAlso j = 0 Then
|
| 275 |
Dim evolveRoute As Char() = evolve.ToArray
|
| 276 |
Call Array.Reverse(evolveRoute)
|
| 277 |
Call css.Add({i, j})
|
| 278 |
|
| 279 |
result.DistTable = distTable _
|
| 280 |
.ToVectorList _
|
| 281 |
.Select(Function(vec) New Streams.Array.Double With {
|
| 282 |
.Values = vec
|
| 283 |
}) _
|
| 284 |
.ToArray
|
| 285 |
result.DistEdits = New String(evolveRoute)
|
| 286 |
result.Path = css.ToArray
|
| 287 |
result.Matches = New String(edits.ToArray.Reverse.ToArray)
|
| 288 |
|
| 289 |
Exit While
|
| 290 |
|
| 291 |
ElseIf i = 0 AndAlso j > 0 Then
|
| 292 |
Call evolve.Add("d"c)
|
| 293 |
Call css.Add({i - 1, j})
|
| 294 |
Call edits.Add("-"c)
|
| 295 |
j -= 1
|
| 296 |
|
| 297 |
ElseIf i > 0 AndAlso j = 0 Then
|
| 298 |
Call evolve.Add("i"c)
|
| 299 |
Call css.Add({i - 1, j})
|
| 300 |
Call edits.Add("-"c)
|
| 301 |
|
| 302 |
i -= 1
|
| 303 |
|
| 304 |
Else
|
| 305 |
If distTable(i - 1, j - 1) <= distTable(i - 1, j) AndAlso
|
| 306 |
distTable(i - 1, j - 1) <= distTable(i, j - 1) Then
|
| 307 |
Call css.Add({i - 1, j})
|
| 308 |
If distTable(i - 1, j - 1) = distTable(i, j) Then
|
| 309 |
Call evolve.Add("m"c)
|
| 310 |
Call edits.Add(hypotheses(j - 1))
|
| 311 |
Else
|
| 312 |
Call evolve.Add("s"c)
|
| 313 |
Call edits.Add("-"c)
|
| 314 |
End If
|
| 315 |
|
| 316 |
i -= 1
|
| 317 |
j -= 1
|
| 318 |
|
| 319 |
ElseIf distTable(i - 1, j) < distTable(i, j - 1) Then
|
| 320 |
Call css.Add({i - 1, j})
|
| 321 |
Call evolve.Add("i")
|
| 322 |
Call edits.Add("-"c)
|
| 323 |
i -= 1
|
| 324 |
|
| 325 |
ElseIf distTable(i, j - 1) < distTable(i - 1, j) Then
|
| 326 |
Call css.Add({i - 1, j})
|
| 327 |
Call evolve.Add("d")
|
| 328 |
Call edits.Add("-"c)
|
| 329 |
j -= 1
|
| 330 |
|
| 331 |
End If
|
| 332 |
End If
|
| 333 |
|
| 334 |
If css.Count > 1024 AndAlso css.Count - evolve.Count > 128 Then
|
| 335 |
' Call $"{reference} ==> {hypotheses} stack could not be solve, operation abort!".__DEBUG_ECHO
|
| 336 |
Return Nothing
|
| 337 |
End If
|
| 338 |
End While
|
| 339 |
|
| 340 |
Return result
|
| 341 |
End Function
|
| 342 |
|
| 343 |
|
| 344 |
|
| 345 |
|
| 346 |
|
| 347 |
|
| 348 |
|
| 349 |
|
| 350 |
|
| 351 |
|
| 352 |
<ExportAPI("ComputeDistance", Info:="Implement the Levenshtein Edit Distance algorithm.")>
|
| 353 |
Public Function ComputeDistance(reference$, hypotheses$, Optional cost# = 0.7) As DistResult
|
| 354 |
|
| 355 |
If hypotheses Is Nothing Then hypotheses = ""
|
| 356 |
If reference Is Nothing Then reference = ""
|
| 357 |
|
| 358 |
Dim distTable As Double(,) = __createTable(
|
| 359 |
reference.Select(Function(ch) AscW(ch)).ToArray,
|
| 360 |
hypotheses.Select(Function(ch) AscW(ch)).ToArray,
|
| 361 |
cost)
|
| 362 |
Dim i As Integer = reference.Length,
|
| 363 |
j As Integer = hypotheses.Length
|
| 364 |
Dim result As New DistResult With {
|
| 365 |
.Hypotheses = hypotheses,
|
| 366 |
.Reference = reference
|
| 367 |
}
|
| 368 |
|
| 369 |
Return __computeRoute(hypotheses, result, i, j, distTable)
|
| 370 |
End Function
|
| 371 |
End Module
|
| 372 |
End Namespace
|