1 | #Region "Microsoft.VisualBasic::524e5682b47e871f38b9f6e90ababafd, Microsoft.VisualBasic.Core\ComponentModel\DataStructures\FuzzyGroup.vb" |
2 | |
3 | ' Author: |
4 | ' |
5 | ' asuka (amethyst.asuka@gcmodeller.org) |
6 | ' xie (genetics@smrucc.org) |
7 | ' xieguigang (xie.guigang@live.com) |
8 | ' |
9 | ' Copyright (c) 2018 GPL3 Licensed |
10 | ' |
11 | ' |
12 | ' GNU GENERAL PUBLIC LICENSE (GPL3) |
13 | ' |
14 | ' |
15 | ' This program is free software: you can redistribute it and/or modify |
16 | ' it under the terms of the GNU General Public License as published by |
17 | ' the Free Software Foundation, either version 3 of the License, or |
18 | ' (at your option) any later version. |
19 | ' |
20 | ' This program is distributed in the hope that it will be useful, |
21 | ' but WITHOUT ANY WARRANTY; without even the implied warranty of |
22 | ' MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
23 | ' GNU General Public License for more details. |
24 | ' |
25 | ' You should have received a copy of the GNU General Public License |
26 | ' along with this program. If not, see <http://www.gnu.org/licenses/>. |
27 | |
28 | |
29 | |
30 | ' /********************************************************************************/ |
31 | |
32 | ' Summaries: |
33 | |
34 | ' Module FuzzyGroup |
35 | ' |
36 | ' Function: (+2 Overloads) FuzzyGroups |
37 | ' Structure __groupHelper |
38 | ' |
39 | ' Function: Equals, ToString |
40 | ' |
41 | ' |
42 | ' |
43 | ' |
44 | ' /********************************************************************************/ |
45 | |
46 | #End Region |
47 | |
48 | Imports System.Runtime.CompilerServices |
49 | Imports Microsoft.VisualBasic.ComponentModel.Collection.Generic |
50 | Imports Microsoft.VisualBasic.Language |
51 | Imports Microsoft.VisualBasic.Linq |
52 | Imports Microsoft.VisualBasic.Parallel |
53 | Imports Microsoft.VisualBasic.Parallel.Linq |
54 | Imports Microsoft.VisualBasic.Serialization.JSON |
55 | Imports Microsoft.VisualBasic.Text.Levenshtein |
56 | |
57 | Namespace ComponentModel.Collection |
58 | |
59 | ''' <summary> |
60 | ''' 对数据进行分组,通过标签数据的相似度 |
61 | ''' </summary> |
62 | Public Module FuzzyGroup |
63 | |
64 | ''' <summary> |
65 | ''' Grouping objects in a collection based on their <see cref="INamedValue.Key"/> string Fuzzy equals to others'. |
66 | ''' </summary> |
67 | ''' <typeparam name="T"></typeparam> |
68 | ''' <param name="source"></param> |
69 | ''' <param name="cut">字符串相似度的阈值</param> |
70 | ''' <returns></returns> |
71 | <Extension> |
72 | Public Function FuzzyGroups(Of T As INamedValue)( |
73 | source As IEnumerable(Of T), |
74 | Optional cut As Double = 0.6, |
75 | Optional parallel As Boolean = False) As GroupResult(Of T, String)() |
76 | |
77 | Return source.FuzzyGroups(Function(x) x.Key, cut, parallel).ToArray |
78 | End Function |
79 | |
80 | ''' <summary> |
81 | ''' Grouping objects in a collection based on their unique key string Fuzzy equals to others'. |
82 | ''' </summary> |
83 | ''' <typeparam name="T"></typeparam> |
84 | ''' <param name="source"></param> |
85 | ''' <param name="getKey">The unique key provider</param> |
86 | ''' <param name="cut">字符串相似度的阈值</param> |
87 | ''' <returns></returns> |
88 | ''' <remarks> |
89 | ''' 由于list在查找方面的速度非常的慢,而字典可能在生成的时候会慢一些,但是查找很快,所以在这里函数里面使用字典来替代列表 |
90 | ''' </remarks> |
91 | <Extension> |
92 | Public Iterator Function FuzzyGroups(Of T)( |
93 | source As IEnumerable(Of T), |
94 | getKey As Func(Of T, String), |
95 | Optional cut As Double = 0.6, |
96 | Optional parallel As Boolean = False) As IEnumerable(Of GroupResult(Of T, String)) |
97 | |
98 | Dim tmp As New List(Of __groupHelper(Of T)) |
99 | Dim buf As List(Of __groupHelper(Of T)) = |
100 | LinqAPI.MakeList(Of __groupHelper(Of T)) <= From x As T |
101 | In source |
102 | Let s_key As String = getKey(x) |
103 | Select New __groupHelper(Of T) With { |
104 | .cut = cut, |
105 | .key = s_key, |
106 | .keyASC = s_key.Select(AddressOf Asc).ToArray, |
107 | .x = x |
108 | } |
109 | Dim out As GroupResult(Of T, String) |
110 | Dim lhash As Dictionary(Of __groupHelper(Of T), Object) = |
111 | buf.ToDictionary(Function(x) x, Function(x) Nothing) |
112 | |
113 | If parallel Then |
114 | Call "Fuzzy grouping running in parallel mode...".__DEBUG_ECHO |
115 | End If |
116 | |
117 | Do While lhash.Count > 0 |
118 | Dim ref As __groupHelper(Of T) = lhash.First.Key |
119 | |
120 | Call tmp.Clear() |
121 | Call tmp.Add(ref) ' 重置缓存 |
122 | Call lhash.Remove(ref) ' 写入Group的参考数据 |
123 | |
124 | If parallel Then |
125 | tmp += LQuerySchedule.LQuery(lhash.Keys, Function(x) x, where:=Function(x) ref.Equals(x:=x)) |
126 | Else |
127 | For Each x As __groupHelper(Of T) In lhash.Values |
128 | If ref.Equals(x:=x) Then |
129 | Call tmp.Add(x) |
130 | End If |
131 | Next |
132 | End If |
133 | |
134 | Call Console.Write("-") |
135 | |
136 | For Each x As __groupHelper(Of T) In tmp |
137 | Call lhash.Remove(x) |
138 | Next |
139 | |
140 | Call Console.Write("*") |
141 | |
142 | out = New GroupResult(Of T, String) With { |
143 | .Group = tmp.Select(Function(x) x.x).ToArray, |
144 | .Tag = ref.key |
145 | } |
146 | Yield out |
147 | Loop |
148 | End Function |
149 | |
150 | ''' <summary> |
151 | ''' 分组操作的内部帮助类 |
152 | ''' </summary> |
153 | ''' <typeparam name="T"></typeparam> |
154 | Private Structure __groupHelper(Of T) |
155 | |
156 | ''' <summary> |
157 | ''' Key for represent this object. |
158 | ''' </summary> |
159 | Public key As String |
160 | ''' <summary> |
161 | ''' Target element object in the grouping |
162 | ''' </summary> |
163 | Public x As T |
164 | Public cut As Double |
165 | ''' <summary> |
166 | ''' Key cache |
167 | ''' </summary> |
168 | Public keyASC As Integer() |
169 | |
170 | Public Overrides Function ToString() As String |
171 | Return Me.GetJson |
172 | End Function |
173 | |
174 | ''' <summary> |
175 | ''' 判断Key是否模糊相等 |
176 | ''' </summary> |
177 | ''' <param name="x"></param> |
178 | ''' <returns></returns> |
179 | Public Overloads Function Equals(x As __groupHelper(Of T)) As Boolean |
180 | Dim edits As DistResult = ComputeDistance( |
181 | keyASC, x.keyASC, |
182 | Function(a, b) a = b, |
183 | AddressOf Chr) |
184 | |
185 | If edits Is Nothing Then |
186 | Return False |
187 | Else |
188 | Return edits.MatchSimilarity >= cut |
189 | End If |
190 | End Function |
191 | End Structure |
192 | End Module |
193 | End Namespace |