1 | #Region "Microsoft.VisualBasic::1714b08589beffbc1a086cc0914198cf, Microsoft.VisualBasic.Core\Extensions\Math\NumberGroups.vb" |
2 | |
3 | ' Author: |
4 | ' |
5 | ' asuka (amethyst.asuka@gcmodeller.org) |
6 | ' xie (genetics@smrucc.org) |
7 | ' xieguigang (xie.guigang@live.com) |
8 | ' |
9 | ' Copyright (c) 2018 GPL3 Licensed |
10 | ' |
11 | ' |
12 | ' GNU GENERAL PUBLIC LICENSE (GPL3) |
13 | ' |
14 | ' |
15 | ' This program is free software: you can redistribute it and/or modify |
16 | ' it under the terms of the GNU General Public License as published by |
17 | ' the Free Software Foundation, either version 3 of the License, or |
18 | ' (at your option) any later version. |
19 | ' |
20 | ' This program is distributed in the hope that it will be useful, |
21 | ' but WITHOUT ANY WARRANTY; without even the implied warranty of |
22 | ' MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
23 | ' GNU General Public License for more details. |
24 | ' |
25 | ' You should have received a copy of the GNU General Public License |
26 | ' along with this program. If not, see <http://www.gnu.org/licenses/>. |
27 | |
28 | |
29 | |
30 | ' /********************************************************************************/ |
31 | |
32 | ' Summaries: |
33 | |
34 | ' Module NumberGroups |
35 | ' |
36 | ' Function: BinarySearch, (+2 Overloads) GroupBy, GroupByImpl, Groups, Match |
37 | ' Min |
38 | ' Interface IVector |
39 | ' |
40 | ' Properties: Data |
41 | ' |
42 | ' |
43 | ' |
44 | ' Interface INumberTag |
45 | ' |
46 | ' Properties: Tag |
47 | ' |
48 | ' |
49 | ' /********************************************************************************/ |
50 | |
51 | #End Region |
52 | |
53 | Imports System.Runtime.CompilerServices |
54 | Imports Microsoft.VisualBasic.ComponentModel.DataSourceModel |
55 | Imports Microsoft.VisualBasic.ComponentModel.TagData |
56 | Imports Microsoft.VisualBasic.Language |
57 | Imports Microsoft.VisualBasic.Linq |
58 | Imports Microsoft.VisualBasic.Math.Statistics |
59 | Imports Microsoft.VisualBasic.Parallel |
60 | |
61 | Namespace Math |
62 | |
63 | ''' <summary> |
64 | ''' Simple number vector grouping |
65 | ''' </summary> |
66 | Public Module NumberGroups |
67 | |
68 | ''' <summary> |
69 | ''' The numeric vector model |
70 | ''' </summary> |
71 | Public Interface IVector |
72 | ReadOnly Property Data As Double() |
73 | End Interface |
74 | |
75 | <Extension> |
76 | Public Function Match(Of T As IVector)(a As IEnumerable(Of T), b As IEnumerable(Of T)) As Double |
77 | Dim target As New List(Of T)(a) |
78 | Dim mins = b.Select(Function(x) target.Min(x)) |
79 | Dim result As Double = mins.Sum(Function(tt) tt.Tag) |
80 | |
81 | With target |
82 | For Each x In mins.Select(Function(o) o.Value) |
83 | Call .Remove(item:=x) |
84 | If .Count = 0 Then |
85 | Exit For |
86 | End If |
87 | Next |
88 | End With |
89 | |
90 | Return result * (target.Count + 1) |
91 | End Function |
92 | |
93 | ''' <summary> |
94 | ''' 计算出<paramref name="target"/>集合之众的与<paramref name="v"/>距离最小的元素 |
95 | ''' (或者说是匹配度最高的元素) |
96 | ''' </summary> |
97 | ''' <typeparam name="T"></typeparam> |
98 | ''' <param name="target"></param> |
99 | ''' <param name="v"></param> |
100 | ''' <returns></returns> |
101 | <Extension> |
102 | Public Function Min(Of T As IVector)(target As IEnumerable(Of T), v As T) As DoubleTagged(Of T) |
103 | Dim minV# = Double.MaxValue |
104 | Dim minX As T |
105 | Dim vector#() = v.Data |
106 | |
107 | For Each x As T In target |
108 | Dim d# = x.Data.EuclideanDistance(vector) |
109 | |
110 | If d < minV Then |
111 | minV = d |
112 | minX = x |
113 | End If |
114 | Next |
115 | |
116 | Return New DoubleTagged(Of T) With { |
117 | .Tag = minV, |
118 | .Value = minX |
119 | } |
120 | End Function |
121 | |
122 | ''' <summary> |
123 | ''' Returns ``-1`` means no search result |
124 | ''' </summary> |
125 | ''' <param name="seq"></param> |
126 | ''' <param name="target#"></param> |
127 | ''' <param name="equals"></param> |
128 | ''' <returns></returns> |
129 | <Extension> |
130 | Public Function BinarySearch(seq As IEnumerable(Of Double), target#, equals As GenericLambda(Of Double).IEquals) As Double |
131 | With seq _ |
132 | .SeqIterator _ |
133 | .OrderBy(Function(x) x.value) _ |
134 | .ToArray |
135 | |
136 | Dim x As SeqValue(Of Double) |
137 | Dim min% = 0 |
138 | Dim max% = .Length - 1 |
139 | Dim index% |
140 | Dim value# |
141 | |
142 | If max = -1 Then |
143 | ' no elements |
144 | Return -1 |
145 | ElseIf max = 0 Then |
146 | ' one element |
147 | If equals(.ByRef(0).value, target) Then |
148 | Return 0 |
149 | Else |
150 | ' 序列只有一个元素,但是不相等,则返回-1,否则后面的while会无限死循环 |
151 | Return -1 |
152 | End If |
153 | End If |
154 | |
155 | Do While max <> (min + 1) |
156 | index = (max - min) / 2 + min |
157 | x = .ByRef(index) |
158 | value = x.value |
159 | |
160 | If equals(target, value) Then |
161 | Return x.i |
162 | ElseIf target > value Then |
163 | min = index |
164 | Else |
165 | max = index |
166 | End If |
167 | Loop |
168 | |
169 | If equals(.ByRef(min).value, target) Then |
170 | Return .ByRef(min).i |
171 | ElseIf equals(.ByRef(max).value, target) Then |
172 | Return .ByRef(max).i |
173 | Else |
174 | Return -1 |
175 | End If |
176 | End With |
177 | End Function |
178 | |
179 | ''' <summary> |
180 | ''' 将一维的数据按照一定的偏移量分组输出 |
181 | ''' </summary> |
182 | ''' <param name="source"></param> |
183 | ''' <returns></returns> |
184 | <Extension> Public Iterator Function GroupBy(Of T)(source As IEnumerable(Of T), |
185 | evaluate As Func(Of T, Double), |
186 | equals As GenericLambda(Of Double).IEquals, |
187 | Optional parallel As Boolean = False) As IEnumerable(Of NamedCollection(Of T)) |
188 | If Not parallel Then |
189 | |
190 | ' For Each group In source.AsList.GroupByImpl(evaluate, equals) |
191 | ' Yield group |
192 | ' Next |
193 | |
194 | ' 先进行预处理:求值然后进行排序 |
195 | Dim tagValues = source _ |
196 | .Select(Function(o) (evaluate(o), o)) _ |
197 | .OrderBy(Function(o) o.Item1) _ |
198 | .ToArray |
199 | Dim means As New Average |
200 | Dim members As New List(Of T) |
201 | |
202 | ' 根据分组的平均值来进行分组操作 |
203 | For Each x As (val#, o As T) In tagValues |
204 | If means.N = 0 Then |
205 | means += x.Item1 |
206 | members += x.Item2 |
207 | Else |
208 | If equals(means.Average, x.Item1) Then |
209 | means += x.Item1 |
210 | members += x.Item2 |
211 | Else |
212 | Yield New NamedCollection(Of T)(CStr(means.Average), members) |
213 | |
214 | means = New Average({x.Item1}) |
215 | members = New List(Of T) From {x.Item2} |
216 | End If |
217 | End If |
218 | Next |
219 | |
220 | If members > 0 Then |
221 | Yield New NamedCollection(Of T)(CStr(means.Average), members) |
222 | End If |
223 | Else |
224 | Dim partitions = source _ |
225 | .SplitIterator(20000) _ |
226 | .AsParallel _ |
227 | .Select(Function(part) |
228 | Return part.AsList.GroupByImpl(evaluate, equals) |
229 | End Function) _ |
230 | .IteratesALL _ |
231 | .AsList |
232 | |
233 | ' 先分割,再对name做分组 |
234 | Dim union = partitions.GroupByImpl(Function(part) Val(part.Name), equals) |
235 | |
236 | For Each unionGroup In union |
237 | Dim name$ = unionGroup.Name |
238 | Dim data = unionGroup _ |
239 | .Value _ |
240 | .Select(Function(member) member.Value) _ |
241 | .IteratesALL _ |
242 | .ToArray |
243 | |
244 | Yield New NamedCollection(Of T) With { |
245 | .Name = name, |
246 | .Value = data |
247 | } |
248 | Next |
249 | End If |
250 | End Function |
251 | |
252 | <Extension> |
253 | Private Function GroupByImpl(Of T)(source As List(Of T), evaluate As Func(Of T, Double), equals As GenericLambda(Of Double).IEquals) As NamedCollection(Of T)() |
254 | Dim tmp As New With { |
255 | .avg = New Average({}), |
256 | .list = New List(Of T) |
257 | } |
258 | Dim groups = { |
259 | tmp |
260 | }.AsList * 0 |
261 | |
262 | Do While source.Count > 0 |
263 | Dim x As T = source.Pop |
264 | Dim value# = evaluate(x) |
265 | Dim hit% = groups _ |
266 | .Select(Function(g) |
267 | Return g.avg.Average |
268 | End Function) _ |
269 | .BinarySearch(value, equals) |
270 | |
271 | ' 在这里应该使用二分法查找来加快计算速度的 |
272 | If hit > -1 Then |
273 | With groups(hit) |
274 | .avg += value |
275 | .list.Add(x) |
276 | End With |
277 | Else |
278 | groups += New With { |
279 | .avg = New Average({value}), |
280 | .list = New List(Of T) From {x} |
281 | } |
282 | End If |
283 | Loop |
284 | |
285 | Return groups _ |
286 | .Select(Function(tuple) |
287 | Return New NamedCollection(Of T) With { |
288 | .Name = tuple.avg.Average, |
289 | .Value = tuple.list |
290 | } |
291 | End Function) _ |
292 | .OrderBy(Function(tuple) Val(tuple.Name)) _ |
293 | .ToArray |
294 | End Function |
295 | |
296 | ''' <summary> |
297 | ''' 将一维的数据按照一定的偏移量分组输出 |
298 | ''' </summary> |
299 | ''' <param name="source"></param> |
300 | ''' <param name="offsets"></param> |
301 | ''' <returns></returns> |
302 | ''' |
303 | <MethodImpl(MethodImplOptions.AggressiveInlining)> |
304 | <Extension> Public Function GroupBy(Of T)(source As IEnumerable(Of T), evaluate As Func(Of T, Double), offsets#) As NamedCollection(Of T)() |
305 | Return source.GroupBy(evaluate, equals:=Function(a, b) Abs(a - b) <= offsets) |
306 | End Function |
307 | |
308 | ''' <summary> |
309 | ''' 按照相邻的两个数值是否在offset区间内来进行简单的分组操作 |
310 | ''' </summary> |
311 | ''' <typeparam name="TagObject"></typeparam> |
312 | ''' <param name="source"></param> |
313 | ''' <param name="offset"></param> |
314 | ''' <returns></returns> |
315 | <Extension> |
316 | Public Function Groups(Of TagObject As INumberTag)(source As IEnumerable(Of TagObject), offset As Integer) As GroupResult(Of TagObject, Integer)() |
317 | Dim list As New List(Of GroupResult(Of TagObject, Integer)) |
318 | Dim orders As TagObject() = (From x As TagObject |
319 | In source |
320 | Select x |
321 | Order By x.Tag Ascending).ToArray |
322 | Dim tag As TagObject = orders(Scan0) |
323 | Dim tmp As New List(Of TagObject) From {tag} |
324 | |
325 | For Each x As TagObject In orders.Skip(1) |
326 | If x.Tag - tag.Tag <= offset Then ' 因为已经是经过排序了的,所以后面总是大于前面的 |
327 | tmp += x |
328 | Else |
329 | tag = x |
330 | list += New GroupResult(Of TagObject, Integer)(tag.Tag, tmp) |
331 | tmp = New List(Of TagObject) From {x} |
332 | End If |
333 | Next |
334 | |
335 | If tmp.Count > 0 Then |
336 | list += New GroupResult(Of TagObject, Integer)(tag.Tag, tmp) |
337 | End If |
338 | |
339 | Return list |
340 | End Function |
341 | End Module |
342 | |
343 | Public Interface INumberTag |
344 | ReadOnly Property Tag As Integer |
345 | End Interface |
346 | End Namespace |