1 | #Region "Microsoft.VisualBasic::987d69e81c50f3c2f0004fee3b75461d, Microsoft.VisualBasic.Core\Extensions\Trinity\NLP\TextTokens.vb" |
2 | |
3 | ' Author: |
4 | ' |
5 | ' asuka (amethyst.asuka@gcmodeller.org) |
6 | ' xie (genetics@smrucc.org) |
7 | ' xieguigang (xie.guigang@live.com) |
8 | ' |
9 | ' Copyright (c) 2018 GPL3 Licensed |
10 | ' |
11 | ' |
12 | ' GNU GENERAL PUBLIC LICENSE (GPL3) |
13 | ' |
14 | ' |
15 | ' This program is free software: you can redistribute it and/or modify |
16 | ' it under the terms of the GNU General Public License as published by |
17 | ' the Free Software Foundation, either version 3 of the License, or |
18 | ' (at your option) any later version. |
19 | ' |
20 | ' This program is distributed in the hope that it will be useful, |
21 | ' but WITHOUT ANY WARRANTY; without even the implied warranty of |
22 | ' MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
23 | ' GNU General Public License for more details. |
24 | ' |
25 | ' You should have received a copy of the GNU General Public License |
26 | ' along with this program. If not, see <http://www.gnu.org/licenses/>. |
27 | |
28 | |
29 | |
30 | ' /********************************************************************************/ |
31 | |
32 | ' Summaries: |
33 | |
34 | ' Interface ITokenCount |
35 | ' |
36 | ' Properties: Count, Id, Token |
37 | ' |
38 | ' Interface ILink |
39 | ' |
40 | ' Properties: Count, source, target |
41 | ' |
42 | ' Module TextTokens |
43 | ' |
44 | ' Sub: (+2 Overloads) Analysis |
45 | ' |
46 | ' |
47 | ' /********************************************************************************/ |
48 | |
49 | #End Region |
50 | |
51 | Imports System.Text.RegularExpressions |
52 | Imports Microsoft.VisualBasic.Linq |
53 | |
54 | Namespace Data.Trinity.NLP |
55 | |
56 | Public Interface ITokenCount |
57 | Property Token As String |
58 | Property Id As Integer |
59 | Property Count As Integer |
60 | End Interface |
61 | |
62 | Public Interface ILink |
63 | Property source As Integer |
64 | Property target As Integer |
65 | Property Count As Integer |
66 | End Interface |
67 | |
68 | ''' <summary> |
69 | ''' 应用于字符串分析的,自然语言处理 |
70 | ''' </summary> |
71 | Public Module TextTokens |
72 | |
73 | Public Sub Analysis(Of T, Tnode As ITokenCount)( |
74 | data As IEnumerable(Of T), |
75 | getValue As Func(Of T, String), |
76 | nodeNew As Func(Of String, Integer, Tnode), |
77 | ByRef nodes As Dictionary(Of String, Tnode), |
78 | Optional ignores As String() = Nothing) |
79 | |
80 | If ignores Is Nothing Then |
81 | ignores = {} |
82 | Else |
83 | ignores = ignores _ |
84 | .Where(Function(s) Not s.StringEmpty) _ |
85 | .Select(AddressOf LCase) _ |
86 | .ToArray |
87 | End If |
88 | |
89 | For Each x As T In data |
90 | Dim tokens As String() = getValue(x).ToLower.Split |
91 | |
92 | tokens = tokens.Where( ' 前面已经用ToLower转换为小写了,所以在这里直接使用indexof来判断 |
93 | Function(s) Array.IndexOf(ignores, s) = -1 AndAlso |
94 | Regex.Match(s, "\d+:?").Value <> s) _ |
95 | .ToArray |
96 | |
97 | For Each s As String In tokens |
98 | If Not nodes.ContainsKey(s) Then |
99 | nodes(s) = nodeNew(s, nodes.Count + 1) |
100 | End If |
101 | |
102 | nodes(s).Count += 1 |
103 | Next |
104 | Next |
105 | End Sub |
106 | |
107 | Public Sub Analysis(Of T, Tnode As ITokenCount, |
108 | Tlink As ILink)( |
109 | data As IEnumerable(Of T), |
110 | getValue As Func(Of T, String), |
111 | nodeNew As Func(Of String, Integer, Tnode), |
112 | ByRef nodes As Dictionary(Of String, Tnode), |
113 | Optional linkNew As Func(Of Integer, Integer, Tlink) = Nothing, |
114 | Optional ByRef links As Dictionary(Of String, Tlink) = Nothing, |
115 | Optional ignores As String() = Nothing) |
116 | |
117 | If ignores Is Nothing Then |
118 | ignores = {} |
119 | Else |
120 | ignores = ignores _ |
121 | .Where(Function(s) Not s.StringEmpty) _ |
122 | .Select(Function(s) s.ToLower) _ |
123 | .ToArray |
124 | End If |
125 | |
126 | For Each x As T In data |
127 | Dim tokens As String() = getValue(x).ToLower.Split |
128 | tokens = tokens.Where( ' 前面已经用ToLower转换为小写了,所以在这里直接使用indexof来判断 |
129 | Function(s) Array.IndexOf(ignores, s) = -1 AndAlso |
130 | Regex.Match(s, "\d+:?").Value <> s).ToArray |
131 | |
132 | For Each s As String In tokens |
133 | If Not nodes.ContainsKey(s) Then |
134 | nodes(s) = nodeNew(s, nodes.Count + 1) |
135 | End If |
136 | nodes(s).Count += 1 |
137 | Next |
138 | |
139 | If linkNew Is Nothing Then |
140 | Continue For |
141 | End If |
142 | |
143 | For Each s As String In tokens |
144 | For Each tt As String In tokens |
145 | |
146 | If s = tt Then |
147 | Continue For ' 自己和自己不需要被统计 |
148 | End If |
149 | |
150 | Dim o As String = {s, tt}.OrderBy(Function(ss) ss).JoinBy(" --> ") |
151 | If Not links.ContainsKey(o) Then |
152 | links(o) = linkNew(nodes(s).Id, nodes(tt).Id) |
153 | End If |
154 | links(o).Count += 1 |
155 | Next |
156 | Next |
157 | Next |
158 | End Sub |
159 | End Module |
160 | End Namespace |