| 1 | #Region "Microsoft.VisualBasic::987d69e81c50f3c2f0004fee3b75461d, Microsoft.VisualBasic.Core\Extensions\Trinity\NLP\TextTokens.vb" |
| 2 | |
| 3 | ' Author: |
| 4 | ' |
| 5 | ' asuka (amethyst.asuka@gcmodeller.org) |
| 6 | ' xie (genetics@smrucc.org) |
| 7 | ' xieguigang (xie.guigang@live.com) |
| 8 | ' |
| 9 | ' Copyright (c) 2018 GPL3 Licensed |
| 10 | ' |
| 11 | ' |
| 12 | ' GNU GENERAL PUBLIC LICENSE (GPL3) |
| 13 | ' |
| 14 | ' |
| 15 | ' This program is free software: you can redistribute it and/or modify |
| 16 | ' it under the terms of the GNU General Public License as published by |
| 17 | ' the Free Software Foundation, either version 3 of the License, or |
| 18 | ' (at your option) any later version. |
| 19 | ' |
| 20 | ' This program is distributed in the hope that it will be useful, |
| 21 | ' but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 22 | ' MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 23 | ' GNU General Public License for more details. |
| 24 | ' |
| 25 | ' You should have received a copy of the GNU General Public License |
| 26 | ' along with this program. If not, see <http://www.gnu.org/licenses/>. |
| 27 | |
| 28 | |
| 29 | |
| 30 | ' /********************************************************************************/ |
| 31 | |
| 32 | ' Summaries: |
| 33 | |
| 34 | ' Interface ITokenCount |
| 35 | ' |
| 36 | ' Properties: Count, Id, Token |
| 37 | ' |
| 38 | ' Interface ILink |
| 39 | ' |
| 40 | ' Properties: Count, source, target |
| 41 | ' |
| 42 | ' Module TextTokens |
| 43 | ' |
| 44 | ' Sub: (+2 Overloads) Analysis |
| 45 | ' |
| 46 | ' |
| 47 | ' /********************************************************************************/ |
| 48 | |
| 49 | #End Region |
| 50 | |
| 51 | Imports System.Text.RegularExpressions |
| 52 | Imports Microsoft.VisualBasic.Linq |
| 53 | |
| 54 | Namespace Data.Trinity.NLP |
| 55 | |
| 56 | Public Interface ITokenCount |
| 57 | Property Token As String |
| 58 | Property Id As Integer |
| 59 | Property Count As Integer |
| 60 | End Interface |
| 61 | |
| 62 | Public Interface ILink |
| 63 | Property source As Integer |
| 64 | Property target As Integer |
| 65 | Property Count As Integer |
| 66 | End Interface |
| 67 | |
| 68 | ''' <summary> |
| 69 | ''' 应用于字符串分析的,自然语言处理 |
| 70 | ''' </summary> |
| 71 | Public Module TextTokens |
| 72 | |
| 73 | Public Sub Analysis(Of T, Tnode As ITokenCount)( |
| 74 | data As IEnumerable(Of T), |
| 75 | getValue As Func(Of T, String), |
| 76 | nodeNew As Func(Of String, Integer, Tnode), |
| 77 | ByRef nodes As Dictionary(Of String, Tnode), |
| 78 | Optional ignores As String() = Nothing) |
| 79 | |
| 80 | If ignores Is Nothing Then |
| 81 | ignores = {} |
| 82 | Else |
| 83 | ignores = ignores _ |
| 84 | .Where(Function(s) Not s.StringEmpty) _ |
| 85 | .Select(AddressOf LCase) _ |
| 86 | .ToArray |
| 87 | End If |
| 88 | |
| 89 | For Each x As T In data |
| 90 | Dim tokens As String() = getValue(x).ToLower.Split |
| 91 | |
| 92 | tokens = tokens.Where( ' 前面已经用ToLower转换为小写了,所以在这里直接使用indexof来判断 |
| 93 | Function(s) Array.IndexOf(ignores, s) = -1 AndAlso |
| 94 | Regex.Match(s, "\d+:?").Value <> s) _ |
| 95 | .ToArray |
| 96 | |
| 97 | For Each s As String In tokens |
| 98 | If Not nodes.ContainsKey(s) Then |
| 99 | nodes(s) = nodeNew(s, nodes.Count + 1) |
| 100 | End If |
| 101 | |
| 102 | nodes(s).Count += 1 |
| 103 | Next |
| 104 | Next |
| 105 | End Sub |
| 106 | |
| 107 | Public Sub Analysis(Of T, Tnode As ITokenCount, |
| 108 | Tlink As ILink)( |
| 109 | data As IEnumerable(Of T), |
| 110 | getValue As Func(Of T, String), |
| 111 | nodeNew As Func(Of String, Integer, Tnode), |
| 112 | ByRef nodes As Dictionary(Of String, Tnode), |
| 113 | Optional linkNew As Func(Of Integer, Integer, Tlink) = Nothing, |
| 114 | Optional ByRef links As Dictionary(Of String, Tlink) = Nothing, |
| 115 | Optional ignores As String() = Nothing) |
| 116 | |
| 117 | If ignores Is Nothing Then |
| 118 | ignores = {} |
| 119 | Else |
| 120 | ignores = ignores _ |
| 121 | .Where(Function(s) Not s.StringEmpty) _ |
| 122 | .Select(Function(s) s.ToLower) _ |
| 123 | .ToArray |
| 124 | End If |
| 125 | |
| 126 | For Each x As T In data |
| 127 | Dim tokens As String() = getValue(x).ToLower.Split |
| 128 | tokens = tokens.Where( ' 前面已经用ToLower转换为小写了,所以在这里直接使用indexof来判断 |
| 129 | Function(s) Array.IndexOf(ignores, s) = -1 AndAlso |
| 130 | Regex.Match(s, "\d+:?").Value <> s).ToArray |
| 131 | |
| 132 | For Each s As String In tokens |
| 133 | If Not nodes.ContainsKey(s) Then |
| 134 | nodes(s) = nodeNew(s, nodes.Count + 1) |
| 135 | End If |
| 136 | nodes(s).Count += 1 |
| 137 | Next |
| 138 | |
| 139 | If linkNew Is Nothing Then |
| 140 | Continue For |
| 141 | End If |
| 142 | |
| 143 | For Each s As String In tokens |
| 144 | For Each tt As String In tokens |
| 145 | |
| 146 | If s = tt Then |
| 147 | Continue For ' 自己和自己不需要被统计 |
| 148 | End If |
| 149 | |
| 150 | Dim o As String = {s, tt}.OrderBy(Function(ss) ss).JoinBy(" --> ") |
| 151 | If Not links.ContainsKey(o) Then |
| 152 | links(o) = linkNew(nodes(s).Id, nodes(tt).Id) |
| 153 | End If |
| 154 | links(o).Count += 1 |
| 155 | Next |
| 156 | Next |
| 157 | Next |
| 158 | End Sub |
| 159 | End Module |
| 160 | End Namespace |