1 | #Region "Microsoft.VisualBasic::1099d60b78992290328f879c558bae05, Microsoft.VisualBasic.Core\ComponentModel\DataSource\Tsv.vb" |
2 | |
3 | ' Author: |
4 | ' |
5 | ' asuka (amethyst.asuka@gcmodeller.org) |
6 | ' xie (genetics@smrucc.org) |
7 | ' xieguigang (xie.guigang@live.com) |
8 | ' |
9 | ' Copyright (c) 2018 GPL3 Licensed |
10 | ' |
11 | ' |
12 | ' GNU GENERAL PUBLIC LICENSE (GPL3) |
13 | ' |
14 | ' |
15 | ' This program is free software: you can redistribute it and/or modify |
16 | ' it under the terms of the GNU General Public License as published by |
17 | ' the Free Software Foundation, either version 3 of the License, or |
18 | ' (at your option) any later version. |
19 | ' |
20 | ' This program is distributed in the hope that it will be useful, |
21 | ' but WITHOUT ANY WARRANTY; without even the implied warranty of |
22 | ' MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
23 | ' GNU General Public License for more details. |
24 | ' |
25 | ' You should have received a copy of the GNU General Public License |
26 | ' along with this program. If not, see <http://www.gnu.org/licenses/>. |
27 | |
28 | |
29 | |
30 | ' /********************************************************************************/ |
31 | |
32 | ' Summaries: |
33 | |
34 | ' Module TsvFileIO |
35 | ' |
36 | ' Function: GetTsvHeader, Load, LoadByIndex, LoadFile |
37 | ' |
38 | ' |
39 | ' /********************************************************************************/ |
40 | |
41 | #End Region |
42 | |
43 | Imports System.IO |
44 | Imports System.Reflection |
45 | Imports System.Runtime.CompilerServices |
46 | Imports System.Text |
47 | Imports Microsoft.VisualBasic.ComponentModel.Collection |
48 | Imports Microsoft.VisualBasic.ComponentModel.DataSourceModel.SchemaMaps |
49 | Imports Microsoft.VisualBasic.Language |
50 | Imports Microsoft.VisualBasic.Language.Default |
51 | Imports Microsoft.VisualBasic.Linq |
52 | Imports Microsoft.VisualBasic.Text |
53 | Imports FieldTuple = System.Collections.Generic.KeyValuePair(Of String, System.Reflection.PropertyInfo) |
54 | Imports RowTokens = System.Collections.Generic.IEnumerable(Of System.String) |
55 | |
56 | Namespace ComponentModel.DataSourceModel |
57 | |
58 | ''' <summary> |
59 | ''' 将文件读取出来然后对每一行数据进行分割,由于没有使用自定义属性来标记列的名称,所以这个很简单的tsv加载器要求属性的名称与列名称要完全一致。 |
60 | ''' 而且,还不能够为非初始数据类型,这个模块之中提供了简单的数据类型转换操作,这个只是一个简单的内置TSv文件读取模块 |
61 | ''' </summary> |
62 | ''' <remarks></remarks> |
63 | Public Module TsvFileIO |
64 | |
65 | ''' <summary> |
66 | ''' Columns indexing by title.(自动将tsv文件数据之中的行解析反序列化加载为一个Class对象) |
67 | ''' </summary> |
68 | ''' <typeparam name="T"></typeparam> |
69 | ''' <param name="Path"></param> |
70 | ''' <returns></returns> |
71 | Public Iterator Function Load(Of T As Class)(path$, Optional encoding As Encodings = Encodings.UTF8) As IEnumerable(Of T) |
72 | Dim data As IEnumerable(Of RowTokens) = TsvFileIO.LoadFile(path, encoding.CodePage, skipFirstLine:=True) |
73 | Dim tableSchema = DataFramework.Schema(Of T)(PropertyAccess.ReadWrite, True) |
74 | Dim type As Type = GetType(T) |
75 | Dim schemaOrdinals As Index(Of String) = |
76 | path _ |
77 | .OpenReader(encoding.CodePage) _ |
78 | .GetTsvHeader(False) |
79 | Dim typers = tableSchema.ToDictionary( |
80 | Function(m) m.Key, |
81 | Function(p) p.Value.PropertyType) |
82 | |
83 | For Each line As String() In data.Select(Function(r) DirectCast(r, String())) |
84 | Dim o As Object = Activator.CreateInstance(type) |
85 | |
86 | For Each field As FieldTuple In tableSchema |
87 | With field |
88 | Dim index As Integer = schemaOrdinals(.Key) |
89 | Dim s$ = line(index) |
90 | Dim value As Object = Scripting.CTypeDynamic(s, typers(.Key)) |
91 | |
92 | Call .Value.SetValue(o, value) |
93 | End With |
94 | Next |
95 | |
96 | Yield DirectCast(o, T) |
97 | Next |
98 | End Function |
99 | |
100 | ''' <summary> |
101 | ''' Columns indexing by position. |
102 | ''' </summary> |
103 | ''' <typeparam name="T"></typeparam> |
104 | ''' <param name="path$"></param> |
105 | ''' <param name="encoding"></param> |
106 | ''' <param name="base"> |
107 | ''' By default the array index is ZERO based, or you can specific this array index base from 1, or any other integer values |
108 | ''' </param> |
109 | ''' <returns></returns> |
110 | Public Iterator Function LoadByIndex(Of T As Class)(path$, Optional encoding As Encodings = Encodings.UTF8, Optional base% = 0) As IEnumerable(Of T) |
111 | Dim data As IEnumerable(Of RowTokens) = TsvFileIO.LoadFile(path, encoding.CodePage, skipFirstLine:=False) |
112 | Dim type As Type = GetType(T) |
113 | Dim index = DataFrameColumnAttribute _ |
114 | .LoadMapping(type, mapsAll:=False) _ |
115 | .Values _ |
116 | .OrderBy(Function(field) |
117 | Return field.field.Index |
118 | End Function) _ |
119 | .ToDictionary(Function(i) i.field.Index) |
120 | Dim str$ |
121 | Dim fields As PropertyInfo() = data _ |
122 | .First _ |
123 | .Count _ |
124 | .SeqIterator(offset:=base) _ |
125 | .Select(Function(i) |
126 | ' The tsv file have 10 columns, but only have 7 columns was indexed in target class schema type |
127 | ' Set all of the no-indexed column in tsv file its reader property to nothing. |
128 | If index.ContainsKey(i) Then |
129 | Return DirectCast(index(i).member, PropertyInfo) |
130 | Else |
131 | Return Nothing |
132 | End If |
133 | End Function) _ |
134 | .ToArray |
135 | |
136 | For Each line As RowTokens In data |
137 | Dim o = Activator.CreateInstance(type) |
138 | |
139 | For Each col As SeqValue(Of String) In line.SeqIterator |
140 | str = col.value |
141 | |
142 | With fields(col) |
143 | If Not .IsNothing Then |
144 | Call .SetValue( |
145 | obj:=o, |
146 | value:=Scripting.CTypeDynamic(str, .PropertyType)) |
147 | End If |
148 | End With |
149 | Next |
150 | |
151 | Yield DirectCast(o, T) |
152 | Next |
153 | End Function |
154 | |
155 | ''' <summary> |
156 | ''' Returns the source string without any processing |
157 | ''' </summary> |
158 | ReadOnly withoutProcess As New DefaultValue(Of Func(Of String, String))(Function(str) str) |
159 | |
160 | ''' <summary> |
161 | ''' |
162 | ''' </summary> |
163 | ''' <param name="stream"></param> |
164 | ''' <param name="lower"></param> |
165 | ''' <param name="process"></param> |
166 | ''' <returns></returns> |
167 | ''' <remarks> |
168 | ''' Linux平台上面的mono这里有bug,为什么<see cref="StreamReader.ReadLine()"/>一直都输出空值? |
169 | ''' </remarks> |
170 | <Extension> |
171 | Public Function GetTsvHeader(stream As StreamReader, |
172 | Optional lower As Boolean = False, |
173 | Optional process As Func(Of String, String) = Nothing) As Index(Of String) |
174 | |
175 | Dim line$ = stream.ReadLine |
176 | Dim headers$() = line _ |
177 | .Split(ASCII.TAB) _ |
178 | .Select(selector:=process Or withoutProcess) _ |
179 | .ToArray |
180 | |
181 | If lower Then |
182 | Return headers _ |
183 | .Select(AddressOf Strings.LCase) _ |
184 | .Indexing |
185 | Else |
186 | Return New Index(Of String)(headers) |
187 | End If |
188 | End Function |
189 | |
190 | ''' <summary> |
191 | ''' 读取文件并且按照TAb进行分割 |
192 | ''' </summary> |
193 | ''' <param name="path"></param> |
194 | ''' <param name="skipFirstLine">The first line of the text document maybe is the title headers, skip this line?</param> |
195 | ''' <returns></returns> |
196 | Private Function LoadFile(path$, Optional encoding As Encoding = Nothing, Optional skipFirstLine As Boolean = False) As IEnumerable(Of RowTokens) |
197 | Dim lines As String() = TextDoc.ReadAllLines(path, encoding Or UTF8) |
198 | Dim LQuery = LinqAPI.Exec(Of RowTokens) _ |
199 | _ |
200 | () <= From strLine As String |
201 | In lines |
202 | Let t As String() = Strings.Split(strLine, vbTab) ' 跳过标题行 |
203 | Select DirectCast(t, RowTokens) |
204 | |
205 | If skipFirstLine Then |
206 | Return LQuery.Skip(1) |
207 | Else |
208 | Return LQuery |
209 | End If |
210 | End Function |
211 | End Module |
212 | End Namespace |