1 | #Region "Microsoft.VisualBasic::5baaf99dd798b4a071f66a7c70985bbc, Microsoft.VisualBasic.Core\Extensions\Doc\LargeTextFile.vb" |
2 | |
3 | ' Author: |
4 | ' |
5 | ' asuka (amethyst.asuka@gcmodeller.org) |
6 | ' xie (genetics@smrucc.org) |
7 | ' xieguigang (xie.guigang@live.com) |
8 | ' |
9 | ' Copyright (c) 2018 GPL3 Licensed |
10 | ' |
11 | ' |
12 | ' GNU GENERAL PUBLIC LICENSE (GPL3) |
13 | ' |
14 | ' |
15 | ' This program is free software: you can redistribute it and/or modify |
16 | ' it under the terms of the GNU General Public License as published by |
17 | ' the Free Software Foundation, either version 3 of the License, or |
18 | ' (at your option) any later version. |
19 | ' |
20 | ' This program is distributed in the hope that it will be useful, |
21 | ' but WITHOUT ANY WARRANTY; without even the implied warranty of |
22 | ' MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
23 | ' GNU General Public License for more details. |
24 | ' |
25 | ' You should have received a copy of the GNU General Public License |
26 | ' along with this program. If not, see <http://www.gnu.org/licenses/>. |
27 | |
28 | |
29 | |
30 | ' /********************************************************************************/ |
31 | |
32 | ' Summaries: |
33 | |
34 | ' Module LargeTextFile |
35 | ' |
36 | ' Function: GetLastLine, IteratesStream, IteratesTableData, Merge, Peeks |
37 | ' Tails |
38 | ' |
39 | ' /********************************************************************************/ |
40 | |
41 | #End Region |
42 | |
43 | Imports System.IO |
44 | Imports System.Runtime.CompilerServices |
45 | Imports System.Text |
46 | Imports Microsoft.VisualBasic.CommandLine.Reflection |
47 | Imports Microsoft.VisualBasic.Scripting.MetaData |
48 | Imports Microsoft.VisualBasic.Text |
49 | |
50 | ''' <summary> |
51 | ''' Wrapper for the file operations. |
52 | ''' </summary> |
53 | ''' <remarks></remarks> |
54 | <[Namespace]("Large_Text_File")> |
55 | Public Module LargeTextFile |
56 | |
57 | ''' <summary> |
58 | ''' Iterates read all lines in a very large text file, using for loading a very large size csv/tsv file |
59 | ''' </summary> |
60 | ''' <param name="path$">file path</param> |
61 | ''' <param name="title$">The header line of this large size csv/tsv file.</param> |
62 | ''' <param name="skip%">Skip n lines, then start to populate data lines.</param> |
63 | ''' <param name="encoding">Text file encoding.</param> |
64 | ''' <returns></returns> |
65 | <Extension> |
66 | Public Function IteratesTableData(path$, ByRef title$, Optional skip% = -1, Optional encoding As Encodings = Encodings.ASCII) As IEnumerable(Of String) |
67 | Using reader As StreamReader = path.OpenReader(encoding.CodePage) |
68 | Dim i% = skip |
69 | |
70 | ' skip lines |
71 | Do While i > 0 |
72 | reader.ReadLine() |
73 | i -= 1 |
74 | Loop |
75 | |
76 | title = reader.ReadLine |
77 | |
78 | Return reader.IteratesStream |
79 | End Using |
80 | End Function |
81 | |
82 | <Extension> |
83 | Public Iterator Function IteratesStream(s As StreamReader) As IEnumerable(Of String) |
84 | Do While Not s.EndOfStream |
85 | Yield s.ReadLine |
86 | Loop |
87 | End Function |
88 | |
89 | ''' <summary> |
90 | ''' 当一个文件非常大以致无法使用任何现有的文本编辑器查看的时候,可以使用本方法查看其中的一部分数据 |
91 | ''' </summary> |
92 | ''' <returns></returns> |
93 | ''' <remarks></remarks> |
94 | ''' |
95 | <ExportAPI("Peeks")> |
96 | Public Function Peeks(path As String, Optional length% = 5 * 1024) As String |
97 | Dim buffer As Char() = New Char(length - 1) {} |
98 | |
99 | Using reader As StreamReader = FileIO.FileSystem.OpenTextFileReader(path) |
100 | Call reader.ReadBlock(buffer, 0, buffer.Length) |
101 | End Using |
102 | |
103 | Return New String(value:=buffer) |
104 | End Function |
105 | |
106 | ''' <summary> |
107 | ''' Peek the tails of a large text file.(尝试查看大文件的尾部的数据) |
108 | ''' </summary> |
109 | ''' <param name="path">If the file is not exists, then this function will returns nothing.</param> |
110 | ''' <param name="length">Peeks of the number of characters.(字符的数目)</param> |
111 | ''' <param name="encoding">Default value is <see cref="DefaultEncoding"/></param> |
112 | ''' <returns></returns> |
113 | ''' <remarks> |
114 | ''' 请注意,如果字符编码是不定长的,则返回的字符串可能会出现乱码的问题 |
115 | ''' </remarks> |
116 | <ExportAPI("Tails")> |
117 | <Extension> |
118 | Public Function Tails(path$, <Parameter("characters", "The number of the characters, not the bytes value.")> length%, Optional encoding As Encoding = Nothing) As String |
119 | Dim textEncoder As Encoding = encoding Or DefaultEncoding |
120 | |
121 | If Not path.FileExists Then |
122 | Return Nothing |
123 | Else |
124 | length *= (textEncoder.GetBytes("a").Length + 1) |
125 | End If |
126 | |
127 | Using reader As New FileStream(path, FileMode.Open) |
128 | If reader.Length < length Then |
129 | length = reader.Length |
130 | End If |
131 | |
132 | Dim buffer As Byte() = New Byte(length - 1) {} |
133 | |
134 | Call reader.Seek(reader.Length - length, SeekOrigin.Begin) |
135 | Call reader.Read(buffer, 0, buffer.Length) |
136 | |
137 | Dim value$ = textEncoder.GetString(buffer) |
138 | Return value |
139 | End Using |
140 | End Function |
141 | |
142 | ''' <summary> |
143 | ''' Get last line of the target text file. |
144 | ''' </summary> |
145 | ''' <param name="path$"></param> |
146 | ''' <param name="encoding"></param> |
147 | ''' <param name="newLine$"></param> |
148 | ''' <returns></returns> |
149 | <Extension> |
150 | Public Function GetLastLine(path$, Optional encoding As Encoding = Nothing, Optional newLine$ = vbLf) As String |
151 | Using sr As New StreamReader(path, encoding Or UTF8) |
152 | Dim lastline As String |
153 | Dim i As Integer = 2 |
154 | |
155 | Call sr.DiscardBufferedData() |
156 | |
157 | Do |
158 | If i <= sr.BaseStream.Length Then |
159 | sr.BaseStream.Seek(sr.BaseStream.Length - i, SeekOrigin.Begin) |
160 | lastline = sr.ReadToEnd |
161 | |
162 | If lastline.StartsWith(newLine) Then |
163 | Exit Do |
164 | End If |
165 | |
166 | i += 1 |
167 | Else |
168 | ' 目标文本文件只有一行数据 |
169 | sr.BaseStream.Seek(Scan0, SeekOrigin.Begin) |
170 | Return sr.ReadToEnd |
171 | End If |
172 | Loop |
173 | |
174 | ' 因为空格可能是所需要的字符串的数据 |
175 | ' 所以在这里只取出前后的newline字符串 |
176 | Return lastline.Trim(ASCII.CR, ASCII.LF) |
177 | End Using |
178 | End Function |
179 | |
180 | <ExportAPI(".Merge", Info:="Please make sure all of the file in the target directory is text file not binary file.")> |
181 | Public Function Merge(<Parameter("Dir", "The default directory parameter value is the current directory.")> Optional dir$ = "./") As String |
182 | Dim Texts = From file As String |
183 | In FileIO.FileSystem.GetFiles(dir, FileIO.SearchOption.SearchAllSubDirectories, "*.*") |
184 | Select FileIO.FileSystem.ReadAllText(file) |
185 | Dim Merged As String = String.Join(vbCr, Texts) |
186 | Return Merged |
187 | End Function |
188 | End Module |