Text\IO\TextFileEncodingDetector.vb

1	#Region "Microsoft.VisualBasic::5f43b698ba21ad10df9cb0c9dd72dc57, Microsoft.VisualBasic.Core\Text\IO\TextFileEncodingDetector.vb"
2
3	' Author:
4	'
5	' asuka (amethyst.asuka@gcmodeller.org)
6	' xie (genetics@smrucc.org)
7	' xieguigang (xie.guigang@live.com)
8	'
9	' Copyright (c) 2018 GPL3 Licensed
10	'
11	'
12	' GNU GENERAL PUBLIC LICENSE (GPL3)
13	'
14	'
15	' This program is free software: you can redistribute it and/or modify
16	' it under the terms of the GNU General Public License as published by
17	' the Free Software Foundation, either version 3 of the License, or
18	' (at your option) any later version.
19	'
20	' This program is distributed in the hope that it will be useful,
21	' but WITHOUT ANY WARRANTY; without even the implied warranty of
22	' MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23	' GNU General Public License for more details.
24	'
25	' You should have received a copy of the GNU General Public License
26	' along with this program. If not, see <http://www.gnu.org/licenses/>.
27
28
29
30	' /********************************************************************************/
31
32	' Summaries:
33
34	' Module TextFileEncodingDetector
35	'
36	' Properties: TextCodings
37	'
38	' Function: DetectBOMBytes, DetectSuspiciousUTF8SequenceLength, DetectTextByteArrayEncoding, (+2 Overloads) DetectTextFileEncoding, DetectUnicodeInByteSampleByHeuristics
39	' IsCommonUSASCIIByte, ToString
40	'
41	'
42	' /********************************************************************************/
43
44	#End Region
45
46	Imports System.Text
47	Imports System.IO
48	Imports System.Text.RegularExpressions
49
50	Namespace Text
51
52	''' <summary>
53	''' Encoding fileEncoding = TextFileEncodingDetector.DetectTextFileEncoding("you file path",Encoding.Default);
54	''' </summary>
55	''' <remarks></remarks>
56	Public Module TextFileEncodingDetector
57
58	Public ReadOnly Property TextCodings As Dictionary(Of String, System.Text.Encoding) =
59	New Dictionary(Of String, Encoding) From {
60	_
61	{"ascii", System.Text.Encoding.ASCII},
62	{"unicode", System.Text.Encoding.Unicode},
63	{"utf8", System.Text.Encoding.UTF8}
64	}
65
66	Public Function ToString(data As Generic.IEnumerable(Of Byte), Optional encoding As String = "") As String
67	If Not String.IsNullOrEmpty(encoding) Then encoding = encoding.ToLower
68	Dim TextEncoding As System.Text.Encoding = If(String.IsNullOrEmpty(encoding) OrElse Not TextCodings.ContainsKey(encoding), System.Text.Encoding.Default, TextCodings(encoding))
69	Return TextEncoding.GetString(data.ToArray)
70	End Function
71
72	' '* Simple class to handle text file encoding woes (in a primarily English-speaking tech
73	'* world).
74	'*
75	'* - This code is fully managed, no shady calls to MLang (the unmanaged codepage
76	'* detection library originally developed for Internet Explorer).
77	'*
78	'* - This class does NOT try to detect arbitrary codepages/charsets, it really only
79	'* aims to differentiate between some of the most common variants of Unicode
80	'* encoding, and a "default" (western / ascii-based) encoding alternative provided
81	'* by the caller.
82	'*
83	'* - As there is no "Reliable" way to distinguish between UTF-8 (without BOM) and
84	'* Windows-1252 (in .Net, also incorrectly called "ASCII") encodings, we use a
85	'* heuristic - so the more of the file we can sample the better the guess. If you
86	'* are going to read the whole file into memory at some point, then best to pass
87	'* in the whole byte byte array directly. Otherwise, decide how to trade off
88	'* reliability against performance / memory usage.
89	'*
90	'* - The UTF-8 detection heuristic only works for western text, as it relies on
91	'* the presence of UTF-8 encoded accented and other characters found in the upper
92	'* ranges of the Latin-1 and (particularly) Windows-1252 codepages.
93	'*
94	'* - For more general detection routines, see existing projects / resources:
95	'* - MLang - Microsoft library originally for IE6, available in Windows XP and later APIs now (I think?)
96	'* - MLang .Net bindings: http://www.codeproject.com/KB/recipes/DetectEncoding.aspx
97	'* - CharDet - Mozilla browser's detection routines
98	'* - Ported to Java then .Net: http://www.conceptdevelopment.net/Localization/NCharDet/
99	'* - Ported straight to .Net: http://code.google.com/p/chardetsharp/source/browse
100	'*
101	'* Copyright Tao Klerks, Jan 2010, tao@klerks.biz
102	'* Licensed under the modified BSD license:
103	'*
104	' 'Redistribution and use in source and binary forms, with or without modification, are
105	'permitted provided that the following conditions are met:
106	' '- Redistributions of source code must retain the above copyright notice, this list of
107	'conditions and the following disclaimer.
108	'- Redistributions in binary form must reproduce the above copyright notice, this list
109	'of conditions and the following disclaimer in the documentation and/or other materials
110	'provided with the distribution.
111	'- The name of the author may not be used to endorse or promote products derived from
112	'this software without specific prior written permission.
113	' 'THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
114	'INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
115	'A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
116	'DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
117	'BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
118	'PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
119	'WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
120	'ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
121	'OF SUCH DAMAGE.
122	' '*
123	'
124
125	Const _defaultHeuristicSampleSize As Long = &H10000
126
127	''' <summary>
128	''' completely arbitrary - inappropriate for high numbers of files / high speed requirements
129	''' </summary>
130	''' <param name="InputFilename"></param>
131	''' <param name="DefaultEncoding"></param>
132	''' <returns></returns>
133	''' <remarks></remarks>
134	Public Function DetectTextFileEncoding(InputFilename As String, Optional DefaultEncoding As Encoding = Nothing) As Encoding
135	Using textfileStream As FileStream = IO.File.OpenRead(InputFilename)
136	If DefaultEncoding Is Nothing Then
137	DefaultEncoding = System.Text.Encoding.Default
138	End If
139	Return DetectTextFileEncoding(textfileStream, DefaultEncoding, _defaultHeuristicSampleSize)
140	End Using
141	End Function
142
143	Public Function DetectTextFileEncoding(InputFileStream As FileStream, DefaultEncoding As Encoding, HeuristicSampleSize As Long) As Encoding
144	If InputFileStream Is Nothing Then
145	Throw New ArgumentNullException("Must provide a valid Filestream!", "InputFileStream")
146	End If
147
148	If Not InputFileStream.CanRead Then
149	Throw New ArgumentException("Provided file stream is not readable!", "InputFileStream")
150	End If
151
152	If Not InputFileStream.CanSeek Then
153	Throw New ArgumentException("Provided file stream cannot seek!", "InputFileStream")
154	End If
155
156	Dim encodingFound As Encoding = Nothing
157	Dim originalPos As Long = InputFileStream.Position
158
159	InputFileStream.Position = 0
160
161	'First read only what we need for BOM detection
162
163	Dim bomBytes As Byte() = New Byte(If(InputFileStream.Length > 4, 4, InputFileStream.Length) - 1) {}
164	InputFileStream.Read(bomBytes, 0, bomBytes.Length)
165
166	encodingFound = DetectBOMBytes(bomBytes)
167
168	If encodingFound IsNot Nothing Then
169	InputFileStream.Position = originalPos
170	Return encodingFound
171	End If
172
173	'BOM Detection failed, going for heuristics now.
174	' create sample byte array and populate it
175	Dim sampleBytes As Byte() = New Byte(If(HeuristicSampleSize > InputFileStream.Length, InputFileStream.Length, HeuristicSampleSize) - 1) {}
176
177	Call Array.Copy(bomBytes, sampleBytes, bomBytes.Length)
178
179	If InputFileStream.Length > bomBytes.Length Then
180	InputFileStream.Read(sampleBytes, bomBytes.Length, sampleBytes.Length - bomBytes.Length)
181	End If
182
183	InputFileStream.Position = originalPos
184
185	'test byte array content
186	encodingFound = DetectUnicodeInByteSampleByHeuristics(sampleBytes)
187
188	If encodingFound IsNot Nothing Then
189	Return encodingFound
190	Else
191	Return DefaultEncoding
192	End If
193	End Function
194
195	Public Function DetectTextByteArrayEncoding(TextData As Byte(), DefaultEncoding As Encoding) As Encoding
196	If TextData Is Nothing Then
197	Throw New ArgumentNullException("Must provide a valid text data byte array!", "TextData")
198	End If
199
200	Dim encodingFound As Encoding = DetectBOMBytes(TextData)
201
202	If encodingFound IsNot Nothing Then
203	Return encodingFound
204	Else
205	'test byte array content
206	encodingFound = DetectUnicodeInByteSampleByHeuristics(TextData)
207
208	If encodingFound IsNot Nothing Then
209	Return encodingFound
210	Else
211	Return DefaultEncoding
212	End If
213	End If
214
215	End Function
216
217	Public Function DetectBOMBytes(BOMBytes As Byte()) As Encoding
218	If BOMBytes Is Nothing Then
219	Throw New ArgumentNullException("Must provide a valid BOM byte array!", "BOMBytes")
220	End If
221
222	If BOMBytes.Length < 2 Then
223	Return Nothing
224	End If
225
226	If BOMBytes(0) = &HFF AndAlso BOMBytes(1) = &HFE AndAlso (BOMBytes.Length < 4 OrElse BOMBytes(2) <> 0 OrElse BOMBytes(3) <> 0) Then
227	Return Encoding.Unicode
228	End If
229
230	If BOMBytes(0) = &HFE AndAlso BOMBytes(1) = &HFF Then
231	Return Encoding.BigEndianUnicode
232	End If
233
234	If BOMBytes.Length < 3 Then
235	Return Nothing
236	End If
237
238	If BOMBytes(0) = &HEF AndAlso BOMBytes(1) = &HBB AndAlso BOMBytes(2) = &HBF Then
239	Return Encoding.UTF8
240	End If
241
242	If BOMBytes(0) = &H2B AndAlso BOMBytes(1) = &H2F AndAlso BOMBytes(2) = &H76 Then
243	Return Encoding.UTF7
244	End If
245
246	If BOMBytes.Length < 4 Then
247	Return Nothing
248	End If
249
250	If BOMBytes(0) = &HFF AndAlso BOMBytes(1) = &HFE AndAlso BOMBytes(2) = 0 AndAlso BOMBytes(3) = 0 Then
251	Return Encoding.UTF32
252	End If
253
254	If BOMBytes(0) = 0 AndAlso BOMBytes(1) = 0 AndAlso BOMBytes(2) = &HFE AndAlso BOMBytes(3) = &HFF Then
255	Return Encoding.GetEncoding(12001)
256	End If
257
258	Return Nothing
259	End Function
260
261	Public Function DetectUnicodeInByteSampleByHeuristics(SampleBytes As Byte()) As Encoding
262	Dim oddBinaryNullsInSample As Long = 0
263	Dim evenBinaryNullsInSample As Long = 0
264	Dim suspiciousUTF8SequenceCount As Long = 0
265	Dim suspiciousUTF8BytesTotal As Long = 0
266	Dim likelyUSASCIIBytesInSample As Long = 0
267
268	'Cycle through, keeping count of binary null positions, possible UTF-8
269	' sequences from upper ranges of Windows-1252, and probable US-ASCII
270	' character counts.
271
272	Dim currentPos As Long = 0
273	Dim skipUTF8Bytes As Integer = 0
274
275	While currentPos < SampleBytes.Length
276	'binary null distribution
277	If SampleBytes(currentPos) = 0 Then
278	If currentPos Mod 2 = 0 Then
279	evenBinaryNullsInSample += 1
280	Else
281	oddBinaryNullsInSample += 1
282	End If
283	End If
284
285	'likely US-ASCII characters
286	If IsCommonUSASCIIByte(SampleBytes(currentPos)) Then
287	likelyUSASCIIBytesInSample += 1
288	End If
289
290	'suspicious sequences (look like UTF-8)
291	If skipUTF8Bytes = 0 Then
292	Dim lengthFound As Integer = DetectSuspiciousUTF8SequenceLength(SampleBytes, currentPos)
293
294	If lengthFound > 0 Then
295	suspiciousUTF8SequenceCount += 1
296	suspiciousUTF8BytesTotal += lengthFound
297	skipUTF8Bytes = lengthFound - 1
298	End If
299	Else
300	skipUTF8Bytes -= 1
301	End If
302
303	currentPos += 1
304	End While
305
306	'1: UTF-16 LE - in english / european environments, this is usually characterized by a
307	' high proportion of odd binary nulls (starting at 0), with (as this is text) a low
308	' proportion of even binary nulls.
309	' The thresholds here used (less than 20% nulls where you expect non-nulls, and more than
310	' 60% nulls where you do expect nulls) are completely arbitrary.
311
312	If ((evenBinaryNullsInSample * 2.0) / SampleBytes.Length) < 0.2 AndAlso ((oddBinaryNullsInSample * 2.0) / SampleBytes.Length) > 0.6 Then
313	Return Encoding.Unicode
314	End If
315
316	'2: UTF-16 BE - in english / european environments, this is usually characterized by a
317	' high proportion of even binary nulls (starting at 0), with (as this is text) a low
318	' proportion of odd binary nulls.
319	' The thresholds here used (less than 20% nulls where you expect non-nulls, and more than
320	' 60% nulls where you do expect nulls) are completely arbitrary.
321
322	If ((oddBinaryNullsInSample * 2.0) / SampleBytes.Length) < 0.2 AndAlso ((evenBinaryNullsInSample * 2.0) / SampleBytes.Length) > 0.6 Then
323	Return Encoding.BigEndianUnicode
324	End If
325
326	'3: UTF-8 - Martin Dürst outlines a method for detecting whether something CAN be UTF-8 content
327	' using regexp, in his w3c.org unicode FAQ entry:
328	' http://www.w3.org/International/questions/qa-forms-utf-8
329	' adapted here for C#.
330	Dim potentiallyMangledString As String = Encoding.ASCII.GetString(SampleBytes)
331	Dim UTF8Validator As New Regex("\A(" & "[\x09\x0A\x0D\x20-\x7E]" & "\|[\xC2-\xDF][\x80-\xBF]" & "\|\xE0[\xA0-\xBF][\x80-\xBF]" & "\|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}" & "\|\xED[\x80-\x9F][\x80-\xBF]" & "\|\xF0[\x90-\xBF][\x80-\xBF]{2}" & "\|[\xF1-\xF3][\x80-\xBF]{3}" & "\|\xF4[\x80-\x8F][\x80-\xBF]{2}" & ")*\z")
332
333	If UTF8Validator.IsMatch(potentiallyMangledString) Then
334	'Unfortunately, just the fact that it CAN be UTF-8 doesn't tell you much about probabilities.
335	'If all the characters are in the 0-127 range, no harm done, most western charsets are same as UTF-8 in these ranges.
336	'If some of the characters were in the upper range (western accented characters), however, they would likely be mangled to 2-byte by the UTF-8 encoding process.
337	' So, we need to play stats.
338
339	' The "Random" likelihood of any pair of randomly generated characters being one
340	' of these "suspicious" character sequences is:
341	' 128 / (256 * 256) = 0.2%.
342	' ' In western text data, that is SIGNIFICANTLY reduced - most text data stays in the <127
343	' character range, so we assume that more than 1 in 500,000 of these character
344	' sequences indicates UTF-8. The number 500,000 is completely arbitrary - so sue me.
345	' ' We can only assume these character sequences will be rare if we ALSO assume that this
346	' IS in fact western text - in which case the bulk of the UTF-8 encoded data (that is
347	' not already suspicious sequences) should be plain US-ASCII bytes. This, I
348	' arbitrarily decided, should be 80% (a random distribution, eg binary data, would yield
349	' approx 40%, so the chances of hitting this threshold by accident in random data are
350	' VERY low).
351
352	'suspicious sequences
353	'all suspicious, so cannot evaluate proportion of US-Ascii
354	If (suspiciousUTF8SequenceCount * 500000.0 / SampleBytes.Length >= 1) AndAlso (SampleBytes.Length - suspiciousUTF8BytesTotal = 0 OrElse likelyUSASCIIBytesInSample * 1.0 / (SampleBytes.Length - suspiciousUTF8BytesTotal) >= 0.8) Then
355	Return Encoding.UTF8
356	End If
357	End If
358
359	Return Nothing
360	End Function
361
362	Private Function IsCommonUSASCIIByte(testByte As Byte) As Boolean
363	'lf
364	'cr
365	'tab
366	'common punctuation
367	'digits
368	'common punctuation
369	'capital letters
370	'common punctuation
371	'lowercase letters
372	If testByte = &HA OrElse testByte = &HD OrElse testByte = &H9 OrElse (testByte >= &H20 AndAlso testByte <= &H2F) OrElse (testByte >= &H30 AndAlso testByte <= &H39) OrElse (testByte >= &H3A AndAlso testByte <= &H40) OrElse (testByte >= &H41 AndAlso testByte <= &H5A) OrElse (testByte >= &H5B AndAlso testByte <= &H60) OrElse (testByte >= &H61 AndAlso testByte <= &H7A) OrElse (testByte >= &H7B AndAlso testByte <= &H7E) Then
373	'common punctuation
374	Return True
375	Else
376	Return False
377	End If
378	End Function
379
380	Private Function DetectSuspiciousUTF8SequenceLength(SampleBytes As Byte(), currentPos As Long) As Integer
381	Dim lengthFound As Integer = 0
382
383	If SampleBytes.Length >= currentPos + 1 AndAlso SampleBytes(currentPos) = &HC2 Then
384	If SampleBytes(currentPos + 1) = &H81 OrElse SampleBytes(currentPos + 1) = &H8D OrElse SampleBytes(currentPos + 1) = &H8F Then
385	lengthFound = 2
386	ElseIf SampleBytes(currentPos + 1) = &H90 OrElse SampleBytes(currentPos + 1) = &H9D Then
387	lengthFound = 2
388	ElseIf SampleBytes(currentPos + 1) >= &HA0 AndAlso SampleBytes(currentPos + 1) <= &HBF Then
389	lengthFound = 2
390	End If
391	ElseIf SampleBytes.Length >= currentPos + 1 AndAlso SampleBytes(currentPos) = &HC3 Then
392	If SampleBytes(currentPos + 1) >= &H80 AndAlso SampleBytes(currentPos + 1) <= &HBF Then
393	lengthFound = 2
394	End If
395	ElseIf SampleBytes.Length >= currentPos + 1 AndAlso SampleBytes(currentPos) = &HC5 Then
396	If SampleBytes(currentPos + 1) = &H92 OrElse SampleBytes(currentPos + 1) = &H93 Then
397	lengthFound = 2
398	ElseIf SampleBytes(currentPos + 1) = &HA0 OrElse SampleBytes(currentPos + 1) = &HA1 Then
399	lengthFound = 2
400	ElseIf SampleBytes(currentPos + 1) = &HB8 OrElse SampleBytes(currentPos + 1) = &HBD OrElse SampleBytes(currentPos + 1) = &HBE Then
401	lengthFound = 2
402	End If
403	ElseIf SampleBytes.Length >= currentPos + 1 AndAlso SampleBytes(currentPos) = &HC6 Then
404	If SampleBytes(currentPos + 1) = &H92 Then
405	lengthFound = 2
406	End If
407	ElseIf SampleBytes.Length >= currentPos + 1 AndAlso SampleBytes(currentPos) = &HCB Then
408	If SampleBytes(currentPos + 1) = &H86 OrElse SampleBytes(currentPos + 1) = &H9C Then
409	lengthFound = 2
410	End If
411	ElseIf SampleBytes.Length >= currentPos + 2 AndAlso SampleBytes(currentPos) = &HE2 Then
412	If SampleBytes(currentPos + 1) = &H80 Then
413	If SampleBytes(currentPos + 2) = &H93 OrElse SampleBytes(currentPos + 2) = &H94 Then
414	lengthFound = 3
415	End If
416	If SampleBytes(currentPos + 2) = &H98 OrElse SampleBytes(currentPos + 2) = &H99 OrElse SampleBytes(currentPos + 2) = &H9A Then
417	lengthFound = 3
418	End If
419	If SampleBytes(currentPos + 2) = &H9C OrElse SampleBytes(currentPos + 2) = &H9D OrElse SampleBytes(currentPos + 2) = &H9E Then
420	lengthFound = 3
421	End If
422	If SampleBytes(currentPos + 2) = &HA0 OrElse SampleBytes(currentPos + 2) = &HA1 OrElse SampleBytes(currentPos + 2) = &HA2 Then
423	lengthFound = 3
424	End If
425	If SampleBytes(currentPos + 2) = &HA6 Then
426	lengthFound = 3
427	End If
428	If SampleBytes(currentPos + 2) = &HB0 Then
429	lengthFound = 3
430	End If
431	If SampleBytes(currentPos + 2) = &HB9 OrElse SampleBytes(currentPos + 2) = &HBA Then
432	lengthFound = 3
433	End If
434	ElseIf SampleBytes(currentPos + 1) = &H82 AndAlso SampleBytes(currentPos + 2) = &HAC Then
435	lengthFound = 3
436	ElseIf SampleBytes(currentPos + 1) = &H84 AndAlso SampleBytes(currentPos + 2) = &HA2 Then
437	lengthFound = 3
438	End If
439	End If
440
441	Return lengthFound
442	End Function
443	End Module
444	End Namespace