1 | #Region "Microsoft.VisualBasic::451a2205c4f8963de8185e32eaf6b0f3, Microsoft.VisualBasic.Core\Text\Parser\HtmlParser\Table.vb" |
2 | |
3 | ' Author: |
4 | ' |
5 | ' asuka (amethyst.asuka@gcmodeller.org) |
6 | ' xie (genetics@smrucc.org) |
7 | ' xieguigang (xie.guigang@live.com) |
8 | ' |
9 | ' Copyright (c) 2018 GPL3 Licensed |
10 | ' |
11 | ' |
12 | ' GNU GENERAL PUBLIC LICENSE (GPL3) |
13 | ' |
14 | ' |
15 | ' This program is free software: you can redistribute it and/or modify |
16 | ' it under the terms of the GNU General Public License as published by |
17 | ' the Free Software Foundation, either version 3 of the License, or |
18 | ' (at your option) any later version. |
19 | ' |
20 | ' This program is distributed in the hope that it will be useful, |
21 | ' but WITHOUT ANY WARRANTY; without even the implied warranty of |
22 | ' MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
23 | ' GNU General Public License for more details. |
24 | ' |
25 | ' You should have received a copy of the GNU General Public License |
26 | ' along with this program. If not, see <http://www.gnu.org/licenses/>. |
27 | |
28 | |
29 | |
30 | ' /********************************************************************************/ |
31 | |
32 | ' Summaries: |
33 | |
34 | ' Module TableParser |
35 | ' |
36 | ' Function: GetColumnsHTML, GetRowsHTML, GetTablesHTML |
37 | ' |
38 | ' |
39 | ' /********************************************************************************/ |
40 | |
41 | #End Region |
42 | |
43 | Imports System.Runtime.CompilerServices |
44 | Imports System.Text.RegularExpressions |
45 | Imports Microsoft.VisualBasic.Linq |
46 | |
47 | Namespace Text.HtmlParser |
48 | |
49 | ''' <summary> |
50 | ''' The string parser for the table html text block |
51 | ''' </summary> |
52 | Public Module TableParser |
53 | |
54 | ''' <summary> |
55 | ''' Parsing the html text betweens the tag ``<table></table>`` by using regex expression. |
56 | ''' </summary> |
57 | ''' <param name="html"></param> |
58 | ''' <returns></returns> |
59 | ''' |
60 | <Extension> |
61 | Public Function GetTablesHTML(html As String, Optional greedy As Boolean = False) As String() |
62 | Dim regxp As String = If(greedy, "<table.+</table>", "<table.+?</table>") |
63 | Dim tbls As String() = Regex.Matches(html, regxp, RegexICSng).ToArray |
64 | Return tbls |
65 | End Function |
66 | |
67 | ''' <summary> |
68 | ''' Parsing the html text betweens the tag <tr></tr> by using regex expression. |
69 | ''' </summary> |
70 | ''' <param name="table"></param> |
71 | ''' <returns></returns> |
72 | ''' |
73 | <Extension> |
74 | Public Function GetRowsHTML(table As String) As String() |
75 | If table Is Nothing Then |
76 | Return {} |
77 | End If |
78 | Dim rows As String() = Regex.Matches( |
79 | table, |
80 | "<tr.+?</tr>", |
81 | RegexOptions.Singleline Or RegexOptions.IgnoreCase).ToArray |
82 | Return rows |
83 | End Function |
84 | |
85 | ''' <summary> |
86 | ''' The td tag is trimmed in this function.(请注意,在本函数之中,<td>标签是被去除掉了的) |
87 | ''' </summary> |
88 | ''' <param name="row"></param> |
89 | ''' <returns></returns> |
90 | ''' |
91 | <Extension> |
92 | Public Function GetColumnsHTML(row As String) As String() |
93 | Dim cols As String() = Regex.Matches(row, "(<td.+?</td>)|(<th.+?</th>)", RegexICSng).ToArray |
94 | cols = cols _ |
95 | .Select(Function(s) s.GetValue) _ |
96 | .ToArray |
97 | Return cols |
98 | End Function |
99 | End Module |
100 | End Namespace |