1 | #Region "Microsoft.VisualBasic::4b1c90a0695cbf375286d54d64b838f0, Microsoft.VisualBasic.Core\Extensions\Math\Information\Entropy.vb" |
2 | |
3 | ' Author: |
4 | ' |
5 | ' asuka (amethyst.asuka@gcmodeller.org) |
6 | ' xie (genetics@smrucc.org) |
7 | ' xieguigang (xie.guigang@live.com) |
8 | ' |
9 | ' Copyright (c) 2018 GPL3 Licensed |
10 | ' |
11 | ' |
12 | ' GNU GENERAL PUBLIC LICENSE (GPL3) |
13 | ' |
14 | ' |
15 | ' This program is free software: you can redistribute it and/or modify |
16 | ' it under the terms of the GNU General Public License as published by |
17 | ' the Free Software Foundation, either version 3 of the License, or |
18 | ' (at your option) any later version. |
19 | ' |
20 | ' This program is distributed in the hope that it will be useful, |
21 | ' but WITHOUT ANY WARRANTY; without even the implied warranty of |
22 | ' MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
23 | ' GNU General Public License for more details. |
24 | ' |
25 | ' You should have received a copy of the GNU General Public License |
26 | ' along with this program. If not, see <http://www.gnu.org/licenses/>. |
27 | |
28 | |
29 | |
30 | ' /********************************************************************************/ |
31 | |
32 | ' Summaries: |
33 | |
34 | ' Module Entropy |
35 | ' |
36 | ' Function: ShannonEnt, ShannonEntropy |
37 | ' |
38 | ' |
39 | ' /********************************************************************************/ |
40 | |
41 | #End Region |
42 | |
43 | Imports System.Runtime.CompilerServices |
44 | |
45 | Namespace Math.Information |
46 | |
47 | ''' <summary> |
48 | ''' 信息熵越大表示所含信息量越多 |
49 | ''' </summary> |
50 | Public Module Entropy |
51 | |
52 | ''' <summary> |
53 | ''' 计算出目标序列的香农信息熵 |
54 | ''' </summary> |
55 | ''' <typeparam name="T"></typeparam> |
56 | ''' <param name="collection"></param> |
57 | ''' <returns></returns> |
58 | ''' <remarks> |
59 | ''' ###### 计算公式 |
60 | ''' |
61 | ''' ``` |
62 | ''' H(x) = E[ I(xi) ] |
63 | ''' = E[ log(2, 1/p(xi)) ] |
64 | ''' = -∑ p(xi)log(2, p(xi)) (i=1, 2, ..., n) |
65 | ''' ``` |
66 | ''' |
67 | ''' 其中,``x``表示随机变量,与之相对应的是所有可能输出的集合,定义为符号集,随机变量的输出用``x``表示。 |
68 | ''' ``P(x)``表示输出概率函数。变量的不确定性越大,熵也就越大,把它搞清楚所需要的信息量也就越大. |
69 | ''' </remarks> |
70 | <Extension> |
71 | Public Function ShannonEnt(Of T)(collection As IEnumerable(Of T)) As Double |
72 | Dim distincts = (From x As T In collection Group x By x Into Count).ToArray |
73 | Dim numEntries% = Aggregate g In distincts Into Sum(g.Count) |
74 | Dim probs = From item In distincts Select item.Count / numEntries |
75 | Dim entropy# = ShannonEntropy(probs) |
76 | |
77 | Return entropy |
78 | End Function |
79 | |
80 | ''' <summary> |
81 | ''' 直接从一个概率向量之中计算出香农信息熵 |
82 | ''' </summary> |
83 | ''' <param name="probs">Sum of this probability vector must equals to 1</param> |
84 | ''' <returns></returns> |
85 | ''' |
86 | <Extension> |
87 | Public Function ShannonEntropy(probs As IEnumerable(Of Double)) As Double |
88 | Dim entropy# = Aggregate prob As Double |
89 | In probs |
90 | Where prob > 0 ' 因为是求和,所以prob等于零的时候,乘上ln应该也是零的,因为零对求和无影响,所以在这里直接使用where跳过零了 |
91 | Let ln = Math.Log(prob, newBase:=2) |
92 | Into Sum(prob * ln) |
93 | ' 和的负数,注意在这里最后的结果还需要乘以-1 |
94 | ' 有一个负号 |
95 | Return -entropy |
96 | End Function |
97 | End Module |
98 | End Namespace |