Note
Click here to download the full example code
MIC package
v1 = [R, R, R, R] v2 = [R, R, R, R]
10 # Libraries
11 import numpy as np
12 import pandas as pd
13
14 from scipy.stats.contingency import crosstab
15 from sklearn.metrics import mutual_info_score
16
17
18 def mutual_info_matrix_v3(x=None, y=None, ct=None):
19 """Compute the component information score.
20
21 .. note: Might be inefficient but good for testing.
22
23 .. note: In order to be able to compute the mutual
24 information score it is necessary to have
25 variation within the variable. Thus, if
26 there is only one class, should we return
27 a result or a warning?
28
29 Parameters
30 ----------
31 x: list
32 List with the classes
33 y: list
34 List with the classes
35
36 Returns
37 -------
38 """
39
40 def _check_nparray(obj, param_name):
41 if obj is not None:
42 if isinstance(obj, np.ndarray):
43 return obj
44 elif isinstance(obj, pd.Series):
45 return obj.to_numpy()
46 elif isinstance(obj, pd.DataFrame):
47 return obj.to_numpy()
48 elif isinstance(obj, list):
49 return np.array(obj)
50 else:
51 raise ValueError("""
52 The input parameter '{0}' is of type '{1} which is
53 not supported. Please ensure it is a np.ndarray."""
54 .format(param_name, type(obj)))
55
56 # Ensure they are all np arrays
57
58 x = _check_nparray(x, 'x')
59 y = _check_nparray(y, 'y')
60 ct = _check_nparray(ct, 'ct')
61
62 # Compute contingency
63 if ct is None:
64 c = crosstab(x,y)
65 if isinstance(c, tuple):
66 ct = c[-1] # older scipy
67 else:
68 ct = c.count # newer scipy
69
70 # Variables
71 n = ct.sum()
72 pi = np.ravel(ct.sum(axis=1)) / n
73 pj = np.ravel(ct.sum(axis=0)) / n
74
75 # Create empty matrix
76 m = np.empty(ct.shape)
77 m[:] = np.nan
78
79 # Fill with component information score
80 with np.errstate(all='ignore'):
81 for i in range(m.shape[0]):
82 for j in range(m.shape[1]):
83 pxy = ct[i,j] / n
84 m[i,j] = pxy * np.log(pxy / (pi[i] * pj[j]))
85
86 # Fill with na (lim x->0 => 0)
87 m[np.isnan(m)] = 0
88
89 # Return
90 return m
91
92
93 def mutual_info_matrix_v2(x=None, y=None, ct=None):
94 """Computes the component information.
95
96 The component information is calculated as below where X/Y
97 denotes a state (e.g. RR).
98
99 C(X/Y) = P(X/Y) * log(P(X/Y) / P(X)*P(Y))
100
101 c11 c12
102 C = c21 c22
103
104 pi = [pi1, pi2]
105 pj = [pj1, pj2]
106
107 ci11 = c11/n * np.log(c11/n / pi1*pj1)
108 ci12 = c12/n * np.log(c12/n / pi1*pj2)
109 ci21 = c21/n * np.log(c21/n / pi2*pj1)
110 ci22 = c22/n * np.log(c22/n / pi2*pj2)
111
112 .. warning: It only works for square contingency matrices; that is, the
113 number of different classes appearing in the vectors x and y
114 must be the same.
115
116 Enough for susceptibility test with only R and S. Will
117 fail if only one is present or I is introduced.
118
119 Example
120 -------
121 sex Female Male Total
122 lbl
123 Not Survived 89 483 572
124 Survived 230 112 342
125 Total 319 595 914
126
127 # Manual example.
128 ci11 = (89 / 914) * np.log((89 / 914) / (572 / 914) * (319 / 914))
129 ci12 = (483 / 914) * np.log((483 / 914) / (572 / 914) * (595 / 914))
130 ci21 = (230 / 914) * np.log((230 / 914) / (342 / 914) * (319 / 914))
131 ci22 = (112 / 914) * np.log((112 / 914) / (342 / 914) * (595 / 914))
132 m = np.array([[ci11, ci12], [ci21, ci22]])
133
134 # Compute
135 m = component_information_v0(x=data.lbl, y=d.sex)
136
137 Parameters
138 ----------
139
140 Returns
141 -------
142
143 """
144
145 def _check_nparray(obj, param_name):
146 if obj is not None:
147 if isinstance(obj, np.ndarray):
148 return obj
149 elif isinstance(obj, pd.Series):
150 return obj.to_numpy()
151 elif isinstance(obj, pd.DataFrame):
152 return obj.to_numpy()
153 elif isinstance(obj, list):
154 return np.array(obj)
155 else:
156 raise ValueError("""
157 The input parameter '{0}' is of type '{1} which is
158 not supported. Please ensure it is a np.ndarray."""
159 .format(param_name, type(obj)))
160
161 # Ensure they are all np arrays
162
163 x = _check_nparray(x, 'x')
164 y = _check_nparray(y, 'y')
165 ct = _check_nparray(ct, 'ct')
166
167 # Compute contingency
168 if ct is None:
169 c = crosstab(x, y)
170 if isinstance(c, tuple):
171 ct = c[-1] # older scipy
172 else:
173 ct = c.count # newer scipy
174
175 with np.errstate(divide='ignore'):
176 # Variables
177 n = ct.sum()
178 pi = np.ravel(ct.sum(axis=1))
179 pj = np.ravel(ct.sum(axis=0))
180
181 # Compute matrix
182 b = np.repeat(pi.reshape(-1,1), len(pi), axis=1)
183 c = np.repeat(pj.reshape(1,-1), len(pj), axis=0)
184 m = (ct/n) * np.log((ct/n) / ((b/n) * (c/n)))
185
186 # Fill with na (lim x->0 => 0)
187 m[np.isnan(m)] = 0
188
189 # Return
190 return m
191
192
193 def mutual_info_matrix_v1(x=None, y=None, *, ct=None):
194 """Computes the mutual information matrix
195
196 The component information is calculated as below where X/Y
197 denotes a state (e.g. RR).
198
199 C(X/Y) = P(XY) * log(P(XY) / P(X)*P(Y))
200 = P(XY) * [log(P(XY)) - log(P(X)*P(Y))]
201 = P(XY) * [log(P(XY)) - (log(P(X)) + log(P(Y)))]
202 = P(XY) * [log(P(XY)) - log(P(X)) - log(P(Y))]
203
204 .. note:: It is inspired by the code from sklearn.metrics.mutual_info_score.
205
206 Notes
207 -----
208 The logarithm used is the natural logarithm (base-e).
209 """
210 from math import log
211 from scipy import sparse as sp
212 from sklearn.metrics.cluster._supervised import check_clusterings
213 from sklearn.metrics.cluster._supervised import check_array
214 from sklearn.metrics.cluster._supervised import contingency_matrix
215
216 labels_true, labels_pred, contingency = x, y, ct
217
218 if contingency is None:
219 labels_true, labels_pred = check_clusterings(labels_true, labels_pred)
220 contingency = contingency_matrix(labels_true, labels_pred, sparse=True)
221 else:
222 contingency = check_array(
223 contingency,
224 accept_sparse=["csr", "csc", "coo"],
225 dtype=[int, np.int32, np.int64],
226 )
227
228 if isinstance(contingency, np.ndarray):
229 # For an array
230 nzx, nzy = np.nonzero(contingency)
231 nz_val = contingency[nzx, nzy]
232 elif sp.issparse(contingency):
233 # For a sparse matrix
234 nzx, nzy, nz_val = sp.find(contingency)
235 else:
236 raise ValueError("Unsupported type for 'contingency': %s" % type(contingency))
237
238 contingency_sum = contingency.sum()
239 pi = np.ravel(contingency.sum(axis=1))
240 pj = np.ravel(contingency.sum(axis=0))
241
242 # Since MI <= min(H(X), H(Y)), any labelling with zero entropy, i.e. containing a
243 # single cluster, implies MI = 0
244 if pi.size == 1 or pj.size == 1:
245 return 0.0
246
247 log_contingency_nm = np.log(nz_val)
248 contingency_nm = nz_val / contingency_sum
249 # Don't need to calculate the full outer product, just for non-zeroes
250 outer = pi.take(nzx).astype(np.int64, copy=False) * \
251 pj.take(nzy).astype(np.int64, copy=False) # b*c
252 log_outer = -np.log(outer) + np.log(pi.sum()) + np.log(pj.sum())
253
254 mi = (
255 contingency_nm * (log_contingency_nm - np.log(contingency_sum))
256 + contingency_nm * log_outer
257 )
258 mi = np.where(np.abs(mi) < np.finfo(mi.dtype).eps, 0.0, mi)
259
260 #return np.clip(mi.sum(), 0.0, None), mi
261 try:
262 return np.array(mi).reshape(contingency.shape).T
263 except:
264 return mi
265 #return mutual_info_matrix_v3(x=x, y=y, ct=ct)
Total running time of the script: ( 0 minutes 0.000 seconds)