MIC package

v1 = [R, R, R, R] v2 = [R, R, R, R]

 10 # Libraries
 11 import numpy as np
 12 import pandas as pd
 13
 14 from scipy.stats.contingency import crosstab
 15 from sklearn.metrics import mutual_info_score
 16
 17
 18 def mutual_info_matrix_v3(x=None, y=None, ct=None):
 19     """Compute the component information score.
 20
 21     .. note: Might be inefficient but good for testing.
 22
 23     .. note: In order to be able to compute the mutual
 24              information score it is necessary to have
 25              variation within the variable. Thus, if
 26              there is only one class, should we return
 27              a result or a warning?
 28
 29     Parameters
 30     ----------
 31     x: list
 32         List with the classes
 33     y: list
 34         List with the classes
 35
 36     Returns
 37     -------
 38     """
 39
 40     def _check_nparray(obj, param_name):
 41         if obj is not None:
 42             if isinstance(obj, np.ndarray):
 43                 return obj
 44             elif isinstance(obj, pd.Series):
 45                 return obj.to_numpy()
 46             elif isinstance(obj, pd.DataFrame):
 47                 return obj.to_numpy()
 48             elif isinstance(obj, list):
 49                 return np.array(obj)
 50             else:
 51                 raise ValueError("""
 52                        The input parameter '{0}' is of type '{1} which is
 53                        not supported. Please ensure it is a np.ndarray."""
 54                                  .format(param_name, type(obj)))
 55
 56         # Ensure they are all np arrays
 57
 58     x = _check_nparray(x, 'x')
 59     y = _check_nparray(y, 'y')
 60     ct = _check_nparray(ct, 'ct')
 61
 62     # Compute contingency
 63     if ct is None:
 64         c = crosstab(x,y)
 65         if isinstance(c, tuple):
 66             ct = c[-1]   # older scipy
 67         else:
 68             ct = c.count # newer scipy
 69
 70     # Variables
 71     n = ct.sum()
 72     pi = np.ravel(ct.sum(axis=1)) / n
 73     pj = np.ravel(ct.sum(axis=0)) / n
 74
 75     # Create empty matrix
 76     m = np.empty(ct.shape)
 77     m[:] = np.nan
 78
 79     # Fill with component information score
 80     with np.errstate(all='ignore'):
 81         for i in range(m.shape[0]):
 82             for j in range(m.shape[1]):
 83                 pxy = ct[i,j] / n
 84                 m[i,j] = pxy * np.log(pxy / (pi[i] * pj[j]))
 85
 86     # Fill with na (lim x->0 => 0)
 87     m[np.isnan(m)] = 0
 88
 89     # Return
 90     return m
 91
 92
 93 def mutual_info_matrix_v2(x=None, y=None, ct=None):
 94     """Computes the component information.
 95
 96     The component information is calculated as below where X/Y
 97     denotes a state (e.g. RR).
 98
 99         C(X/Y) = P(X/Y) * log(P(X/Y) / P(X)*P(Y))
100
101           c11 c12
102     C =   c21 c22
103
104     pi = [pi1, pi2]
105     pj = [pj1, pj2]
106
107     ci11 = c11/n * np.log(c11/n / pi1*pj1)
108     ci12 = c12/n * np.log(c12/n / pi1*pj2)
109     ci21 = c21/n * np.log(c21/n / pi2*pj1)
110     ci22 = c22/n * np.log(c22/n / pi2*pj2)
111
112     .. warning: It only works for square contingency matrices; that is, the
113                 number of different classes appearing in the vectors x and y
114                 must be the same.
115
116                 Enough for susceptibility test with only R and S. Will
117                 fail if only one is present or I is introduced.
118
119     Example
120     -------
121     sex           Female  Male  Total
122     lbl
123     Not Survived      89   483    572
124     Survived         230   112    342
125     Total            319   595    914
126
127     # Manual example.
128     ci11 = (89  / 914) * np.log((89  / 914) / (572 / 914) * (319 / 914))
129     ci12 = (483 / 914) * np.log((483 / 914) / (572 / 914) * (595 / 914))
130     ci21 = (230 / 914) * np.log((230 / 914) / (342 / 914) * (319 / 914))
131     ci22 = (112 / 914) * np.log((112 / 914) / (342 / 914) * (595 / 914))
132     m = np.array([[ci11, ci12], [ci21, ci22]])
133
134     # Compute
135     m = component_information_v0(x=data.lbl, y=d.sex)
136
137     Parameters
138     ----------
139
140     Returns
141     -------
142
143     """
144
145     def _check_nparray(obj, param_name):
146         if obj is not None:
147             if isinstance(obj, np.ndarray):
148                 return obj
149             elif isinstance(obj, pd.Series):
150                 return obj.to_numpy()
151             elif isinstance(obj, pd.DataFrame):
152                 return obj.to_numpy()
153             elif isinstance(obj, list):
154                 return np.array(obj)
155             else:
156                 raise ValueError("""
157                        The input parameter '{0}' is of type '{1} which is
158                        not supported. Please ensure it is a np.ndarray."""
159                                  .format(param_name, type(obj)))
160
161         # Ensure they are all np arrays
162
163     x = _check_nparray(x, 'x')
164     y = _check_nparray(y, 'y')
165     ct = _check_nparray(ct, 'ct')
166
167     # Compute contingency
168     if ct is None:
169         c = crosstab(x, y)
170         if isinstance(c, tuple):
171             ct = c[-1]  # older scipy
172         else:
173             ct = c.count  # newer scipy
174
175     with np.errstate(divide='ignore'):
176         # Variables
177         n = ct.sum()
178         pi = np.ravel(ct.sum(axis=1))
179         pj = np.ravel(ct.sum(axis=0))
180
181         # Compute matrix
182         b = np.repeat(pi.reshape(-1,1), len(pi), axis=1)
183         c = np.repeat(pj.reshape(1,-1), len(pj), axis=0)
184         m = (ct/n) * np.log((ct/n) / ((b/n) * (c/n)))
185
186     # Fill with na (lim x->0 => 0)
187     m[np.isnan(m)] = 0
188
189     # Return
190     return m
191
192
193 def mutual_info_matrix_v1(x=None, y=None, *, ct=None):
194     """Computes the mutual information matrix
195
196     The component information is calculated as below where X/Y
197     denotes a state (e.g. RR).
198
199         C(X/Y) = P(XY) * log(P(XY) / P(X)*P(Y))
200                = P(XY) * [log(P(XY)) - log(P(X)*P(Y))]
201                = P(XY) * [log(P(XY)) - (log(P(X)) + log(P(Y)))]
202                = P(XY) * [log(P(XY)) - log(P(X)) - log(P(Y))]
203
204     .. note:: It is inspired by the code from sklearn.metrics.mutual_info_score.
205
206     Notes
207     -----
208     The logarithm used is the natural logarithm (base-e).
209     """
210     from math import log
211     from scipy import sparse as sp
212     from sklearn.metrics.cluster._supervised import check_clusterings
213     from sklearn.metrics.cluster._supervised import check_array
214     from sklearn.metrics.cluster._supervised import contingency_matrix
215
216     labels_true, labels_pred, contingency = x, y, ct
217
218     if contingency is None:
219         labels_true, labels_pred = check_clusterings(labels_true, labels_pred)
220         contingency = contingency_matrix(labels_true, labels_pred, sparse=True)
221     else:
222         contingency = check_array(
223             contingency,
224             accept_sparse=["csr", "csc", "coo"],
225             dtype=[int, np.int32, np.int64],
226         )
227
228     if isinstance(contingency, np.ndarray):
229         # For an array
230         nzx, nzy = np.nonzero(contingency)
231         nz_val = contingency[nzx, nzy]
232     elif sp.issparse(contingency):
233         # For a sparse matrix
234         nzx, nzy, nz_val = sp.find(contingency)
235     else:
236         raise ValueError("Unsupported type for 'contingency': %s" % type(contingency))
237
238     contingency_sum = contingency.sum()
239     pi = np.ravel(contingency.sum(axis=1))
240     pj = np.ravel(contingency.sum(axis=0))
241
242     # Since MI <= min(H(X), H(Y)), any labelling with zero entropy, i.e. containing a
243     # single cluster, implies MI = 0
244     if pi.size == 1 or pj.size == 1:
245         return 0.0
246
247     log_contingency_nm = np.log(nz_val)
248     contingency_nm = nz_val / contingency_sum
249     # Don't need to calculate the full outer product, just for non-zeroes
250     outer = pi.take(nzx).astype(np.int64, copy=False) * \
251             pj.take(nzy).astype(np.int64, copy=False) # b*c
252     log_outer = -np.log(outer) + np.log(pi.sum()) + np.log(pj.sum())
253
254     mi = (
255         contingency_nm * (log_contingency_nm - np.log(contingency_sum))
256         + contingency_nm * log_outer
257     )
258     mi = np.where(np.abs(mi) < np.finfo(mi.dtype).eps, 0.0, mi)
259
260     #return np.clip(mi.sum(), 0.0, None), mi
261     try:
262         return np.array(mi).reshape(contingency.shape).T
263     except:
264         return mi
265         #return mutual_info_matrix_v3(x=x, y=y, ct=ct)

Total running time of the script: ( 0 minutes 0.000 seconds)

Gallery generated by Sphinx-Gallery