05. IQRFilter

Example of implementing an IQR filter.

Out:

[[ 1.  2.  3.  4.]
 [ 5.  6. nan nan]
 [ 9.  1. nan  3.]
 [ 1.  2.  3.  4.]
 [ 1.  2.  3.  3.]
 [ 3.  7.  3.  4.]
 [ 1.  2.  3. nan]
 [ 3.  7.  3.  4.]
 [ 1.  2.  3.  4.]
 [ 3.  6.  3.  4.]
 [ 2.  2. nan nan]]
[1 1 1 1 1 1 0 0 0 0 0]

  9 # Import from future.
 10 from __future__ import division
 11
 12 # Libraries
 13 import numpy as np
 14
 15 # --------------------------------------------------------------------------
 16 #                       Inter-Quantile Range filter
 17 # --------------------------------------------------------------------------
 18 class IQRFilter():
 19     """This filter set those cells which lie outside of the
 20     interquantile range rule as np.nan. Ir performs iqr
 21     filtering for a single data matrix or a matrix with the
 22     corresponding classes. The latter performs the filtering
 23     for each class independently.
 24
 25     .. todo: Return indicator with values set as nan.
 26     .. note: The code could be simplified.
 27     .. note: The coud could check input classes and raise error.
 28
 29     """
 30
 31     def __init__(self, iqrrange=[25, 75], coefficient=1.5):
 32         """The constructor"""
 33         self.iqrrange = iqrrange
 34         self.coefficient = coefficient
 35         self.lower_coefs = None
 36         self.upper_coefs = None
 37
 38     def __repr__(self):
 39         """
 40         """
 41         return "IQRFilter(iqrrange=%s, coefficient=%s)" % \
 42                (self.iqrrange, self.coefficient)
 43
 44     # --------------------------
 45     #   generic methods
 46     # --------------------------
 47     def _fit(self, X):
 48         """This method computes the lower and upper percentiles
 49         """
 50         # Compute lower and uper quartiles
 51         lower_quartiles, upper_quartiles = \
 52             np.nanpercentile(X, self.iqrrange, axis=0)
 53
 54         # Compute the interquantile range
 55         iqrs = (upper_quartiles - lower_quartiles) * self.coefficient
 56
 57         # Set parameters
 58         return lower_quartiles - iqrs, upper_quartiles + iqrs
 59
 60     # --------------------------
 61     #   single class methods
 62     # --------------------------
 63     def _fit_s(self, X):
 64         """This method fits single category.
 65
 66          Parameters
 67          ----------
 68          X :
 69
 70          Returns
 71          -------
 72          IQRFIlter instance
 73         """
 74         # Create the array coefficients
 75         self.lower_coefs, self.upper_coefs = self._fit(X)
 76
 77         # Format to array
 78         self.lower_coefs = self.lower_coefs.reshape(1, -1)
 79         self.upper_coefs = self.upper_coefs.reshape(1, -1)
 80
 81         # Return
 82         return self
 83
 84     def _filter_s(self, X):
 85         """This method filters single category.
 86
 87         Parameters
 88         ----------
 89         X :
 90
 91         Returns
 92         -------
 93         np.ndarray
 94         """
 95         # Copy X
 96         F = np.copy(X)
 97
 98         # Indexes
 99         is_lower = F < self.lower_coefs[0, :]
100         is_upper = F > self.upper_coefs[0, :]
101
102         # Filter
103         F[is_lower | is_upper] = np.nan
104
105         # Return
106         return F
107
108     # ----------------------------
109     #   multiple class methods
110     # ----------------------------
111     def _fit_m(self, X, y):
112         """This method fits multiple category
113
114         Note: the attribute _classes is a dictionary in which the key is
115               the ategory identifier and the value is the index within
116               the lower_coefs and upper_coefs.
117
118         Parameters
119         ----------
120         X :
121         y :
122
123         Returns
124         -------
125         IQRFIlter instance
126         """
127         # Create matrices with coefficients for each class
128         self.lower_coefs = [self._fit(X[y == c])[0] for c in np.unique(y)]
129         self.upper_coefs = [self._fit(X[y == c])[1] for c in np.unique(y)]
130
131         # Format to array
132         self.lower_coefs = np.array(self.lower_coefs)
133         self.upper_coefs = np.array(self.upper_coefs)
134
135         # Set classes
136         self._classes = {c: idx for idx, c in enumerate(np.unique(y))}
137
138         # Return
139         return self
140
141     def _filter_m(self, X, y):
142         """This method filters multiple category.
143
144
145         Parameters
146         ----------
147         X :
148         y :
149
150         Returns
151         -------
152         np.ndarray
153         """
154         # Copy matrix
155         F = np.copy(X).astype(float)
156
157         # For each category
158         for category, index in self._classes.items():
159             # Indexes
160             is_y = np.repeat((y == category).reshape(-1, 1), X.shape[1], 1)
161             is_lower = F < self.lower_coefs[index, :]
162             is_upper = F > self.upper_coefs[index, :]
163
164             # Filter
165             F[is_y & (is_lower | is_upper)] = np.nan
166
167         # Return
168         return F
169
170     # -------------------------
171     #   caller methods
172     # -------------------------
173     def fit(self, X, y=None):
174         """This method fits the filter to the data.
175
176         Parameters
177         ----------
178         X :
179         y :
180
181         Returns
182         -------
183         self
184         """
185         # Fit filter
186         if y is None:
187             self._fit_s(X)
188         else:
189             self._fit_m(X, y)
190         # Return
191         return self
192
193     def filter(self, X, y=None):
194         """This method filters the input
195
196         Parameters
197         ----------
198         X :
199
200         Returns
201         -------
202         np.ndarray
203         """
204         # The object has not been previously fitted
205         if self.lower_coefs is None or self.upper_coefs is None:
206             raise TypeError("The instance IQRFilter has not been fitted.")
207
208         # The instance has been fitted with classes
209         if hasattr(self, '_classes') and y is None:
210             raise TypeError("The instance IQRFilter has been fitted with "
211                             "several categories (%s). As such, the y "
212                             "parameter is required to identify the "
213                             "categories." % self._classes.keys())
214
215         # Verify that all classes are included
216         if hasattr(self, '_classes'):
217             y_classes = set(np.unique(y))
218             f_classes = set(self._classes.keys())
219             if bool(y_classes - f_classes):
220                 raise TypeError("There are categories in the inputed y (%s) which "
221                                 "were not seen during the fiting process (%s). As "
222                                 "such the data cannot be filtered." %
223                                 (y_classes, f_classes))
224
225         # Filter
226         if y is None:
227             return self._filter_s(X)
228         # Multiple category
229         return self._filter_m(X, y)
230
231     def fit_filter(self, X, y=None):
232         """This method fits and filters.
233         """
234         # Fit
235         self.fit(X, y)
236         # Filter
237         if y is None:
238             return self.filter(X)
239         # Return
240         return self.filter(X, y), y
241
242
243
244
245 if __name__ == '__main__':
246
247     # Import
248     import numpy as np
249     import warnings
250     import matplotlib as mpl
251
252     # Import specific
253     from sklearn.datasets import make_classification
254
255     # ------------------------------------
256     # basic configuration
257     # ------------------------------------
258     # Ignore all the warnings
259     warnings.simplefilter('ignore')
260
261     # Set matplotlib
262     mpl.rcParams['xtick.labelsize'] = 9
263     mpl.rcParams['ytick.labelsize'] = 9
264     mpl.rcParams['axes.titlesize'] = 11
265     mpl.rcParams['legend.fontsize'] = 9
266
267     # Set print options
268     np.set_printoptions(precision=2)
269
270     # ------------------------------------
271     # create data
272     # ------------------------------------
273     # Create feature data
274     data = np.array([[1, 2, 3, 4],
275                      [5, 6, 7, 8],
276                      [9, 1, 2, 3],
277                      [1, 2, 3, 4],
278                      [1, 2, 3, 3],
279                      [3, 7, 3, 4],
280                      [1, 2, 3, 3],
281                      [3, 7, 3, 4],
282                      [1, 2, 3, 4],
283                      [3, 6, 3, 4],
284                      [2, 2, -55, 55]])
285
286     # Create categories
287     y = np.array([1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0])
288
289     # --------------
290     # IQR filtering
291     # --------------
292     # Create filter object
293     iqr = IQRFilter(iqrrange=[25, 75], coefficient=1.5)
294
295     # Fit and filter
296     X, y = iqr.fit_filter(data, y)
297
298     # Show
299     print(X)
300     print(y)

Total running time of the script: ( 0 minutes 0.008 seconds)

Gallery generated by Sphinx-Gallery