Note
Click here to download the full example code
05. IQRFilter
Example of implementing an IQR filter.
Out:
[[ 1. 2. 3. 4.]
[ 5. 6. nan nan]
[ 9. 1. nan 3.]
[ 1. 2. 3. 4.]
[ 1. 2. 3. 3.]
[ 3. 7. 3. 4.]
[ 1. 2. 3. nan]
[ 3. 7. 3. 4.]
[ 1. 2. 3. 4.]
[ 3. 6. 3. 4.]
[ 2. 2. nan nan]]
[1 1 1 1 1 1 0 0 0 0 0]
9 # Import from future.
10 from __future__ import division
11
12 # Libraries
13 import numpy as np
14
15 # --------------------------------------------------------------------------
16 # Inter-Quantile Range filter
17 # --------------------------------------------------------------------------
18 class IQRFilter():
19 """This filter set those cells which lie outside of the
20 interquantile range rule as np.nan. Ir performs iqr
21 filtering for a single data matrix or a matrix with the
22 corresponding classes. The latter performs the filtering
23 for each class independently.
24
25 .. todo: Return indicator with values set as nan.
26 .. note: The code could be simplified.
27 .. note: The coud could check input classes and raise error.
28
29 """
30
31 def __init__(self, iqrrange=[25, 75], coefficient=1.5):
32 """The constructor"""
33 self.iqrrange = iqrrange
34 self.coefficient = coefficient
35 self.lower_coefs = None
36 self.upper_coefs = None
37
38 def __repr__(self):
39 """
40 """
41 return "IQRFilter(iqrrange=%s, coefficient=%s)" % \
42 (self.iqrrange, self.coefficient)
43
44 # --------------------------
45 # generic methods
46 # --------------------------
47 def _fit(self, X):
48 """This method computes the lower and upper percentiles
49 """
50 # Compute lower and uper quartiles
51 lower_quartiles, upper_quartiles = \
52 np.nanpercentile(X, self.iqrrange, axis=0)
53
54 # Compute the interquantile range
55 iqrs = (upper_quartiles - lower_quartiles) * self.coefficient
56
57 # Set parameters
58 return lower_quartiles - iqrs, upper_quartiles + iqrs
59
60 # --------------------------
61 # single class methods
62 # --------------------------
63 def _fit_s(self, X):
64 """This method fits single category.
65
66 Parameters
67 ----------
68 X :
69
70 Returns
71 -------
72 IQRFIlter instance
73 """
74 # Create the array coefficients
75 self.lower_coefs, self.upper_coefs = self._fit(X)
76
77 # Format to array
78 self.lower_coefs = self.lower_coefs.reshape(1, -1)
79 self.upper_coefs = self.upper_coefs.reshape(1, -1)
80
81 # Return
82 return self
83
84 def _filter_s(self, X):
85 """This method filters single category.
86
87 Parameters
88 ----------
89 X :
90
91 Returns
92 -------
93 np.ndarray
94 """
95 # Copy X
96 F = np.copy(X)
97
98 # Indexes
99 is_lower = F < self.lower_coefs[0, :]
100 is_upper = F > self.upper_coefs[0, :]
101
102 # Filter
103 F[is_lower | is_upper] = np.nan
104
105 # Return
106 return F
107
108 # ----------------------------
109 # multiple class methods
110 # ----------------------------
111 def _fit_m(self, X, y):
112 """This method fits multiple category
113
114 Note: the attribute _classes is a dictionary in which the key is
115 the ategory identifier and the value is the index within
116 the lower_coefs and upper_coefs.
117
118 Parameters
119 ----------
120 X :
121 y :
122
123 Returns
124 -------
125 IQRFIlter instance
126 """
127 # Create matrices with coefficients for each class
128 self.lower_coefs = [self._fit(X[y == c])[0] for c in np.unique(y)]
129 self.upper_coefs = [self._fit(X[y == c])[1] for c in np.unique(y)]
130
131 # Format to array
132 self.lower_coefs = np.array(self.lower_coefs)
133 self.upper_coefs = np.array(self.upper_coefs)
134
135 # Set classes
136 self._classes = {c: idx for idx, c in enumerate(np.unique(y))}
137
138 # Return
139 return self
140
141 def _filter_m(self, X, y):
142 """This method filters multiple category.
143
144
145 Parameters
146 ----------
147 X :
148 y :
149
150 Returns
151 -------
152 np.ndarray
153 """
154 # Copy matrix
155 F = np.copy(X).astype(float)
156
157 # For each category
158 for category, index in self._classes.items():
159 # Indexes
160 is_y = np.repeat((y == category).reshape(-1, 1), X.shape[1], 1)
161 is_lower = F < self.lower_coefs[index, :]
162 is_upper = F > self.upper_coefs[index, :]
163
164 # Filter
165 F[is_y & (is_lower | is_upper)] = np.nan
166
167 # Return
168 return F
169
170 # -------------------------
171 # caller methods
172 # -------------------------
173 def fit(self, X, y=None):
174 """This method fits the filter to the data.
175
176 Parameters
177 ----------
178 X :
179 y :
180
181 Returns
182 -------
183 self
184 """
185 # Fit filter
186 if y is None:
187 self._fit_s(X)
188 else:
189 self._fit_m(X, y)
190 # Return
191 return self
192
193 def filter(self, X, y=None):
194 """This method filters the input
195
196 Parameters
197 ----------
198 X :
199
200 Returns
201 -------
202 np.ndarray
203 """
204 # The object has not been previously fitted
205 if self.lower_coefs is None or self.upper_coefs is None:
206 raise TypeError("The instance IQRFilter has not been fitted.")
207
208 # The instance has been fitted with classes
209 if hasattr(self, '_classes') and y is None:
210 raise TypeError("The instance IQRFilter has been fitted with "
211 "several categories (%s). As such, the y "
212 "parameter is required to identify the "
213 "categories." % self._classes.keys())
214
215 # Verify that all classes are included
216 if hasattr(self, '_classes'):
217 y_classes = set(np.unique(y))
218 f_classes = set(self._classes.keys())
219 if bool(y_classes - f_classes):
220 raise TypeError("There are categories in the inputed y (%s) which "
221 "were not seen during the fiting process (%s). As "
222 "such the data cannot be filtered." %
223 (y_classes, f_classes))
224
225 # Filter
226 if y is None:
227 return self._filter_s(X)
228 # Multiple category
229 return self._filter_m(X, y)
230
231 def fit_filter(self, X, y=None):
232 """This method fits and filters.
233 """
234 # Fit
235 self.fit(X, y)
236 # Filter
237 if y is None:
238 return self.filter(X)
239 # Return
240 return self.filter(X, y), y
241
242
243
244
245 if __name__ == '__main__':
246
247 # Import
248 import numpy as np
249 import warnings
250 import matplotlib as mpl
251
252 # Import specific
253 from sklearn.datasets import make_classification
254
255 # ------------------------------------
256 # basic configuration
257 # ------------------------------------
258 # Ignore all the warnings
259 warnings.simplefilter('ignore')
260
261 # Set matplotlib
262 mpl.rcParams['xtick.labelsize'] = 9
263 mpl.rcParams['ytick.labelsize'] = 9
264 mpl.rcParams['axes.titlesize'] = 11
265 mpl.rcParams['legend.fontsize'] = 9
266
267 # Set print options
268 np.set_printoptions(precision=2)
269
270 # ------------------------------------
271 # create data
272 # ------------------------------------
273 # Create feature data
274 data = np.array([[1, 2, 3, 4],
275 [5, 6, 7, 8],
276 [9, 1, 2, 3],
277 [1, 2, 3, 4],
278 [1, 2, 3, 3],
279 [3, 7, 3, 4],
280 [1, 2, 3, 3],
281 [3, 7, 3, 4],
282 [1, 2, 3, 4],
283 [3, 6, 3, 4],
284 [2, 2, -55, 55]])
285
286 # Create categories
287 y = np.array([1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0])
288
289 # --------------
290 # IQR filtering
291 # --------------
292 # Create filter object
293 iqr = IQRFilter(iqrrange=[25, 75], coefficient=1.5)
294
295 # Fit and filter
296 X, y = iqr.fit_filter(data, y)
297
298 # Show
299 print(X)
300 print(y)
Total running time of the script: ( 0 minutes 0.008 seconds)