Note
Click here to download the full example code
02. Aggregation
Aggregate patient’s data in a single row.
Out:
1/ 300. Patient '0'
11/ 300. Patient '10'
21/ 300. Patient '20'
31/ 300. Patient '30'
41/ 300. Patient '40'
51/ 300. Patient '50'
61/ 300. Patient '60'
71/ 300. Patient '70'
81/ 300. Patient '80'
91/ 300. Patient '90'
101/ 300. Patient '100'
0 1 2 3 4 5 6 ... 94 95 96 97 98 99 100
max_f_0 0.999396 0.999916 0.998421 0.999023 0.999431 0.997913 0.997867 ... 0.995800 0.998766 0.998735 0.998977 9.976324e-01 0.991941 0.998891
max_f_1 0.999436 0.997938 0.999444 0.999926 0.998605 0.995591 0.999488 ... 0.998451 0.999851 0.995863 0.998557 9.971850e-01 0.992134 0.994609
max_f_2 0.996942 0.998603 0.998015 0.995936 0.993793 0.999142 0.998222 ... 0.999786 0.996589 0.994065 0.998225 9.991617e-01 0.999670 0.997319
max_f_3 0.999834 0.999682 0.994780 0.998556 0.999465 0.997353 0.998338 ... 0.998686 0.999914 0.999150 0.996743 9.990733e-01 0.998847 0.998722
max_f_4 0.998651 0.999780 0.999122 0.998361 0.998318 0.999930 0.999977 ... 0.998310 0.999673 0.993660 0.999548 9.964524e-01 0.989648 0.995319
max_f_5 0.997312 0.998754 0.998809 0.997283 0.999338 0.998211 0.999350 ... 0.994484 0.999991 0.998394 0.999013 9.987785e-01 0.999315 0.999718
max_f_6 0.999305 0.999753 0.999907 0.996438 0.999014 0.998860 0.999388 ... 0.993969 0.997109 0.999011 0.997794 9.979976e-01 0.999533 0.999551
max_f_7 0.998653 0.999945 0.996116 0.990794 0.999622 0.994723 0.998606 ... 0.997663 0.999109 0.999812 0.999841 9.911849e-01 0.999201 0.996596
max_f_8 0.996390 0.997969 0.997322 0.997818 0.997749 0.997650 0.999295 ... 0.999591 0.998346 0.991803 0.999475 9.987050e-01 0.998879 0.997566
max_f_9 0.999218 0.985888 0.999487 0.995975 0.993271 0.998376 0.996804 ... 0.996486 0.998782 0.998949 0.999446 9.992994e-01 0.999501 0.999945
max_id 0.000000 1.000000 2.000000 3.000000 4.000000 5.000000 6.000000 ... 94.000000 95.000000 96.000000 97.000000 9.800000e+01 99.000000 100.000000
min_f_0 0.006612 0.005354 0.001194 0.001723 0.000572 0.001657 0.000188 ... 0.000205 0.003465 0.002174 0.000060 4.641318e-04 0.003149 0.012990
min_f_1 0.000657 0.009678 0.003852 0.005713 0.000374 0.001143 0.000361 ... 0.002627 0.001206 0.000043 0.000317 2.485503e-04 0.000097 0.001037
min_f_2 0.004106 0.002320 0.000009 0.001365 0.002018 0.003379 0.005193 ... 0.001478 0.000453 0.003230 0.000050 2.346579e-03 0.004809 0.000394
min_f_3 0.001637 0.000820 0.001093 0.000946 0.002665 0.000166 0.006021 ... 0.003038 0.002047 0.003247 0.000594 3.530193e-03 0.001577 0.000516
min_f_4 0.001236 0.000880 0.001519 0.002644 0.002253 0.004729 0.001125 ... 0.000415 0.004389 0.003957 0.001519 1.975811e-03 0.000841 0.000739
min_f_5 0.007147 0.001026 0.000837 0.000039 0.000605 0.002531 0.001165 ... 0.002278 0.000446 0.003155 0.002953 4.909229e-03 0.000199 0.001498
min_f_6 0.001654 0.000702 0.000005 0.002121 0.002108 0.003364 0.001350 ... 0.001140 0.001168 0.004485 0.001613 5.254541e-08 0.001442 0.000083
min_f_7 0.001018 0.000803 0.010409 0.002907 0.003485 0.002294 0.004204 ... 0.001379 0.001513 0.005337 0.003602 5.742431e-04 0.002944 0.001785
min_f_8 0.004807 0.000146 0.002599 0.000593 0.001437 0.000679 0.000507 ... 0.001697 0.001800 0.001131 0.002471 1.088023e-03 0.000636 0.001334
min_f_9 0.001023 0.000182 0.000290 0.000437 0.000455 0.008994 0.002949 ... 0.000819 0.003768 0.000417 0.003779 1.259451e-03 0.001748 0.001426
min_id 0.000000 1.000000 2.000000 3.000000 4.000000 5.000000 6.000000 ... 94.000000 95.000000 96.000000 97.000000 9.800000e+01 99.000000 100.000000
id 0.000000 1.000000 2.000000 3.000000 4.000000 5.000000 6.000000 ... 94.000000 95.000000 96.000000 97.000000 9.800000e+01 99.000000 100.000000
[23 rows x 101 columns]
8 # Generic
9 import warnings
10 import numpy as np
11 import pandas as pd
12
13 # Ignore warnings
14 warnings.simplefilter("ignore")
15
16 def load_data():
17 return pd.read_csv('./laboratory.csv', parse_dates=['date'])
18
19 def create_data():
20 """"""
21 # Configuration
22 ROWS, COLS = 150000, 10
23 PATIENTS = 300
24
25 # Create random values
26 features = np.random.random_sample((ROWS, COLS))
27 patients = np.random.randint(PATIENTS, size=(ROWS, 1))
28
29 # Create DataFrame
30 df = pd.DataFrame(data=features)
31 df = df.add_prefix('f_')
32 df['id'] = patients
33
34 # Return
35 return df
36
37
38 # -----------------------
39 # Read data
40 # -----------------------
41 # Read data
42 #data = pd.read_csv('./laboratory.csv', parse_dates=['date'])
43
44 data = create_data()
45
46
47 # -----------------------
48 # Format
49 # -----------------------
50 # Configuration
51 show_progress_every = 10 # Number of patients
52 break_loop_after = 100 # Number of patients or None
53
54 # Create empty outcome
55 results = pd.DataFrame()
56
57 # Groups
58 groups = data.groupby(by='id')
59
60 # Step by step (16270 groups!)
61 for i, (k, g) in enumerate(groups):
62 # Show information
63 if (i % show_progress_every) == 0:
64 print("%5s/%5s. Patient '%s'" % (i+1, len(groups), k))
65
66 # Show dataframe
67 #print(g)
68
69 # Sort by dates (if needed)
70
71 # Fill empty values
72 #g.fillna(method='backfill', inplace=True)
73
74 # Compute statistics
75 # ------------------
76 # .. note: Forward/backward filling does not affect
77 # the max/min but it affects the mean or
78 # median.
79 #
80 # .. note: You could also create a map with all the
81 # functions you want to apply instead of
82 # using describe. This is useful if you need
83 # specific stats
84
85 # Get the common stats
86 #d = g.describe()
87
88 # Get specific stats for all columns
89 d = g.agg({c: ['max', 'min'] for c in g.columns})
90
91 # Stack the describe outcome
92 d = d.stack()
93 d.index = ['_'.join(e) for e in d.index.tolist()]
94 d['id'] = k # patient identifier
95 #d['date'] = min(g['date']) # admission date
96
97 # Append result
98 results = pd.concat([results, d], ignore_index=True, axis=1)
99
100 # Break clause for testing
101 #if break_loop_after is not None:
102 if i==break_loop_after:
103 break
104
105
106 # Show columns
107 #print(results.columns.values)
108
109 print(results)
110
111 # Show results (partially)
112 #print(results[['id', 'date', 'max_wcc']])
113
114 # .. note: Once it works as you want, you can try to do it
115 # in one single line and compare the results to
116 # verify that it is correct.
Total running time of the script: ( 0 minutes 0.868 seconds)