Note
Click here to download the full example code
02. Aggregation
Aggregate patient’s data in a single row.
Out:
1/ 300. Patient '0'
11/ 300. Patient '10'
21/ 300. Patient '20'
31/ 300. Patient '30'
41/ 300. Patient '40'
51/ 300. Patient '50'
61/ 300. Patient '60'
71/ 300. Patient '70'
81/ 300. Patient '80'
91/ 300. Patient '90'
101/ 300. Patient '100'
0 1 2 3 4 5 6 7 8 9 10 ... 90 91 92 93 94 95 96 97 98 99 100
max_f_0 0.999166 0.995902 0.990377 0.999191 0.997868 0.997490 0.997091 0.998154 0.998121 0.995806 0.998504 ... 0.998043 0.997191 0.997522 0.994520 0.999044 0.999562 0.997723 0.998982 0.997026 0.998981 0.996704
max_f_1 0.995113 0.998210 0.999969 0.999881 0.998306 0.995908 0.999442 0.998846 0.999314 0.996838 0.999810 ... 0.999437 0.998326 0.999892 0.994770 0.999142 0.999939 0.998722 0.999566 0.997692 0.999654 0.994968
max_f_2 0.997346 0.999410 0.996470 0.993386 0.999083 0.999827 0.998781 0.999226 0.998101 0.997776 0.998826 ... 0.999026 0.996765 0.996579 0.998870 0.999100 0.996796 0.999992 0.993990 0.997819 0.998853 0.997467
max_f_3 0.999931 0.997206 0.999767 0.999457 0.999507 0.999245 0.999124 0.997490 0.999097 0.997301 0.994163 ... 0.999800 0.999824 0.999891 0.999744 0.999810 0.994255 0.998567 0.996788 0.992755 0.997598 0.997755
max_f_4 0.999696 0.998864 0.999455 0.998960 0.999457 0.999056 0.998377 0.998563 0.999333 0.995236 0.998548 ... 0.997040 0.996794 0.996363 0.995791 0.999824 0.997369 0.999177 0.998516 0.994851 0.999645 0.997877
max_f_5 0.997081 0.997976 0.999938 0.998338 0.995882 0.999872 0.999770 0.998595 0.995579 0.999844 0.997590 ... 0.998979 0.999847 0.998569 0.997307 0.999024 0.998335 0.993630 0.996399 0.999485 0.993851 0.996774
max_f_6 0.998300 0.999404 0.994047 0.999286 0.999829 0.994948 0.995964 0.995430 0.995857 0.999728 0.998278 ... 0.996676 0.998386 0.997306 0.999243 0.998980 0.998677 0.996776 0.999945 0.997120 0.999502 0.999313
max_f_7 0.998698 0.996507 0.997965 0.999789 0.999582 0.999186 0.999822 0.999902 0.998861 0.998005 0.999013 ... 0.998608 0.995124 0.996183 0.997998 0.996933 0.991781 0.997209 0.999821 0.999399 0.999406 0.998789
max_f_8 0.996938 0.999981 0.998824 0.999493 0.997097 0.996062 0.998225 0.999827 0.997406 0.999337 0.999036 ... 0.999644 0.997824 0.998300 0.998979 0.999669 0.999156 0.997957 0.993494 0.996177 0.998384 0.996794
max_f_9 0.997907 0.998515 0.999214 0.999294 0.998242 0.997000 0.991230 0.998908 0.997639 0.999711 0.998592 ... 0.998912 0.999204 0.994821 0.995526 0.998980 0.996725 0.999728 0.992341 0.997040 0.998998 0.999728
max_id 0.000000 1.000000 2.000000 3.000000 4.000000 5.000000 6.000000 7.000000 8.000000 9.000000 10.000000 ... 90.000000 91.000000 92.000000 93.000000 94.000000 95.000000 96.000000 97.000000 98.000000 99.000000 100.000000
min_f_0 0.001158 0.000341 0.001459 0.001328 0.003367 0.000282 0.003246 0.000030 0.002715 0.006384 0.002116 ... 0.001583 0.002259 0.000094 0.001797 0.000738 0.002264 0.000749 0.000525 0.001551 0.001233 0.000840
min_f_1 0.001629 0.000667 0.002398 0.000303 0.004581 0.000233 0.000054 0.000806 0.002058 0.000366 0.002019 ... 0.008139 0.003167 0.005427 0.001694 0.001227 0.000321 0.000411 0.000395 0.002667 0.000293 0.003114
min_f_2 0.004299 0.001128 0.002832 0.003280 0.000253 0.001473 0.002829 0.002240 0.000996 0.000508 0.002240 ... 0.000079 0.000102 0.001354 0.001029 0.000172 0.001957 0.001380 0.001959 0.002138 0.003181 0.002199
min_f_3 0.000050 0.002135 0.001911 0.001414 0.007655 0.000319 0.003766 0.000352 0.000134 0.000252 0.003958 ... 0.004889 0.001966 0.000667 0.001421 0.000879 0.002310 0.001971 0.001254 0.000090 0.001126 0.001467
min_f_4 0.006641 0.005255 0.002976 0.002150 0.002447 0.007052 0.000135 0.000283 0.001841 0.005424 0.000027 ... 0.000459 0.000446 0.000177 0.003633 0.001138 0.001786 0.000104 0.003367 0.000429 0.001537 0.001116
min_f_5 0.003584 0.003522 0.002967 0.002021 0.005047 0.000035 0.001684 0.002565 0.007492 0.002283 0.000773 ... 0.001026 0.005296 0.002557 0.002348 0.003484 0.003363 0.002438 0.000201 0.002090 0.001767 0.003236
min_f_6 0.000212 0.001931 0.003289 0.001045 0.001465 0.001397 0.001558 0.000534 0.001040 0.003722 0.001215 ... 0.000075 0.007023 0.000834 0.003977 0.000709 0.000584 0.003408 0.002159 0.000877 0.004342 0.001188
min_f_7 0.000478 0.000270 0.001497 0.001129 0.001838 0.000184 0.000110 0.000193 0.000597 0.000239 0.004206 ... 0.002373 0.000382 0.000904 0.001935 0.001082 0.000176 0.000153 0.000111 0.008172 0.000221 0.001106
min_f_8 0.001668 0.005238 0.000820 0.002464 0.001592 0.005168 0.001785 0.000654 0.001021 0.005254 0.003541 ... 0.002647 0.001073 0.000753 0.000128 0.000405 0.001337 0.001039 0.000515 0.002775 0.001568 0.002494
min_f_9 0.000979 0.002115 0.002794 0.002009 0.001399 0.000759 0.001002 0.001899 0.001864 0.001549 0.001589 ... 0.000904 0.000853 0.000076 0.001616 0.000143 0.009148 0.001240 0.000396 0.002274 0.002052 0.002189
min_id 0.000000 1.000000 2.000000 3.000000 4.000000 5.000000 6.000000 7.000000 8.000000 9.000000 10.000000 ... 90.000000 91.000000 92.000000 93.000000 94.000000 95.000000 96.000000 97.000000 98.000000 99.000000 100.000000
id 0.000000 1.000000 2.000000 3.000000 4.000000 5.000000 6.000000 7.000000 8.000000 9.000000 10.000000 ... 90.000000 91.000000 92.000000 93.000000 94.000000 95.000000 96.000000 97.000000 98.000000 99.000000 100.000000
[23 rows x 101 columns]
8 # Generic
9 import warnings
10 import numpy as np
11 import pandas as pd
12
13 # Ignore warnings
14 warnings.simplefilter("ignore")
15
16 def load_data():
17 return pd.read_csv('./laboratory.csv', parse_dates=['date'])
18
19 def create_data():
20 """"""
21 # Configuration
22 ROWS, COLS = 150000, 10
23 PATIENTS = 300
24
25 # Create random values
26 features = np.random.random_sample((ROWS, COLS))
27 patients = np.random.randint(PATIENTS, size=(ROWS, 1))
28
29 # Create DataFrame
30 df = pd.DataFrame(data=features)
31 df = df.add_prefix('f_')
32 df['id'] = patients
33
34 # Return
35 return df
36
37
38 # -----------------------
39 # Read data
40 # -----------------------
41 # Read data
42 #data = pd.read_csv('./laboratory.csv', parse_dates=['date'])
43
44 data = create_data()
45
46
47 # -----------------------
48 # Format
49 # -----------------------
50 # Configuration
51 show_progress_every = 10 # Number of patients
52 break_loop_after = 100 # Number of patients or None
53
54 # Create empty outcome
55 results = pd.DataFrame()
56
57 # Groups
58 groups = data.groupby(by='id')
59
60 # Step by step (16270 groups!)
61 for i, (k, g) in enumerate(groups):
62 # Show information
63 if (i % show_progress_every) == 0:
64 print("%5s/%5s. Patient '%s'" % (i+1, len(groups), k))
65
66 # Show dataframe
67 #print(g)
68
69 # Sort by dates (if needed)
70
71 # Fill empty values
72 #g.fillna(method='backfill', inplace=True)
73
74 # Compute statistics
75 # ------------------
76 # .. note: Forward/backward filling does not affect
77 # the max/min but it affects the mean or
78 # median.
79 #
80 # .. note: You could also create a map with all the
81 # functions you want to apply instead of
82 # using describe. This is useful if you need
83 # specific stats
84
85 # Get the common stats
86 #d = g.describe()
87
88 # Get specific stats for all columns
89 d = g.agg({c: ['max', 'min'] for c in g.columns})
90
91 # Stack the describe outcome
92 d = d.stack()
93 d.index = ['_'.join(e) for e in d.index.tolist()]
94 d['id'] = k # patient identifier
95 #d['date'] = min(g['date']) # admission date
96
97 # Append result
98 results = pd.concat([results, d], ignore_index=True, axis=1)
99
100 # Break clause for testing
101 #if break_loop_after is not None:
102 if i==break_loop_after:
103 break
104
105
106 # Show columns
107 #print(results.columns.values)
108
109 print(results)
110
111 # Show results (partially)
112 #print(results[['id', 'date', 'max_wcc']])
113
114 # .. note: Once it works as you want, you can try to do it
115 # in one single line and compare the results to
116 # verify that it is correct.
Total running time of the script: ( 0 minutes 1.007 seconds)