02. Aggregation

Aggregate patient’s data in a single row.

Out:

    1/  300. Patient '0'
   11/  300. Patient '10'
   21/  300. Patient '20'
   31/  300. Patient '30'
   41/  300. Patient '40'
   51/  300. Patient '50'
   61/  300. Patient '60'
   71/  300. Patient '70'
   81/  300. Patient '80'
   91/  300. Patient '90'
  101/  300. Patient '100'
              0         1         2         3         4         5         6         7         8         9          10   ...        90         91         92         93         94         95         96         97         98         99          100
max_f_0  0.999166  0.995902  0.990377  0.999191  0.997868  0.997490  0.997091  0.998154  0.998121  0.995806   0.998504  ...   0.998043   0.997191   0.997522   0.994520   0.999044   0.999562   0.997723   0.998982   0.997026   0.998981    0.996704
max_f_1  0.995113  0.998210  0.999969  0.999881  0.998306  0.995908  0.999442  0.998846  0.999314  0.996838   0.999810  ...   0.999437   0.998326   0.999892   0.994770   0.999142   0.999939   0.998722   0.999566   0.997692   0.999654    0.994968
max_f_2  0.997346  0.999410  0.996470  0.993386  0.999083  0.999827  0.998781  0.999226  0.998101  0.997776   0.998826  ...   0.999026   0.996765   0.996579   0.998870   0.999100   0.996796   0.999992   0.993990   0.997819   0.998853    0.997467
max_f_3  0.999931  0.997206  0.999767  0.999457  0.999507  0.999245  0.999124  0.997490  0.999097  0.997301   0.994163  ...   0.999800   0.999824   0.999891   0.999744   0.999810   0.994255   0.998567   0.996788   0.992755   0.997598    0.997755
max_f_4  0.999696  0.998864  0.999455  0.998960  0.999457  0.999056  0.998377  0.998563  0.999333  0.995236   0.998548  ...   0.997040   0.996794   0.996363   0.995791   0.999824   0.997369   0.999177   0.998516   0.994851   0.999645    0.997877
max_f_5  0.997081  0.997976  0.999938  0.998338  0.995882  0.999872  0.999770  0.998595  0.995579  0.999844   0.997590  ...   0.998979   0.999847   0.998569   0.997307   0.999024   0.998335   0.993630   0.996399   0.999485   0.993851    0.996774
max_f_6  0.998300  0.999404  0.994047  0.999286  0.999829  0.994948  0.995964  0.995430  0.995857  0.999728   0.998278  ...   0.996676   0.998386   0.997306   0.999243   0.998980   0.998677   0.996776   0.999945   0.997120   0.999502    0.999313
max_f_7  0.998698  0.996507  0.997965  0.999789  0.999582  0.999186  0.999822  0.999902  0.998861  0.998005   0.999013  ...   0.998608   0.995124   0.996183   0.997998   0.996933   0.991781   0.997209   0.999821   0.999399   0.999406    0.998789
max_f_8  0.996938  0.999981  0.998824  0.999493  0.997097  0.996062  0.998225  0.999827  0.997406  0.999337   0.999036  ...   0.999644   0.997824   0.998300   0.998979   0.999669   0.999156   0.997957   0.993494   0.996177   0.998384    0.996794
max_f_9  0.997907  0.998515  0.999214  0.999294  0.998242  0.997000  0.991230  0.998908  0.997639  0.999711   0.998592  ...   0.998912   0.999204   0.994821   0.995526   0.998980   0.996725   0.999728   0.992341   0.997040   0.998998    0.999728
max_id   0.000000  1.000000  2.000000  3.000000  4.000000  5.000000  6.000000  7.000000  8.000000  9.000000  10.000000  ...  90.000000  91.000000  92.000000  93.000000  94.000000  95.000000  96.000000  97.000000  98.000000  99.000000  100.000000
min_f_0  0.001158  0.000341  0.001459  0.001328  0.003367  0.000282  0.003246  0.000030  0.002715  0.006384   0.002116  ...   0.001583   0.002259   0.000094   0.001797   0.000738   0.002264   0.000749   0.000525   0.001551   0.001233    0.000840
min_f_1  0.001629  0.000667  0.002398  0.000303  0.004581  0.000233  0.000054  0.000806  0.002058  0.000366   0.002019  ...   0.008139   0.003167   0.005427   0.001694   0.001227   0.000321   0.000411   0.000395   0.002667   0.000293    0.003114
min_f_2  0.004299  0.001128  0.002832  0.003280  0.000253  0.001473  0.002829  0.002240  0.000996  0.000508   0.002240  ...   0.000079   0.000102   0.001354   0.001029   0.000172   0.001957   0.001380   0.001959   0.002138   0.003181    0.002199
min_f_3  0.000050  0.002135  0.001911  0.001414  0.007655  0.000319  0.003766  0.000352  0.000134  0.000252   0.003958  ...   0.004889   0.001966   0.000667   0.001421   0.000879   0.002310   0.001971   0.001254   0.000090   0.001126    0.001467
min_f_4  0.006641  0.005255  0.002976  0.002150  0.002447  0.007052  0.000135  0.000283  0.001841  0.005424   0.000027  ...   0.000459   0.000446   0.000177   0.003633   0.001138   0.001786   0.000104   0.003367   0.000429   0.001537    0.001116
min_f_5  0.003584  0.003522  0.002967  0.002021  0.005047  0.000035  0.001684  0.002565  0.007492  0.002283   0.000773  ...   0.001026   0.005296   0.002557   0.002348   0.003484   0.003363   0.002438   0.000201   0.002090   0.001767    0.003236
min_f_6  0.000212  0.001931  0.003289  0.001045  0.001465  0.001397  0.001558  0.000534  0.001040  0.003722   0.001215  ...   0.000075   0.007023   0.000834   0.003977   0.000709   0.000584   0.003408   0.002159   0.000877   0.004342    0.001188
min_f_7  0.000478  0.000270  0.001497  0.001129  0.001838  0.000184  0.000110  0.000193  0.000597  0.000239   0.004206  ...   0.002373   0.000382   0.000904   0.001935   0.001082   0.000176   0.000153   0.000111   0.008172   0.000221    0.001106
min_f_8  0.001668  0.005238  0.000820  0.002464  0.001592  0.005168  0.001785  0.000654  0.001021  0.005254   0.003541  ...   0.002647   0.001073   0.000753   0.000128   0.000405   0.001337   0.001039   0.000515   0.002775   0.001568    0.002494
min_f_9  0.000979  0.002115  0.002794  0.002009  0.001399  0.000759  0.001002  0.001899  0.001864  0.001549   0.001589  ...   0.000904   0.000853   0.000076   0.001616   0.000143   0.009148   0.001240   0.000396   0.002274   0.002052    0.002189
min_id   0.000000  1.000000  2.000000  3.000000  4.000000  5.000000  6.000000  7.000000  8.000000  9.000000  10.000000  ...  90.000000  91.000000  92.000000  93.000000  94.000000  95.000000  96.000000  97.000000  98.000000  99.000000  100.000000
id       0.000000  1.000000  2.000000  3.000000  4.000000  5.000000  6.000000  7.000000  8.000000  9.000000  10.000000  ...  90.000000  91.000000  92.000000  93.000000  94.000000  95.000000  96.000000  97.000000  98.000000  99.000000  100.000000

[23 rows x 101 columns]

  8 # Generic
  9 import warnings
 10 import numpy as np
 11 import pandas as pd
 12
 13 # Ignore warnings
 14 warnings.simplefilter("ignore")
 15
 16 def load_data():
 17     return pd.read_csv('./laboratory.csv', parse_dates=['date'])
 18
 19 def create_data():
 20     """"""
 21     # Configuration
 22     ROWS, COLS = 150000, 10
 23     PATIENTS = 300
 24
 25     # Create random values
 26     features = np.random.random_sample((ROWS, COLS))
 27     patients = np.random.randint(PATIENTS, size=(ROWS, 1))
 28
 29     # Create DataFrame
 30     df = pd.DataFrame(data=features)
 31     df = df.add_prefix('f_')
 32     df['id'] = patients
 33
 34     # Return
 35     return df
 36
 37
 38 # -----------------------
 39 # Read data
 40 # -----------------------
 41 # Read data
 42 #data = pd.read_csv('./laboratory.csv', parse_dates=['date'])
 43
 44 data = create_data()
 45
 46
 47 # -----------------------
 48 # Format
 49 # -----------------------
 50 # Configuration
 51 show_progress_every = 10 # Number of patients
 52 break_loop_after = 100  # Number of patients or None
 53
 54 # Create empty outcome
 55 results = pd.DataFrame()
 56
 57 # Groups
 58 groups = data.groupby(by='id')
 59
 60 # Step by step (16270 groups!)
 61 for i, (k, g) in enumerate(groups):
 62     # Show information
 63     if (i % show_progress_every) == 0:
 64             print("%5s/%5s. Patient '%s'" % (i+1, len(groups), k))
 65
 66     # Show dataframe
 67     #print(g)
 68
 69     # Sort by dates (if needed)
 70
 71     # Fill empty values
 72     #g.fillna(method='backfill', inplace=True)
 73
 74     # Compute statistics
 75     # ------------------
 76     # .. note: Forward/backward filling does not affect
 77     #          the max/min but it affects the mean or
 78     #          median.
 79     #
 80     # .. note: You could also create a map with all the
 81     #          functions you want to apply instead of
 82     #          using describe. This is useful if you need
 83     #          specific stats
 84
 85     # Get the common stats
 86     #d = g.describe()
 87
 88     # Get specific stats for all columns
 89     d = g.agg({c: ['max', 'min'] for c in g.columns})
 90
 91     # Stack the describe outcome
 92     d = d.stack()
 93     d.index = ['_'.join(e) for e in d.index.tolist()]
 94     d['id'] = k                  # patient identifier
 95     #d['date'] = min(g['date'])   # admission date
 96
 97     # Append result
 98     results = pd.concat([results, d], ignore_index=True, axis=1)
 99
100     # Break clause for testing
101     #if break_loop_after is not None:
102     if i==break_loop_after:
103             break
104
105
106 # Show columns
107 #print(results.columns.values)
108
109 print(results)
110
111 # Show results (partially)
112 #print(results[['id', 'date', 'max_wcc']])
113
114 # .. note: Once it works as you want, you can try to do it
115 #          in one single line and compare the results to
116 #          verify that it is correct.

Total running time of the script: ( 0 minutes 1.007 seconds)

Gallery generated by Sphinx-Gallery