Note
Click here to download the full example code
05. Format damien sepsis data
Example
7 # Libraries
8 import time
9 import pandas as pd
10 import seaborn as sns
11 import matplotlib.pyplot as plt
12
13 # ---------------------------
14 # Constants
15 # ---------------------------
16 # Path to biochemical markers
17 path_bio = '.\\datasets\\damien-sepsis-biomarkers.csv'
18
19 # Path to nhs to hos mappings
20 path_nth = '.\\datasets\\damien-sepsis-nhs_to_hos.csv'
21
22 # Path to data request megalist
23 path_drm = '.\\datasets\\data-request-megalist.xlsx'
24
25 # Path to save output
26 path_save = '.\\outputs\\{0}-damien-sepsis-biomarkers-pm{1}.csv'
27
28 # Save
29 SAVE = True
30
31 # Days +- first micro sample
32 WINDOW = 30
33
34 # ---------------------------
35 # Main
36 # ---------------------------
37
38 # -----------
39 # Read data
40 # -----------
41 # Read biomarkers
42 bio = pd.read_csv(path_bio,
43 #nrows=10000,
44 parse_dates=['date_collected',
45 'date_outcome'])
46
47 # Read nhs to hos
48 nth = pd.read_csv(path_nth)
49
50 # Read data request megalist
51 drm = pd.read_excel(path_drm,
52 parse_dates=['Sampledate'])
53
54 # Rename drm
55 drm = drm.rename(columns={
56 'Sampledate': 'date_sample',
57 'Hospital Number': 'hos_number'})
58
59 # Sort by date (important if keeping first)
60 drm = drm.sort_values(by='date_sample')
61
62 # Keep first appearance only
63 drm = drm.groupby(by='hos_number') \
64 .first().reset_index()
65
66 # Show
67 print("\nShow datasets:")
68 print(bio)
69 print(nth)
70 print(drm)
71
72 # Show columns
73 print("\nShow columns:")
74 print(bio.columns)
75 print(nth.columns)
76 print(drm.columns)
77
78 # -----------
79 # Merge
80 # -----------
81 # Merge by nhs_number
82 bio = bio.merge(nth, how='left',
83 left_on='patient_nhs_number',
84 right_on='nhs_number')
85
86 # Merge with date (first)
87 bio = bio.merge(drm, how='inner',
88 left_on='hos_number',
89 right_on='hos_number')
90
91 # .. note: There must be an issue with Sampledate, because it is not
92 # being converted to datetime64[ns] from parse_dates. Thus
93 # force conversion ourselves. Note that invalid parsing will
94 # be set to NaT (not a time)
95 bio.date_sample = \
96 pd.to_datetime(bio.date_sample, errors='coerce')
97
98 # Compute day difference
99 bio['day'] = (bio.date_sample - bio.date_collected).dt.days
100
101 # -----------
102 # Plot
103 # -----------
104 # Count
105 count = bio.day.value_counts().sort_index()
106
107 # Configure sns
108 #sns.set_theme(style='whitegrid')
109 sns.set_color_codes("muted")
110 sns.despine(left=True, bottom=True)
111
112 # Plot bars
113 ax = plt.bar(count.index.values,
114 count.values, color='b', alpha=0.5)
115
116 # Fill aea selected
117 plt.fill_between(x=[-WINDOW, WINDOW],
118 y1=0, y2=count.max(), alpha=0.25,
119 color='orange')
120
121 # Draw vertical line at 30
122 plt.vlines([-WINDOW, WINDOW], ymin=0,
123 ymax=count.max(), color='k',
124 linestyle='dashed', linewidth=0.75)
125
126 # Configure
127 plt.grid(False)
128 plt.xlabel('Day from sample')
129 plt.ylabel('Count')
130 plt.title('Day from sample count')
131
132 # Layout
133 plt.tight_layout()
134
135 # Show
136 plt.show()
137
138 # ---------------
139 # Filter and save
140 # ---------------
141 # Filter out
142 bio = bio[bio.day.abs() <= WINDOW]
143
144 # Save
145 if SAVE:
146 # Get time
147 time = time.strftime('%Y%m%d-%H%M%S')
148 # Save with all info
149 bio.to_csv(path_save.format(time, str(WINDOW)))
150 # Save anonymised
151 bio = bio.drop(columns=['patient_nhs_number',
152 'nhs_number',
153 'hos_number'])
154 # Show columns
155 print(bio.columns)
156 bio.to_csv(path_save.format(time, str(WINDOW) + '-anonymised'))
157
158 # Show
159 plt.show()
Total running time of the script: ( 0 minutes 0.000 seconds)