Note
Click here to download the full example code
02. Clean OUCRU v2.0
6 # Libraries
7 import pandas as pd
8
9 # -----------------------------
10 # Helper methods
11 # -----------------------------
12 def create_ids(series):
13 """This method creates the ids"""
14 # Unique patient numbers
15 unique = series.unique()
16 # Patient mapping
17 mapping = dict(zip(unique, range(len(unique))))
18 # Result (mysql pk ids from 1)
19 aux = series.map(mapping) + 1
20 # Return
21 return aux
22
23 def keep_by_table(df, tablename, remove_prefix=True):
24 """Keep columns starting with table name"""
25 # Define tag
26 tag = '%s_' % tablename
27 # Find columns with tag
28 cols = [c for c in df.columns if c.startswith(tag)]
29 # Keep only those columns
30 aux = df[cols].copy(deep=True)
31 aux = aux.drop_duplicates()
32 aux = aux.reset_index(drop=True)
33 # Remove prefix
34 if remove_prefix:
35 aux.columns = [c.replace(tag, '') for c in cols]
36 # Return
37 return aux
38
39
40 # -----------------------------
41 # Load data
42 # -----------------------------
43 # Load data
44 data = pd.read_csv('./data/stacked-oucru/combined_stacked.csv',
45 #nrows=1000,
46 parse_dates=['date'])
47
48 # Show
49 print(data)
50 print(data.columns)
51
52 # -----------------------------
53 # Format
54 # -----------------------------
55 # Drop
56 drop = [
57 'Unnamed: 0',
58 'Unnamed: 0.1',
59 'result_old',
60 'date_old',
61 'dsource'
62 ]
63
64 # Rename
65 rename = {
66 'study_no': 'patient_nhs_number',
67 'date': 'date_collected',
68 'column': 'laboratory_code',
69 }
70
71 # Replace
72 replace = {
73 'result': {
74 'True': 1,
75 'False': 0
76 }
77 }
78
79 # Basic format
80 data = data.drop(columns=drop)
81 data = data.rename(columns=rename)
82
83 # Boolean to number
84 data = data.replace(replace)
85
86 # Keep only those whose result can be cast to number
87 data.result = pd.to_numeric(data.result, errors='coerce')
88
89 # These columns are required
90 data = data.dropna(how='any',
91 subset=['date_collected', 'result'])
92
93 # Reset index
94 data = data.reset_index()
95
96 # Show types
97 print(data.dtypes)
98
99
100
101 # ---------------------------------
102 # Create patients
103 # ---------------------------------
104 # Create ids
105 data['patient_id'] = create_ids(data.patient_nhs_number)
106
107 # Create DataFrame
108 patient = keep_by_table(data, tablename='patient')
109
110 # Add dates
111 patient['date_created'] = pd.to_datetime('today').normalize()
112 patient['date_updated'] = pd.to_datetime('today').normalize()
113 patient['name'] = ''
114 patient['surname'] = ''
115 patient['ext_number'] = patient.nhs_number # can be null or unique
116 patient['hos_number'] = patient.nhs_number # can be null or unique
117 patient['dob'] = pd.to_datetime('today').normalize() # can be null
118 patient['gender'] = 0
119
120 # Order
121 patient = patient[['id',
122 'date_created',
123 'date_updated',
124 'name',
125 'surname',
126 'ext_number',
127 'nhs_number',
128 'hos_number',
129 'dob',
130 'gender']]
131
132 # Show
133 print("\nPatient table:")
134 print(patient)
135
136 # ---------------------------------
137 # Create laboratory code
138 # ---------------------------------
139 # Create ids
140 data['laboratory_id'] = create_ids(data.laboratory_code)
141
142 # Create DataFrame
143 laboratory_codes = \
144 keep_by_table(data, tablename='laboratory')
145
146 # Add dates
147 laboratory_codes['date_created'] = pd.to_datetime('today').normalize()
148 laboratory_codes['date_updated'] = pd.to_datetime('today').normalize()
149 laboratory_codes['name'] = laboratory_codes.code
150 laboratory_codes['description'] = ''
151 laboratory_codes['is_visible'] = 1
152
153 # Order
154 laboratory_codes = laboratory_codes[['id',
155 'date_created',
156 'date_updated',
157 'code',
158 'name',
159 'description',
160 'is_visible']]
161
162 # Show
163 print("\nLaboratory codes table:")
164 print(laboratory_codes)
165
166 # ---------------------------------
167 # Create laboratory test
168 # ---------------------------------
169 # Drop columns
170 data = data.drop(columns=[
171 'patient_nhs_number',
172 'laboratory_code'
173 ])
174
175 # Rename columns
176 data = data.rename(columns={
177 'laboratory_id': 'code_id',
178 })
179
180 # Add columns
181 data['id'] = data.index.values + 1
182 data['uuid'] = data.id
183 data['date_outcome'] = data.date_collected
184 data['date_created'] = data.date_collected
185 data['date_updated'] = data.date_collected
186 data['unit_range'] = ''
187 data['result_status'] = 'UNKNOWN'
188 data['abnormal_status'] = ''
189
190 # Order
191 data = data[['id',
192 'uuid',
193 'date_created',
194 'date_updated',
195 'date_collected',
196 'date_outcome',
197 'result',
198 'unit',
199 'unit_range',
200 'result_status',
201 'abnormal_status',
202 'code_id',
203 'patient_id']]
204
205
206
207 # Show
208 print("\nLaboratory tests table:")
209 print(data)
210
211 # ---------------------------------
212 # Save
213 # ---------------------------------
214 # Save
215 patient.to_csv('./outputs/patient.csv',
216 date_format='%Y-%m-%d %H:%M:%S', index=False)
217 laboratory_codes.to_csv('./outputs/laboratory_code.csv',
218 date_format='%Y-%m-%d %H:%M:%S', index=False)
219 data.to_csv('./outputs/laboratory_tests.csv',
220 date_format='%Y-%m-%d %H:%M:%S', index=False)
Total running time of the script: ( 0 minutes 0.000 seconds)