02. Clean OUCRU v2.0

  6 # Libraries
  7 import pandas as pd
  8
  9 # -----------------------------
 10 # Helper methods
 11 # -----------------------------
 12 def create_ids(series):
 13     """This method creates the ids"""
 14     # Unique patient numbers
 15     unique = series.unique()
 16     # Patient mapping
 17     mapping = dict(zip(unique, range(len(unique))))
 18     # Result (mysql pk ids from 1)
 19     aux = series.map(mapping) + 1
 20     # Return
 21     return aux
 22
 23 def keep_by_table(df, tablename, remove_prefix=True):
 24     """Keep columns starting with table name"""
 25     # Define tag
 26     tag = '%s_' % tablename
 27     # Find columns with tag
 28     cols = [c for c in df.columns if c.startswith(tag)]
 29     # Keep only those columns
 30     aux = df[cols].copy(deep=True)
 31     aux = aux.drop_duplicates()
 32     aux = aux.reset_index(drop=True)
 33     # Remove prefix
 34     if remove_prefix:
 35         aux.columns = [c.replace(tag, '') for c in cols]
 36     # Return
 37     return aux
 38
 39
 40 # -----------------------------
 41 # Load data
 42 # -----------------------------
 43 # Load data
 44 data = pd.read_csv('./data/stacked-oucru/combined_stacked.csv',
 45     #nrows=1000,
 46     parse_dates=['date'])
 47
 48 # Show
 49 print(data)
 50 print(data.columns)
 51
 52 # -----------------------------
 53 # Format
 54 # -----------------------------
 55 # Drop
 56 drop = [
 57     'Unnamed: 0',
 58     'Unnamed: 0.1',
 59     'result_old',
 60     'date_old',
 61     'dsource'
 62 ]
 63
 64 # Rename
 65 rename = {
 66     'study_no': 'patient_nhs_number',
 67     'date': 'date_collected',
 68     'column': 'laboratory_code',
 69 }
 70
 71 # Replace
 72 replace = {
 73     'result': {
 74         'True': 1,
 75         'False': 0
 76     }
 77 }
 78
 79 # Basic format
 80 data = data.drop(columns=drop)
 81 data = data.rename(columns=rename)
 82
 83 # Boolean to number
 84 data = data.replace(replace)
 85
 86 # Keep only those whose result can be cast to number
 87 data.result = pd.to_numeric(data.result, errors='coerce')
 88
 89 # These columns are required
 90 data = data.dropna(how='any',
 91     subset=['date_collected', 'result'])
 92
 93 # Reset index
 94 data = data.reset_index()
 95
 96 # Show types
 97 print(data.dtypes)
 98
 99
100
101 # ---------------------------------
102 # Create patients
103 # ---------------------------------
104 # Create ids
105 data['patient_id'] = create_ids(data.patient_nhs_number)
106
107 # Create DataFrame
108 patient = keep_by_table(data, tablename='patient')
109
110 # Add dates
111 patient['date_created'] = pd.to_datetime('today').normalize()
112 patient['date_updated'] = pd.to_datetime('today').normalize()
113 patient['name'] = ''
114 patient['surname'] = ''
115 patient['ext_number'] = patient.nhs_number # can be null or unique
116 patient['hos_number'] = patient.nhs_number # can be null or unique
117 patient['dob'] = pd.to_datetime('today').normalize()      # can be null
118 patient['gender'] = 0
119
120 # Order
121 patient = patient[['id',
122                    'date_created',
123                    'date_updated',
124                    'name',
125                    'surname',
126                    'ext_number',
127                    'nhs_number',
128                    'hos_number',
129                    'dob',
130                    'gender']]
131
132 # Show
133 print("\nPatient table:")
134 print(patient)
135
136 # ---------------------------------
137 # Create laboratory code
138 # ---------------------------------
139 # Create ids
140 data['laboratory_id'] = create_ids(data.laboratory_code)
141
142 # Create DataFrame
143 laboratory_codes = \
144     keep_by_table(data, tablename='laboratory')
145
146 # Add dates
147 laboratory_codes['date_created'] = pd.to_datetime('today').normalize()
148 laboratory_codes['date_updated'] = pd.to_datetime('today').normalize()
149 laboratory_codes['name'] = laboratory_codes.code
150 laboratory_codes['description'] = ''
151 laboratory_codes['is_visible'] = 1
152
153 # Order
154 laboratory_codes = laboratory_codes[['id',
155                                      'date_created',
156                                      'date_updated',
157                                      'code',
158                                      'name',
159                                      'description',
160                                      'is_visible']]
161
162 # Show
163 print("\nLaboratory codes table:")
164 print(laboratory_codes)
165
166 # ---------------------------------
167 # Create laboratory test
168 # ---------------------------------
169 # Drop columns
170 data = data.drop(columns=[
171     'patient_nhs_number',
172     'laboratory_code'
173 ])
174
175 # Rename columns
176 data = data.rename(columns={
177     'laboratory_id': 'code_id',
178 })
179
180 # Add columns
181 data['id'] = data.index.values + 1
182 data['uuid'] = data.id
183 data['date_outcome'] = data.date_collected
184 data['date_created'] = data.date_collected
185 data['date_updated'] = data.date_collected
186 data['unit_range'] = ''
187 data['result_status'] = 'UNKNOWN'
188 data['abnormal_status'] = ''
189
190 # Order
191 data = data[['id',
192              'uuid',
193              'date_created',
194              'date_updated',
195              'date_collected',
196              'date_outcome',
197              'result',
198              'unit',
199              'unit_range',
200              'result_status',
201              'abnormal_status',
202              'code_id',
203              'patient_id']]
204
205
206
207 # Show
208 print("\nLaboratory tests table:")
209 print(data)
210
211 # ---------------------------------
212 # Save
213 # ---------------------------------
214 # Save
215 patient.to_csv('./outputs/patient.csv',
216     date_format='%Y-%m-%d %H:%M:%S', index=False)
217 laboratory_codes.to_csv('./outputs/laboratory_code.csv',
218     date_format='%Y-%m-%d %H:%M:%S', index=False)
219 data.to_csv('./outputs/laboratory_tests.csv',
220     date_format='%Y-%m-%d %H:%M:%S', index=False)

Total running time of the script: ( 0 minutes 0.000 seconds)

Gallery generated by Sphinx-Gallery