Note
Click here to download the full example code
06. Clean OUCRU v1.0
6 # Libraries
7 import pandas as pd
8
9 # -----------------------------
10 # Load data
11 # -----------------------------
12 # Load data
13 data = pd.read_csv('./data/stacked-oucru/combined_stacked.csv',
14 parse_dates=['date'])
15
16 # Show
17 print(data)
18 print(data.columns)
19
20 # -----------------------------
21 # Format
22 # -----------------------------
23 # Drop
24 drop = [
25 'Unnamed: 0',
26 'Unnamed: 0.1',
27 'result_old',
28 'date_old',
29 'dsource'
30 ]
31
32 # Rename
33 rename = {
34 'study_no' : 'patient',
35 'date': 'date_collected',
36 'column': 'code',
37 }
38
39 # Replace
40 replace = {
41 'result': {
42 'True': 1,
43 'False': 0
44 }
45 }
46
47 # Format
48 data = data.drop(columns=drop)
49 data = data.rename(columns=rename)
50 data['id'] = data.index.values
51 data['uuid'] = data.index.values
52 data['date_outcome'] = data.date_collected
53 data = data.replace(replace)
54 data = data[sorted(data.columns.values)]
55
56 # Keep only those whose result can be cast to number
57 data.result = pd.to_numeric(data.result, errors='coerce')
58
59 # Remove nan
60 data = data[data.result.notna()]
61
62 # Show types
63 print(data.dtypes)
64
65 # Save
66 data.head(10000).to_csv('./outputs/combined_stack_head10000.csv')
67 data.to_csv('./outputs/combined_stacked.csv')
Total running time of the script: ( 0 minutes 0.000 seconds)