Source code for pyamr.datasets.microbiology.create_susceptibility

# Libraries
import glob
import time
import pandas as pd

# Import pyAMR
from pyamr.datasets.registries import acronym_series
from pyamr.datasets.load import load_registry_microorganisms
from pyamr.datasets.load import load_registry_antimicrobials

# ---------------------------------
# Methods
# ---------------------------------
[docs]def create_microorganisms_lookup_table(orgs): """ Creates the look up table for the organisms. This method uses the information in the organisms dataframe and the information in the default microorganisms registry to create a unique lookup table for the data. Parameters ---------- orgs: pd.DataFrame The DataFrame with the organism genus and organism species for which the look up table should be created. The DataFrame must contain the following columns: microorganism_name genus species Returns ------- pd.DataFrame Lookup table DataFrame with the following columns: 'domain' 'phylum' 'class' 'order' 'family' 'genus' 'species' 'acronym' 'exists_in_registry' 'gram_stain' 'microorganism_code' 'microorganism_name """ # Check if not 'genus' in orgs: print("Missing <genus> column.") if not 'species' in orgs: print("Missing <species> column.") if not 'microorganism_name' in orgs: print("Missing <microorganism_name> column.") # Read microorganisms registry reg = load_registry_microorganisms() # -------------- # Step 1 # -------------- # First, merge those rows within gram_stain and taxonomy # that have equal genus and species. Note that we are # only merging if both values exist and are equal. # Merge orgs = pd.merge(orgs, reg, how='left', left_on=['genus', 'species'], right_on=['genus', 'species'] ) # Those merged exist in registry. orgs['exists_in_registry'] = orgs.acronym.notna() # -------------- # Step 2 # -------------- # Second, for those values whose taxonomy-related columns # are null, use the taxonomy information based only on the # genus. This does not overwrite step 1. # Taxonomy columns ctaxonomy = ['domain', 'phylum', 'class', 'order', 'family'] # Create aux aux = pd.merge(orgs[['genus']], reg.drop(columns=['species', 'acronym'])\ .drop_duplicates() \ .groupby('genus') \ .head(1), how='left', left_on=['genus'], right_on=['genus'] ) # Update orgs orgs.update(aux) # ------------------- # Create new acronyms # ------------------- # .. note: In order to be similar to the ones used in # HH hospital, we can pass the minimum value # as lg=1, ls=4 and length of 4 if only one # word. # Columns with missing acronyms idxs = orgs.acronym.isna() # Fill with new acronyms orgs.loc[idxs, 'acronym'] = \ acronym_series(orgs.loc[idxs, 'microorganism_name'].fillna(''), exclude_acronyms=reg.acronym.unique().tolist(), unique_acronyms=True, verbose=0, kwgs_acronym={'sep': '_'}) # Columns keep = ['domain', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'acronym', 'exists_in_registry', 'gram_stain', 'microorganism_code', 'microorganism_name', 'microorganism_name_original'] # Filter orgs = orgs[[c for c in keep if c in orgs]] # Return return orgs
[docs]def create_antimicrobials_lookup_table(abxs): """Creates the look up table for the antimicorbials. This method uses the information in the antibiotics dataframe and the information in the default antimicrobials registry to create a unique lookup table for the data. Parameters ---------- abxs: pd.DataFrame The DataFrame with ... The DataFrame must contain the following columns: Returns ------- pd.DataFrame Lookup table DataFrame with the following columns: """ # Check if not 'microorganism_name' in abxs: print("Missing <antimicrobial_name> column.") # Read microorganisms registry reg = load_registry_antimicrobials() # -------------- # Step 1 # -------------- # First, merge those rows with equal names. # Merge abxs = pd.merge(abxs, reg, how='left', left_on=['antimicrobial_name'], right_on=['name'] ) # Those merged exist in registry. abxs['exists_in_registry'] = abxs.acronym.notna() # ------------------- # Create new acronyms # ------------------- # .. note: In order to be similar to the ones used in # HH hospital, we can pass the minimum value # as lg=1, ls=4 and length of 4 if only one # word. # Columns with missing acronyms idxs = abxs.acronym.isna() # Fill with new acronyms abxs.loc[idxs, 'acronym'] = \ acronym_series(abxs.loc[idxs, 'antimicrobial_name'].fillna(''), exclude_acronyms=reg.acronym.unique().tolist(), unique_acronyms=True, verbose=0, kwgs_acronym={ 'sep': '_', 'exceptions': []}) # Columns keep = ['name', 'category', 'acronym', 'exists_in_registry', 'antimicrobial_code'] # Filter abxs = abxs[[c for c in keep if c in abxs]] # Return return abxs
if __name__ == '__main__': # Import import csv import yaml import time import logging import logging.config import pandas as pd # Specific from pathlib import Path # PyAMR specific methods from pyamr.datasets.clean import clean_clwsql008 from pyamr.datasets.clean import clean_legacy from pyamr.datasets.clean import clean_mimic # Configure logging with open('./logging.yaml', 'rt') as f: config = yaml.safe_load(f.read()) logging.config.dictConfig(config) # Get logger logger = logging.getLogger('dev') # --------------------------------- # Constant # --------------------------------- # Time timestr = time.strftime("%Y%m%d-%H%M%S") # --------------------------------- # Methods # --------------------------------- def strdf(df): return "\n\t{0}\n".format(df.to_string().replace('\n', '\n\t')) # --------------------------------- # Load data # --------------------------------- # Define path path = './nhs/legacy/' path = './nhs/clwsql008/' tuples = [ ('./nhs/legacy', clean_legacy), ('./nhs/clwsql008', clean_clwsql008), #('./nhs/test', clean_clwsql008), #('./nhs/test2', clean_legacy), #('./mimic/mimic-iv-v0.4', clean_mimic), ('./yujia/raw', clean_clwsql008) ] # Combined data combined = [] # For each tuple for path, f_clean in tuples: print("Loading... {0}".format(path)) # Load data (multiple files) data = pd.concat([pd.read_csv(f, encoding="ISO-8859-1", engine='c') for f in glob.glob(path + "/*.csv")]) # Clean data data = f_clean(data) # Combine combined.append(data) # Merge data = pd.concat(combined) # Basic formatting data = data.drop_duplicates() # ------------------- # Anonymise # ------------------- # Create hos_number to id mapper #unique = data.patient_id.unique() #pid_map = dict(zip(unique, range(len(unique)))) # Include categories #data.patient_id = data.patient_id.map(pid_map) #unique = data.patient_hos_number.unique() #pid_map = dict(zip(unique, range(len(unique)))) # Include categories #data.patient_id = data.patient_hos_number.map(pid_map) # Show #logger.info("\nData:\n{0}".format(strdf(data.head(10)))) logger.info("\nColumns:\n\t{0}\n".format(data.columns)) logger.info("\nDTypes:\n{0}".format(strdf(data.dtypes))) logger.info("\nNaNs:\n{0}".format(strdf(data.isna().sum(axis=0)))) # ---------------------------------- # Create Microorganisms LookUp table # ---------------------------------- # Organism columns columns = ['microorganism_code', 'microorganism_name', 'microorganism_name_original'] # Extract organisms information from susceptibility orgs = data[columns].copy(deep=True) orgs = orgs.drop_duplicates(subset=['microorganism_name']) # Create genus and species orgs[['genus', 'species']] = \ orgs.microorganism_name.str.split(expand=True, n=1) # Format genus and species orgs.genus = orgs.genus.str.title() orgs.species = orgs.species.str.lower() # Sort orgs = orgs.sort_values(by=['genus', 'species']) # Create microorganisms database orgs = create_microorganisms_lookup_table(orgs) # ---------------------------------- # Create Antimicrobials LookUp table # ---------------------------------- # Organism columns columns = ['antimicrobial_code', 'antimicrobial_name'] # Extract organisms information from susceptibility abxs = data[columns].copy(deep=True) abxs = abxs.drop_duplicates(subset=['antimicrobial_name']) # Format genus and species abxs.antimicrobial_name = abxs.antimicrobial_name.str.capitalize() # Sort abxs = abxs.sort_values(by=['antimicrobial_name']) # Create microorganisms database abxs = create_antimicrobials_lookup_table(abxs) # -------------------------- # Logging useful information # -------------------------- # This code logs the unique values and the corresponding # count for the columns specified in the array. Note that # it handles if itdoes not exist (maybe warn?). # Report unique values. for c in ['sensitivity_code', 'sensitivity_name', 'method_code', 'method_name']: if not c in data: continue # Get value counts aux = data[c].value_counts() # Log information logger.info("\n{0}:\n{1}".format(c, strdf(aux))) # This code logs the duplicated values for the subsets # included in the array regarding the MICROORGANISMS. # Should we also include names? for subset in [['microorganism_code'], ['acronym']]: # Get duplicates idxs_dup = orgs[['microorganism_name', 'microorganism_code', 'acronym']] \ .duplicated(subset=subset, keep=False) # Log information logger.info("\nDuplicated: {0}\n\n{1}". format(subset, strdf(orgs[idxs_dup]))) # This code logs the duplicated values for the subsets # included in the array regarding the ANTIMICROBIALS. # Should we also include names? for subset in [['name'], ['acronym']]: # Get duplicates idxs_dup = abxs[['name', 'antimicrobial_code', 'acronym']] \ .duplicated(subset=subset, keep=False) # Log information logger.info("\nDuplicated {0}:\n\n{1}" .format(subset, strdf(abxs[idxs_dup]))) # Report duplicated values (antimicrobials) """ # Create basic information sensitivity = data.sensitivity_code.value_counts() gram_stain = orgs.gram_stain.value_counts() # Create duplicates aux = orgs[['microorganism_name', 'microorganism_code', 'acronym']] # Find duplicates idxs_dup_code = orgs.duplicated(subset=['microorganism_code'], keep=False) idxs_dup_acrm = orgs.duplicated(subset=['acronym'], keep=False) # Basic information logger.info("\nSensitivity:\n{0}".format(strdf(sensitivity))) logger.info("\nGram stain:\n{0}".format(strdf(gram_stain))) logger.info("\nDuplicate codes:\n{0}".format(strdf(aux[idxs_dup_code]))) logger.info("\nDuplicate acronyms:\n{0}".format(strdf(aux[idxs_dup_acrm]))) """ # ---------- # Filter # ---------- # We have to remove those dates in which the date_received is none # because otherwise we cannot group the data by year to store it # in different files. In addition, the others are also required to # have a meaningful susceptibility test record. data = data.dropna(how='any', subset=['date_received', 'specimen_name', 'microorganism_name', 'antimicrobial_name', 'sensitivity_name']) # ---------- # Save # ---------- # Columns keep = ['date_received', 'date_outcome', 'patient_hos_number', 'laboratory_number', 'specimen_code', 'specimen_name', 'specimen_description', 'microorganism_code', 'microorganism_name', 'antimicrobial_code', 'antimicrobial_name', 'method_code', 'method_name', 'sensitivity_code', 'sensitivity_name', 'mic', 'reported'] # Filter data = data[[c for c in keep if c in data]] # ---------- # Save # ---------- # Define path path = Path('./%s' % timestr) # Create path if it does not exist path.mkdir(parents=True, exist_ok=True) # Save databases abxs.to_csv(path / 'antimicrobials.csv', index=False) orgs.to_csv(path / 'microorganisms.csv', index=False) # Create grouper grouper = pd.Grouper(key='date_received', freq='Y') # Save susceptibility grouped for n, g in data.groupby(grouper): # Create filename filename = "susceptibility-%s.csv" % n.strftime('%Y') # Save g.to_csv(path / filename, index=False, date_format='%Y-%m-%d %H:%M:%S', # dont check excels! check csvs! quoting=csv.QUOTE_ALL) # QUOTE_NONNUMERIC # Logging logger.info('The results have been saved in: %s' % path)