Source code for pyamr.core.table.organism

###############################################################################
# Author: Bernard Hernandez
# Filename: organismspy
# Date: 02/09/2015
# Description:
# 
# This script creates a map to solve possible issues of misspelled antibiotics,
# or differences as upper/lower cases, unnecessary spaces, missing codes, ...
# Hence, it creates a conversion between the original antibiotic name to an
# homogenous antibiotic name, and the original code and an homogeneous code.
#
###############################################################################


# Generic libraries.
import sys
import datetime
import numpy as np
import pandas as pd
import re

"""
# Own libraries.
sys.path.append('../../../../modules/')
import settings.organisms as ORG
import settings.antibiotics as ANT
import settings.microbiology as MBL
import others.io.read as pd_read
import others.clean.generic as pd_clean_gen
"""

# IMPORTANT
# =========
# Note that some of the constants used to indicate which columns should be
# lowercase, uppercase, ... are stated in the settings modules which can be
# found in modules/settings/organisms.py. In addition, the rename_map
# stores in modules/settings/microbiology.py should contain the mapping
# between the column names with the organism name and code and the
# standard: organismNameOrig and organismCodeOrig



[docs]class OrganismsTable: """ """ # ---------- # Constants # ---------- # Columns identifiers for the name and the code. Ensure that there is # a conversion in the file modules/settings/microbiology.py from the # input data file column names to 'organismCodeOrig' and # 'organismNameOrig'. The different letters indicate: # O - the original value that will be stored in the antibiotics table # F - the formated value that will be stored in the antibiotics table # P - the value of the column in the input data. nameO, codeO = 'organismNameOrig', 'organismCodeOrig' nameF, codeF = 'organismName', 'organismCode' nameS, codeS = 'specieName', 'specieCode' # The columns that should be read from the file. Note that the original # files are huge and might not be stored in memory. Therefore reducing # the columns kept in memory helps. usecols = [nameO, codeO] # Length for automatic codes. len_sp = 8 len_tp = 8 # Constructor def __init__(self): txt = "Check that the columns with the ORGANISM name and code have " txt+= "mapped in the rename_map variable in the file " txt+= "module/settings/microbiology.py\n" print(txt) #------------------------------------------------------ # PRIVATE METHODS #------------------------------------------------------ def _acronym(self, words): """This method return the acronym of several words. """ return "".join(e[0] for e in words) def _compute_main(self, name): """This method creates the main part of the code. """ words = name.split(" ") if 'sp.'in name: return name[:self.len_sp] elif len(words)==1: return name[:self.len_sp] elif len(words)==2: return words[0][0]+words[1][:self.len_tp] else: if 'beta-haemolytic' in name: tp = re.sub('[\W_]+', '', words[-1]) return self._acronym(words[:-1])+tp else: return self._acronym(words) def _compute_numb(self, name): """This method computes the numbers. """ if 'second' in name: return 2 if 'third' in name: return 3 if '2nd' in name: return 2 if '3rd' in name: return 3 return None def _compute_automatic_code(self, row, len_sp=8, len_tp=8): """This method computes the automatic code (starting with A_). """ # Compute. main = self._compute_main(row['organismName']) numb = self._compute_numb(row['organismNameOrig']) if numb is None: code = "A_%s" % main else: code = "A_%s%s" % (main,numb) # Return return code.upper() def _compute_organism_codes(self, df): """This method computes missing organism codes. """ # compute codes for idx,elm in df.iterrows(): # It already has a code. if not pd.isnull(elm[self.codeO]): continue # Needs an automatic code but name is empty. if pd.isnull(elm[self.nameO]): continue if pd.isnull(elm[self.nameF]): continue # Compute automatic code df.loc[idx, self.codeF] = self._compute_automatic_code(elm) # return return df def _compute_specie_codes(self, df): """This method computes missing specie codes. """ # Find species species_rows = df['organismName'].str.contains('sp.') species_vals = df['organismName'][species_rows].unique() for sp in species_vals: sp_name = sp.split(" ")[0] sp_code = sp_name.upper() sp_rows = df['organismName'].str.contains(sp_name) df.loc[sp_rows,'specieName'] = sp_name df.loc[sp_rows,'specieCode'] = 'A_%s' % sp_code return df #------------------------------------------------------ # PUBLIC METHODS #------------------------------------------------------
[docs] def merge_table(self, df1, df2, on, conflicts): """This function merges the two basic columns name and code. Parameters ---------- df1 : the old dataframe with name and code. df2 : the new dataframe with name and code. Returns ------- df_m : merged dataframe. df_c : conflicts dataframe. """ # Merged dataframe df_m = pd.merge(df1, df2, on=on, how='outer') for c1 in df_m.columns[1:]: if not c1.endswith('_x') and not c1.endswith('_y'): continue c2 = c1[:-1]+'x' if c1.endswith('_y') else c1[:-1]+'y' df_m[c1].fillna(df_m[c2], inplace=True) # Find conflicts c1, c2 = conflicts+"_x", conflicts+"_y" df_c = df_m[df_m[c1]!=df_m[c2]] df_c = df_c[[on, c1, c2]] # Remove duplicated columns (y) for c in df_m.columns: if c.endswith('y'): del df_m[c] # Remove column names endings. for i,c in enumerate(df_m.columns.values): if c.endswith('x'): df_m.columns.values[i] = c[:-2] # Return return df_m, df_c
[docs] def create_table(self, data_path): """This method creates the table from input data Parameters ---------- Returns ------- """ """ # Read data. df = pd_read.read_data(ftype='csv', path=data_path, rename_map=MBL.rename_map, # renaming cols keep_cols=self.usecols, # keep cols std_cols=[self.nameO, self.codeO], # std cols low_memory=False) # Unique combinations (NaN are not grouped so they are set to -1). df_u = df.replace(np.nan, "None") df_u = df_u.groupby([self.nameO, self.codeO]).size() df_u = df_u.reset_index().rename(columns={0:'count'}) df_u = df_u.replace("None", np.nan) df_u[self.nameF] = df_u[self.nameO] df_u[self.codeF] = df_u[self.codeO] df_u.dropna(subset=[self.nameO], inplace=True) #df_u = df_u.reindex(columns=ANT.database_cols) del df_u['count'] # Cleaning data (order matters). df_u = pd_clean_gen.lettercase(df_u, ORG.to_lowercase, 'lower') df_u = pd_clean_gen.lettercase(df_u, ORG.to_uppercase, 'upper') df_u = pd_clean_gen.delete_spaces(df_u, ORG.dl_spaces) df_u = pd_clean_gen.ending_specie(df_u, ORG.specie_ending) df_u = pd_clean_gen.specie_abbreviation(df_u, ORG.specie_abbreviation) df_u = pd_clean_gen.delete_parenthesis(df_u, ORG.dl_parenthesis) df_u = pd_clean_gen.delete_numbers(df_u, ORG.dl_numbers) # Computing automatic codes. df_u = self._compute_organism_codes(df_u) df_u = self._compute_specie_codes(df_u) # Drop duplicates keeping the one in which the code is not NAN. For that # purpose we sort by values (therefore sending nans to the end) and then # use drop_duplicates() removing the last instance found. df_u = df_u.sort_values(by=self.codeO) df_u = df_u.drop_duplicates(subset=[self.nameO], keep='first') df_u = df_u.sort_values(by=self.nameO) # Name of columns df_u.columns = [self.nameO, self.codeO, self.nameF, self.codeF, self.nameS, self.codeS] # Return return df_u """ pass
[docs] def compute(self, input_path, output_path): """This method computes/updates a table from data. Parameters ---------- input_path : the path with the data. output_path : the path to store the table. Returns ------- """ """ # Create table using input data. tnew = self.create_table(input_path) told = pd_read.read_data(ftype='csv', path=output_path) # Merge tables. dfm, dfc = tnew, None if told is not None: cols = [self.nameO, self.codeF] dfm, dfc = self.merge_table(told, tnew, self.nameO, self.codeF) # Sort and save. dfm.sort_values(by=self.nameO, inplace=True) dfm.to_csv(output_path, index=False) # Show information. self.show_information(dfm, dfc, output_path) # return return output_path """ pass
[docs] def show_information(self, df, dfc, pathname): """This method shows important information. Parameters ---------- Returns ------- """ # Text displayed as info when merging different files. txt_init = "Antibiotics table conversion created!\n" txt_merge = "\nThe following conflicts have been found. Please take action " txt_merge+= "by opening the automatically created antibiotic csv file " txt_merge+= "(antibotic database) and solving such conflicts before " txt_merge+= "continuing (by default the old values have been kept):" # Variables dup_nameO = df[df.duplicated(self.nameO)] dup_codeO = df[df.duplicated(self.codeO)] dup_codeF = df[df.duplicated(self.codeF)] # Print text init print("-"*80 + "\n"+txt_init) print("Different Original Names: %s" % len(df[self.nameO].unique())) print("Different Formated Names: %s " % len(df[self.nameF].unique())) print("Different Organism Codes: %s" % len(df[self.codeF].unique())) print("Different Species Names: %s" % len(df[self.nameS].unique())) print("Different Species Codes: %s\n" % len(df[self.codeS].unique())) # Print duplicates if len(dup_nameO)>0: print(dup_nameO) if len(dup_codeO)>0: print(dup_codeO) if len(dup_codeF)>0: print(dup_codeF) # Print conflicts if dfc is not None: if len(dfc)>0: print("Conflicts:") print(dfc) print(txt_merge) print("Please revise: %s" % pathname)
if __name__ == '__main__': # Import own module sys.path.append('../../../') # Import specific libraries from pyamr.datasets import load # Constants. goback = "../../../../" input_path = goback + "data/raw/microbiology/csv/luke02/" output_path = goback + "data/tables/antibiotics.csv" # --------------------- # load data # --------------------- # Load data data = load.dataset_epicimpoc_susceptibility_year(nrows=1000) # --------------------- # create builder # --------------------- # Builter #builder = AntibioticsTable() #print(data.columns) #builder.fit() #import sys #sys.exit() # Constants. goback = "../../../../" input_path = goback + "data/raw/microbiology/csv/luke02" output_path = goback + "data/tables/organisms.csv" # Object. builder = OrganismsTable() # Format. builder.compute(input_path, output_path)