Source code for pyamr.core.table.antibiotic

###############################################################################
# Author: Bernard Hernandez
# Filename: antibiotics.py
# Date: 02/09/2015
# Description:
# 
# This script creates a map to solve possible issues of misspelled antibiotics,
# or differences as upper/lower cases, unnecessary spaces, missing codes, ...
# Hence, it creates a conversion between the original antibiotic name to an
# homogenous antibiotic name, and the original code and an homogeneous code.
#
###############################################################################

# Generic libraries.
import sys
import datetime
import numpy as np
import pandas as pd
import re

"""
# Own libraries.
sys.path.append('../../../../modules/')
import settings.organisms as ORG
import settings.antibiotics as ANT
import settings.microbiology as MBL
import others.io.read as pd_read
import others.clean.generic as pd_clean_gen
"""

# IMPORTANT
# =========
# Note that some of the constants used to indicate which columns should be
# lowercase, uppercase, ... are stated in the settings modules which can be
# found in modules/settings/antibiotics.py. In addition, the rename_map
# stores in modules/settings/microbiology.py should contain the mapping
# between the column names with the antibiotic name and code and the
# standard: antibioticNameOrig and antibioticCodeOrig



[docs]class AntibioticsTable:
  """
  """

  # ----------
  # Constants
  # ----------
  # Columns identifiers for the name and the code. Ensure that there is 
  # a conversion in the file modules/settings/microbiology.py from the
  # input data file column names to 'antibioticCodeOrig' and
  # 'ntibioticNameOrig'. The different letters indicate:
  # O - the original value that will be stored in the antibiotics table
  # F - the formated value that will be stored in the antibiotics table
  # P - the value of the column in the input data.
  nameO, codeO = 'antibioticNameOrig', 'antibioticCodeOrig'
  nameF, codeF = 'antibioticName', 'antibioticCode' 

  # The columns that should be read from the file. Note that the original
  # files are huge and might not be stored in memory. Therefore reducing
  # the columns kept in memory helps.
  usecols = [nameO, codeO] 

  # Length for automatic codes.
  l = 8

  # Constructor
  def __init__(self):
    txt = "Check that the columns with the ANTIBIOTIC name and code have "
    txt+= "mapped in the rename_map variable in the file "
    txt+= "module/settings/microbiology.py\n"
    print(txt)


  #------------------------------------------------------
  #                   PRIVATE METHODS
  #------------------------------------------------------
  def _acronym(self, words):
    """This method return the acronym of several words.
    """
    return "".join(e[0] for e in words)

  def _compute_main(self, name):
    """This method creates the main part of the code.
    """
    words = name.split(" ")
    if len(words)==1:   return name[:self.l]
    elif len(words)==2: return words[0][0]+words[1][:self.l]
    elif len(words)==3: return words[0][0]+words[1][0]+words[2][:self.l] 
    else:               return self._acronym(words)

  def _compute_automatic_code(self, row, l=8):
    """This method computes the automatic code (starting with A_). 
    """
    # Compute.
    main = self._compute_main(row[self.nameF])
    code = "A_%s" % main
    return code.upper()   

  def _compute_codes(self, df):
    """This method computes missing codes.
    """
    # compute codes
    for idx,elm in df.iterrows():
      # It already has a code.
      if not pd.isnull(elm[self.codeO]): continue
      # Needs an automatic code but name is empty.
      if pd.isnull(elm[self.nameO]): continue
      # Compute automatic code
      df.loc[idx, self.codeF] = self._compute_automatic_code(elm)
    # return
    return df



  #------------------------------------------------------
  #                  PUBLIC METHODS
  #------------------------------------------------------
[docs]  def merge_table(self, df1, df2, on, conflicts):
    """This function merges the two basic columns name and code.

    Parameters
    ----------
    df1 : the old dataframe with name and code.
    df2 : the new dataframe with name and code.

    Returns
    -------
    df_m : merged dataframe.
    df_c : conflicts dataframe.
    """
    # Merged dataframe
    df_m = pd.merge(df1, df2, on=on, how='outer')
    for c1 in df_m.columns[1:]:
      if not c1.endswith('_x') and not c1.endswith('_y'): continue
      c2 = c1[:-1]+'x' if c1.endswith('_y') else c1[:-1]+'y'
      df_m[c1].fillna(df_m[c2], inplace=True)
    # Find conflicts
    c1, c2 = conflicts+"_x", conflicts+"_y"
    df_c = df_m[df_m[c1]!=df_m[c2]]
    df_c = df_c[[on, c1, c2]]
    # Remove duplicated columns (y)
    for c in df_m.columns:
      if c.endswith('y'):
        del df_m[c]
    # Remove column names endings.
    for i,c in enumerate(df_m.columns.values):
      if c.endswith('x'):
        df_m.columns.values[i] = c[:-2]
    # Return
    return df_m, df_c


[docs]  def create_table(self, data_path):
    """This method creates the table from input data

    Parameters
    ----------

    Returns
    -------
    """
    """
    # Read data.
    df = pd_read.read_data(ftype='csv', 
                           path=data_path, 
                           rename_map=MBL.rename_map,          # renaming cols
                           keep_cols=self.usecols,             # keep cols
                           std_cols=[self.nameO, self.codeO],  # std cols 
                           low_memory=False)
    # Unique combinations (NaN are not grouped so they are set to -1).
    df_u = df.replace(np.nan, "None")
    df_u = df_u.groupby([self.nameO, self.codeO]).size()
    df_u = df_u.reset_index().rename(columns={0:'count'})
    df_u = df_u.replace("None", np.nan)
    df_u[self.nameF] = df_u[self.nameO]
    df_u[self.codeF] = df_u[self.codeO]
    #df_u = df_u.reindex(columns=ANT.database_cols)
    del df_u['count']

    # Cleaning data (order matters).
    df_u = pd_clean_gen.lettercase(df_u, ANT.to_lowercase, 'lower')
    df_u = pd_clean_gen.lettercase(df_u, ANT.to_uppercase, 'upper')
    df_u = pd_clean_gen.delete_spaces(df_u, ANT.dl_spaces)
    df_u = pd_clean_gen.delete_parenthesis(df_u, ANT.dl_parenthesis)

    # Computing automatic codes.
    df_u = self._compute_codes(df_u)

    # Drop duplicates keeping the one in which the code is not NAN. For that
    # purpose we sort by values (therefore sending nans to the end) and then
    # use drop_duplicates() removing the last instance found.
    df_u = df_u.sort_values(by=self.codeO)
    df_u = df_u.drop_duplicates(subset=[self.nameO], keep='first')
    df_u = df_u.sort_values(by=self.nameO)

    # Name of columns
    df_u.columns = [self.nameO, self.codeO, self.nameF, self.codeF]

    # Return
    return df_u
    """
      
    pass




[docs]  def update(self):
    """
    """

[docs]  def fit(self, values, acronym):
    """
    """

[docs]  def compute(self, input_path, output_path):
    """This method computes/updates a table from data.

    Parameters
    ----------
    input_path  : the path with the data.
    output_path : the path to store the table.

    Returns
    -------

    """
    """
    # Create table using input data.
    tnew = self.create_table(input_path)
    told = pd_read.read_data(ftype='csv', path=output_path) 

    # Merge tables.
    dfm, dfc = tnew, None
    if told is not None:
      cols = [self.nameO, self.codeF]
      dfm, dfc = self.merge_table(told, tnew, self.nameO, self.codeF)
    
    # Sort and save.
    dfm.sort_values(by=self.nameO, inplace=True)
    dfm.to_csv(output_path, index=False)

    # Show information.
    self.show_information(dfm, dfc, output_path)

    # return
    return output_path
    """
    pass



[docs]  def show_information(self, df, dfc, pathname):
    """This method shows important information.

    Parameters
    ----------

    Returns
    -------
    """
    # Text displayed as info when merging different files.
    txt_init = "Antibiotics table conversion created!\n"
    txt_merge = "\nThe following conflicts have been found. Please take action "
    txt_merge+= "by opening the automatically created antibiotic csv file "
    txt_merge+= "(antibotic database) and solving such conflicts before "
    txt_merge+= "continuing (by default the old values have been kept):"

    # Variables
    dup_nameO = df[df.duplicated(self.nameO)]
    dup_codeO = df[df.duplicated(self.codeO)]
    dup_codeF = df[df.duplicated(self.codeF)]

    # Print text init
    print("-"*80 + "\n"+txt_init)
    print("Different Original Names: %s" % len(df[self.nameO].unique()))
    print("Different Formated Names: %s " % len(df[self.nameF].unique()))
    print("Different Antibiotics Codes: %s\n" % len(df[self.codeF].unique()))

    # Print duplicates
    if len(dup_nameO)>0: print(dup_nameO)
    if len(dup_codeO)>0: print(dup_codeO)
    if len(dup_codeF)>0: print(dup_codeF)

    # Print conflicts
    if dfc is not None:
      print("Conflicts:")
      print(dfc)
      print(txt_merge)

    print("Please revise: %s" % pathname)
















[docs]class AntibioticTable():



  def __init__(self,  to_lowercase=None,
                      to_uppercase=None,
                      delete_spaces=None,
                      delete_parenthesis=None):
    """
    """
    self.to_lowercase = to_lowercase
    self.to_uppercase = to_uppercase
    self.delete_spaces = delete_spaces
    self.delete_parenthesis = delete_parenthesis


[docs]  def fit(self, dataframe):
    """
    """

    df_u = dataframe.replace(np.nan, "None")
    df_u = dataframe.groupby([self.nameO, self.codeO]).size()
    df_u = dataframe.reset_index().rename(columns={0:'count'})
    df_u = dataframe.replace("None", np.nan)

    print(df_u)
    """
    if self.to_lowercase is not None:

    if self.to_uppercase is not None:

    if self.delete_spaces is not None:

    if self.delete_parenthesis is not None:
    """





if __name__ == '__main__':

  # Import own module
  sys.path.append('../../../')

  # Import specific libraries
  from pyamr.datasets import load

  # Constants.
  goback = "../../../../"
  input_path = goback + "data/raw/microbiology/csv/luke02/"
  output_path = './antibiotics.csv'




  # ---------------------
  # load data
  # ---------------------
  # Load data
  data = load.dataset_epicimpoc_susceptibility_year(nrows=1000)

  # Tansform data
  data = data[['antibioticName', 'antibioticCode']]

  AntibioticTable().fit(data)

  # ---------------------
  # create builder
  # ---------------------
  # Builter
  builder = AntibioticsTable()
  print(data.columns)
  print(data.head(10))

  # Compute antibiotic table.
  builder.compute(input_path, output_path)