Source code for pyamr.core.table.acronym

###############################################################################
# Author: Bernard Hernandez
# Filename: acronym.py
# Date: 02/09/2015
# Description:
# 
# This script creates a map to solve possible issues of misspelled antibiotics,
# or differences as upper/lower cases, unnecessary spaces, missing codes, ...
# Hence, it creates a conversion between the original antibiotic name to an
# homogenous antibiotic name, and the original code and an homogeneous code.
#
###############################################################################

# Libraries
import pandas as pd

# --------------------------------------------------------------------------
#                              helper methods
# --------------------------------------------------------------------------
[docs]def acronym(name, length=3, prefix='A_'):
  """This method created the acronym for a given string

  Parameters
  ----------
  name: string-like
    The string to construct the acronym

  length: int-like
    The number of letters used for the first word of the acronym

  prefix: string-like
    The prefix to add to the acronym for identification purposes

  Returns
  -------
  string
  """
  # Check that the acronym is not nul
  # Remove spaces
  name = ' '.join(name.split())
  # Array with individual words
  words = name.split()   
  # Check there are words to create acronym
  if not len(words):
    return ('%sNONE' % (prefix)).upper()
  # Create acronym for other words
  acronym = [w[0] for w in words[1:]]
  # Create acronym adding first word
  acronym = '%s%s%s' % (prefix, words[0][:length], ''.join(acronym))
  # Return
  return acronym.upper()


def _check_acronym_conflicts(transdict):
  """This method check that all acronyms are unique.

  Parameters
  ----------
  transdict: dict-like
    The dictionar containing the string and the corresponding acronym.

  Returns
  -------
  """
  # Import library
  from collections import Counter
  # Count number of times the acronym appears
  counter = Counter(transdict.values())
  # Create a dict with the duplicated elements
  duplicated = {k:v for k,v in counter.items() if v>1}
  # Raise an error
  if len(duplicated):
    raise ValueError("The following acronyms within the dictionary "
                     "are not unique %s." % duplicated)


[docs]class AcronymBuilder:
  """
  """

  # Attributes
  _func = acronym

  # Constructor
  def __init__(self):
    """
    """
    pass
  
[docs]  def has_acronym():
    pass

[docs]  def has_value():
    pass

[docs]  def update(self):
    pass

[docs]  def fit(self, values, acronyms):
    """This method creates a dictionary with the acronyms

    TODO: Instead of just checking whether there are acronym conflicts or not
    ensure that always unique acronyms are created. Note that the implemented
    approach to create acronyms does not ensure they are always unique.

    Parameters
    ----------
    values: array-like (contains strings)
      The array with the possible values

    acronyms: array-like (contains strings)
      The array with the corresponding acronyms.

    Returns
    -------
    """
    # Check that lengths are the same
    if len(values)!=len(acronyms):
      raise ValueError("The length of the parameters values (%s) and "
                       "acronyms (%s) mismatch. They must be the same "
                       "length." % (len(values), len(acronyms)))

    # Remove duplicated spaces within values
    values = [' '.join(v.split()).lower() for v in values]

    # Create acronyms for nan entries
    acronyms = [acronym(v) if pd.isnull(a) else a 
      for v,a in zip(values, acronyms)]

    # Create translation dictionary
    self._transdict = dict(zip(values, acronyms))
    self._reverdict = dict(zip(acronyms, values))

    # Check acronym conflicts
    _check_acronym_conflicts(self._transdict)
    _check_acronym_conflicts(self._reverdict)

    # Return
    return self


[docs]  def transform(self, values):
    """This method returns the acronyms for given values

    Parameters
    ----------
    values: array-like

    Returns
    -------
    array with acronyms
    """
    return [self._transdict[value] for value in values]

[docs]  def inverse_transform(self, acronyms):
    """This method returns the values for given acronyms.

    Parameters
    ----------
    values: array-like

    Returns
    -------
    array with acronyms
    """
    return [self._reverdict[acronym] for acronym in acronyms]

  












if __name__ == '__main__':

  # Libraries
  import sys

  # Import specific libraries
  from pyamr.datasets import load


  # ---------------------
  # Example I
  # ---------------------
  # This example shows how to use the acronym builder to generate acronyms
  # to be used for the EPiC IMPOC research. In particular, it creates the
  # acronyms for the organisms.

  # Load data
  data = load.dataset_epicimpoc_susceptibility_year()
  data = data[['organismNameOrig', 'organismCodeOrig']].drop_duplicates()

  # Create the acronym builder
  builder = AcronymBuilder()

  # Get values and acronyms
  values = data['organismNameOrig'].values
  acronyms = data['organismCodeOrig'].values

  print(pd.DataFrame(values, acronyms))

  # Fit builder
  builder.fit(values=values, acronyms=acronyms)

  # Transform
  print(builder.transform(['enterococcus sp.', 'pseudomonas sp.']))

  # Inverse transform
  print(builder.inverse_transform(['ENTC', 'PSEUD']))


  # ---------------------
  # Example II
  # ---------------------
  # This example shows how the existing acronym generation approach 
  # fails for the terms 'feo guapo' and 'feo gordo' which produce the 
  # same acronym.
  # Create data
  a = ''
  b = 'feo'
  c = 'feo   guapo'
  d = 'feo guapo tonto'
  e = 'feo gordo'

  # Create the acronym builder
  builder = AcronymBuilder()

  # Get values and acronyms
  values = [a, b, c, d, e]
  acronyms = [acronym(name) for name in values]

  # Fit builder
  builder.fit(values=values, acronyms=acronyms)