Source code for pyamr.datasets.registries

# Libraries
import collections
import pandas as pd

# Import specific
from itertools import product

# -----------------------------------------
# Helper methods
# -----------------------------------------
[docs]def length_exceptions(x):
    """This method..."""
    if len(x) == 1:
        return [len(x[0])]
    if len(x) >= 4:
        return [1] * len(x)
    return None

def _loops_strategy_lengths(series):
    """
    :param x:
    :return:
    """
    import numpy as np
    # Create DataFrame with words
    words = series.str.split(expand=True, n=1)
    # Create DataFrame with word lengths
    lengths = words.astype('str').applymap(lambda x: len(x))
    # Create loops
    return list(product(*[range(4, n) for n in lengths.max()]))

# -----------------------------------------
# Constants
# -----------------------------------------
ACRONYM_EXCEPTIONS = [
    length_exceptions
]


def _acronym(x, lengths=None, sep='', exceptions=ACRONYM_EXCEPTIONS):
    """Create an acronym from a string single.

    .. note: Add variable to chose whether we want to keep
             the whole word if the split length is 1.

    Parameters
    ----------
    x: String
        The strings to create the acronym/code.

    sep: String (default ' ')
        The separator to include between the acronym components. For
        instance, the sep values ' ' and '_' for the string
        'artificial intelligence' would lead to 'AI' and 'A_I'
        respectively.

    lengths: tuple
        The number of letters to use from each word after
        the initial string has been split.

    exceptions: dictionary
        The condition to evaluate as key and the lengths to use as values.
        Only the first condition that returns Tue will be valuated. The
        signature of the methods are as follows:

        key function:
        :param x: list - array obtained from split
        :return: boolean - whether

        value function:
        :param x: list - array obtained from split
        :return: list - array with lenghts

    Returns
    --------
    string
        The acronym
    """
    # Splits
    split = x.split()

    # Define default lengths
    if lengths is None:
        lengths = [1] * len(split)

    # Exceptions
    #for k,v in exceptions.items():
    #    if k(split):
    #        lengths = v(split)
    for expt in exceptions:
        aux = expt(split)
        if aux is not None:
            lengths = aux

    # Compute acronym
    return sep.join([c[:l].upper() \
        for c, l in zip(split, lengths)])



[docs]def acronym_series_unique(series, split_n=1,
                          exclude_acronyms=[],
                          loops_strategy=None,
                          verbose=10,
                          kwgs_acronym={}):
    """Computes unique acronyms.

    Parameters
    ----------

    Returns
    --------
    """

    # Set default none acronym
    acronym = \
        pd.Series(index=series.index, data=pd.NA, name='acronym')

    # Define loops strategy
    if loops_strategy is None:
        loops_strategy = _loops_strategy_lengths

    # Find loops
    loops = loops_strategy(series)

    # Loop
    for i, l in enumerate(loops):

        # Find duplicates or empty values
        idxs = acronym.duplicated(keep='first') | acronym.isna()

        # Break clause
        if idxs.sum() == 0:
            break

        # Show information
        if verbose > 5:
            print("%s/%s. lengths=%s" % (i, len(loops), l))
        if verbose > 7:
            df = pd.concat([series[idxs], acronym[idxs]], axis=1)
            print("%s\n" % df.sort_values(by='acronym'))

        # Create acronyms
        aux = series[idxs].apply(_acronym, lengths=l, **kwgs_acronym) # Create new
        aux[aux.isin(exclude_acronyms)] = pd.NA                       # Exclude
        acronym[idxs] = aux[idxs]                                     # Update

    # Show warning
    if verbose > 0:
        print("There are %s repeated acronyms!\n" % idxs.sum())

    # Return
    return acronym


[docs]def acronym_series(series, unique_acronyms=False, **kwargs):
    """This method...

    Parameters
    ----------
    series: pd.Series
        The series with the names to convert in acronyms.
    unique_acronyms:

    exclude_acronyms:

    split_n: int

    verbose: int
        Level of verbosity

    loops_strategy: function
        The function to indicate what length combinations should
        be used on each iteration. By default it will use the
        default method _loops_strategy_lengths which split the
        series in two and returns all possible lengths from (4, 4)
        till (max_len, max_len). The signature of the function to
        pass as loops_strategy is as follows:

        :param x: series
        :return: list (array of lengths)

    kwgs_split: dict
        The parameters to pass to the split function

    kwgs_acronym: dict
        The parameters to pass to the acronym function.

    Returns
    -------
    pd.Series
        The acronym series

    """
    # Basic check
    if series.duplicated().any() and unique_acronyms:

        # show warning
        repeated = series.value_counts()
        print("\nThe series has the following identical values and \n"
              "therefore they cannot or shouldn't be expressed with \n"
              "different acronyms. The unique_acronyms parameter has \n"
              "been set to 'False'.\n\n{0}" \
                .format(repeated[repeated > 1]))
        # Set unique acronyms to false
        unique_acronyms = False

    # Return acronyms
    if not unique_acronyms:
        return series.apply(_acronym)

    # Return
    return acronym_series_unique(series, **kwargs)


[docs]def invert(d):
    return {v:k for k,v in d.items()}


[docs]def create_registry(data, keyword=None, keep=None):
    """Creates registry from data.

    Parameters
    ----------
    data: pd.DataFrame
        The data

    keyword: string
        The keyword for the columns. All columns starting
        with such keyword will be kept and used for the
        registry.

    keep: list
        The list of columns to keep for the registry
    """
    # Columns to keep
    if keep is None:
        keep = [c for c in data.columns
            if c.startswith(keyword) and
                (c.endswith('name') or
                 c.endswith('code') or
                 c.endswith('description'))]

    # Copy data
    reg = data[keep].copy(deep=True)
    reg = reg.drop_duplicates()
    reg = reg.reset_index(drop=True)

    # Add id
    #if keyword is not None:
    #    reg['%s_id' % keyword] = reg.index.values

    # Return
    return reg


# -----------------------------------------------------
# Microorganism
# -----------------------------------------------------
# The regexp map
REGEX_MAP_MICROORGANISM = {
    '\([^)]*\)': '',      # Remove everything between ()
    '(\s)*\-(\s)*': '-',  # Remove spaces before after hyphen
    'species': '',        # Rename species for next regexp
    'o157': '',           # Remove scherichia coli o157
    'sp(\.)*(\s|$)+': ' ',  # Remove sp from word.
    'strep(\.|\s|$)': 'streptococcus ',   # Complete
    'staph(\.|\s|$)': 'staphylococcus ',  # Complete
    'staphylococci': 'staphylococcus',    # Correction (add mixed? group?)
    'streptococci': 'streptococcus',      # Correction (add mixed? group?)
    '\s+': ' ',           # Remove duplicated spaces.
}

# The hyphens
HYPHENS = ['haemolytic']

# The words to move to the beginning
MOVE_TO_START = ['enterococcus',
                 'staphylococcus',
                 'streptococcus',
                 'coliform']

# The words to move to the end
MOVE_TO_END = ['methicillin resistant',
               'vancomycin resistant',
               'mixed']


def _clean_microorganism(series,
                         hyphens=HYPHENS,
                         move_to_start=MOVE_TO_START,
                         move_to_end=MOVE_TO_END):
    """Cleans the microorganism names.

    .. todo: Put everything below as our own defined function
             but allow users to pass their own functions.

    Parameters
    ----------
    series: pd.Series
        The series with the name of the organisms. Ideally it
        should represent the binomial nomenclature including
        genus and specie respectively.

    hyphens:

    move_to_start:

    move_to_end:

    Returns
    -------
    """

    # Libraries
    from pyamr.datasets.clean import hyphen_before
    from pyamr.datasets.clean import word_to_start

    # Copy
    s = series.copy(deep=True)

    # Lower
    s = s.str.lower()

    # Apply regex mapping
    s = s.replace(regex=REGEX_MAP_MICROORGANISM)

    # Correct hyphens
    for hp in hyphens:
        s = s.transform(hyphen_before, w=hp)

    # Correct order of genus
    for sp in move_to_start:
        s = s.transform(word_to_start, w=sp, pos='start')

    # Correct order of tags
    for sp in move_to_end:
        s = s.transform(word_to_start, w=sp, pos='end')

    # Apply regexp mapping
    s = s.replace(regex=REGEX_MAP_MICROORGANISM)

    # Final strip
    s = s.str.strip()

    # Return
    return s


[docs]def clean_specimen(series):
    """
    \sr\s = right -> end
    \sl\s = left -> end


    :param series:
    :return:
    """

# ---------------------------------------------------
# Registry Base
# ---------------------------------------------------
[docs]class Registry:
    """This is basically a lookup table."""

    # The order of the columns within the registry
    ORDER = ['id', 'name', 'code', 'description', 'original']
    # The subset to use to drop duplicates
    SUBSET = ['name', 'original']
    # The function to clean the names
    FCLEAN = {}
    # The dictionary to rename columns
    RENAME_COLUMNS = {}
    # Registry dataframe
    REG = None


    def __init__(self, keyword='',
                       order=ORDER,
                       subset=SUBSET,
                       fclean=FCLEAN):
        """Constructor

        .. note: Raise type errors
        .. note: Allow to load from file
        """
        # Set parameters
        self.keyword = keyword
        self.ORDER = order
        self.SUBSET = subset
        self.FCLEAN = fclean

        # Add columns to order. Note that if they are
        # important to drop duplicates, they probably
        # should be kept.
        #if subset is not None:
        #    self.ORDER += sorted(set(subset).difference(set(order)))


[docs]    def getr(self, prepend=False):
        """Returns the registry DataFrame"""
        # Get registry
        aux = self.REG.copy(deep=True)

        # Prepend
        if prepend and self.keyword!='':
            aux.columns = ['%s_%s' % (self.keyword, c)
                for c in aux.columns]

        # Fill na with '' so null=False.
        # aux = aux.fillna(aux.dtypes.replace({'O': ''}))

        # Return
        return aux


[docs]    def fit(self, data):
        """This method...

        .. note: It assumes name exists...

        Parameters
        ----------
        data: pd.DataFrame
            The DataFrame expects to have the code, the
            name and the description. Specially the name.
            Think what happens if other missing.

        Returns
        -------
        """
        # Create registry
        aux = create_registry(data, keyword=self.keyword)

        # Keep only last label
        aux.columns = [c.split("_", 1)[-1] for c in aux.columns]

        # Raise error
        if not set(['name', 'code']).intersection(aux.columns):
           raise ValueError("Missing either name or code")

        # Add missing columns
        if 'code' not in aux:
            aux['code'] = aux.name
        if 'name' not in aux:
            aux['name'] = aux.code

        # Backup original
        original = aux.name

        # Put everything to lowercase
        aux = aux.applymap(lambda x: x.lower()
            if isinstance(x, str) else x)

        # Restore original
        aux['original'] = original

        """
        # Add code and fill empty name
        aux.name.fillna(aux.code, inplace=True)

        # Replace
        aux.code = aux.code.replace(self.CODE_REPLACE)

        # Map
        aux = aux.replace({
            'name': self.CODE_MAP,
            'code': invert(self.CODE_MAP)
        })
        """

        print("=====> %s" % self.FCLEAN)

        # Apply cleaning
        for k,v in self.FCLEAN.items():
            if not k in aux.columns:
                continue
            if not callable(v):
                continue
            aux[k] = v(aux[k])
            print("cleaned %s with %s" % (k, v))

        # Final formatting
        aux = aux.drop_duplicates(subset=self.SUBSET)
        aux.name = aux.name.astype(str)
        aux = aux.sort_values(by='name')
        aux = aux.reset_index()
        aux['id'] = aux.index + 1

        # Keep
        keep = [c for c in self.ORDER if c in aux.columns]

        # Order information
        aux = aux[keep]

        # Set registry
        self.REG = aux

        # Return
        return self

[docs]    def replace(self, series, key='original', value='name'):
        """This method..."""
        tup = zip(self.REG[key], self.REG[value])
        return series.map(dict(tup))

[docs]    def transform(self, data, replace={}, include_id=True):
        """Transform data

        Parameters
        ----------
        data: pd.DataFrame
            The data to transform.
        replace:

        include_id

        Returns
        -------
        pd.DataFrame
            The data transformed
        """
        # Include the id.
        if include_id:
            data['%s_id' % self.keyword] = \
                self.replace(data['%s_name' % self.keyword],
                    key='name', value='id')

        # Perform replace
        #for column, (key, value) in replace.items():
        #    data[column] = self.replace(data[column],
        #        key='original', value='name')

        # Return
        return data


[docs]    def fit_transform(self, data, **kwargs):
        """Fits and transforms"""
        self.fit(data)
        return self.transform(data, **kwargs)

[docs]    def combine(self, data):
        pass

[docs]    def clean(self, series):
        pass


# ---------------------------------------------------
# Registry Microorganism
# ---------------------------------------------------
[docs]class MicroorganismRegistry(Registry):
    """Registry for microorganisms"""

    taxonomy = ['domain',
                'class',
                'order',
                'family',
                'genus',
                'species',
                'subspecies']

    reg = None

    def __init__(self, **kwargs):
        """"""
        # Super
        super().__init__(**kwargs)

        # For some reason setting the dictionary as an
        # attribute does not work, the variable FCLEAN
        # remains {} (from parent).
        # Set cleaning dictionary
        self.FCLEAN = {
            'name': _clean_microorganism
        }


[docs]    def combine(self, dataframe, on='name'):
        """Combines an external dataframe with the registry.

        .. note: The dataframe must contain genus, species..

        Parameters
        ----------

        Returns
        --------
        """
        # Load
        from pyamr.datasets.load import load_registry_microorganisms

        # Copy DataFrame
        aux = dataframe.copy(deep=True)

        # Create genus and species
        aux[['genus', 'species']] = \
            aux[on] \
                .str.capitalize() \
                .str.split(expand=True, n=1)

        # Format
        aux.genus = aux.genus.str.capitalize()
        aux.species = aux.species.str.lower()

        # Load registry information
        if self.reg is None:
            self.reg = load_registry_microorganisms()

        # --------------
        # Step 1
        # --------------
        # First, merge those rows within gram_stain and taxonomy
        # that have equal genus and species. Note that we are
        # only merging if both values exist and are equal.
        # Merge
        aux = pd.merge(aux, self.reg,
            how='left',
            left_on=['genus', 'species'],
            right_on=['genus', 'species']
        )

        # Those merged exist in registry.
        aux['exists_in_registry'] = aux.acronym.notna()

        # --------------
        # Step 2
        # --------------
        # Second, for those values whose taxonomy-related columns
        # are null, use the taxonomy information based only on the
        # genus. This does not overwrite step 1.
        # Create aux
        aux_step2 = pd.merge(aux[['genus']],
            self.reg.drop(columns=['species', 'acronym']) \
                .drop_duplicates() \
                .groupby('genus') \
                .head(1),
            how='left',
            left_on=['genus'],
            right_on=['genus']
        )

        # Update dataframe
        aux.update(aux_step2)

        # Return
        return aux

[docs]    def binomial_name(self):
        pass

[docs]    def uuid(self):
        pass


# ---------------------------------------------------
# Registry Antimicrobial
# ---------------------------------------------------
[docs]class AntimicrobialRegistry(Registry):
    """Registry for antimicrobials"""

    reg = None

[docs]    def combine(self, dataframe, on='name'):
        """Combines an external dataframe with the registry.

        .. note: I am assuming the columns that exist in dataframe...

        Parameters
        ----------

        Returns
        --------
        """
        # Load
        from pyamr.datasets.load import load_registry_antimicrobials

        # Copy DataFrame
        aux = dataframe.copy(deep=True)

        # Format
        aux[on] = aux[on].str.capitalize()

        # Load registry information
        if self.reg is None:
            self.reg = load_registry_antimicrobials()

        # --------------
        # Step 1
        # --------------
        # First, merge those rows within gram_stain and taxonomy
        # that have equal genus and species. Note that we are
        # only merging if both values exist and are equal.
        # Merge
        aux = pd.merge(aux, self.reg,
            how='left',
            left_on=[on],
            right_on=['name']
        )

        # Return
        return aux








if __name__ == '__main__':

    # Libraries
    import pandas as pd

    def len1(x):
        """
        :param x: array obtained from split
        :return: boolean
        """
        return len(x) == 1

    def wordl(x):
        """
        :param x: array obtained from split
        :return: array with lenghts
        """""
        return [len(x[0])]

    # Create data.
    organisms = ['Pseudomonas Aeruginosa',
                 'Pseudomonas Aeruginose', # minor variant
                 'Pseudomonas Aeruginosi', # minor variant
                 'Pseudomonas',            # genus 1
                 'Enterococcus',           # genus 2
                 'Enterococcus vagus',
                 'Staphylococcus Beta-Haemolytic Group A',
                 'This is another example',  # Long
                 'Pseu Aeru',                # Extreme
                 'Enterococcus vagus',       # repeated
                 'Pseudomonas',              # repeated
    ]

    # Create series
    series = pd.Series(organisms)

    # Create new acronyms
    acronyms_1 = series.apply(_acronym, sep='')
    acronyms_2 = series.apply(_acronym, sep='_')

    # Create acronyms
    acronyms_3 = acronym_series(series,
        exclude_acronyms=[],
        unique_acronyms=True,
        verbose=0)

    # -----------------------------
    # Creating unique acronyms
    # -----------------------------
    # Create loops from default loop_strategy
    loops = _loops_strategy_lengths(series)

    # Create acronyms
    acronyms_4 =  acronym_series(series[:-3],
        exclude_acronyms=['PSEU_AERU'],
        unique_acronyms=True,
        verbose=10,
        kwgs_acronym={
            'sep': '_'
        })

    # Create result
    result = pd.DataFrame()
    result['series'] = series
    result['acronyms_1'] = acronyms_1
    result['acronyms_2'] = acronyms_2
    result['acronyms_3'] = acronyms_3
    result['acronyms_4'] = acronyms_4

    # Show
    print("Results:")
    print(result)
    print("Loop Strategy:")
    print(pd.DataFrame(loops))