Source code for pyamr.core.acsi

# Libraries
import numpy as np
import pandas as pd


[docs]def create_combinations_v1(d, groupby,
                              col_spe='SPECIMEN',
                              col_lab='LAB_NUMBER',
                              col_org='MICROORGANISM',
                              col_abx='ANTIMICROBIAL',
                              col_sns='SENSITIVITY'):
    """Creates the DataFrame with all combinations.

    .. note:: There might be an issue if there are two different outcomes
              for the same record. For example, a susceptibility test
              record for penicillin (APEN) with R and another one with
              S. Warn of this issue if it appears!

    .. note:: If the data is right and the laboratory numbers are unique per
              isolate then the date is not necessary. However, what if we want
              to keep it? Groupby should at least contain: specimen, microorganism
              and lab_id

    .. note:: How to add all data in addition to the columns manually.

    Parameters
    ----------

    Returns
    --------
    """
    # Libraries
    from itertools import combinations

    # Initialize
    c = []

    # Loop
    for i, g in d.groupby(groupby):
        for x, y in combinations(g.sort_values(by=col_abx).index, 2):
            aux = dict(zip(groupby, i))

            aux.update({
                '%s_x' % col_abx: g.loc[x, col_abx],
                '%s_y' % col_abx: g.loc[y, col_abx],
                '%s_x' % col_sns: g.loc[x, col_sns],
                '%s_y' % col_sns: g.loc[y, col_sns]
            })
            c.append(aux)

    # Create DataFrame
    c = pd.DataFrame(c)

    # Add class
    c['class'] = c['%s_x' % col_sns] + \
                 c['%s_y' % col_sns]
    # Return
    return c






[docs]def mutual_info_matrix_v3(x=None, y=None, ct=None):
    """Compute the component information score.

    .. note: Might be inefficient but good for testing.

    .. note: In order to be able to compute the mutual
             information score it is necessary to have
             variation within the variable. Thus, if
             there is only one class, should we return
             a result or a warning?

    Parameters
    ----------
    x: list
        List with the classes
    y: list
        List with the classes

    Returns
    -------
    """
    # Libraries
    from scipy.stats.contingency import crosstab

    def _check_nparray(obj, param_name):
        if obj is not None:
            if isinstance(obj, np.ndarray):
                return obj
            elif isinstance(obj, pd.Series):
                return obj.to_numpy()
            elif isinstance(obj, pd.DataFrame):
                return obj.to_numpy()
            elif isinstance(obj, list):
                return np.array(obj)
            else:
                raise ValueError("""
                       The input parameter '{0}' is of type '{1} which is 
                       not supported. Please ensure it is a np.ndarray."""
                                 .format(param_name, type(obj)))

    # Ensure they are all np arrays
    x = _check_nparray(x, 'x')
    y = _check_nparray(y, 'y')
    ct = _check_nparray(ct, 'ct')

    # Compute contingency
    if ct is None:
        c = crosstab(x,y)
        if isinstance(c, tuple):
            ct = c[-1]   # older scipy
        else:
            ct = c.count # newer scipy

    # Variables
    n = ct.sum()
    pi = np.ravel(ct.sum(axis=1)) / n
    pj = np.ravel(ct.sum(axis=0)) / n

    # Create empty matrix
    m = np.empty(ct.shape)
    m[:] = np.nan

    # Fill with component information score
    with np.errstate(all='ignore'):
        for i in range(m.shape[0]):
            for j in range(m.shape[1]):
                pxy = ct[i,j] / n
                m[i,j] = pxy * np.log(pxy / (pi[i] * pj[j]))

    # Fill with na (lim x->0 => 0)
    m[np.isnan(m)] = 0

    # Return
    return m


[docs]def collateral_resistance_index(m):
    """Collateral Resistance Index

    The collateral resistance index is based on the mutual
    information matrix. This implementation assumes there
    are only two classes resistant (R) and sensitive (S).

    .. warning:: Only works for a 2x2 contingency matrix

    Parameters
    ----------
    m: np.array
        A numpy array with the mutual information matrix.
        Also called the contingency matrix.

    Returns
    -------
    """
    return (m[0, 0] + m[1, 1]) - (m[0, 1] + m[1, 0])


[docs]def CRI(x, func_mis=mutual_info_matrix_v3):
    """Collateral resistance index

    Parameters
    ----------
    x: pd.Series
        Contains the combinations classes (e.g. SS, SR, RS, RR)

    func_mis: function
        The function to use to compute the contingency matrix from the
        mutual information score. By default it uses the function
        mutual_info_matrix_v3.
    """
    ct = np.array([[x.SS, x.SR], [x.RS, x.RR]])
    m = func_mis(ct=ct)
    return collateral_resistance_index(m)



[docs]class ACSI:
    """Antimicrobial Collateral Sensitivity Index.

    Other possible names for this index...

       a) Antimicrobial Disjoint Resistance Index
       b) Antimicrobial Collateral Resistance Index
       c) Antimicrobial Collateral Sensitivity Index
       d) Antimicrobial Collateral Resistance Index

    """

    # Attributes
    c_spe = 'SPECIMEN'
    c_org = 'MICROORGANISM'
    c_abx = 'ANTIMICROBIAL'
    c_sns = 'SENSITIVITY'
    c_lab = 'LAB_NUMBER'
    c_dat = 'DATE'

    def __init__(self,
                 column_specimen=c_spe,
                 column_microorganism=c_org,
                 column_antimicrobial=c_abx,
                 column_sensitivity=c_sns,
                 column_laboratory=c_lab):
        """The constructor.

        Parameters
        ----------
        column_specimen: string
            The column name with the specimen

        column_antimicrobial: string
            The column name with the antimicrobial

        column_microorganism: string
            The column name with the microorganism

        column_sensitivity: string
            The column name with the sensitivity

        column_laboratory: string
            The column name with the laboratory id

        Returns
        -------
        none
        """

        # Create dictionary to rename columns
        self.rename_columns = {column_specimen: self.c_spe,
                               column_antimicrobial: self.c_abx,
                               column_microorganism: self.c_org,
                               column_sensitivity: self.c_sns,
                               column_laboratory: self.c_lab}


[docs]    def combinations(self, dataframe, **kwargs):
        """Creates the combinations.

        .. note:: In theory the combinations only need to be grouped
                  by laboratory number, however, in order to maintain
                  the rest of the information (like date) we need to
                  pass all of them.
        """
        return create_combinations_v1(dataframe, **kwargs)


[docs]    def compute_from_contingency(self):
        pass

[docs]    def compute_from_combinations(self):
        pass

[docs]    def compute(self, dataframe,
                flag_combinations=False,
                flag_contingency=False,
                groupby=None,
                func_mis=None,
                return_combinations=False):
        """Computes the Antimicrobial Collateral Sensitivity.

        .. note:: Enable to pass a combinations dataframe.
        .. note:: The lab number should be used only to compute
                  the combinations. It should not be used when
                  computing the ACSI.

        Parameters
        ----------
        dataframe: pd.DataFrame
            A dataframe with the susceptibility test interpretations as columns.

        combinations: boolean
            Indicates whether the variable DataFrame contains susceptibility
            test records (combinations=False) or the antimicrobial combinations
            and the class (combination=True)

        func_mis: function
            The function to use to compute the contingency matrix from the
            mutual information score. By default it uses the function
            mutual_info_matrix_v3.

        Returns
        -------
        pd.Series or pd.DataFrame

        """

        # Do checks

        # Set default groupby
        if groupby is None:
            groupby = [
                self.c_dat,
                self.c_spe,
                self.c_org,
            ]

        # Set default function
        if func_mis is None:
            func_mis = mutual_info_matrix_v3

        # Rename columns
        aux = dataframe.copy(deep=True) \
            .rename(columns=self.rename_columns)

        # Create combinations
        if flag_combinations:
            combinations = aux
        else:
            combinations = self.combinations(aux,
                groupby=groupby + [self.c_lab])

        # Create contingency DataFrame
        contingency = combinations.groupby(
            by=groupby + [
                self.c_abx + '_x',
                self.c_abx + '_y',
                'class'])\
            .size().unstack()

        # Ensure that variables in CRI needed exist
        for s in ['SS', 'RS', 'SR', 'RR']:
            if not s in contingency:
                contingency[s] = 0

        # Compute CRI
        contingency['acsi'] = contingency.fillna(0) \
            .apply(CRI, args=(func_mis,), axis=1)

        # Return
        if return_combinations:
            return contingency, combinations
        return contingency





if __name__ == '__main__':

    # Libraries
    import pandas as pd

    # ----------------------------------
    # Create data
    # ----------------------------------
    # Define susceptibility test records
    susceptibility_records = [
        ['2021-01-01', 'LAB_1', 'BLDCUL', 'ECOL', 'AAUG', 'sensitive'],
        ['2021-01-01', 'LAB_1', 'BLDCUL', 'ECOL', 'ACIP', 'sensitive'],

        ['2021-01-01', 'LAB_2', 'BLDCUL', 'ECOL', 'AAUG', 'sensitive'],
        ['2021-01-01', 'LAB_2', 'BLDCUL', 'ECOL', 'ACIP', 'resistant'],

        ['2021-01-01', 'LAB_3', 'BLDCUL', 'ECOL', 'AAUG', 'sensitive'],
        ['2021-01-01', 'LAB_3', 'BLDCUL', 'ECOL', 'ACIP', 'resistant'],

        ['2021-01-01', 'LAB_4', 'BLDCUL', 'ECOL', 'AAUG', 'resistant'],
        ['2021-01-01', 'LAB_4', 'BLDCUL', 'ECOL', 'ACIP', 'resistant'],

        ['2021-01-02', 'LAB_5', 'BLDCUL', 'ECOL', 'AAUG', 'sensitive'],
        ['2021-01-02', 'LAB_5', 'BLDCUL', 'ECOL', 'ACIP', 'sensitive'],

        ['2021-01-02', 'LAB_6', 'BLDCUL', 'ECOL', 'AAUG', 'sensitive'],
        ['2021-01-02', 'LAB_6', 'BLDCUL', 'ECOL', 'ACIP', 'resistant'],

        ['2021-01-02', 'LAB_7', 'BLDCUL', 'ECOL', 'AAUG', 'resistant'],
        ['2021-01-02', 'LAB_7', 'BLDCUL', 'ECOL', 'ACIP', 'resistant'],

        ['2021-01-03', 'LAB_8', 'BLDCUL', 'ECOL', 'AAUG', 'sensitive'],
        ['2021-01-03', 'LAB_8', 'BLDCUL', 'ECOL', 'ACIP', 'intermediate'],

        ['2021-01-03', 'LAB_9', 'BLDCUL', 'ECOL', 'AAUG', 'resistant'],
        ['2021-01-03', 'LAB_9', 'BLDCUL', 'ECOL', 'ACIP', 'resistant'],
        ['2021-01-03', 'LAB_9', 'BLDCUL', 'SAUR', 'ACIP', 'resistant'],
        ['2021-01-03', 'LAB_9', 'BLDCUL', 'SAUR', 'ACIP', 'resistant'],

        ['2021-01-04', 'LAB_10', 'URICUL', 'ECOL', 'AAUG', 'resistant'],
        ['2021-01-04', 'LAB_10', 'URICUL', 'ECOL', 'ACIP', 'sensitive'],
        ['2021-01-04', 'LAB_10', 'URICUL', 'SAUR', 'AAUG', 'resistant'],
        ['2021-01-04', 'LAB_10', 'URICUL', 'SAUR', 'APEN', 'resistant'],
    ]

    # Create DataFrames
    susceptibility = pd.DataFrame(susceptibility_records,
        columns=['DATE',
                 'LAB_NUMBER',
                 'SPECIMEN',
                 'MICROORGANISM',
                 'ANTIMICROBIAL',
                 'SENSITIVITY'])

    # Format DataFrame
    susceptibility.SENSITIVITY = \
        susceptibility.SENSITIVITY.replace({
            'resistant': 'R',
            'intermediate': 'I',
            'sensitive': 'S'
    })

    # Show
    print("\nSusceptibility:")
    print(susceptibility)

    # .. note: It is important to ensure that there are not susceptibility
    #          test records with contradicting results. For example being
    #          resistant and sensitive at the same time. Integrate this
    #          check in the computation!

    # ---------------------------
    # Create combinations
    # ---------------------------
    # Create combinations
    c = create_combinations_v1(susceptibility,
        groupby=[
            'DATE',
            'LAB_NUMBER',
            'SPECIMEN',
            'MICROORGANISM'
        ])

    print("\nCombinations:")
    print(c)

    # Build contingency
    r = c.groupby([
        'DATE',
        'SPECIMEN',
        'MICROORGANISM',
        'ANTIMICROBIAL_x',
        'ANTIMICROBIAL_y',
        'class']).size().unstack()

    print("Contingency:")
    print(r)

    # Compute CRI
    r['MIS'] = r.fillna(0) \
        .apply(CRI, args=(mutual_info_matrix_v3,), axis=1)

    # Show
    print("\nResult")
    print(r)


    # ------------------------------------------
    # Computes ACSI using class
    # ------------------------------------------

    def show(combinations, contingency, title=None, n=100):
        """Helper function to display outcomes."""
        # Variables
        n_comb = combinations.shape[0]
        n_cont = np.nansum(contingency.to_numpy()[:, :-1])

        if title is None:
            title = 'Grouped By: %s' % str(contingency.index.names[:-2])

        # Display
        print("\n" + "="*n + '\n%s\n'%title + "="*n)
        print("Total combinations: %s" % int(n_comb))
        print("Total contingency:  %s" % int(n_cont))
        print("\nCombinations:")
        print(combinations)
        print("\nContingency:")
        print(contingency)


    # Create ACSI instance
    acsi = ACSI()

    # ---------------
    # Compute overall
    # ---------------
    # .. note:: Why removing LAB_NUMBER returns only
    #           the first letter...?
    # Compute index
    contingency, combinations = \
        acsi.compute(susceptibility,
                     groupby=[],
                     return_combinations=True)

    # Show
    show(combinations, contingency, title='Overall')


    # ---------------
    # Compute by
    # ---------------
    # Compute index
    contingency, combinations = \
        acsi.compute(susceptibility,
                     groupby=['DATE'],
                     return_combinations=True)

    # Show
    show(combinations, contingency, title='By <DATE>')


    # ----------------
    # Compute by pairs
    # ----------------
    # Compute index
    contingency, combinations = \
        acsi.compute(susceptibility,
            groupby=[
                'SPECIMEN',
                'MICROORGANISM'
            ],
            return_combinations=True)

    # Show
    show(combinations, contingency, title='By <SPECIMEN, MICROORGANISM>')

    # -------------------------
    # Compute by date and pairs
    # -------------------------
    # .. note:: It seems that it is important to include all the
    #           parameters when computing the combinations. Otherwise
    #           it might create ill defined combinations. Think this
    #           through...

    # Compute index
    contingency, combinations = \
        acsi.compute(susceptibility,
            groupby=[
                 'DATE',
                 'SPECIMEN',
                 'MICROORGANISM'
            ],
            return_combinations=True)

    # Show
    show(combinations, contingency, title=None)

    # Compute contingency reusing combinations.
    contingency = acsi.compute(combinations.reset_index(),
        groupby=['SPECIMEN'],
        flag_combinations=True,
        return_combinations=False)
    show(combinations, contingency, title=None)

    # Compute contingency reusing combinations.
    contingency = acsi.compute(combinations.reset_index(),
        groupby=['MICROORGANISM'],
        flag_combinations=True,
        return_combinations=False)
    show(combinations, contingency, title=None)

    # -------------------------------------------------------------------------
    # Testing
    # -------------------------------------------------------------------------

    # ---------------------------------------------------------------------
    # Success
    # ---------------------------------------------------------------------
    # .. note: All this examples should succeed. At the moment the code
    #          breaks if gram is not included. This is because the data
    #          we have created has duplicated values for each gram.
    #          Should we consider this within the ASAI?

    # ---------------------------------------------------------------------
    # Errors
    # ---------------------------------------------------------------------
    # .. note: In the examples below, the method acsi is meant to raise
    #          an error either because any of the required missing columns
    #          is missing or because the configuration is not correct.
    print("\n\nHandling errors:")

    # ---------------------------------------------------------------------
    # Warnings
    # ---------------------------------------------------------------------
    # .. note: In the examples below, the method acsi is meant to show a
    #          warning message either no threshold has been specified or
    #          because thresholds have been specified twice.
    print("\n\nShow warnings:")