Source code for pyamr.core.mari

################################################################################
# Author:
# Date:
# Description:
#
#
#
# Copyright:
#
#
################################################################################
# Import libraries
import pandas as pd

# Import sari
from pyamr.core.sari import sari




# -------------------------------------------------------------------------
#                            helper methods
# -------------------------------------------------------------------------
[docs]class MARI: """Multiple Antimicrobial Resistance Index """ # Attributes c_lab = 'LAB_NUMBER' c_spe = 'SPECIMEN' c_org = 'MICROORGANISM' c_dat = 'DATE' c_out = 'SENSITIVITY' def __init__(self, groupby=[c_spe, c_org, c_lab, c_out]): """Constructor. Parameters ---------- groupby: list The labels of the columns to groupby. The name of the columns it should include is as follows: [ COLUMN_SPECIMEN, COLUMN_MICROORGANISM, COLUMN_LABORATORY_NUMBER, COLUMN_OUTCOME ] Returns -------- MARI instance """ self.groupby = groupby
[docs] def compute_v1(self, dataframe, shift=None, period=None, cdate=None, return_frequencies=True, return_isolates=True, **kwargs): # pragma: no cover """Compute MARI v1""" # Copy DataFrame aux = dataframe.copy(deep=True) # Create grouper grouper = [] if shift is not None: grouper = [pd.Grouper(freq=shift, key=cdate)] grouper = grouper + self.groupby # Format as datetime if cdate is not None: aux[cdate] = pd.to_datetime(aux[cdate]) # ------------------------------------------ # Frequencies # ------------------------------------------ # rn = { 'mean': 'mari', 'count': 'freq' } # Compute frequencies isolates = aux.groupby(grouper) \ .size().unstack().fillna(0) # Include mari by isolate isolates['mari'] = sari(isolates, **kwargs) # Compute frequencies if shift is None: # Compute mari dataframe = isolates.mari \ .groupby(level=isolates.index.names[:-1]) \ .agg(['count', 'mean']) \ else: # Create sum and count per shift aux = isolates.reset_index() \ .groupby([cdate] + grouper[1:-2]) \ .mari.agg(['sum', 'count']) \ .reset_index().groupby(grouper[1:-2]) # Compute sums sums = aux.rolling(window=period, min_periods=1, on=cdate)['sum'].sum() # Compute isolate counts counts = aux.rolling(window=period, min_periods=1, on=cdate)['count'].sum() series = (sums/counts).rename('mari') dataframe = pd.concat([sums, counts, series], axis=1) # Wrong!! # Compute sum and count [sum, count] #aux = freqs.groupby([cdate] + grouper[2:-1]) \ # .mari.agg('mean').reset_index() \ # .set_index(cdate).groupby(grouper[2:-1]) \ # .mari.rolling(window=period, min_period=1) \ # .mean() # Rename columns dataframe = dataframe.rename(columns=rn) # Remove frequencies if not return_frequencies: dataframe = dataframe['mari'] # Return if return_isolates: return dataframe, isolates return dataframe
[docs] def rolling(self, dataframe, period, cdate, shift=None): """Compute metric using rolling approach""" if shift is None: warnings.warn(""" The input parameter <shift> is None. Thus, the value of the input parameter <period> (%s) has been used.""" % period) shift = period # Grouper grouper = [pd.Grouper(freq=shift, key=cdate)] grouper = grouper + self.groupby[:-2] # Compute frequencies freqs = dataframe.groupby(grouper) \ .agg(intermediate=('intermediate', 'sum'), resistant=('resistant', 'sum'), sensitive=('sensitive', 'sum'), n_records=('freq', 'sum'), n_samples=('sari', 'count'), total=('sari', 'sum')) \ .reset_index() \ .set_index(cdate).groupby(grouper[1:]) \ .rolling(window=period, min_periods=1) \ .sum().fillna(0) # Return return freqs
[docs] def grouping(self, dataframe, period, cdate): """Compute metric with independent groups. """ # Create grouper if hasattr(dataframe[cdate].dt, str(period)): grouper = [getattr(dataframe[cdate].dt, period)] else: grouper = [pd.Grouper(freq=period, key=cdate)] grouper = self.groupby[:-2] + grouper # Compute freqs = dataframe.groupby(grouper) \ .agg(intermediate=('intermediate', 'sum'), resistant=('resistant', 'sum'), sensitive=('sensitive', 'sum'), n_records=('freq', 'sum'), n_samples=('sari', 'count'), total=('sari', 'sum')) # Return return freqs
[docs] def compute_v2(self, dataframe, shift=None, period=None, cdate=None, return_frequencies=True, return_isolates=True, **kwargs): """Compute MARI v2. .. note: No need to copy because SARI does it for us """ # Libraries from pyamr.core.sari import SARI # Not allowing period to be a number. The main reason is that the # most common interpretation is that scenarios with shift=1D # period=2D and shift=1D period=2 should be the same. However, the # results are actually different. Because period=2 in rolling will # use two adjacent rows without considering time. This introduces # inconsistencies where there are time gaps without data. if period is not None: if not isinstance(period, str): raise ValueError(""" The input parameter <period> cannot be of %s. Ensure it is either None or a valid string such as 2D or year. """ % type(period)) # Compute frequencies if period is None and shift is None: # Compute freqs from sari freqs = SARI(groupby=self.groupby) \ .compute(dataframe, **kwargs) \ .groupby(level=[0,1]) \ .agg(intermediate=('intermediate', 'sum'), resistant=('resistant', 'sum'), sensitive=('sensitive', 'sum'), n_records=('freq', 'sum'), n_samples=('sari', 'count'), total=('sari', 'sum')) else: # Format as datetime dataframe[cdate] = pd.to_datetime(dataframe[cdate]) # Define new group by with date groupby = self.groupby.copy() groupby.insert(3, cdate) # Compute mari (sari per sample) iso = SARI(groupby=groupby) \ .compute(dataframe, **kwargs) \ .reset_index() if shift is None: freqs = self.grouping(dataframe=iso, period=period, cdate=cdate) else: freqs = self.rolling(dataframe=iso, period=period, shift=shift, cdate=cdate) # Add mari freqs['mari'] = freqs.total / freqs.n_samples # Remove frequencies if not return_frequencies: freqs = freqs['mari'] # Return if return_isolates: return freqs, pd.DataFrame() return freqs
[docs] def compute_v3(self, dataframe, shift=None, period=None, cdate=None, return_frequencies=True, return_isolates=True, **kwargs): # pragma: no cover """Compute MARI v3. .. note: It does not work properly. .. note: It is considerably slower. Possibly because it is executing the rolling method twice. Try to remove the first rolling as it is not needed. """ # Libraries from pyamr.core.sari import SARI # Format as datetime if cdate is not None: dataframe[cdate] = pd.to_datetime(dataframe[cdate]) # Create object sari = SARI(groupby=['SPECIMEN', 'MICROORGANISM', 'LAB_NUMBER', 'DATE', 'SENSITIVITY']) # Compute mari (sari per sample) isolates = sari.compute(dataframe, #shift=shift, period=period, cdate=cdate, return_frequencies=return_frequencies, **kwargs) """ from pyamr.core.sari import sari # Compute frequencies isolates = dataframe.groupby(['SPECIMEN', 'MICROORGANISM', 'LAB_NUMBER', 'DATE', 'SENSITIVITY']) \ .size().unstack().fillna(0) # Include mari which is the SARI per isolate isolates['sari'] = sari(isolates, **kwargs) print(isolates) """ by = ['SPECIMEN', 'MICROORGANISM'] if cdate is not None: by = by + [cdate] aux = isolates.reset_index() \ .groupby(by) \ .agg(intermediate=('intermediate', 'sum'), resistant=('resistant', 'sum'), sensitive=('sensitive', 'sum'), #n_records=('freq', 'sum'), n_samples=('sari', 'count'), total=('sari', 'sum')) if period is not None: # Compute rolling aux = aux \ .reset_index() \ .groupby(by[:2]) \ .rolling(window=period, min_periods=1, on=cdate).agg('sum') # Format result. aux.index = aux.index.droplevel(2) aux = aux.reset_index() \ .set_index(by) # Add mari aux['mari'] = aux.total / aux.n_samples # Add number of records aux['n_records'] = \ aux.resistant + \ aux.sensitive + \ aux.intermediate # Return if return_isolates: return aux, isolates return aux
[docs] def compute_v4(self, dataframe, shift=None, period=None, cdate=None, return_frequencies=True, return_isolates=True, **kwargs): # pragma: no cover """Compute MARI v4. """ # Copy DataFrame aux = dataframe.copy(deep=True) # Warning if dates NaN # Warning if elements in groupby any all NaN! # Create grouper grouper = [] if shift is not None: grouper = [pd.Grouper(freq=shift, key=cdate)] grouper = grouper + self.groupby # Format as datetime if cdate is not None: aux[cdate] = pd.to_datetime(aux[cdate]) # ------------------------------------------ # Frequencies # ------------------------------------------ # Compute frequencies isolates = aux.groupby(grouper) \ .size().unstack().fillna(0) # Include mari which is the SARI per isolate isolates['mari'] = sari(isolates, **kwargs) # Compute frequencies if shift is None: # Compute mari #dataframe = isolates.mari \ # .groupby(level=isolates.index.names[:-1]) \ # .agg(['count', 'mean']) \ dataframe = isolates \ .groupby(level=isolates.index.names[:-1]) \ .agg( intermediate=('intermediate', 'sum'), resistant=('resistant', 'sum'), sensitive=('sensitive', 'sum'), n_samples=('mari', 'count'), total=('mari', 'sum') ) else: dataframe = isolates.reset_index() \ .groupby([cdate] + grouper[1:-2]) \ .agg(intermediate=('intermediate', 'sum'), resistant=('resistant', 'sum'), sensitive=('sensitive', 'sum'), n_samples=('mari', 'count'), total=('mari', 'sum')) \ .reset_index().groupby(grouper[1:-2]) \ .rolling(window=period, min_periods=1, on=cdate).agg('sum') # Format result. dataframe.index = dataframe.index.droplevel(2) dataframe = dataframe.reset_index() \ .set_index(['SPECIMEN', 'MICROORGANISM', 'DATE']) # Add mari dataframe['mari'] = dataframe.total / dataframe.n_samples # Add number of records dataframe['n_records'] = \ dataframe.resistant + \ dataframe.sensitive + \ dataframe.intermediate # Remove frequencies if not return_frequencies: dataframe = dataframe['mari'] # Return if return_isolates: return dataframe, isolates return dataframe
[docs] def compute(self, dataframe, **kwargs): """Compute the Multiple Antimicrobial Resistance Index. .. note: The compute_v3 does not work properly .. todo: Add parameters to rolling! .. todo: Place value at the left, center, right of window? .. todo: Ensure that works when time gaps present! .. todo: Carefull with various indexes! .. todo: Warning if dates NaN .. todo: Warning if elements in groupby any all NaN! .. todo: Warning if not all samples have been tested with same antimicrobials Parameters ---------- dataframe: pd.DataFrame A DataFrame with the susceptibility test interpretations as columns. The default strategies used (see below) expect the following columns ['sensitive', 'intermediate', 'resistant'] and if they do not appear they weill be set to zeros. shift: str Frequency value to pass to pd.Grouper. period: str, int Window value to pass to pd.rolling. cdate: string, default=None The column that will be used as date. return_frequencies: boolean, default=True Whether to return the frequencies (isolates) or just the resistance index. return_isolates: boolean, default=True Whether to return the resistance index for each individual isolate. strategy: string or func, default='hard' The method used to compute sari. The possible options are 'soft', 'medium' and 'hard'. In addition, a function with the following signature func(dataframe, **kwargs) can be passed. (i) ``soft`` as R / R+I+S (ii) ``medium`` as R / R+S (iii) ``hard`` as R+I / R+I+S (iv) ``other`` as R+0.5I / R+0.5I+S [Not yet] **kwargs: arguments to pass the strategy function. Returns ------- dataframe: pd.Series or pd.DataFrame The resistance index (pd.Series) or a pd.Dataframe with the resistance index (sari), the sums and the frequencies. isolates: pd.DataFrame The resistance index and each of the sensitivity value counts for each individual isolate. """ #self.compute_v1(dataframe, **kwargs) #self.compute_v2(dataframe, **kwargs) #self.compute_v3(dataframe, **kwargs) return self.compute_v2(dataframe, **kwargs)
if __name__ == '__main__': # pragma: no cover # Import libraries import time import warnings import pandas as pd from timeit import default_timer as timer # Import specific libraries from pyamr.core.mari import MARI # Filter user warning warnings.filterwarnings("ignore", category=UserWarning) # --------------------- # Create data # --------------------- # Load data data = pd.read_csv('../fixtures/indexes/fixture_mari.csv') # Create MARI instance mari = MARI(groupby=['SPECIMEN', 'MICROORGANISM', 'LAB_NUMBER', 'SENSITIVITY']) # Compute MARI overall mari_overall, isolates = mari.compute(data, return_frequencies=True, return_isolates=True) # Compute SARI temporal (ITI) mari_iti_1d_1d = mari.compute(data, shift='1D', period='1D', cdate='DATE', return_isolates=False) mari_iti_2d_2d = mari.compute(data, shift='2D', period='2D', cdate='DATE', return_isolates=False) mari_iti_year = mari.compute(data, period='year', cdate='DATE', return_isolates=False) # Compute MARI temporal (OTI) mari_oti_1d_2d = mari.compute(data, shift='1D', period='2D', cdate='DATE', return_isolates=False) mari_oti_2d_4d = mari.compute(data, shift='2D', period='4D', cdate='DATE', return_isolates=False) # Show print("\nIsolates:") print(isolates) print("\n\n\nMARI (overall):") print(mari_overall) print("\n\n\nMARI (iti) | 1D_1D:") print(mari_iti_1d_1d) print("\n\n\nMARI (iti) | 2D_2D:") print(mari_iti_2d_2d) print("\n\n\nMARI (iti) | year:") print(mari_iti_year) print("\n\n\nMARI (oti) | 1D_2D:") print(mari_oti_1d_2d) print("\n\n\nMARI (oti) | 2D_4D:") print(mari_oti_2d_4d) # ----------------------- # Full test # ----------------------- from itertools import product # Define possible values values = [2, '2D', None, 'year'] combos = list(product(values, values)) # Show print("\n\nCombinations of params <shift> and <period>:") # Loop for i, (shift, period) in enumerate(combos): print("%2s/%2s. Computing... shift=%-5s | period=%-5s ==> " % \ (i + 1, len(combos), shift, period), end="") try: s00, s01 = mari.compute(data, shift=shift, period=period, cdate='DATE') s10, s11 = mari.compute_v2(data, shift=shift, period=period, cdate='DATE') s20, s21 = mari.compute_v3(data, shift=shift, period=period, cdate='DATE') print("Ok! equals_1=%s equals_2=%s" % (s00.equals(s10), s10.equals(s20))) #print(s00) #print(s10) #print(s20) #print("\n\n" + "=" * 80) except Exception as e: print(e) # --------------------------------------------------- # Test timings # --------------------------------------------------- # Libraries from pyamr.datasets.load import make_susceptibility # Constants rename = { 'date_received': 'DATE', 'laboratory_number': 'LAB_NUMBER', 'specimen_code': 'SPECIMEN', 'microorganism_code': 'MICROORGANISM', 'antimicrobial_code': 'ANTIMICROBIAL', 'sensitivity': 'SENSITIVITY' } # Load data data = make_susceptibility() data = data.rename(columns=rename) print("\n\nComparing execution times:") # Example 1 # ========= t0 = timer() mari_overall, isolates = mari.compute(data, return_frequencies=True, return_isolates=True) t1 = timer() mari_overall, isolates = mari.compute_v2(data, return_frequencies=True, return_isolates=True) t2 = timer() mari_overall, isolates = mari.compute_v3(data, return_frequencies=True, return_isolates=True) t3 = timer() print("%.10f | %.10f | %.10f " % (t1-t0, t2-t1, t3-t2)) # Example 2 # ========= t0 = timer() mari_iti = mari.compute(data, shift='1D', period='1D', cdate='DATE', return_isolates=False) t1 = timer() mari_iti = mari.compute_v2(data, shift='1D', period='1D', cdate='DATE', return_isolates=False) t2 = timer() mari_iti = mari.compute_v3(data, shift='1D', period='1D', cdate='DATE', return_isolates=False) t3 = timer() print("%.10f | %.10f | %.10f " % (t1-t0, t2-t1, t3-t2)) # Example 3 # ========= t0 = timer() mari_oti = mari.compute(data, shift='1D', period='2D', cdate='DATE', return_isolates=False) t1 = timer() mari_oti = mari.compute_v2(data, shift='1D', period='2D', cdate='DATE', return_isolates=False) t2 = timer() mari_oti = mari.compute_v3(data, shift='1D', period='2D', cdate='DATE', return_isolates=False) t3 = timer() print("%.10f | %.10f | %.10f " % (t1-t0, t2-t1, t3-t2))