Source code for pyamr.core.freq

################################################################################
# Author:
# Date:
# Description:
#
# Copyright:
# 
################################################################################
from __future__ import division

# Libraries
import sys
import numpy as np
import pandas as pd


[docs]class Frequency(): # pragma: no cover """ """ # Attributes c_abx = 'ANTIBIOTIC' c_org = 'SPECIE' c_dat = 'DATE' c_out = 'SENSITIVITY' # ------------------------------------------------------------------------- # # ------------------------------------------------------------------------- def __init__(self, column_antibiotic=c_abx, column_organism=c_org, column_date=c_dat, column_outcome=c_out, column_labnumber=None, dfmt='%Y-%M-%d'): """The constructor. Parameters ---------- column_antibiotic : string The column name with the the antibiotic values column_organism : string The column name with the organism values column_date : string The column name with the dates column_labnumber : string The column name with the laboraory number Returns ------- """ # Create dictionary to rename columns self.rename_columns = {column_antibiotic: self.c_abx, column_organism: self.c_org, column_date: self.c_dat, column_outcome: self.c_out} # ------------------------------------------------------------------------- # # ------------------------------------------------------------------------- def _by_category_groupby(self, by_category='pairs'): """This method returns the grouping list. Parameters ---------- by_category : string The category to group by from pairs, organisms or antibiotics. Returns ------- list """ # Define how to return the overall results. if by_category == 'organisms': return [self.c_org, self.c_out] elif by_category == 'antibiotics': return [self.c_abx, self.c_out] elif by_category == 'pairs': return [self.c_org, self.c_abx, self.c_out] else: raise ValueError("The by_category parameter select must be one of" "the following [pairs, organisms, antibiotics]; " "the value <%s> was found." % by_category)
[docs] def fit(self): """ """ pass
def _compute_overall(self, dataframe, by_category='pairs'): """This method computes the overall frequency count. Parameters ---------- dataframe : dataframe-like The dataframe with the microbiology data by_category Returns ------- dataframe """ # Get the definition of groupby groupby = self._by_category_groupby(by_category) # Compute results return dataframe.groupby(groupby).size().unstack().fillna(0) def _compute_independent(self, dataframe, by_category='pairs', fs='1D'): """This method computes the independent time intervals frequency. Parameters ---------- dataframe: dataframe-like The microbiology dataframe with the following columns. by_category: string The category to group the outcomes. The outcomes are grouped in pairs formed by (organism, antibiotic) by default. However, these can be also grouped by organisms or antibiotics. fs : string The frequency sample (e.g. 1D, 1M, 7D, ...) Returns ------- dataframe """ # Format the dataframe to have datetime index dataframe = dataframe.reset_index() dataframe = dataframe.set_index(self.c_dat) # Get the definition of groupby groupby = [pd.Grouper(freq=fs)] + self._by_category_groupby(by_category) # Compute independent window dataframe = dataframe.groupby(groupby).size().unstack().fillna(0) # Resample dataframe = dataframe.reset_index() dataframe = dataframe.set_index(self.c_dat) \ .groupby([self.c_org, self.c_abx]) \ .resample(fs).mean() \ .fillna(0) # Return return dataframe def _compute_overlapping(self, dataframe, by_category='pairs', wshift='1D', wsize='2D'): """This method computes the overlapping time intervals frequency Parameters ---------- dataframe: dataframe-like The microbiology dataframe with the following columns. by_category: string The category to group the outcomes. The outcomes are grouped in pairs formed by (organism, antibiotic) by default. However, these can be also grouped by organisms or antibiotics. wshift : string The shift between consecutive windows (OTI). wsize : integer The size of the window (OTI) Returns ------- dataframe """ # Format the dataframe to have datetime index dataframe = self._compute_independent(dataframe=dataframe, by_category=by_category, fs=wshift) # Reset index dataframe = dataframe.reset_index() # Compute rolling window dataframe = dataframe.groupby([self.c_org, self.c_abx], as_index=False) \ .apply(lambda x: x.set_index(self.c_dat) \ .rolling(window=wsize).sum()).fillna(0) # Drop index level (which appears when executing previous code) dataframe.index = dataframe.index.droplevel() print(dataframe) print(self.c_dat, self.c_org, self.c_abx) # Resample dataframe = dataframe.reset_index() dataframe = dataframe.set_index(self.c_dat) \ .groupby([self.c_org, self.c_abx]) \ .resample(wshift).mean() \ .fillna(0) # Return return dataframe
[docs] def compute(self, dataframe, strategy='overall', by_category='pairs', fs=None, wshift=None, wsize=None): """This function computes the frequencies. The method allows to compute the overall frequencies, the frequencies for independent time intervals (ITI) such as monthly or yearly and the frequencies for overlapping time intervals (OTI) in which the parameters wshift and wsize need to be specified. Parameters ---------- dataframe: dataframe-like The microbiology dataframe with the following columns. by_category: string The category to group the outcomes. The outcomes are grouped in pairs formed by (organism, antibiotic) by default. However, these can be also grouped by organisms or antibiotics. fs : string The frequency sample (e.g. 1D, 1M, 7D, ...) wshift : integer The shift between consecutive windows (OTI). wsize : integer The size of the window (OTI) Returns ------- dataframe """ # Check that it is a dataframe if not isinstance(dataframe, pd.DataFrame): raise TypeError("The instance passed as argument needs to be a pandas " "DataFrame. Instead, a <%s> was found. Please convert " "the input accordingly." % type(dataframe)) # Rename columns dataframe = dataframe.rename(columns=self.rename_columns, copy=True) # Ensure date columns has date objects. dataframe[self.c_dat] = pd.to_datetime(dataframe[self.c_dat]) # ----------------------- # Compute # ----------------------- if strategy == 'overall': return self._compute_overall(dataframe=dataframe, by_category=by_category) elif strategy == 'ITI': return self._compute_independent(dataframe=dataframe, by_category=by_category, fs=fs) elif strategy == 'OTI': return self._compute_overlapping(dataframe=dataframe, by_category=by_category, wshift=wshift, wsize=wsize)
if __name__ == '__main__': # pragma: no cover """ # Import libraries import sys import matplotlib as mpl import matplotlib.pyplot as plt # Import own module sys.path.append('../../') # Import specific libraries from pyAMR.datasets import load # Set matplotlib mpl.rcParams['xtick.labelsize'] = 9 mpl.rcParams['ytick.labelsize'] = 9 mpl.rcParams['axes.titlesize'] = 11 mpl.rcParams['legend.fontsize'] = 9 # ----------------------- # Load data # ----------------------- # Load sample data data = load.dataset_epicimpoc_susceptibility_year(nrows=1000000) # Keep only relevant columns data = data[['antibioticCode', 'organismCode', 'dateReceived', 'sensitivity']] # Filter for two examples is_org = data['organismCode'] == 'ECOL' is_abx = data['antibioticCode'].isin(['ATAZ']) data = data[is_abx & is_org] # ------------------------- # Create frequency instance # ------------------------- # Create instance freq = Frequency(column_antibiotic='antibioticCode', column_organism='organismCode', column_date='dateReceived', column_outcome='sensitivity') # ------------------------ # Examples compute overall # ------------------------ # Examples compute overall pairs = freq.compute(data, by_category='pairs') antibiotics = freq.compute(data, by_category='antibiotics') organisms = freq.compute(data, by_category='organisms') # Show # print pairs.head(10) # print antibiotics.head(10) # print organisms.head(10) # ------------------------------------------- # Examples compute independent time intervals # ------------------------------------------- # Examples compute ITI daily = freq.compute(data, strategy='ITI', by_category='pairs', fs='1D') monthly = freq.compute(data, strategy='ITI', by_category='pairs', fs='1M') # Show # print daily.head(10) # print monthly.head(10) # ------------------------------------------- # Examples compute overlapping time intervals # ------------------------------------------- # Examples compute OTI (daily) oti_1 = freq.compute(data, strategy='OTI', by_category='pairs', wshift='1D', wsize=5) # Examples compute OTI (monthly) oti_2 = freq.compute(data, strategy='OTI', by_category='pairs', wshift='1M', wsize=2) # ---------------- # Plot # ---------------- # Show comparison for each pair f, axes = plt.subplots(4, 1, figsize=(15, 8)) # Flatten axes axes = axes.flatten() # Plot ITI (daily) for i, (pair, group) in enumerate(daily.groupby(level=[0, 1])): group.index = group.index.droplevel([0, 1]) group.sum(axis=1).plot(marker='o', ms=3, label=pair, linewidth=0.5, markeredgecolor='k', markeredgewidth=0.3, ax=axes[0]) # Plot ITI (monthly) for i, (pair, group) in enumerate(monthly.groupby(level=[0, 1])): group.index = group.index.droplevel([0, 1]) group.sum(axis=1).plot(marker='o', ms=3, label=pair, linewidth=0.5, markeredgecolor='k', markeredgewidth=0.3, ax=axes[1]) # Plot OTI for i, (pair, group) in enumerate(oti_1.groupby(level=[0, 1])): group.index = group.index.droplevel([0, 1]) group.sum(axis=1).plot(marker='o', ms=3, label=pair, linewidth=0.5, markeredgecolor='k', markeredgewidth=0.3, ax=axes[2]) # Plot OTI for i, (pair, group) in enumerate(oti_2.groupby(level=[0, 1])): group.index = group.index.droplevel([0, 1]) group.sum(axis=1).plot(marker='o', ms=3, label=pair, linewidth=0.5, markeredgecolor='k', markeredgewidth=0.3, ax=axes[3]) # Set legend for ax in axes: ax.legend() ax.set_xlabel('') ax.grid(True) # Set titles axes[0].set_ylabel('Daily') axes[1].set_ylabel('Monthly') axes[2].set_ylabel('OTI(1D,5)') axes[3].set_ylabel('OTI(1M,2)') # Show plt.show() """