Source code for pyamr.datasets.load

# Division
from __future__ import division

# Generic libraries
import os
import sys
import glob
import warnings
import numpy as np
import pandas as pd
# import cPickle as pickle # no needed in python 3.x

# Import specific
from os.path import dirname
from pathlib import Path

# Import own module
sys.path.append('../../')

# Import libraries
#import pyamr.utils.io.read as pd_read

# --------------------------------------
# DEFINITION OF DATABASE PATHS
# --------------------------------------
# This paths should be relative to the folder datasets in which they
# are contained. Otherwise it will not work.

# Create dirname
dirname = dirname(__file__)

# ---------
# nhs
# ---------
# Antibiotics
epicimpoc_antibiotics = './nhs/antibiotics/antibiotics.csv'

# Organisms
epicimpoc_organisms = './nhs/organisms/organisms.csv'

# Profiles
epicimpoc_susceptibility_comp = './nhs/susceptibility/complete'
epicimpoc_susceptibility_year = './nhs/susceptibility/by_year'
epicimpoc_susceptibility_type = './nhs/susceptibility/by_cultures'

# Microbiology data


# Other
other_shampoo_sales = './other/shampoo_sales.csv'




# -----------------------------------------------------------------------------
#                              HELPER METHODS
# -----------------------------------------------------------------------------
[docs]def make_timeseries(): """This method creates a hard-coded time series. Returns ------- x, y, f: The x values, the y values and the frequencies. """ # Create exogenous variable x = np.arange(100) # Create endogenous variable y = np.concatenate((np.arange(50) * 10 + np.random.randn(50) * 20 + 40, np.arange(50) * 2 + np.random.randn(50) * 20 + 400)) # Create frequency variable f = np.concatenate((np.random.rand(35) * 50 + 50, np.random.rand(30) * 50 + 100, np.random.rand(35) * 50 + 150)) # Return return x, y, f
[docs]def make_susceptibility(): """This method returns sample data (Anonymised)""" return fixture(name='./nhs/nhs-susceptibility-2009-anonymised.csv')
[docs]def load_registry_microorganisms(): """This method returns the microorganisms registry""" return pd.read_csv(Path(dirname) / 'registry' / 'registry_microorganisms.csv')
[docs]def load_registry_antimicrobials(): """This method returns the antimicrobials registry""" return pd.read_csv(Path(dirname) / 'registry' / 'registry_antimicrobials.csv')
[docs]def load_microbiology_folder(path, folder, glob_pattern='susceptibility-*.csv', **kwargs): """This method loads the susceptibility data. .. note:: It assumes all the susceptibility data is stored in csv files whose files name starts with 'susceptibility'. In addition, it assumes that the additional iformation is is available in files named 'antimicrobials.csv' and 'microorganisms.csv' Parameters ---------- path: string The path where the folder is located. folder: string Name of the folder with the data. kwargs: Arguments to pass to pd.read_csv Returns ------- susceptibility The susceptibility test data db_abxs The registries with the antimicrobials db_orgs The registry with the microorganisms """ # Define paths path = Path("{0}/{1}".format(dirname, path)) path_sus = path / folder path_abx = path / folder / 'antimicrobials.csv' path_org = path / folder / 'microorganisms.csv' # Load data data = pd.concat([ \ pd.read_csv(f, parse_dates=['date_received'], **kwargs) for f in glob.glob(str(path_sus / glob_pattern))]) # The previous parse dates should work but it fails if any value # cannot be represented as an array of datetimes. For that reason, # we ensure non-standard datetime parsing below. data.date_received = pd.to_datetime(data.date_received) # Load databases (registries) db_abxs = pd.read_csv(path_abx) db_orgs = pd.read_csv(path_org) # Return return data, db_abxs, db_orgs
[docs]def load_data_nhs(folder='susceptibility-v0.0.2', **kwargs): """This method loads the susceptibility data. """ return load_microbiology_folder( \ path='./microbiology/nhs/aggregated/', folder=folder, **kwargs)
[docs]def load_data_mimic(folder='susceptibility-v0.0.1', **kwargs): """This method loads the susceptibility data. """ return load_microbiology_folder( \ path='./microbiology/mimic/aggregated/', folder=folder, **kwargs)
# -------------------------------------- # METHODS TO LOAD DATABASES # -------------------------------------- # ----------------- # epic impoc basic # ----------------- """ def dataset_epicimpoc_antibiotics(**kwargs): return pd.read_csv('%s/%s' % (dirname, epicimpoc_antibiotics), *kwargs) def dataset_epicimpoc_organisms(**kwargs): return pd.read_csv('%s/%s' % (dirname, epicimpoc_organisms), *kwargs) # ----------------------------------- # epic impoc susceptibility test data # ----------------------------------- def dataset_epicimpoc_susceptibility(**kwargs): return pd_read.read_csv('%s/%s' % \ (dirname, epicimpoc_susceptibility_comp), **kwargs) def dataset_epicimpoc_susceptibility_year(year='2014', **kwargs): return pd_read.read_csv('%s/%s/%s' % \ (dirname, epicimpoc_susceptibility_year, str(year)), **kwargs) def dataset_epicimpoc_susceptibility_culture(cultures=['bldcul']): pass def dataset_shampoo_sales(**kwargs): return pd.read_csv('%s/%s' % (dirname, other_shampoo_sales), **kwargs) """
[docs]def fixture(name, **kwargs): """Load fixtures Parameters ---------- name: string The name of the file within the fixtures folder. Returns -------- pd.DataFrame """ # Library from pathlib import Path # Load data path = Path(dirname).parent / 'fixtures' / name data = pd.read_csv(path, **kwargs) # Format if 'DATE' in data: data.DATE = pd.to_datetime(data.DATE) # Return return data
if __name__ == '__main__': # Import import warnings # Suppress warnings warnings.simplefilter('ignore') # Set numpy options np.set_printoptions(threshold=np.nan) # ----------------------------------- # Loading default datasets # ----------------------------------- # Load antibiotics #antibiotics = dataset_epicimpoc_antibiotics() # Load organisms #organisms = dataset_epicimpoc_organisms() # Load profiles #microbiology = dataset_epicimpoc_susceptibility_year(year=2014) # Show information #print(antibiotics.head(5)) #print(organisms.head(5)) #print(len(microbiology)) #print(dataset_shampoo_sales())