Source code for pyamr.datasets.clean

# Libraries
import re
import collections
import numpy as np
import pandas as pd

# -------------------------------------------------------------------
# Constants
# -------------------------------------------------------------------
SENSITIVITY_CODE_REPLACE = {
    'ss': 's',
}


ANTIMICROBIAL_CODE_REPLACE = {
    'AAUGU': 'AAUG'
}

MICROORGANISM_CODE_REPLACE = {
    'ACINE2': 'ACINE',
    'CNS2': 'CNS',
    'CNS3': 'CNS',
    'ECOL2': 'ECOL',
    'KPN2': 'KPNE',
    'PAER2': 'PAER',
    'SAUR2': 'SAUR',
    'LFC2': 'LFC',
    'COLIF2': 'COLIF',
    'COLI2': 'COLIF',
    'ENTAE2': 'EAER',
    'ENTAE': 'EAER',
    'VRE': 'ENTC',
    'MRSA': 'SAUR',
    'MCNS': 'CNS',
    'A_ECOLI': 'ECOL',
    'A_ENTEROBA': 'ENTB',
    'A_SVIRIDANS': 'VIRST',
    'A_CLUSITANI': 'CLUS',
    'A_CDUBLINIE': 'CDUB',
    'A_CPSEUDODI': 'CPSEU'
}

ANTIMICROBIAL_NAME_MAP = {
    '\([^)]*\)': '',  # Remove everything between ()
    '\s{2,}': ' ',  # Remove duplicated spaces
    'gentamicin 200': 'gentamicin'
}

MICROORGANISM_NAME_MAP = {
    # Replace basic
    'strep\.': 'streptococcus ',
    'staph\.': 'staphylococcus ',
    'species': 'sp.',
    'sp.($| )': '',
    'sp(.)?($| )': ' ',
    'second': '',
    'third': '',
    '2nd': '',
    '3rd': '',
    # Specific
    '\*\*\* mrsa \*\*\* isolated': 'staphylococcus aureus',

}

[docs]def invert(d):
    return {v:k for k,v in d.items()}

#def invert(d):
#    return reversed(list(d.items()))

# .. note: Keep everything lowercase because all the
#          columns are str.lower() before doing any
#          formatting/replacement in clean_common.
#
# .. note: Using an ordered dict. Thus, when inverting
#          the dictionaries, for repeated entries
#          (e.g. sensitive SS and S) the latter will
#          be used.
SPECIMEN_CODE_MAP = {
    'URNCUL': 'URICUL'
}

SPECIMEN_NAME_MAP = {
    'Urine Micro': 'Urine Culture'
}

# '9MRSN': 'MRSCUL',
# 'URINE CULTURE': 'URICUL',
# 'WOUND CULTURE': 'WOUCUL',
# 'BLOOD CULTURE': 'BLDCUL',
# 'SPUTUM CULTURE': 'SPTCUL',
##'CSF CULTURE': 'CSFCUL',
# 'EYE CULTURE': 'EYECUL',
# 'GENITALCUL': 'GENCUL',
# 'NEONATAL SCREEN': 'NEOCUL',

METHOD_CODE_MAP = {
    'DD': 'Disk Difussion',
    'PHO': 'Public Health Laboratory',
    'MIC': 'Minimum Inhibitory Concentration',
    'MASTU': 'Microscopy-Based Antimicrobial Susceptibility Testing',
}

SENSITIVITY_CODE_MAP = collections.OrderedDict({
    'ss': 'sensitive',
    's': 'sensitive',
    'r': 'resistant',
    'i': 'intermediate',
    'nd': 'not done',
    'hr': 'highly resistant',
    '<<do not report>>': 'hide',
    'hide': '<<do not report>>',
    'validation fix entry': 'fix',
    'fix': 'validation fix entry',
})

# Note that they will be executed in order and thus
# order matters. The changes on the first expression
# will affect the cells that will be used in the next
# iteration
REGEX_MAP = {
    '\([^)]*\)': '',        # Remove everything between ()
    '(\s)?\-(\s)?': '-',    # Remove spaces before after hyphen
    'species': '',          # Rename species for next regexp
    'o157': '',             # Remove scherichia coli o157
    'sp(\.)?(\s|$)+': ' ',  # Remove sp from word.
    'sp..': ' ',            # Remove sp.. <--- HOW TO DO IT WITH PREVIOUS!
    'strep(\.|\s|$)': 'streptococcus ',  # Complete
    'staph(\.|\s|$)': 'staphylococcus ', # Complete
    'staphylococci': 'staphylococcus',   # Correction (add mixed? group?)
    'streptococci': 'streptococcus',     # Correction (add mixed? group?)
    '\s+': ' ',             # Remove duplicated spaces.
}

REGEX_MAP_BASIC = {
    '\s+': ' ',  # Remove duplicated spaces.
}


# -----------------------------------------------------------
# Helper methods
# -----------------------------------------------------------

[docs]def hyphen_before(x, w):
    """Ensures hyphen between words is correct.

    Parameters
    ----------
    x: string
        The string to format
    w: string
        The word preceded by hyphen.

    Returns
    -------
    string
        The formatted string
    """
    # Ensure it is a string
    if not isinstance(x, str):
        return x
    # Create expression
    regexp = re.compile(r'(\S*)(\s+)(%s)(\W)+'%w)
    # Return
    return re.sub(regexp, r'\1-\3 ', x)


[docs]def word_to_start(x, w, pos='start', verbose=0):
    """Moves the word within the string.

    Parameters
    ----------
    x: string
        The string to format
    w: word
        The word to relocate within the string.
    pos: string, default start
        The position to insert the word. The possible options
        are start (at the beginning) or end (at the end) of the
        string.
    verbose: int
        Level of verbosity

    Returns
    -------
    string
        Formatted string
    """
    # Ensure it is a string
    if not isinstance(x, str):
        return x
    # Create regular expression
    regexp = re.compile(\
        r'(.*|^)(\W|^)%s(\W|$)(.*|$)' % w,
        flags=re.IGNORECASE)
    # Return value if it does not fit.
    if not bool(re.match(regexp, x)):
        return x
    # Return
    if pos == 'start':
        return '%s ' % w + x.replace(w, '')
    if pos == 'end':
        return x.replace(w, '') + ' %s' % w

[docs]def string_replace(series, remove={}):
    """This method corrects the strings.

    Parameters
    ----------
    series:

    remove:

    Returns
    -------
    """
    # Format (lower)
    series = series.str.lower()
    # Do str replacements
    for k, v in remove.items():
        series = series.str.replace(k, v)
    # Format (strip)
    series = series.str.strip()
    # Return
    return series



# ----------------------------------------------------
# Main cleaners
# ----------------------------------------------------
[docs]def clean_basic(data):
    """Performs the basic cleaning.

    1. Everything to lowercase
    2. Remove spaces begin/end (strip)
    3. Remove duplicate spaces (regexp)
    4. Remove duplicates

    Parameters
    ----------
    data: pd.DataFrame
        The data to clean.

    Returns
    -------
    pd.DataFrame
        The cleaned data
    """
    # Copy dataframe
    data = data.copy(deep=True)

    # Put everything to lowercase
    data = data.applymap(lambda x: x.lower()
        if isinstance(x, str) else x)

    # Drop all spaces
    data = data.applymap(lambda x: x.strip()
        if isinstance(x, str) else x)

    # Drop extra spaces
    #{'\s+': ' '}  # Remove duplicated spaces.

    # Basic formatting
    data = data.drop_duplicates()

    # Return
    return data


[docs]def clean_format(data):
    """Final formatting..."""

    aux = data.copy(deep=True)

    # Format title
    for c in ['patient_name',
              'patient_surname']:
        if c in aux:
            aux[c] = aux[c].str.title()

    # Format lower (not needed)
    for c in ['antimicrobial_name',
              'microorganism_name',
              'sensitivity_name',
              'method_name']:
        if c in aux:
            aux[c] = aux[c].str.lower()

    # Format upper
    for c in ['antimicrobial_code',
              'microorganism_code',
              'sensitivity_code',
              'method_code']:
        if c in aux:
            aux[c] = aux[c].str.upper()

    # Strip strings
    aux = aux.apply(lambda x: x.str.strip() \
        if x.dtype == "object" else x)

    # Format date-times
    for c in ['date_received', 'date_outcome']:
        if c in aux:
            aux[c] = pd.to_datetime(aux[c], errors='coerce')

    # Drop duplicates
    aux = aux.drop_duplicates()


[docs]def clean_clwsql008(data, clean_microorganism=True):
    """Performs cleaning for clwsql008 data

    1. rename columns
    2. clean basic
    3. correct issue with sensitivities
    4. correct issue with date_received

    Parameters
    ----------
    data: pd.DataFrame
        The data to clean.

    Returns
    -------
    pd.DataFrame
        The cleaned data
    """

    # ---------------------------------
    # Constants
    # ---------------------------------
    # Rename columns
    rename = {
        'DiagnosticTestID': 'uuid',
        'ReceiveDate': 'received_date',
        'ReceiveTime': 'received_time',
        'PtNumber': 'patient_hos_number',
        'AccNumber': 'laboratory_number',
        'BatTstCode': 'specimen_code',
        'OrderName': 'specimen_name',
        'SpecType': 'specimen_description',
        'OrgPieceCounter': 'microorganism_piece_counter',
        'OrgCode': 'microorganism_code',
        'Organism': 'microorganism_name',
        'DrugCode': 'antimicrobial_code',
        'AntiBiotic Name': 'antimicrobial_name',
        'SensMethod': 'method_code',
        'Sensitivity': 'sensitivity_name',
        'MIC': 'mic',
        'Reported': 'reported',
        'FinalDate': 'date_outcome'
    }

    # There are both code and names merged.
    sensitivity_code_map = {
        's': 'sensitive',
        'ss': 'sensitive',
        'r': 'resistant',
        'i': 'intermediate',
        'nd': 'not done',
        'hr': 'highly resistant',
        'hide': 'hide',
        'fix': 'validation fix entry'
    }

    sensitivity_name_map = {
        'sensitive': 's',
        'resistant': 'r',
        'intermediate': 'i',
        'not done': 'nd',
        'highly resistant': 'hr',
        'hide': 'hide',
        '<<do not report>>': 'hide',
        'validation fix entry': 'fix'
    }

    # --------------------------
    # Method
    # --------------------------
    # The method codes are given but the method names are not
    # included. We could use this opportunity to set their
    # values
    data = data.rename(columns=rename)
    data = clean_basic(data)
    #data = data.convert_dtypes()

    # --------------------------
    # Correct sensitivities
    # --------------------------
    # Replace codes with the names
    data['sensitivity_name'] = \
        data.sensitivity_name.replace(sensitivity_code_map)
    # Map names to corresponding codes
    data['sensitivity_code'] = \
        data.sensitivity_name.map(sensitivity_name_map)

    # Add columns (will be replaced later)
    if 'sensitivity_code' in data:
        data['sensitivity'] = data.sensitivity_code

    # --------------------------
    # Add method name
    # --------------------------
    #data['method_name'] = data.method_code


    # --------------------------
    # Clean microorganism
    # --------------------------
    if clean_microorganism:
        # Create registry
        from pyamr.datasets.registries import MicroorganismRegistry

        # Create registry
        rego = MicroorganismRegistry(keyword='microorganism').fit(data)

        # Format microorganism name
        data.microorganism_name = \
            rego.replace(data.microorganism_name,
                key='original', value='name')


    #
    #data['date_received'] = pd.to_datetime(
    #    data['ReceiveDate'] + ' ' +  data['ReceiveTime'], errors='coerce')

    data['date_received'] = pd.to_datetime(
        data['date_received_date'] + ' ' +  data['date_received_time'], errors='coerce')

    data['date_outcome'] = pd.to_datetime(data['FinalDate'], errors='coerce')

    # --------------------------
    # Add date
    # --------------------------
    # Add new column date
    #data['date_received'] = pd.to_datetime(
    #    data.received_date + ' ' + data.received_time, errors='coerce')


    # Format date-times
    for c in ['date_received', 'date_outcome']:
        if c in data:
            data[c] = pd.to_datetime(data[c], errors='coerce')

    # Return data
    return data


[docs]def clean_legacy(data, clean_microorganism=True, verbose=10):
    """This method cleans microbiology data from legacy.

    1. Rename columns
    2. clean basic
    3. Add sensitivity code
    4. Correct specimen issue

    Parameters
    ----------
    data: pd.DataFrame
        The data to clean

    Returns
    -------
    pd.DataFrame
        The cleaned data
    """
    # ---------------------------------
    # Constants
    # ---------------------------------
    # Rename columns
    rename = {
        'dateReceived': 'date_received',
        'age': 'age',
        'gender': 'gender',
        'patNumber': 'patient_hos_number',
        'labNumber': 'laboratory_number',
        'orderCode': 'specimen_code',
        'orderName': 'specimen_name',
        'specimenType': 'specimen_description',
        'OrgPieceCounter': 'microorganism_piece_counter',
        'organismCode': 'microorganism_code',
        'organismNameOrig': 'microorganism_name',
        'antibioticCode': 'antimicrobial_code',
        'antibioticName': 'antimicrobial_name',
        'sensitivity': 'sensitivity_name'
    }

    # Map sensitivity names with codes.
    sensitivity_name_map = {
        'sensitive': 's',
        'resistant': 'r',
        'intermediate': 'i',
        'not done': 'nd',
        'highly resistant': 'hr',
        '<<do not report>>': 'hide',
        'validation fix entry': 'fix'
    }


    # --------------------------
    # Method
    # --------------------------
    # The method codes are given but the method names are not
    # included. We could use this opportunity to set their
    # values
    # Drop duplicates
    data = data.rename(columns=rename)
    data = clean_basic(data)
    #data = data.convert_dtypes() # issue with np.nan in replace

    # --------------------------
    # Correct sensitivities
    # --------------------------
    # Create column code
    data['sensitivity_code'] = \
        data.sensitivity_name.map(sensitivity_name_map)

    # --------------------------
    # Correct specimens
    # --------------------------
    # Get those with both name and code
    aux = data[['specimen_code', 'specimen_name']]
    aux = aux.dropna(how='any').drop_duplicates()
    tup1 = zip(aux.specimen_name, aux.specimen_code)
    tup2 = zip(aux.specimen_code, aux.specimen_name)

    # Replace names that appear in code
    data.specimen_code = data.specimen_code.replace(dict(tup1))

    # Fill missing (NaN) names
    data.specimen_name = data.specimen_name \
        .fillna(data.specimen_code.replace(dict(tup2)))

    # --------------------------
    # Add method code/name
    # --------------------------
    data['method_name'] = None
    data['method_code'] = None

    # --------------------------
    # Clean microorganism
    # --------------------------
    if clean_microorganism:
        # Create registry
        from pyamr.datasets.registries import MicroorganismRegistry

        # Create registry
        rego = MicroorganismRegistry(keyword='microorganism').fit(data)

        # Could I do it with fit_transform(data)

        # Format microorganism name
        data.microorganism_name = \
            rego.replace(data.microorganism_name,
                key='original', value='name')

    # Format date-times
    for c in ['date_received', 'date_outcome']:
        if c in data:
            data[c] = pd.to_datetime(data[c], errors='coerce')

    # Return
    return data







































[docs]def clean_microorganism(data):
    """This method...."""
    # Copy data
    aux = data.copy(deep=True)

    # Add backup columns
    microorganism_name_original = \
        aux.microorganism_name

    # Put everything to lowercase
    aux = aux.applymap(lambda x: x.lower()
        if isinstance(x, str) else x)

    # Save microorganism name origina
    aux['microorganism_name_original'] = \
        microorganism_name_original

    # Apply regexp mapping
    aux.microorganism_name = \
        aux.microorganism_name.replace(regex=REGEX_MAP)

    # Correct hyphens
    for hp in ['haemolytic']:
        aux.microorganism_name = \
            aux.microorganism_name.str.lower() \
                .transform(hyphen_before, w=hp)

    # Correct order of genus
    for sp in ['enterococcus',
               'staphylococcus',
               'streptococcus',
               'coliform']:
        aux.microorganism_name = \
            aux.microorganism_name.str.lower() \
                .transform(word_to_start, w=sp, pos='start')

    # Correct order of tags
    for sp in ['methicillin resistant',
               'vancomycin resistant',
               'mixed']:
        aux.microorganism_name = \
            aux.microorganism_name.str.lower() \
                .transform(word_to_start, w=sp, pos='end')

    # Apply regexp mapping
    aux = aux.replace(regex=REGEX_MAP)

    # Strip
    aux.microorganism_name = \
        aux.microorganism_name.str.strip()


    keep = ['microorganism_name',
            'microorganism_code',
            'microorganism_name_original']

    # Sort
    aux = aux[keep]
    aux = aux.drop_duplicates(subset=['microorganism_name'])
    aux = aux.sort_values(by='microorganism_name')
    aux = aux.reset_index(drop=True)
    aux.insert(0, 'microorganism_id', aux.index)

    # Return
    return aux



[docs]def clean_common(data, verbose=10):
    """This method cleans the microbiology data.

    It assumes the following columns are imputed:
       date_received
       date_outcome
       microorganism_code
       microorganism_name (required = True)
       antimicrobial_code
       antimicrobial_name
       method_code
       method_name
       sensitivity_code
       sensitivity_name

    Parameters
    ----------
    data: pd.DataFrame
        The dataframe to clean

    Returns
    -------
    pd.DataFrame
        The cleaned dataframe
    """
    # ------------------------------
    # Constants
    # ------------------------------
    # Create required columns
    required = [
        'date_received',
        'date_outcome',
        'specimen_name',
        'microorganism_name',
        'antimicrobial_name',
        'method_name',
        'sensitivity_name']

    # Copy data
    aux = data.copy(deep=True)

    # Add required columns
    for c in required:
        if not c in aux.columns:
            aux[c] = None

    # ------------------------------
    # Missing columns and replace
    # ------------------------------
    # Add backup columns
    aux['microorganism_name_original'] = \
        aux.microorganism_name
    aux['antimicrobial_name_original'] = \
        aux.antimicrobial_name

    # Put everything to lowercase
    aux = aux.applymap(lambda x: x.lower()
        if isinstance(x, str) else x)

    for c in ['specimen', 'method', 'sensitivity']:
        c_name = '%s_name' % c
        c_code = '%s_code' % c
        if c_code not in aux:
            aux[c_code] = aux[c_name]
        aux[c_name].fillna(aux[c_code], inplace=True)

    # Verbose
    if verbose > 5:
        print("Formatting... specimen/method/sensitivity.")

    # .. note: It would be also possible to forget about the
    #          codes and create our own acronyms ensuring that
    #          they are unique.
    # Fix codes (ss -> s)
    if 'sensitivity_code' in aux:
        aux.sensitivity_code = \
            aux.sensitivity_code.replace(SENSITIVITY_CODE_REPLACE)
    if 'microorganism_code' in aux:
        aux.microorganism_code = \
            aux.microorganism_code.replace(MICROORGANISM_CODE_REPLACE)
    if 'antimicrobial_code' in aux:
        aux.antimicrobial_code = \
            aux.antimicrobial_code.replace(ANTIMICROBIAL_CODE_REPLACE)

    # Replace
    aux = aux.replace({
        'specimen_name': SPECIMEN_CODE_MAP,
        'specimen_code': invert(SPECIMEN_CODE_MAP),
        'method_name': METHOD_CODE_MAP,
        'method_code': invert(METHOD_CODE_MAP),
        'sensitivity_name': SENSITIVITY_CODE_MAP,
        'sensitivity_code': invert(SENSITIVITY_CODE_MAP)
    })

    # ------------------------------
    # Fixing orgs/abxs names
    # ------------------------------
    # Verbose
    if verbose > 5:
        print("Formatting... microorganism/antimicrobials.")

    # Apply regexp mapping
    aux = aux.replace(regex=REGEX_MAP)

    # Correct hyphens
    for hp in ['haemolytic']:
        aux.microorganism_name = \
            aux.microorganism_name.str.lower() \
                .transform(hyphen_before, w=hp)

    # Correct order of genus
    for sp in ['enterococcus',
               'staphylococcus',
               'streptococcus',
               'coliform']:
        aux.microorganism_name = \
            aux.microorganism_name.str.lower() \
                .transform(word_to_start, w=sp)

    # Apply regexp mapping
    aux = aux.replace(regex=REGEX_MAP)

    # ------------------------------
    # String formatting
    # ------------------------------
    # Verbose
    if verbose > 5:
        print("Formatting... lower/title/upper.")

    # Format title
    for c in ['patient_name',
              'patient_surname']:
        if c in aux:
            aux[c] = aux[c].str.title()

    # Format lower (not needed)
    for c in ['antimicrobial_name',
              'microorganism_name',
              'sensitivity_name',
              'method_name']:
        if c in aux:
            aux[c] = aux[c].str.lower()

    # Format upper
    for c in ['antimicrobial_code',
              'microorganism_code',
              'sensitivity_code',
              'method_code']:
        if c in aux:
            aux[c] = aux[c].str.upper()

    # Strip strings
    aux = aux.apply(lambda x: x.str.strip() \
        if x.dtype == "object" else x)

    # ------------------------------
    # Time formatting
    # ------------------------------
    # Verbose
    if verbose > 5:
        print("Formatting... date_received/date_outcome.")

    # Format date-times
    for c in ['date_received', 'date_outcome']:
        if c in aux:
            aux[c] = pd.to_datetime(aux[c], errors='coerce')

    # We could also use the convert_dtypes, however there is a big
    # issue with the pd.NA values. I think that issue is only if
    # we want to apply replace and there are pd.NA (only works with
    # np.nan) but it should be fine now that all that has been done.
    #aux = aux.convert_dtypes()

    # Remove empty (sensitivity, date_received, ...?)
    #aux = aux.dropna(how='any', subset=[])

    # Drop duplicates
    aux = aux.drop_duplicates()

    # Return
    return aux


[docs]def clean_clwsql008_old(data, verbose=10):
    """This method cleans microbiology data from clwsql008.

    .. notes: CLW-SQL-008
      - The sensitivity values found are
        { 'SS', 'R', 'ND', 'I', 'HIDE', 'HR' }

    .. note: Using UTC=True gives an error when using
             django-import-export to import the data in
             the databases (commented).

    Parameters
    ----------
    data: pd.DataFrame
        The dataframe with the data

    Returns
    -------
    pd.DataFrame
        The cleaned dataframe.
    """
    # ---------------------------------
    # Constants
    # ---------------------------------
    # Rename columns
    rename = {
        'DiagnosticTestID': 'uuid',
        'ReceiveDate': 'received_date',
        'ReceiveTime': 'received_time',
        'PtNumber': 'patient_hos_number',
        'AccNumber': 'laboratory_number',
        'BatTstCode': 'specimen_code',
        'OrderName': 'specimen_name',
        'SpecType': 'specimen_description',
        'OrgPieceCounter': 'microorganism_piece_counter',
        'OrgCode': 'microorganism_code',
        'Organism': 'microorganism_name',
        'DrugCode': 'antimicrobial_code',
        'AntiBiotic Name': 'antimicrobial_name',
        'SensMethod': 'method_code',
        'Sensitivity': 'sensitivity_code',
        'MIC': 'mic',
        'Reported': 'reported',
        'FinalDate': 'date_outcome'
    }

    # --------------------------
    # Method
    # --------------------------
    # The method codes are given but the method names are not
    # included. We could use this opportunity to set their
    # values
    # Drop duplicates
    data = data.drop_duplicates()
    data = data.rename(columns=rename)
    #data = data.convert_dtypes()

    # Show
    if verbose > 5:
        print("\n")
        print(data.columns)

    # Add new columns
    data['date_received'] = pd.to_datetime(
        data.received_date + ' ' + data.received_time, errors='coerce')

    # Final formatting
    data = clean_common(data, verbose)

    # Return
    return data



[docs]def clean_legacy_old(data, verbose=10):
    """This method cleans microbiology data from legacy.

    .. notes: LEGACY
      - The sensitivities found are ...

    Parameters
    ----------
    data: pd.DataFrame
        The dataframe with the data

    Returns
    -------
    pd.DataFrame
        The cleaned dataframe.
    """
    # ---------------------------------
    # Constants
    # ---------------------------------
    # Rename columns
    rename = {
        'dateReceived': 'date_received',
        'age': 'age',
        'gender': 'gender',
        'patNumber': 'patient_hos_number',
        'labNumber': 'laboratory_number',
        'orderCode': 'specimen_code',
        'orderName': 'specimen_name',
        'specimenType': 'specimen_description',
        'OrgPieceCounter': 'microorganism_piece_counter',
        'organismCode': 'microorganism_code',
        'organismNameOrig': 'microorganism_name',
        'antibioticCode': 'antimicrobial_code',
        'antibioticName': 'antimicrobial_name',
        'sensitivity': 'sensitivity_name'
    }

    # --------------------------
    # Method
    # --------------------------
    # The method codes are given but the method names are not
    # included. We could use this opportunity to set their
    # values
    # Drop duplicates
    data = data.drop_duplicates()
    data = data.rename(columns=rename)
    #data = data.convert_dtypes() # issue with np.nan in replace

    # Show
    if verbose > 5:
        print("\n")
        print(data.columns)

    # Final formatting
    data = clean_common(data, verbose)

    # Return
    return data


[docs]def clean_mimic(data):
    """This method...

    .. note: Need to merge datetime for date and datetime for time
             as done in the datablend package if want full info.

    """
    # ---------------------------------
    # Constants
    # ---------------------------------
    # Rename columns
    rename = {
        'subject_id': 'patient_hos_number',
        'micro_specimen_id': 'laboratory_number',
        'spec_type_desc': 'specimen_description',
        'test_seq': 'microorganism_piece_counter',
        'org_name': 'microorganism_name',
        'ab_name': 'antimicrobial_name',
        'test_name' : 'method',
        'interpretation': 'sensitivity_name',
        'chartdate': 'date_received',
        'storedate': 'date_outcome'
    }

    # Replace values
    replace = {
        #'sensitivity': SENSITIVITY_MAP,
        'microorganism_code': MICROORGANISM_CODE_REPLACE,
        'microorganism_name': MICROORGANISM_NAME_MAP,
        'antimicrobial_code': ANTIMICROBIAL_CODE_REPLACE
    }

    # Rename
    data = data.rename(columns=rename)

    # Replace
    data = data.replace(replace)

    # Format
    data.microorganism_name = data.microorganism_name.str.capitalize()
    data.antimicrobial_name = data.antimicrobial_name.str.capitalize()
    data['microorganism_code'] = data.microorganism_name
    data['antimicrobial_code'] = data.antimicrobial_name
    data['specimen_code'] = data.specimen_description

    # Format date
    if 'date_received' in data:

        # Convert to datetime.
        data.date_received = \
            pd.to_datetime(data.date_received, errors='coerce')

        # Ignore those without result
        data = data[data.sensitivity.notna()]

    # Return
    return data