.. DO NOT EDIT.
.. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY.
.. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE:
.. "_examples/pandas/format06_stacked_oucru_v2.py"
.. LINE NUMBERS ARE GIVEN BELOW.

.. only:: html

    .. note::
        :class: sphx-glr-download-link-note

        Click :ref:`here <sphx_glr_download__examples_pandas_format06_stacked_oucru_v2.py>`
        to download the full example code

.. rst-class:: sphx-glr-example-title

.. _sphx_glr__examples_pandas_format06_stacked_oucru_v2.py:


02. Clean OUCRU v2.0
=====================

.. GENERATED FROM PYTHON SOURCE LINES 5-223

.. code-block:: default
   :lineno-start: 6


    # Libraries
    import pandas as pd

    # -----------------------------
    # Helper methods
    # -----------------------------
    def create_ids(series):
        """This method creates the ids"""
        # Unique patient numbers
        unique = series.unique()
        # Patient mapping
        mapping = dict(zip(unique, range(len(unique))))
        # Result (mysql pk ids from 1)
        aux = series.map(mapping) + 1
        # Return
        return aux

    def keep_by_table(df, tablename, remove_prefix=True):
        """Keep columns starting with table name"""
        # Define tag
        tag = '%s_' % tablename
        # Find columns with tag
        cols = [c for c in df.columns if c.startswith(tag)]
        # Keep only those columns
        aux = df[cols].copy(deep=True)
        aux = aux.drop_duplicates()
        aux = aux.reset_index(drop=True)
        # Remove prefix
        if remove_prefix:
            aux.columns = [c.replace(tag, '') for c in cols]
        # Return
        return aux


    # -----------------------------
    # Load data
    # -----------------------------
    # Load data
    data = pd.read_csv('./data/stacked-oucru/combined_stacked.csv',
        #nrows=1000,
        parse_dates=['date'])

    # Show
    print(data)
    print(data.columns)

    # -----------------------------
    # Format
    # -----------------------------
    # Drop
    drop = [
        'Unnamed: 0',
        'Unnamed: 0.1',
        'result_old',
        'date_old',
        'dsource'
    ]

    # Rename
    rename = {
        'study_no': 'patient_nhs_number',
        'date': 'date_collected',
        'column': 'laboratory_code',
    }

    # Replace
    replace = {
        'result': {
            'True': 1,
            'False': 0
        }
    }

    # Basic format
    data = data.drop(columns=drop)
    data = data.rename(columns=rename)

    # Boolean to number
    data = data.replace(replace)

    # Keep only those whose result can be cast to number
    data.result = pd.to_numeric(data.result, errors='coerce')

    # These columns are required
    data = data.dropna(how='any',
        subset=['date_collected', 'result'])

    # Reset index
    data = data.reset_index()

    # Show types
    print(data.dtypes)


    # ---------------------------------
    # Create patients
    # ---------------------------------
    # Create ids
    data['patient_id'] = create_ids(data.patient_nhs_number)

    # Create DataFrame
    patient = keep_by_table(data, tablename='patient')

    # Add dates
    patient['date_created'] = pd.to_datetime('today').normalize()
    patient['date_updated'] = pd.to_datetime('today').normalize()
    patient['name'] = ''
    patient['surname'] = ''
    patient['ext_number'] = patient.nhs_number # can be null or unique
    patient['hos_number'] = patient.nhs_number # can be null or unique
    patient['dob'] = pd.to_datetime('today').normalize()      # can be null
    patient['gender'] = 0

    # Order
    patient = patient[['id',
                       'date_created',
                       'date_updated',
                       'name',
                       'surname',
                       'ext_number',
                       'nhs_number',
                       'hos_number',
                       'dob',
                       'gender']]

    # Show
    print("\nPatient table:")
    print(patient)

    # ---------------------------------
    # Create laboratory code
    # ---------------------------------
    # Create ids
    data['laboratory_id'] = create_ids(data.laboratory_code)

    # Create DataFrame
    laboratory_codes = \
        keep_by_table(data, tablename='laboratory')

    # Add dates
    laboratory_codes['date_created'] = pd.to_datetime('today').normalize()
    laboratory_codes['date_updated'] = pd.to_datetime('today').normalize()
    laboratory_codes['name'] = laboratory_codes.code
    laboratory_codes['description'] = ''
    laboratory_codes['is_visible'] = 1

    # Order
    laboratory_codes = laboratory_codes[['id',
                                         'date_created',
                                         'date_updated',
                                         'code',
                                         'name',
                                         'description',
                                         'is_visible']]

    # Show
    print("\nLaboratory codes table:")
    print(laboratory_codes)

    # ---------------------------------
    # Create laboratory test
    # ---------------------------------
    # Drop columns
    data = data.drop(columns=[
        'patient_nhs_number',
        'laboratory_code'
    ])

    # Rename columns
    data = data.rename(columns={
        'laboratory_id': 'code_id',
    })

    # Add columns
    data['id'] = data.index.values + 1
    data['uuid'] = data.id
    data['date_outcome'] = data.date_collected
    data['date_created'] = data.date_collected
    data['date_updated'] = data.date_collected
    data['unit_range'] = ''
    data['result_status'] = 'UNKNOWN'
    data['abnormal_status'] = ''

    # Order
    data = data[['id',
                 'uuid',
                 'date_created',
                 'date_updated',
                 'date_collected',
                 'date_outcome',
                 'result',
                 'unit',
                 'unit_range',
                 'result_status',
                 'abnormal_status',
                 'code_id',
                 'patient_id']]


    # Show
    print("\nLaboratory tests table:")
    print(data)

    # ---------------------------------
    # Save
    # ---------------------------------
    # Save
    patient.to_csv('./outputs/patient.csv',
        date_format='%Y-%m-%d %H:%M:%S', index=False)
    laboratory_codes.to_csv('./outputs/laboratory_code.csv',
        date_format='%Y-%m-%d %H:%M:%S', index=False)
    data.to_csv('./outputs/laboratory_tests.csv',
        date_format='%Y-%m-%d %H:%M:%S', index=False)


.. rst-class:: sphx-glr-timing

   **Total running time of the script:** ( 0 minutes  0.000 seconds)


.. _sphx_glr_download__examples_pandas_format06_stacked_oucru_v2.py:


.. only :: html

 .. container:: sphx-glr-footer
    :class: sphx-glr-footer-example


  .. container:: sphx-glr-download sphx-glr-download-python

     :download:`Download Python source code: format06_stacked_oucru_v2.py <format06_stacked_oucru_v2.py>`


  .. container:: sphx-glr-download sphx-glr-download-jupyter

     :download:`Download Jupyter notebook: format06_stacked_oucru_v2.ipynb <format06_stacked_oucru_v2.ipynb>`


.. only:: html

 .. rst-class:: sphx-glr-signature

    `Gallery generated by Sphinx-Gallery <https://sphinx-gallery.github.io>`_