05. Summary plot

 # Libraries
 import shap
 import pandas as pd

 import matplotlib.pyplot as plt


 try:
     __file__
     TERMINAL = True
 except:
     TERMINAL = False


 # ------------------------
 # Methods
 # ------------------------
 def load_shap_file():
     """Load shap file.

     .. note: The timestep does not indicate time step but matrix
              index index. Since the matrix index for time steps
              started in negative t=-T and ended in t=0 the
              transformation should be taken into account.

     """
     from pathlib import Path
     # Load data
     path = Path('../../datasets/shap/')
     data = pd.read_csv(path / 'shap.csv')
     data = data.iloc[:, 1:]
     data = data.rename(columns={'timestep': 'indice'})
     data['timestep'] = data.indice - (data.indice.nunique() - 1)
     return data


 # -----------------------------------------------------
 #                       Main
 # -----------------------------------------------------
 # Load data
 # data = create_random_shap(10, 6, 4)
 data = load_shap_file()
 #data = data[data['sample'] < 100]

 shap_values = pd.pivot_table(data,
                              values='shap_values',
                              index=['sample', 'timestep'],
                              columns=['features'])

 feature_values = pd.pivot_table(data,
                                 values='feature_values',
                                 index=['sample', 'timestep'],
                                 columns=['features'])

 # Show
 if TERMINAL:
     print("\nShow:")
     print(data)
     print(shap_values)
     print(feature_values)

Let’s see how data looks like

 data.head(10)

	features	shap_values	timestep
0	Ward Lactate	0.000652	-6
1	Ward Glucose	-0.000596	-6
2	Ward sO2	0.000231	-6
3	White blood cell count, blood	0.000582	-6
4	Platelets	-0.001705	-6
5	Haemoglobin	-0.000918	-6
6	Mean cell volume, blood	-0.000654	-6
7	Haematocrit	-0.000487	-6
8	Mean cell haemoglobin conc, blood	0.000090	-6
9	Mean cell haemoglobin level, blood	-0.000296	-6

Let’s see how shap_values looks like

 shap_values.iloc[:10, :5]

	features	Alanine Transaminase	Albumin	Alkaline Phosphatase	Bilirubin	C-Reactive Protein
sample	timestep
0	-6	-0.001809	0.000411	0.000486	0.000500	0.010186
	-5	-0.001363	0.000563	0.000803	-0.000133	0.005363
	-4	0.001180	0.000101	0.000859	-0.001680	-0.016017
	-3	0.004938	-0.001043	0.000570	-0.003175	-0.044723
	-2	0.006206	-0.001760	0.000382	-0.003976	-0.062485
	-1	-0.001391	-0.004886	0.002457	0.010031	0.056280
	0	0.003583	0.023502	0.000534	0.001672	-0.010238
1	-6	0.000325	-0.000812	-0.000210	-0.000157	0.000971
	-5	0.000247	-0.002281	-0.000301	-0.000036	-0.000035
	-4	-0.000316	-0.000034	-0.000307	0.000464	-0.009348

Let’s see how feature_values looks like

 feature_values.iloc[:10, :5]

	features	Alanine Transaminase	Albumin	Alkaline Phosphatase	Bilirubin	C-Reactive Protein
sample	timestep
0	-6	0.000000	0.000000	0.000000	0.000000	0.000000
	-5	0.000000	0.000000	0.000000	0.000000	0.000000
	-4	0.000000	0.000000	0.000000	0.000000	0.000000
	-3	0.000000	0.000000	0.000000	0.000000	0.000000
	-2	0.000000	0.000000	0.000000	0.000000	0.000000
	-1	0.000000	0.000000	0.000000	0.000000	0.000000
	0	-0.982956	0.237113	-0.956016	-0.982152	-0.726284
1	-6	-0.994370	-0.587629	-0.956533	-0.988451	-0.398008
	-5	-0.993445	-0.587629	-0.954463	-0.990551	-0.190805
	-4	-0.994370	-0.628866	-0.963260	-0.990551	-0.307893

Display using `shap.summary_plot`

The first option is to use the shap library to plot the results.

 # Let's define/extract some useful variables.
 N = 10  # max loops filter
 TIMESTEPS = len(shap_values.index.unique(level='timestep'))  # number of timesteps
 SAMPLES = len(shap_values.index.unique(level='sample'))  # number of samples

 shap_min = data.shap_values.min()
 shap_max = data.shap_values.max()

Now, let’s display the shap values for all features in each timestep.

 # For each timestep (visualise all features)
 steps = shap_values.index.get_level_values('timestep').unique()
 for i, step in enumerate(steps):
     # Get interesting indexes
     indice = shap_values.index.get_level_values('timestep') == step

     # Create auxiliary matrices
     shap_aux = shap_values.iloc[indice]
     feat_aux = feature_values.iloc[indice]

     # Display
     plt.figure()
     plt.title("Timestep: %s" % step)
     shap.summary_plot(shap_aux.to_numpy(), feat_aux, show=False)
     plt.xlim(shap_min, shap_max)

Now, let’s display the shap values for all timesteps of each feature.

 # For each feature (visualise all time-steps)
 for i, f in enumerate(shap_values.columns[:N]):
     # Show
     # print('%2d. %s' % (i, f))

     # Create auxiliary matrices (select feature and reshape)
     shap_aux = shap_values.iloc[:, i] \
         .to_numpy().reshape(-1, TIMESTEPS)
     feat_aux = feature_values.iloc[:, i] \
         .to_numpy().reshape(-1, TIMESTEPS)
     feat_aux = pd.DataFrame(feat_aux,
         columns=['timestep %s' % j for j in range(-TIMESTEPS+1, 1)]
         )

     # Show
     plt.figure()
     plt.title("Feature: %s" % f)
     shap.summary_plot(shap_aux, feat_aux, sort=False, show=False, plot_type='violin')
     plt.xlim(shap_min, shap_max)
     plt.gca().invert_yaxis()

 # Show
 plt.show()