05. Summary plot

 6 # Libraries
 7 import shap
 8 import pandas as pd
 9
10 import matplotlib.pyplot as plt
11
12
13 try:
14     __file__
15     TERMINAL = True
16 except:
17     TERMINAL = False
18
19
20 # ------------------------
21 # Methods
22 # ------------------------
23 def load_shap_file():
24     """Load shap file.
25
26     .. note: The timestep does not indicate time step but matrix
27              index index. Since the matrix index for time steps
28              started in negative t=-T and ended in t=0 the
29              transformation should be taken into account.
30
31     """
32     from pathlib import Path
33     # Load data
34     path = Path('../../datasets/shap/')
35     data = pd.read_csv(path / 'shap.csv')
36     data = data.iloc[:, 1:]
37     data = data.rename(columns={'timestep': 'indice'})
38     data['timestep'] = data.indice - (data.indice.nunique() - 1)
39     return data
40
41
42 # -----------------------------------------------------
43 #                       Main
44 # -----------------------------------------------------
45 # Load data
46 # data = create_random_shap(10, 6, 4)
47 data = load_shap_file()
48 #data = data[data['sample'] < 100]
49
50 shap_values = pd.pivot_table(data,
51                              values='shap_values',
52                              index=['sample', 'timestep'],
53                              columns=['features'])
54
55 feature_values = pd.pivot_table(data,
56                                 values='feature_values',
57                                 index=['sample', 'timestep'],
58                                 columns=['features'])
59
60 # Show
61 if TERMINAL:
62     print("\nShow:")
63     print(data)
64     print(shap_values)
65     print(feature_values)

Let’s see how data looks like

69 data.head(10)
sample indice features feature_values shap_values timestep
0 0 0 Ward Lactate 0.0 0.000652 -6
1 0 0 Ward Glucose 0.0 -0.000596 -6
2 0 0 Ward sO2 0.0 0.000231 -6
3 0 0 White blood cell count, blood 0.0 0.000582 -6
4 0 0 Platelets 0.0 -0.001705 -6
5 0 0 Haemoglobin 0.0 -0.000918 -6
6 0 0 Mean cell volume, blood 0.0 -0.000654 -6
7 0 0 Haematocrit 0.0 -0.000487 -6
8 0 0 Mean cell haemoglobin conc, blood 0.0 0.000090 -6
9 0 0 Mean cell haemoglobin level, blood 0.0 -0.000296 -6


Let’s see how shap_values looks like

73 shap_values.iloc[:10, :5]
features Alanine Transaminase Albumin Alkaline Phosphatase Bilirubin C-Reactive Protein
sample timestep
0 -6 -0.001809 0.000411 0.000486 0.000500 0.010186
-5 -0.001363 0.000563 0.000803 -0.000133 0.005363
-4 0.001180 0.000101 0.000859 -0.001680 -0.016017
-3 0.004938 -0.001043 0.000570 -0.003175 -0.044723
-2 0.006206 -0.001760 0.000382 -0.003976 -0.062485
-1 -0.001391 -0.004886 0.002457 0.010031 0.056280
0 0.003583 0.023502 0.000534 0.001672 -0.010238
1 -6 0.000325 -0.000812 -0.000210 -0.000157 0.000971
-5 0.000247 -0.002281 -0.000301 -0.000036 -0.000035
-4 -0.000316 -0.000034 -0.000307 0.000464 -0.009348


Let’s see how feature_values looks like

77 feature_values.iloc[:10, :5]
features Alanine Transaminase Albumin Alkaline Phosphatase Bilirubin C-Reactive Protein
sample timestep
0 -6 0.000000 0.000000 0.000000 0.000000 0.000000
-5 0.000000 0.000000 0.000000 0.000000 0.000000
-4 0.000000 0.000000 0.000000 0.000000 0.000000
-3 0.000000 0.000000 0.000000 0.000000 0.000000
-2 0.000000 0.000000 0.000000 0.000000 0.000000
-1 0.000000 0.000000 0.000000 0.000000 0.000000
0 -0.982956 0.237113 -0.956016 -0.982152 -0.726284
1 -6 -0.994370 -0.587629 -0.956533 -0.988451 -0.398008
-5 -0.993445 -0.587629 -0.954463 -0.990551 -0.190805
-4 -0.994370 -0.628866 -0.963260 -0.990551 -0.307893


Display using shap.summary_plot

The first option is to use the shap library to plot the results.

85 # Let's define/extract some useful variables.
86 N = 10  # max loops filter
87 TIMESTEPS = len(shap_values.index.unique(level='timestep'))  # number of timesteps
88 SAMPLES = len(shap_values.index.unique(level='sample'))  # number of samples
89
90 shap_min = data.shap_values.min()
91 shap_max = data.shap_values.max()

Now, let’s display the shap values for all features in each timestep.

 98 # For each timestep (visualise all features)
 99 steps = shap_values.index.get_level_values('timestep').unique()
100 for i, step in enumerate(steps):
101     # Get interesting indexes
102     indice = shap_values.index.get_level_values('timestep') == step
103
104     # Create auxiliary matrices
105     shap_aux = shap_values.iloc[indice]
106     feat_aux = feature_values.iloc[indice]
107
108     # Display
109     plt.figure()
110     plt.title("Timestep: %s" % step)
111     shap.summary_plot(shap_aux.to_numpy(), feat_aux, show=False)
112     plt.xlim(shap_min, shap_max)
  • Timestep: -6
  • Timestep: -5
  • Timestep: -4
  • Timestep: -3
  • Timestep: -2
  • Timestep: -1
  • Timestep: 0

Now, let’s display the shap values for all timesteps of each feature.

118 # For each feature (visualise all time-steps)
119 for i, f in enumerate(shap_values.columns[:N]):
120     # Show
121     # print('%2d. %s' % (i, f))
122
123     # Create auxiliary matrices (select feature and reshape)
124     shap_aux = shap_values.iloc[:, i] \
125         .to_numpy().reshape(-1, TIMESTEPS)
126     feat_aux = feature_values.iloc[:, i] \
127         .to_numpy().reshape(-1, TIMESTEPS)
128     feat_aux = pd.DataFrame(feat_aux,
129         columns=['timestep %s' % j for j in range(-TIMESTEPS+1, 1)]
130         )
131
132     # Show
133     plt.figure()
134     plt.title("Feature: %s" % f)
135     shap.summary_plot(shap_aux, feat_aux, sort=False, show=False, plot_type='violin')
136     plt.xlim(shap_min, shap_max)
137     plt.gca().invert_yaxis()
138
139 # Show
140 plt.show()
  • Feature: Alanine Transaminase
  • Feature: Albumin
  • Feature: Alkaline Phosphatase
  • Feature: Bilirubin
  • Feature: C-Reactive Protein
  • Feature: Chloride
  • Feature: Creatinine
  • Feature: D-Dimer
  • Feature: Eosinophils
  • Feature: Ferritin

Total running time of the script: ( 0 minutes 7.590 seconds)

Gallery generated by Sphinx-Gallery