Note
Click here to download the full example code
UKVI trips visualisation
Out:
Outbound Date Inbound Date Outbound Ports Inbound Ports Days Difference Voyage Code
0 2018-07-27 11:00:00 2018-07-30 21:25:00 LHR-FRA FRA-LHR 2 BA0904
1 2018-08-11 08:45:00 2018-08-25 13:25:00 STN-BLQ SKG-LGW 13 FR0194
2 2018-09-29 11:30:00 2018-10-03 16:40:00 LHR-FRA FRA-LHR 3 LH0905
3 2018-11-03 07:30:00 2018-11-06 14:45:00 LHR-FRA FRA-LHR 2 LH0923
4 2018-11-25 10:40:00 2018-12-01 13:50:00 STN-RAK RAK-STN 5 FR3556
5 2018-12-22 11:50:00 2018-12-26 22:40:00 STN-AOI AOI-STN 3 FR0124
6 2019-01-04 08:00:00 2019-01-08 11:30:00 STN-FRA FRA-LHR 3 FR1687
7 2019-01-21 08:00:00 2019-01-24 18:35:00 STN-FRA FRA-STN 2 FR1687
8 2019-02-09 08:05:00 2019-02-15 07:40:00 STN-BLQ BLQ-STN 4 FR0194
9 2019-02-20 17:35:00 2019-02-21 21:55:00 LGW-MXP MXP-LTN 0 U28197
10 2019-03-01 18:05:00 2019-03-15 06:25:00 LHR-JNB WDH-JNB 12 SA0235
11 2019-03-16 18:40:00 2019-03-20 07:40:00 STN-AOI BLQ-STN 2 FR0124
12 2019-03-26 08:05:00 2019-03-30 23:55:00 STN-BLQ AOI-STN 3 FR0194
13 2019-04-18 09:25:00 2019-04-22 20:21:00 LTN-AMS AMS-LDN 3 U22157
14 2019-04-27 06:25:00 2019-04-30 23:30:00 STN-AOI BLQ-STN 2 FR0124
15 2019-06-09 06:20:00 2019-06-15 22:00:00 STN-AOI RMI-STN 5 FR0124
16 2019-07-11 07:10:00 2019-07-15 13:30:00 LGW-BLQ BLQ-STN 3 U28989
17 2019-08-16 07:10:00 2019-08-22 08:05:00 LGW-BLQ BLQ-STN 5 U28989
18 2019-08-23 06:25:00 2019-09-02 23:55:00 STN-AOI BLQ-STN 9 FR0124
19 2019-11-16 11:35:00 2019-11-19 14:15:00 STN-BGY BLQ-LTN 2 FR4219
20 2019-12-22 08:05:00 2020-01-08 19:55:00 STN-BLQ BLQ-LHR 16 FR0194
21 2020-02-08 06:05:00 2020-02-10 21:30:00 LGW-ZRH ZRH-LTN 1 U28113
22 2020-02-20 09:15:00 2020-02-24 14:15:00 LTN-BLQ BLQ-LTN 3 FR3406
23 2020-03-04 06:30:00 2020-03-07 23:15:00 STN-ALC ALC-STN 2 FR8382
24 2020-07-26 14:35:00 2020-08-17 20:10:00 LHR-BLQ BLQ-LHR 21 BA0542
25 2020-09-06 14:35:00 2021-02-08 11:05:00 LHR-BLQ FCO-LHR 153 BA0542
26 2021-06-17 07:00:00 2021-09-25 12:05:00 STN-BLQ AOI-STN 99 FR0194
27 2021-11-15 17:25:00 2021-11-29 13:05:00 LTN-BLQ PEG-STN 12 FR3406
28 2022-01-04 08:05:00 2022-01-09 22:55:00 STN-BLQ AOI-STN 4 FR0194
29 2022-02-10 11:50:00 2022-02-22 07:40:00 STN-AOI BLQ-STN 10 FR0124
30 2022-03-24 09:50:00 2022-03-30 19:05:00 STN-TFS TFS-LGW 5 LS1663
31 2022-03-31 06:25:00 2022-04-03 11:25:00 STN-AOI AOI-STN 2 FR0124
32 2022-04-30 20:10:00 2022-05-02 21:20:00 LHR-BSL BSL-LHR 1 BA0756
33 2022-06-03 12:55:00 2022-06-14 12:05:00 STN-MAD PMI-STN 9 FR5996
34 2022-07-05 13:05:00 2022-07-27 15:45:00 STN-AOI BRI-LGW 21 FR0124
35 2022-09-13 13:05:00 2022-09-21 11:25:00 STN-AOI AOI-STN 6 FR0124
36 2022-10-06 16:15:00 2022-10-08 10:15:00 LGW-VRN VRN-LGW 0 U28449
37 2022-10-26 13:05:00 2022-12-26 15:25:00 STN-MAD KUL-LHR 60 FR5996
38 2023-01-13 06:15:00 2023-01-16 11:55:00 STN-BLQ AOI-STN 2 FR0194
39 2023-02-04 06:45:00 2023-02-06 16:10:00 LTN-ATH ATH-LGW 1 W94467
40 2023-02-21 16:05:00 2023-03-01 11:40:00 LTN-BLQ PEG-STN 6 FR3406
41 2023-04-21 18:35:00 2023-04-24 16:00:00 STN-AOI PEG-STN 1 FR0261
42 2023-04-28 14:45:00 2023-05-09 16:35:00 STN-LPA LPA-STN 10 FR2842
43 2023-05-28 17:10:00 2023-05-31 22:10:00 LHR-ZRH ZRH-LGW 2 LX0325
44 2023-06-09 11:10:00 2023-06-15 20:35:00 LGW-FCO AOI-STN 5 W45781
7 import pdfplumber
8 import re
9 import pandas as pd
10 import matplotlib.dates as mdates
11 import matplotlib.pyplot as plt
12
13 from matplotlib.dates import DateFormatter
14 from pathlib import Path
15
16
17
18
19 def extract_basic_travel_data(pdf_path, start_page, end_page):
20 """Extract the data from a PDF file.
21
22 Ensure that the format is appropriate, ent the column headers match those
23 included below. Otherwise modify as appropriate.
24
25 Parameters
26 ----------
27 pdf_path: str
28 The path to the file.
29 start_page: int
30 The start page where the table apperas.
31 end_page: int
32 The end page where the table appears.
33
34 Returns
35 -------
36 """
37 # Define the headers for essential data
38 headers = [
39 "Departure Date/Time", "Arrival Date/Time", "Voyage Code", "In/Out",
40 "Dep Port", "Arrival Port"
41 ]
42
43 travel_data = []
44
45 # Regex pattern to capture essential information
46 row_pattern = re.compile(
47 r"(\d{2}/\d{2}/\d{4} \d{2}:\d{2})\s+" # Departure Date/Time
48 r"(\d{2}/\d{2}/\d{4} \d{2}:\d{2})\s+" # Arrival Date/Time
49 r"(\S+)\s+" # Voyage Code
50 r"(Outbound|Inbound)\s+" # In/Out
51 r"(\S+)\s+" # Dep Port
52 r"(\S+)" # Arrival Port
53 )
54
55 # Open the PDF file and iterate over specified pages
56 with pdfplumber.open(pdf_path) as pdf:
57 for page_num in range(start_page - 1, end_page):
58 page = pdf.pages[page_num]
59 text = page.extract_text()
60 if not text:
61 continue
62
63 # Match rows using the regex pattern
64 matches = row_pattern.findall(text)
65 if matches:
66 for match in matches:
67 travel_data.append(dict(zip(headers, match)))
68
69 # Return
70 return travel_data
71
72
73 def combine_outbound_inbound(df):
74 """Combine Outbound-Ibound rows into a single one.
75
76 Paramters
77 ---------
78 df: pd.DataFrame
79 The DataFrame with the data.
80
81 Returns
82 -------
83 pd.DataFrame
84
85 """
86
87 # Convert date columns to datetime format
88 df["Departure Date/Time"] = \
89 pd.to_datetime(df["Departure Date/Time"],
90 format="%d/%m/%Y %H:%M")
91 df["Arrival Date/Time"] = \
92 pd.to_datetime(df["Arrival Date/Time"],
93 format="%d/%m/%Y %H:%M")
94
95 # Sort the DataFrame by "Departure Date/Time"
96 df = df.sort_values(by="Departure Date/Time").reset_index(drop=True)
97
98 # Process the DataFrame
99 result = []
100 for i in range(0, len(df) - 1, 2): # Step by 2 to handle consecutive rows
101 outbound = df.iloc[i]
102 inbound = df.iloc[i + 1]
103
104 # Ensure the pair consists of an outbound followed by an inbound
105 if outbound["In/Out"] == "Outbound" and inbound["In/Out"] == "Inbound":
106 # Calculate the difference in days
107 days_difference = (inbound["Arrival Date/Time"] - outbound["Departure Date/Time"]).days - 1
108
109 # Create a combined row with desired columns
110 combined_row = {
111 "Outbound Date": outbound["Departure Date/Time"],
112 "Inbound Date": inbound["Arrival Date/Time"],
113 "Outbound Ports": outbound["Dep Port"] + '-' + outbound["Arrival Port"],
114 "Inbound Ports": inbound["Dep Port"] + '-' + inbound["Arrival Port"],
115 "Days Difference": days_difference,
116 "Voyage Code": outbound["Voyage Code"]
117 }
118
119 result.append(combined_row)
120
121 # Return
122 return pd.DataFrame(result)
123
124
125
126 def display(df, cmap=None):
127 """Plotting the graph.
128
129 Parameters
130 ----------
131 df: pd.DataFrame
132 The pandas DataFrame.
133
134 Returns
135 -------
136 None
137 """
138 # Set up plot
139 fig, ax = plt.subplots(figsize=(16, 8))
140
141 # Fore each row (voyage)
142 for i, row in df.iterrows():
143
144 if cmap is None:
145 color = 'skyblue'
146 else:
147 cmap.get(row['Outbound Ports'].split('-')[1], 'skyblue')
148
149
150 # Plot each voyage as a horizontal bar with text annotations
151 ax.plot([row["Outbound Date"], row["Inbound Date"]], [i, i], marker='o', color=color, lw=6)
152
153 # Formatting outbound and inbound dates
154 outbound_str = row["Outbound Date"].strftime("%d %b") # Day and abbreviated month
155 inbound_str = row["Inbound Date"].strftime("%d %b") # Day and abbreviated month
156
157 # Adjust the text position to be further right
158 ax.text(row["Inbound Date"] + pd.Timedelta(days=10), i - 0.05, # Increased offset to 10 days
159 f"{row['Outbound Ports']} ({outbound_str}) to {row['Inbound Ports']} ({inbound_str}) | {row['Days Difference']} days",
160 va='center', ha='left', fontsize=9, color="black")
161
162 # Alternate month shading
163 start_date = df["Outbound Date"].min().replace(day=1)
164 end_date = df["Inbound Date"].max()
165 current_date = start_date
166 month = 0
167 while current_date < end_date:
168 next_month = (current_date + pd.DateOffset(months=1)).replace(day=1)
169 ax.axvspan(current_date, next_month, color='gray' if month % 2 == 0 else 'lightgray', alpha=0.2)
170 current_date = next_month
171 month += 1
172
173 # Add horizontal lines for each year
174 years = pd.date_range(start=start_date, end=end_date+pd.DateOffset(years=1), freq='Y')
175 for year in years:
176 ax.axvline(year, color='black', linestyle='--', lw=1) # Vertical line for each year
177 ax.text(year - pd.Timedelta(days=90), len(df) + 0.5, year.year,
178 ha='left', va='center', fontsize=10, color='black') # Year label
179
180 # Setting the x-axis limits to include full years
181 full_start_date = pd.Timestamp(year=start_date.year, month=1, day=1)
182 full_end_date = pd.Timestamp(year=end_date.year + 1, month=1, day=1) # Next January
183 ax.set_xlim(full_start_date, full_end_date)
184
185 # Set x-axis ticks to show full years from January to December
186 ax.xaxis.set_major_locator(mdates.YearLocator()) # Major ticks at the beginning of each year
187 ax.xaxis.set_minor_locator(mdates.MonthLocator()) # Minor ticks for each month
188 ax.xaxis.set_major_formatter(DateFormatter("%Y")) # Year as the format for major ticks
189
190 # Formatting the plot
191 ax.set_yticks(range(len(df)))
192 ax.set_yticklabels(df["Voyage Code"])
193 #ax.set_yticklabels(df['Days Difference'])
194 ax.set_xlabel("Date")
195 ax.set_title("Voyage Durations (total abroad %s days)" % df['Days Difference'].sum())
196
197 # Set x-axis ticks to show abbreviated month names and year
198 ax.xaxis.set_major_locator(mdates.MonthLocator())
199 ax.xaxis.set_major_formatter(DateFormatter("%b %Y")) # Month abbreviation and year
200 plt.xticks(rotation=45)
201
202 plt.grid(axis='x', linestyle='--', alpha=0.5)
203 plt.tight_layout()
204 plt.show()
205
206
207
208 # ------------------------------------------------------
209 # Main
210 # ------------------------------------------------------
211 # Include any missing entry. This could happen if the travel
212 # was done by bus or train, as only flights have been recorded
213 # in the system.
214 MISSING = {
215 'veronica': [
216 {
217 "Departure Date/Time": "22/04/2019 18:21",
218 "Arrival Date/Time": "22/04/2019 20:21",
219 "Voyage Code": "BUS001",
220 "In/Out": "Inbound",
221 "Dep Port": "AMS",
222 "Arrival Port": "LDN"
223 }
224 ]
225 }
226
227 # Include the colors desired for each airport. For example
228 # they could be colored by country.
229 COLORMAP = {
230 'FRA': 'black',
231 'BLQ': 'green',
232 'LHR': 'blue',
233 'LGW': 'blue',
234 'RAK': 'skyblue',
235 'STN': 'blue',
236 'AOI': 'green',
237 'MXP': 'green',
238 'LTN': 'blue',
239 'JNB': 'skyblue',
240 'AMS': 'skyblue',
241 'BGY': 'green',
242 'ZRH': 'skyblue',
243 'ALC': 'yellow',
244 'TFS': 'yellow',
245 'BSL': 'skyblue',
246 'MAD': 'yellow',
247 'VRN': 'green',
248 'ATH': 'skyblue',
249 'LPA': 'yellow',
250 'FCO': 'green',
251 'FRFHN': 'black',
252 'LDN': 'blue'
253 }
254
255
256
257 # Define the PDF file path and page range to extract
258 pdf_path = Path('./data/775243 Final Bundle.pdf')
259 start_page = 6 # Page number where the tables start
260 end_page = 8 # Page number where the tables end
261
262 # Define the JSON file
263 #pdf_path = Path('./data/bernard-2024.json')
264
265 # Load DataFrame
266 if pdf_path.suffix == '.pdf':
267 trips = extract_basic_travel_data(pdf_path, start_page, end_page)
268 elif pdf_path.suffix == '.json':
269 trips = pd.read_json(pdf_path)
270 else:
271 print('File extension <%s> not supported.' % pdf_path.suffix)
272
273 # Convert to DataFrame
274 df = pd.DataFrame(trips)
275 # Append missing rows using concat
276 df = pd.concat([df, pd.DataFrame(MISSING['veronica'])], ignore_index=True)
277 # Combine consecutive outbound-inbound trips into one row.
278 df_cmb = combine_outbound_inbound(df)
279
280 # Show
281 print(df_cmb)
282
283 # Display
284 display(df_cmb)
Total running time of the script: ( 0 minutes 1.785 seconds)