Hourly Level Data.
In this document, we provide all steps to reproduce the exploratory analysis of our data at the hourly level.
Should you have any questions, need help to reproduce the analysis or find coding errors, please do not hesitate to contact us at leo.zabrocki@gmail.com and marion.leroutier@hhs.se.
We load the following packages:
# load required packages
library(rmarkdown) # for creating the R Markdown document
library(knitr) # for creating the R Markdown document
library(here) # for files paths organization
library(tidyverse) # for data manipulation and visualization
library(dtplyr) # to speed up to dplyr
library(ggridges) # for density ridge plots
library(osmdata) # for retrieving open street map data
library(sf) # for simple features access
library(patchwork) # for combining plots
library(Cairo) # for printing customed police of graphs
We also load our custom ggplot2 theme for graphs:
Finally, we load the data:
# load data
data <- readRDS( here::here(
"inputs",
"1.data",
"1.hourly_data",
"2.data_for_analysis",
"0.main_data",
"data_for_analysis_hourly.RDS"
)
) %>%
as.data.frame()
We first explore the patterns of vessel traffic data at the hourly level and by vessel type (cruise, ferry and other types).
Mean of total tonnage by vessel categories and hour of the day:
# create the graph
graph_total_tonnage_hour_vessel_type <- data %>%
select(
hour,
total_gross_tonnage,
total_gross_tonnage_cruise,
total_gross_tonnage_ferry,
total_gross_tonnage_other_vessels
) %>%
rename(
"Total Tonnage" = total_gross_tonnage,
"Cruise Tonnage" = total_gross_tonnage_cruise,
"Ferry Tonnage" = total_gross_tonnage_ferry,
"Other Vessel Types Tonnage" = total_gross_tonnage_other_vessels
) %>%
pivot_longer(cols = -c(hour),
names_to = "tonnage_type",
values_to = "tonnage") %>%
group_by(tonnage_type, hour) %>%
summarise(mean_tonnage = mean(tonnage) %>%
ifelse(is.na(.), 0, .)) %>%
ggplot(., aes(x = as.factor(hour), y = mean_tonnage, group = "l")) +
geom_line(colour = my_blue) +
facet_wrap( ~ tonnage_type) +
xlab("Hour") + ylab("Average Gross Tonnage") +
theme_tufte()
# display the graph
graph_total_tonnage_hour_vessel_type
ggsave(
graph_total_tonnage_hour_vessel_type,
filename = here::here(
"inputs",
"3.outputs",
"1.hourly_analysis",
"1.eda",
"graph_total_tonnage_hour_vessel_type.pdf"
),
width = 30,
height = 15,
units = "cm",
device = cairo_pdf
)
Mean of entering and exiting total tonnage by vessel categories and hour of the day:
# create the graph
graph_entering_exiting_total_tonnage_hour_vessel_type <- data %>%
select(
hour,
total_gross_tonnage_entry:total_gross_tonnage_entry_other_vessels,
total_gross_tonnage_exit:total_gross_tonnage_exit_other_vessels
) %>%
pivot_longer(cols = -c(hour),
names_to = "tonnage_type",
values_to = "tonnage") %>%
mutate(
entry_exit = ifelse(str_detect(tonnage_type, "entry"), "Arrival", "Departure"),
vessel_type = "Total Tonnage" %>%
ifelse(str_detect(tonnage_type, "cruise"), "Cruise", .) %>%
ifelse(str_detect(tonnage_type, "ferry"), "Ferry", .) %>%
ifelse(str_detect(tonnage_type, "other"), "Other Vessel Types", .)
) %>%
group_by(vessel_type, entry_exit, hour) %>%
summarise(mean_tonnage = mean(tonnage) %>%
ifelse(is.na(.), 0, .)) %>%
ggplot(.,
aes(
x = as.factor(hour),
y = mean_tonnage,
group = entry_exit,
colour = entry_exit
)) +
geom_line(size = 1.1) +
scale_colour_manual(values = c(my_blue, my_orange)) +
facet_wrap( ~ vessel_type) +
xlab("Hour") + ylab("Average Gross Tonnage") +
labs(colour = "Vessel Status") +
theme_tufte() +
theme(
axis.title.y = element_text(size = 20),
axis.title.x = element_text(size = 20),
strip.text.x = element_text(size = 18),
legend.title = element_text(size = 20),
legend.text = element_text(size = 18)
)
# display the graph
graph_entering_exiting_total_tonnage_hour_vessel_type
ggsave(
graph_entering_exiting_total_tonnage_hour_vessel_type,
filename = here::here(
"inputs",
"3.outputs",
"1.hourly_analysis",
"1.eda",
"graph_entering_exiting_total_tonnage_hour_vessel_type.pdf"
),
width = 30,
height = 15,
units = "cm",
device = cairo_pdf
)
Mean of total tonnage by vessel categories, season and hour of the day:
# create the graph
graph_total_tonnage_season_hour_vessel_type <- data %>%
mutate(
season = recode(
season,
"1" = "January-March",
"2" = "April-June",
"3" = "July-September",
"4" = "October-December"
) %>% fct_relevel(
.,
"January-March",
"April-June",
"July-September",
"October-December"
)
) %>%
select(
hour,
season,
total_gross_tonnage_cruise,
total_gross_tonnage_ferry,
total_gross_tonnage_other_vessels
) %>%
rename("Cruise" = total_gross_tonnage_cruise,
"Ferry" = total_gross_tonnage_ferry,
"Other Vessel Types" = total_gross_tonnage_other_vessels) %>%
pivot_longer(
cols = -c(hour, season),
names_to = "tonnage_type",
values_to = "tonnage"
) %>%
group_by(tonnage_type, season, hour) %>%
summarise(mean_tonnage = mean(tonnage) %>%
ifelse(is.na(.), 0, .)) %>%
ggplot(., aes(x = as.factor(hour), y = mean_tonnage, group = "l")) +
geom_line(colour = my_blue) +
facet_grid(tonnage_type ~ season, scales = "free_y") +
xlab("Hour") + ylab("Average Gross Tonnage") +
theme_tufte()
# display the graph
graph_total_tonnage_season_hour_vessel_type
# save the graph
ggsave(
graph_total_tonnage_season_hour_vessel_type,
filename = here::here(
"inputs",
"3.outputs",
"1.hourly_analysis",
"1.eda",
"graph_total_tonnage_season_hour_vessel_type.pdf"
),
width = 90,
height = 36,
units = "cm",
device = cairo_pdf
)
One issue of for our matching analysis is that vessel traffic is very regular. For instance, cruise vessels dock and leave the port for specific days and hours so that it is hard to find similar pairs of hours with and without traffic. We plot below the hourly tonnage of cruise docking the port for mondays belonging to July and August months of 2012:
# create the data
data_tonnage_cruise_traffic_monday_season_3_2012 <- data %>%
filter(month %in% c("July", "August") &
year == 2012 & weekday == "Monday") %>%
mutate(julian_date = lubridate::yday(date)) %>%
mutate(julian_date = map(julian_date, ~ paste("Day", ., sep = " "))) %>%
unnest(julian_date) %>%
select(julian_date, hour, total_gross_tonnage_entry_cruise)
# graph entering tonnage cruise
graph_tonnage_variation_cruise <-
data_tonnage_cruise_traffic_monday_season_3_2012 %>%
ggplot(.,
aes(x = as.factor(hour), y = total_gross_tonnage_entry_cruise, group = julian_date)) +
geom_line(colour = my_blue) +
facet_wrap( ~ julian_date) +
scale_y_continuous(
labels = function(x)
format(x, scientific = FALSE)
) +
xlab("Hour") + ylab("Average Gross Tonnage") +
theme_tufte()
# display the graph entering tonnage cruise
graph_tonnage_variation_cruise
ggsave(
graph_tonnage_variation_cruise,
filename = here::here(
"inputs",
"3.outputs",
"1.hourly_analysis",
"1.eda",
"graph_tonnage_variation_cruise.pdf"
),
width = 84,
height = 35,
units = "cm",
device = cairo_pdf
)
We explore here the hourly patterns of pollutant concentrations. We plot below the average concentration by hour for each pollutant:
# create the graph of pollutants' concentration by hour of the day
graph_pollutant_hour <- data %>%
select(hour, mean_no2_l:mean_pm10_sl) %>%
rename(
"NO2 Longchamp" = mean_no2_l,
"O3 Longchamp" = mean_o3_l,
"PM2.5 Longchamp" = mean_pm25_l,
"PM10 Longchamp" = mean_pm10_l,
"SO2 Longchamp" = mean_so2_l,
"NO2 Saint-Louis" = mean_no2_sl,
"PM10 Saint-Louis" = mean_pm10_sl
) %>%
pivot_longer(cols = -c(hour),
names_to = "pollutant",
values_to = "concentration") %>%
group_by(hour, pollutant) %>%
summarise(mean_concentration = mean(concentration)) %>%
ggplot(., aes(x = as.factor(hour), y = mean_concentration, group = "l")) +
geom_line(colour = my_blue) +
facet_wrap( ~ pollutant, scales = "free", ncol = 4) +
xlab("Hour") + ylab("Concentration (µg/m³)") +
theme_tufte()
# display the graph
graph_pollutant_hour
# save the graph
ggsave(
graph_pollutant_hour,
filename = here::here(
"inputs",
"3.outputs",
"1.hourly_analysis",
"1.eda",
"graph_pollutant_hour.pdf"
),
width = 100,
height = 30,
units = "cm",
device = cairo_pdf
)
We explore here the hourly patterns of weather parameters. We plot the hourly distribution of weather parameters:
# make the graph
graph_continuous_weather_hour <- data %>%
select(hour,
temperature_average,
rainfall_height_dummy,
wind_speed,
humidity_average) %>%
rename(
"Average Temperature (°C)" = temperature_average,
"Rainfall Dummy (%)" = rainfall_height_dummy,
"Wind Speed (m/s)" = wind_speed,
"Average Humidity (%)" = humidity_average
) %>%
pivot_longer(cols = -c(hour),
names_to = "weather_parameter",
values_to = "values") %>%
group_by(weather_parameter, hour) %>%
summarise(values = mean(values)) %>%
ungroup() %>%
mutate(values = ifelse(weather_parameter == "Rainfall Dummy (%)", values *
100, values)) %>%
ggplot(., aes(x = as.factor(hour), y = values, group = "l")) +
geom_line(colour = my_blue) +
facet_wrap( ~ weather_parameter, scales = "free_y") +
xlab("Hour") + ylab("Value") +
theme_tufte()
# display the graph
graph_continuous_weather_hour
# save the graph
ggsave(
graph_continuous_weather_hour,
filename = here::here(
"inputs",
"3.outputs",
"1.hourly_analysis",
"1.eda",
"graph_continuous_weather_hour.pdf"
),
width = 60,
height = 25,
units = "cm",
device = cairo_pdf
)
We plot the hourly distribution of wind direction categories:
# make the graph
graph_wind_direction_hour <- data %>%
select(hour, wind_direction) %>%
mutate(
wind_direction_categories = cut(
wind_direction,
breaks = seq(0, 360, by = 90),
include.lowest = TRUE
) %>%
recode(
.,
"[0,90]" = "North-East",
"(90,180]" = "South-East",
"(180,270]" = "South-West",
"(270,360]" = "North-West"
)
) %>%
mutate(hour = as.factor(hour)) %>%
group_by(hour, wind_direction_categories) %>%
# compute the number of observations
summarise(n = n()) %>%
# compute the proportion
mutate(freq = round(n / sum(n) * 100, 0)) %>%
ggplot(., aes(x = hour, y = freq, group = "l")) +
geom_line(colour = my_blue) +
facet_wrap( ~ wind_direction_categories, ncol = 4) +
coord_flip() +
xlab("Hour") + ylab("Proportion (%)") +
theme_tufte()
# display the graph
graph_wind_direction_hour
# save the graph
ggsave(
graph_wind_direction_hour,
filename = here::here(
"inputs",
"3.outputs",
"1.hourly_analysis",
"1.eda",
"graph_wind_direction_hour.pdf"
),
width = 40,
height = 22,
units = "cm",
device = cairo_pdf
)
We explore here the hourly patterns of the flow of vehicles. We plot below the average flow of vehicles by hour of the day:
# create the graph of the flow of vehicles by hour of the day
graph_road_traffic_hour <- data %>%
group_by(hour) %>%
summarise(mean_road_traffic_flow = mean(road_traffic_flow, na.rm = TRUE)) %>%
ungroup() %>%
ggplot(., aes(x = as.factor(hour), y = mean_road_traffic_flow, group = "l")) +
geom_line(colour = my_blue) +
xlab("Hour") + ylab("Average Road Traffic Flow\n (Number of Vehicles)") +
theme_tufte()
# display the graph
graph_road_traffic_hour
# save the graph
ggsave(
graph_road_traffic_hour,
filename = here::here(
"inputs",
"3.outputs",
"1.hourly_analysis",
"1.eda",
"graph_road_traffic_hour.pdf"
),
width = 30,
height = 15,
units = "cm",
device = cairo_pdf
)
We downloaded from data.gouv.fr a shape file of Marseille’s districts. We load these data and compute the union of the districts to have the boundaries of the city:
Using the osmdata
package, we retrieve the boundaries of
Marseille’s port:
# building the query to get boundaries of the port
harbour_query <- getbb("Marseille") %>%
opq() %>%
add_osm_feature("harbour")
# retrieve the data
harbour_data <- osmdata_sf(harbour_query)
Using the osmdata
package, we retrieve Marseille’s
network of roads. This query takes a bit of time to run:
# building the query to get roads data
roads_query <- getbb("Marseille") %>%
opq() %>%
add_osm_feature("highway")
# retrieve the data
roads_data <- osmdata_sf(roads_query)
We retrieve the coordinates of Longchamp and Saint Louis stations from AtmoSud website:
# create air pollution measuring stations data
data_stations <-
tibble(
latitude = c(43.34811, 43.30610),
longitude = c(5.36012, 5.39526),
name = c("Saint Louis", "Longchamp")
)
# set the crs
data_stations <- st_as_sf(data_stations,
coords = c('longitude', 'latitude'),
crs = "+init=epsg:4326")
# retrieve station coordinates for labelling their names on the map
stations_coordinates <-
as.data.frame(st_transform(data_stations, 2154) %>%
st_coordinates(.))
stations_coordinates$name <- data_stations$name
# we first store a vector of colors to create the legend
colors <- c("Harbour" = my_orange, "Stations" = my_blue)
# make the map
map_marseille <- ggplot() +
# city's boundaries
geom_sf(
data = marseille_borders,
colour = "black",
fill = "white",
size = 0.1
) +
# roads
geom_sf(data = roads_data$osm_lines,
colour = "gray80",
size = 0.2) +
# station coordinates
geom_sf(
data = data_stations,
aes(colour = "Stations"),
shape = 3,
stroke = 1,
size = 3, key_glyph = draw_key_label
) +
# city's boundaries
geom_sf(
data = marseille_borders,
colour = "black",
fill = NA,
size = 0.1
) +
# port's boundaries
geom_sf(data = harbour_data$osm_lines,
aes(colour = "Harbour"),
size = 0.5, key_glyph = draw_key_label) +
geom_sf(
data = harbour_data$osm_polygons,
aes(colour = "Harbour"),
fill = NA,
size = 0.3, key_glyph = draw_key_label
) +
# station labels
ggrepel::geom_text_repel(
data = stations_coordinates,
aes(X, Y, label = name),
hjust = 1,
segment.color = NA,
size = 4,
fontface = 'bold'
) +
# axes labels
xlab('Longitude') + ylab('Latitude') +
# add scale
ggspatial::annotation_scale(
location = "br",
line_width = 0.5,
height = unit(0.2, "cm")
) +
# add north arrow
ggspatial::annotation_north_arrow(
location = "tr",
which_north = "true",
height = unit(1, "cm"),
width = unit(1, "cm")
) +
# zoom on the map
coord_sf(
crs = st_crs(2154),
xlim = c(885000, 898500),
ylim = c(6245000, 6254000),
expand = FALSE
) +
scale_color_manual(name = "Legend:", values = colors) +
theme_tufte() +
# theme options
theme(
panel.border = element_rect(color = "black", fill = NA, size = 0.8),
panel.background = element_rect(fill = "aliceblue"),
panel.grid.major = element_blank())
# save the map
ggsave(
map_marseille,
filename = here::here(
"inputs",
"3.outputs",
"1.hourly_analysis",
"1.eda",
"map_marseille.pdf"
),
width = 18,
height = 12,
units = "cm",
device = cairo_pdf
)
If you see mistakes or want to suggest changes, please create an issue on the source repository.