Exploratory Data Analysis

Hourly Level Data.

Marie-Abèle Bind https://scholar.harvard.edu/marie-abele (Biostatistics Center, Massachusetts General Hospital)https://biostatistics.massgeneral.org/faculty/marie-abele-bind-phd/ , Marion Leroutier https://www.parisschoolofeconomics.eu/en/leroutier-marion/work-in-progress/ (Misum, Stockholm School of Economics)https://www.hhs.se/en/persons/l/leroutier-marion/ , Léo Zabrocki https://lzabrocki.github.io/ (RFF-CMCC EIEE)https://www.eiee.org/
2022-08-03

In this document, we provide all steps to reproduce the exploratory analysis of our data at the hourly level.

Should you have any questions, need help to reproduce the analysis or find coding errors, please do not hesitate to contact us at and .

Required Packages

We load the following packages:

# load required packages
library(rmarkdown) # for creating the R Markdown document
library(knitr) # for creating the R Markdown document
library(here) # for files paths organization
library(tidyverse) # for data manipulation and visualization
library(dtplyr) # to speed up to dplyr
library(ggridges) # for density ridge plots
library(osmdata) # for retrieving open street map data
library(sf) # for simple features access
library(patchwork) # for combining plots
library(Cairo) # for printing customed police of graphs

We also load our custom ggplot2 theme for graphs:

# load ggplot custom theme
source(here::here(
  "inputs",
  "2.functions",
  "script_theme_tufte.R"
))
# define nice colors
my_blue <- "#0081a7"
my_orange <- "#fb8500"

Finally, we load the data:

# load data
data <- readRDS(  here::here(
    "inputs",
    "1.data",
    "1.hourly_data",
    "2.data_for_analysis",
    "0.main_data",
    "data_for_analysis_hourly.RDS"
  )
) %>%
  as.data.frame()

Vessel Traffic Data

We first explore the patterns of vessel traffic data at the hourly level and by vessel type (cruise, ferry and other types).

Hourly Patterns of Gross Tonnage

Mean of total tonnage by vessel categories and hour of the day:

Please show me the code!
# create the graph
graph_total_tonnage_hour_vessel_type <- data %>%
  select(
    hour,
    total_gross_tonnage,
    total_gross_tonnage_cruise,
    total_gross_tonnage_ferry,
    total_gross_tonnage_other_vessels
  ) %>%
  rename(
    "Total Tonnage" = total_gross_tonnage,
    "Cruise Tonnage" = total_gross_tonnage_cruise,
    "Ferry Tonnage" = total_gross_tonnage_ferry,
    "Other Vessel Types Tonnage" = total_gross_tonnage_other_vessels
  ) %>%
  pivot_longer(cols = -c(hour),
               names_to = "tonnage_type",
               values_to = "tonnage") %>%
  group_by(tonnage_type, hour) %>%
  summarise(mean_tonnage = mean(tonnage) %>%
              ifelse(is.na(.), 0, .)) %>%
  ggplot(., aes(x = as.factor(hour), y = mean_tonnage, group = "l")) +
  geom_line(colour = my_blue) +
  facet_wrap( ~ tonnage_type) +
  xlab("Hour") + ylab("Average Gross Tonnage") +
  theme_tufte()

# display the graph
graph_total_tonnage_hour_vessel_type
Please show me the code!
ggsave(
  graph_total_tonnage_hour_vessel_type,
  filename = here::here(
    "inputs",
    "3.outputs",
    "1.hourly_analysis",
    "1.eda",
    "graph_total_tonnage_hour_vessel_type.pdf"
  ),
  width = 30,
  height = 15,
  units = "cm",
  device = cairo_pdf
)

Mean of entering and exiting total tonnage by vessel categories and hour of the day:

Please show me the code!
# create the graph
graph_entering_exiting_total_tonnage_hour_vessel_type <- data %>%
  select(
    hour,
    total_gross_tonnage_entry:total_gross_tonnage_entry_other_vessels,
    total_gross_tonnage_exit:total_gross_tonnage_exit_other_vessels
  ) %>%
  pivot_longer(cols = -c(hour),
               names_to = "tonnage_type",
               values_to = "tonnage") %>%
  mutate(
    entry_exit = ifelse(str_detect(tonnage_type, "entry"), "Arrival", "Departure"),
    vessel_type = "Total Tonnage" %>%
      ifelse(str_detect(tonnage_type, "cruise"), "Cruise", .) %>%
      ifelse(str_detect(tonnage_type, "ferry"), "Ferry", .) %>%
      ifelse(str_detect(tonnage_type, "other"), "Other Vessel Types", .)
  ) %>%
  group_by(vessel_type, entry_exit, hour) %>%
  summarise(mean_tonnage = mean(tonnage) %>%
              ifelse(is.na(.), 0, .)) %>%
  ggplot(.,
         aes(
           x = as.factor(hour),
           y = mean_tonnage,
           group = entry_exit,
           colour = entry_exit
         )) +
  geom_line(size = 1.1) +
  scale_colour_manual(values = c(my_blue, my_orange)) +
  facet_wrap( ~ vessel_type) +
  xlab("Hour") + ylab("Average Gross Tonnage") +
  labs(colour = "Vessel Status") +
  theme_tufte() +
  theme(
    axis.title.y = element_text(size = 20),
    axis.title.x = element_text(size = 20),
    strip.text.x = element_text(size = 18),
    legend.title = element_text(size = 20),
    legend.text = element_text(size = 18)
  )

# display the graph
graph_entering_exiting_total_tonnage_hour_vessel_type
Please show me the code!
ggsave(
  graph_entering_exiting_total_tonnage_hour_vessel_type,
  filename = here::here(
    "inputs",
    "3.outputs",
    "1.hourly_analysis",
    "1.eda",
    "graph_entering_exiting_total_tonnage_hour_vessel_type.pdf"
  ),
  width = 30,
  height = 15,
  units = "cm",
  device = cairo_pdf
)

Hourly Patterns of total Gross Tonnage by Season

Mean of total tonnage by vessel categories, season and hour of the day:

Please show me the code!
# create the graph
graph_total_tonnage_season_hour_vessel_type <- data %>%
  mutate(
    season = recode(
      season,
      "1" = "January-March",
      "2" = "April-June",
      "3" = "July-September",
      "4" = "October-December"
    ) %>% fct_relevel(
      .,
      "January-March",
      "April-June",
      "July-September",
      "October-December"
    )
  ) %>%
  select(
    hour,
    season,
    total_gross_tonnage_cruise,
    total_gross_tonnage_ferry,
    total_gross_tonnage_other_vessels
  ) %>%
  rename("Cruise" = total_gross_tonnage_cruise,
         "Ferry" = total_gross_tonnage_ferry,
         "Other Vessel Types" = total_gross_tonnage_other_vessels) %>%
  pivot_longer(
    cols = -c(hour, season),
    names_to = "tonnage_type",
    values_to = "tonnage"
  ) %>%
  group_by(tonnage_type, season, hour) %>%
  summarise(mean_tonnage = mean(tonnage) %>%
              ifelse(is.na(.), 0, .)) %>%
  ggplot(., aes(x = as.factor(hour), y = mean_tonnage, group = "l")) +
  geom_line(colour = my_blue) +
  facet_grid(tonnage_type ~ season, scales = "free_y") +
  xlab("Hour") + ylab("Average Gross Tonnage") +
  theme_tufte()

# display the graph
graph_total_tonnage_season_hour_vessel_type
Please show me the code!
# save the graph
ggsave(
  graph_total_tonnage_season_hour_vessel_type,
  filename = here::here(
    "inputs",
    "3.outputs",
    "1.hourly_analysis",
    "1.eda",
    "graph_total_tonnage_season_hour_vessel_type.pdf"
  ),
  width = 90,
  height = 36,
  units = "cm",
  device = cairo_pdf
)

Lack of Variation in Vessel Traffic

One issue of for our matching analysis is that vessel traffic is very regular. For instance, cruise vessels dock and leave the port for specific days and hours so that it is hard to find similar pairs of hours with and without traffic. We plot below the hourly tonnage of cruise docking the port for mondays belonging to July and August months of 2012:

Please show me the code!
# create the data
data_tonnage_cruise_traffic_monday_season_3_2012 <- data %>%
  filter(month %in% c("July", "August") &
           year == 2012 & weekday == "Monday") %>%
  mutate(julian_date = lubridate::yday(date)) %>%
  mutate(julian_date = map(julian_date, ~ paste("Day", ., sep = " "))) %>%
  unnest(julian_date) %>%
  select(julian_date, hour, total_gross_tonnage_entry_cruise)

# graph entering tonnage cruise
graph_tonnage_variation_cruise <-
  data_tonnage_cruise_traffic_monday_season_3_2012 %>%
  ggplot(.,
         aes(x = as.factor(hour), y = total_gross_tonnage_entry_cruise, group = julian_date)) +
  geom_line(colour = my_blue) +
  facet_wrap( ~ julian_date) +
  scale_y_continuous(
    labels = function(x)
      format(x, scientific = FALSE)
  ) +
  xlab("Hour") + ylab("Average Gross Tonnage") +
  theme_tufte()

# display the graph entering tonnage cruise
graph_tonnage_variation_cruise
Please show me the code!
ggsave(
  graph_tonnage_variation_cruise,
  filename = here::here(
    "inputs",
    "3.outputs",
    "1.hourly_analysis",
    "1.eda",
    "graph_tonnage_variation_cruise.pdf"
  ),
  width = 84,
  height = 35,
  units = "cm",
  device = cairo_pdf
)

Pollution Data

We explore here the hourly patterns of pollutant concentrations. We plot below the average concentration by hour for each pollutant:

Please show me the code!
# create the graph of pollutants' concentration by hour of the day
graph_pollutant_hour <- data %>%
  select(hour, mean_no2_l:mean_pm10_sl) %>%
  rename(
    "NO2 Longchamp" = mean_no2_l,
    "O3 Longchamp" = mean_o3_l,
    "PM2.5 Longchamp" = mean_pm25_l,
    "PM10 Longchamp" = mean_pm10_l,
    "SO2 Longchamp" = mean_so2_l,
    "NO2 Saint-Louis" = mean_no2_sl,
    "PM10 Saint-Louis" = mean_pm10_sl
  ) %>%
  pivot_longer(cols = -c(hour),
               names_to = "pollutant",
               values_to = "concentration") %>%
  group_by(hour, pollutant) %>%
  summarise(mean_concentration = mean(concentration)) %>%
  ggplot(., aes(x = as.factor(hour), y = mean_concentration, group = "l")) +
  geom_line(colour = my_blue) +
  facet_wrap( ~ pollutant, scales = "free", ncol = 4) +
  xlab("Hour") + ylab("Concentration (µg/m³)") +
  theme_tufte()

# display the graph
graph_pollutant_hour
Please show me the code!
# save the graph
ggsave(
  graph_pollutant_hour,
  filename = here::here(
    "inputs",
    "3.outputs",
    "1.hourly_analysis",
    "1.eda",
    "graph_pollutant_hour.pdf"
  ),
  width = 100,
  height = 30,
  units = "cm",
  device = cairo_pdf
)

Weather Data

We explore here the hourly patterns of weather parameters. We plot the hourly distribution of weather parameters:

Please show me the code!
# make the graph
graph_continuous_weather_hour <- data %>%
  select(hour,
         temperature_average,
         rainfall_height_dummy,
         wind_speed,
         humidity_average) %>%
  rename(
    "Average Temperature (°C)" = temperature_average,
    "Rainfall Dummy (%)" = rainfall_height_dummy,
    "Wind Speed (m/s)" =          wind_speed,
    "Average Humidity (%)" = humidity_average
  ) %>%
  pivot_longer(cols = -c(hour),
               names_to = "weather_parameter",
               values_to = "values") %>%
  group_by(weather_parameter, hour) %>%
  summarise(values = mean(values)) %>%
  ungroup() %>%
  mutate(values = ifelse(weather_parameter == "Rainfall Dummy (%)", values *
                           100, values)) %>%
  ggplot(., aes(x = as.factor(hour), y = values, group = "l")) +
  geom_line(colour = my_blue) +
  facet_wrap( ~ weather_parameter, scales = "free_y") +
  xlab("Hour") + ylab("Value") +
  theme_tufte()

# display the graph
graph_continuous_weather_hour
Please show me the code!
# save the graph
ggsave(
  graph_continuous_weather_hour,
  filename = here::here(
    "inputs",
    "3.outputs",
    "1.hourly_analysis",
    "1.eda",
    "graph_continuous_weather_hour.pdf"
  ),
  width = 60,
  height = 25,
  units = "cm",
  device = cairo_pdf
)

We plot the hourly distribution of wind direction categories:

Please show me the code!
# make the graph
graph_wind_direction_hour <- data %>%
  select(hour, wind_direction) %>%
  mutate(
    wind_direction_categories = cut(
      wind_direction,
      breaks = seq(0, 360, by  = 90),
      include.lowest = TRUE
    ) %>%
      recode(
        .,
        "[0,90]" = "North-East",
        "(90,180]" = "South-East",
        "(180,270]" = "South-West",
        "(270,360]" = "North-West"
      )
  ) %>%
  mutate(hour = as.factor(hour)) %>%
  group_by(hour, wind_direction_categories) %>%
  # compute the number of observations
  summarise(n = n()) %>%
  # compute the proportion
  mutate(freq = round(n / sum(n) * 100, 0)) %>%
  ggplot(., aes(x = hour, y = freq, group = "l")) +
  geom_line(colour = my_blue) +
  facet_wrap( ~ wind_direction_categories, ncol = 4) +
  coord_flip() +
  xlab("Hour") + ylab("Proportion (%)") +
  theme_tufte()

# display the graph
graph_wind_direction_hour
Please show me the code!
# save the graph
ggsave(
  graph_wind_direction_hour,
  filename = here::here(
    "inputs",
    "3.outputs",
    "1.hourly_analysis",
    "1.eda",
    "graph_wind_direction_hour.pdf"
  ),
  width = 40,
  height = 22,
  units = "cm",
  device = cairo_pdf
)

Road Traffic Data

We explore here the hourly patterns of the flow of vehicles. We plot below the average flow of vehicles by hour of the day:

Please show me the code!
# create the graph of the flow of vehicles by hour of the day
graph_road_traffic_hour <- data %>%
  group_by(hour) %>%
  summarise(mean_road_traffic_flow = mean(road_traffic_flow, na.rm = TRUE)) %>%
  ungroup() %>%
  ggplot(., aes(x = as.factor(hour), y = mean_road_traffic_flow, group = "l")) +
  geom_line(colour = my_blue) +
  xlab("Hour") + ylab("Average Road Traffic Flow\n (Number of Vehicles)") +
  theme_tufte()

# display the graph
graph_road_traffic_hour
Please show me the code!
# save the graph
ggsave(
  graph_road_traffic_hour,
  filename = here::here(
    "inputs",
    "3.outputs",
    "1.hourly_analysis",
    "1.eda",
    "graph_road_traffic_hour.pdf"
  ),
  width = 30,
  height = 15,
  units = "cm",
  device = cairo_pdf
)

Map of Marseille

City’s boundaries

We downloaded from data.gouv.fr a shape file of Marseille’s districts. We load these data and compute the union of the districts to have the boundaries of the city:

# load the city's boundaries
marseille_borders <- st_read(
  here::here(
    "inputs",
    "1.data",
    "1.hourly_data",
    "1.raw_data",
    "6.marseille_map_data",
    "districts_marseille.shp"
  )
) %>%
  st_union()

Port’s boundaries

Using the osmdata package, we retrieve the boundaries of Marseille’s port:

# building the query to get boundaries of the port
harbour_query <- getbb("Marseille") %>%
  opq() %>%
  add_osm_feature("harbour")

# retrieve the data
harbour_data <- osmdata_sf(harbour_query)

Roads data

Using the osmdata package, we retrieve Marseille’s network of roads. This query takes a bit of time to run:

# building the query to get roads data
roads_query <- getbb("Marseille") %>%
  opq() %>%
  add_osm_feature("highway")

# retrieve the data
roads_data <- osmdata_sf(roads_query)

Air pollution measuring stations data

We retrieve the coordinates of Longchamp and Saint Louis stations from AtmoSud website:

# create air pollution measuring stations data
data_stations <-
  tibble(
    latitude = c(43.34811,  43.30610),
    longitude = c(5.36012, 5.39526),
    name = c("Saint Louis", "Longchamp")
  )

# set the crs
data_stations <- st_as_sf(data_stations,
                          coords = c('longitude', 'latitude'),
                          crs = "+init=epsg:4326")

# retrieve station coordinates for labelling their names on the map
stations_coordinates <-
  as.data.frame(st_transform(data_stations, 2154) %>%
                  st_coordinates(.))
stations_coordinates$name <- data_stations$name

Making the map

# we first store a vector of colors to create the legend
colors <- c("Harbour" = my_orange, "Stations" = my_blue)

# make the map
map_marseille <- ggplot() +
    # city's boundaries
  geom_sf(
    data = marseille_borders,
    colour = "black",
    fill = "white",
    size = 0.1
  ) +
  
  # roads
  geom_sf(data = roads_data$osm_lines,
          colour = "gray80",
          size = 0.2) +
  # station coordinates
  geom_sf(
    data = data_stations,
    aes(colour = "Stations"),
    shape = 3,
    stroke = 1,
    size = 3, key_glyph = draw_key_label
  ) +
  # city's boundaries
  geom_sf(
    data = marseille_borders,
    colour = "black",
    fill = NA,
    size = 0.1
  ) +
  # port's boundaries
  geom_sf(data = harbour_data$osm_lines,
          aes(colour = "Harbour"),
              size = 0.5, key_glyph = draw_key_label) +
  geom_sf(
    data = harbour_data$osm_polygons,
    aes(colour = "Harbour"),
    fill = NA,
    size = 0.3, key_glyph = draw_key_label
  ) +
  # station labels
  ggrepel::geom_text_repel(
    data = stations_coordinates,
    aes(X, Y, label = name),
    hjust = 1,
    segment.color = NA,
    size = 4,
    fontface = 'bold'
  ) +
  # axes labels
  xlab('Longitude') + ylab('Latitude') +
  # add scale
  ggspatial::annotation_scale(
    location = "br",
    line_width = 0.5,
    height = unit(0.2, "cm")
  ) +
  # add north arrow
  ggspatial::annotation_north_arrow(
    location = "tr",
    which_north = "true",
    height = unit(1, "cm"),
    width = unit(1, "cm")
  ) +
  # zoom on the map
  coord_sf(
    crs = st_crs(2154),
    xlim = c(885000, 898500),
    ylim = c(6245000, 6254000),
    expand = FALSE
  ) +
  scale_color_manual(name = "Legend:", values = colors) +
  theme_tufte() +
  # theme options
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 0.8),
    panel.background = element_rect(fill = "aliceblue"),
    panel.grid.major = element_blank())

# save the map
ggsave(
  map_marseille,
  filename = here::here(
    "inputs",
    "3.outputs",
    "1.hourly_analysis",
    "1.eda",
    "map_marseille.pdf"
  ),
  width = 18,
  height = 12,
  units = "cm",
  device = cairo_pdf
)

Corrections

If you see mistakes or want to suggest changes, please create an issue on the source repository.