We adjust for calendar calendar indicator and weather confounding factors.

Should you have any questions, need help to reproduce the analysis or find coding errors, please do not hesitate to contact us at leo.zabrocki@gmail.com and marion.leroutier@hhs.se.

Required Packages

We load the following packages:

# load required packages
library(knitr) # for creating the R Markdown document
library(here) # for files paths organization
library(tidyverse) # for data manipulation and visualization
library(dtplyr) # to speed up to dplyr
library(randChecks) # for randomization check
library(ggridges) # for ridge density plots
library(Cairo) # for printing custom police of graphs
library(patchwork) # combining plots
library(kableExtra) # for table formatting

We also load our custom ggplot2 theme for graphs:

# load ggplot custom theme
source(here::here(
  "inputs",
  "2.functions",
  "script_theme_tufte.R"
))
# define nice colors
my_blue <- "#0081a7"
my_orange <- "#fb8500"

Preparing the Data

We load the matching and matched data and bind them together:

# load Initial Data
data_matching <-
  readRDS(
    here::here(
      "inputs",
      "1.data",
      "1.hourly_data",
      "2.data_for_analysis",
      "1.matched_data",
      "matching_data.rds"
    )
  ) %>%
  mutate(dataset = "Initial Data")

# load matched data
data_matched <-
  readRDS(
    here::here(
      "inputs",
      "1.data",
      "1.hourly_data",
      "2.data_for_analysis",
      "1.matched_data",
      "matched_data_entry_cruise.rds"
    )
  ) %>%
  mutate(dataset = "Matched Data")

# bind the two datasets
data <- bind_rows(data_matched, data_matching) %>% dplyr::as_data_frame()

We also load the initial data to select the lags of variables at the daily level:

# load initial data
data_daily <-
  readRDS(
    here::here(
      "inputs",
      "1.data",
      "1.hourly_data",
      "2.data_for_analysis",
      "0.main_data",
      "data_for_analysis_hourly.RDS"
    )
  ) %>%
  select(
    date,
    daily_mean_no2_sl_lag_1:daily_mean_o3_l_lag_2,
    daily_rainfall_height_dummy_lag_1:daily_humidity_average_lag_2
  ) %>% dplyr::as_data_frame()

We merge the matched_data with the data_daily :

data <- data %>%
  left_join(., data_daily, by = "date")

We change labels of the is_treated variable :

data <- data %>%
  mutate(is_treated = ifelse(is_treated == "TRUE", "True", "False"))

Love Plots

Continuous Weather Covariates

Please show me the code!

# compute figures for the love plot
data_weather_continuous <- data %>%
  select(
    dataset,
    is_treated,
    pair_number,
    contains("temperature"),
    contains("humidity"),
    contains("wind_speed")
  ) %>%
  pivot_longer(
    cols = -c(pair_number, is_treated, dataset),
    names_to = "variable",
    values_to = "values"
  ) %>%
  mutate(
    weather_variable = NA %>%
      ifelse(
        str_detect(variable, "temperature_average"),
        "Average Temperature",
        .
      ) %>%
      ifelse(
        str_detect(variable, "humidity_average"),
        "Humidity Average",
        .
      ) %>%
      ifelse(str_detect(variable, "wind_speed"), "Wind Speed", .)
  ) %>%
  mutate(
    time = 0 %>%
      ifelse(str_detect(variable, "lag_1"),-1, .) %>%
      ifelse(str_detect(variable, "lag_2"),-2, .) %>%
      ifelse(str_detect(variable, "lag_3"),-3, .) %>%
      ifelse(str_detect(variable, "lead_1"), 1, .) %>%
      ifelse(str_detect(variable, "lead_2"), 2, .) %>%
      ifelse(str_detect(variable, "lead_3"), 3, .)
  ) %>%
  filter(time<= 0) %>%
  mutate(level = "Hourly" %>%
           ifelse(str_detect(variable, "daily"), "Daily", .)) %>%
  filter(!is.na(time)) %>%
  mutate(level_time = paste(level, "Lag", time, sep = " ")) %>%
  select(dataset,
         pair_number,
         is_treated,
         weather_variable,
         level_time,
         values)

data_abs_difference_continuous_weather <-
  data_weather_continuous %>%
  group_by(dataset, weather_variable, level_time, is_treated) %>%
  summarise(mean_values = mean(values, na.rm = TRUE)) %>%
  summarise(abs_difference = abs(mean_values[2] - mean_values[1]))

data_sd_weather_continuous <-  data_weather_continuous %>%
  filter(is_treated == "True" & dataset == "Initial Data") %>%
  group_by( weather_variable, level_time, is_treated) %>%
  summarise(sd_treatment = sd(values, na.rm = TRUE)) %>%
  ungroup() %>%
  select(weather_variable, level_time, sd_treatment)

data_love_continuous_weather <-
  left_join(
    data_abs_difference_continuous_weather,
    data_sd_weather_continuous,
    by = c("weather_variable", "level_time")
  ) %>%
  mutate(standardized_difference = abs_difference / sd_treatment) %>%
  select(-c(abs_difference, sd_treatment)) %>%
  mutate(level_time = fct_relevel(level_time, "Daily Lag -1", "Daily Lag -2", "Hourly Lag 0", "Hourly Lag -1", "Hourly Lag -2", "Hourly Lag -3"))



# make the graph
graph_love_plot_continuous_weather <-
  ggplot(
    data_love_continuous_weather,
    aes(
      y = fct_rev(level_time),
      x = standardized_difference,
      colour = fct_rev(dataset),
      shape = fct_rev(dataset),
    )
  ) +
  geom_vline(xintercept = 0) +
  geom_vline(xintercept = 0.1,
             color = "black",
             linetype = "dashed") +
  geom_point(
    size = 2,
    alpha = 0.8
  ) +
  scale_colour_manual(name = "Dataset: ", values = c(my_blue, my_orange)) +
  scale_shape_manual(name = "Dataset: ", values = c(16, 17)) +
  facet_wrap( ~ weather_variable, scales = "free_y") +
  xlab("Standardized Mean Differences") +
  ylab("") +
  theme_tufte()

# plot the graph
graph_love_plot_continuous_weather

Please show me the code!

# save the graph
ggsave(
  graph_love_plot_continuous_weather,
  filename = here::here(
    "inputs",
    "3.outputs",
    "1.hourly_analysis",
    "2.experiment_cruise",
    "1.checking_matching_procedure",
    "graph_love_plot_continuous_weather.pdf"
  ),
  width = 55,
  height = 20,
  units = "cm",
  device = cairo_pdf
)

Categorical Weather Covariates

Please show me the code!

# compute figures for the love plot
data_weather_categorical <- data %>%
  select(
    dataset,
    is_treated,
    contains("rainfall_height_dummy"),
    contains("wind_direction_east_west")
  ) %>%
  mutate_at(vars(contains("rainfall")), ~ ifelse(. == 1, "True", "False")) %>%
  mutate_all( ~ as.character(.)) %>%
  pivot_longer(
    cols = -c(dataset, is_treated),
    names_to = "variable",
    values_to = "values"
  ) %>%
  # group by is_treated, variable and values
  group_by(dataset, is_treated, variable, values) %>%
  # compute the number of observations
  summarise(n = n()) %>%
  # compute the proportion
  mutate(freq = round(n / sum(n) * 100, 0)) %>%
  ungroup() %>%
  mutate(
    weather_variable = NA %>%
      ifelse(str_detect(variable, "wind"), "Wind Direction", .) %>%
      ifelse(str_detect(variable, "rainfall"), "Rainfall Dummy", .)
  ) %>%
  mutate(
    time = 0 %>%
      ifelse(str_detect(variable, "lag_1"),-1, .) %>%
      ifelse(str_detect(variable, "lag_2"),-2, .) %>%
      ifelse(str_detect(variable, "lag_3"),-3, .) %>%
      ifelse(str_detect(variable, "lead_1"), 1, .) %>%
      ifelse(str_detect(variable, "lead_2"), 2, .) %>%
      ifelse(str_detect(variable, "lead_3"), 3, .)
  ) %>%
  filter(time<= 0) %>%
  mutate(level = "Hourly" %>%
           ifelse(str_detect(variable, "daily"), "Daily", .)) %>%
  filter(!is.na(time)) %>%
  mutate(variable = paste(level, weather_variable, "\nLag", time, sep = " ")) %>%
  select(dataset, is_treated, weather_variable, variable, values, freq) %>%
  pivot_wider(names_from = is_treated, values_from = freq) %>%
  mutate(abs_difference = abs(`True` - `False`)) %>%
  filter(values != "False" & values != "West") 

# create the figure for wind direction
graph_love_plot_wind_direction <- data_weather_categorical %>%
  filter(weather_variable == "Wind Direction") %>%
  mutate(variable = fct_relevel(variable, "Daily Wind Direction \nLag -1", "Daily Wind Direction \nLag -2", "Hourly Wind Direction \nLag 0", "Hourly Wind Direction \nLag -1", "Hourly Wind Direction \nLag -2", "Hourly Wind Direction \nLag -3")) %>%
  ggplot(., aes(
    y = fct_rev(values),
    x = abs_difference,
    colour = fct_rev(dataset),
    shape = fct_rev(dataset)
  )) +
  geom_vline(xintercept = 0) +
  geom_point(
    size = 2,
    alpha = 0.8
  ) +
  scale_colour_manual(name = "Dataset: ", values = c(my_blue, my_orange)) +
  scale_shape_manual(name = "Dataset: ", values = c(16, 17)) +
  facet_wrap( ~ variable, scales = "free_y", ncol = 4) +
  xlab("Absolute Difference in Percentage Points") +
  ylab("") +
  theme_tufte()

# print the figure for wind direction
graph_love_plot_wind_direction

Please show me the code!

# save the figure for wind direction
ggsave(
  graph_love_plot_wind_direction,
  filename = here::here(
      "inputs",
      "3.outputs",
      "1.hourly_analysis",
      "2.experiment_cruise",
      "1.checking_matching_procedure",
    "graph_love_plot_wind_direction.pdf"
  ),
  width = 60,
  height = 30,
  units = "cm",
  device = cairo_pdf
)

# create the figure for rainfall dummy
graph_love_plot_rainfall <- data_weather_categorical %>%
  filter(weather_variable == "Rainfall Dummy") %>%
  mutate(variable = fct_relevel(variable, "Daily Rainfall Dummy \nLag -1", "Daily Rainfall Dummy \nLag -2", "Hourly Rainfall Dummy \nLag 0", "Hourly Rainfall Dummy \nLag -1", "Hourly Rainfall Dummy \nLag -2", "Hourly Rainfall Dummy \nLag -3")) %>%
  ggplot(., aes(
    y = fct_rev(variable),
    x = abs_difference,
    colour = fct_rev(dataset),
    shape = fct_rev(dataset)
  )) +
  geom_vline(xintercept = 0) +
  geom_point(
    size = 2,
    alpha = 0.8
  ) +
  scale_colour_manual(name = "Dataset: ", values = c(my_blue, my_orange)) +
  scale_shape_manual(name = "Dataset: ", values = c(16, 17)) +
  xlab("Absolute Difference in Percentage Points") +
  ylab("") +
  theme_tufte()

# print the figure for rainfall dummy
graph_love_plot_rainfall

Please show me the code!

# save the figure for rainfall dummy
ggsave(
  graph_love_plot_rainfall,
  filename = here::here(
    "inputs",
    "3.outputs",
    "1.hourly_analysis",
    "2.experiment_cruise",
    "1.checking_matching_procedure",
    "graph_love_plot_rainfall.pdf"
  ),
  width = 30,
  height = 30,
  units = "cm",
  device = cairo_pdf
)

Pollutants

Please show me the code!

# compute figures for the love plot
data_pollutants <- data %>%
  select(
    dataset,
    is_treated,
    pair_number,
    contains("no2"),
    contains("o3"),
    contains("pm10"),
    contains("pm25"),
    contains("so2")
  ) %>%
  pivot_longer(
    cols = -c(pair_number, is_treated, dataset),
    names_to = "variable",
    values_to = "values"
  ) %>%
  mutate(
    pollutant = NA %>%
      ifelse(str_detect(variable, "no2_l"), "NO2 Longchamp", .) %>%
      ifelse(str_detect(variable, "no2_sl"), "NO2 Saint-Louis", .) %>%
      ifelse(str_detect(variable, "o3"), "O3 Lonchamp", .) %>%
      ifelse(str_detect(variable, "pm10_l"), "PM10 Longchamp", .) %>%
      ifelse(str_detect(variable, "pm10_sl"), "PM10 Saint-Louis", .) %>%
      ifelse(str_detect(variable, "pm25"), "PM2.5 Longchamp", .) %>%
      ifelse(str_detect(variable, "so2"), "SO2 Longchamp", .)
  ) %>%
  mutate(
    time = 0 %>%
      ifelse(str_detect(variable, "lag_1"),-1, .) %>%
      ifelse(str_detect(variable, "lag_2"),-2, .) %>%
      ifelse(str_detect(variable, "lag_3"),-3, .) %>%
      ifelse(str_detect(variable, "lead_1"), 1, .) %>%
      ifelse(str_detect(variable, "lead_2"), 2, .) %>%
      ifelse(str_detect(variable, "lead_3"), 3, .)
  ) %>%
  filter(time<= 0) %>%
  mutate(level = "Hourly" %>%
           ifelse(str_detect(variable, "daily"), "Daily", .)) %>%
  filter(!is.na(time)) %>%
  mutate(level_time = paste(level, "Lag", time, sep = " ")) %>%
  select(dataset,
         pair_number,
         is_treated,
         pollutant,
         level_time,
         values)

data_abs_difference_pollutants <- data_pollutants %>%
  group_by(dataset, pollutant, level_time, is_treated) %>%
  summarise(mean_values = mean(values, na.rm = TRUE)) %>%
  summarise(abs_difference = abs(mean_values[2] - mean_values[1]))

data_sd_pollutants <-  data_pollutants %>%
  filter(is_treated == "True" & dataset == "Initial Data") %>%
  group_by(pollutant, level_time, is_treated) %>%
  summarise(sd_treatment = sd(values, na.rm = TRUE)) %>%
  ungroup() %>%
  select(pollutant, level_time, sd_treatment)

data_love_pollutants <-
  left_join(data_abs_difference_pollutants,
            data_sd_pollutants,
            by = c("pollutant", "level_time")) %>%
  mutate(standardized_difference = abs_difference / sd_treatment) %>%
  select(-c(abs_difference, sd_treatment)) %>%
  mutate(level_time = fct_relevel(level_time, "Daily Lag -1", "Daily Lag -2", "Hourly Lag 0", "Hourly Lag -1", "Hourly Lag -2", "Hourly Lag -3"))

# create the graph
graph_love_plot_pollutants <-
  ggplot(data_love_pollutants,
         aes(
           y = fct_rev(level_time),
           x = standardized_difference,
           colour = fct_rev(dataset),
           shape = fct_rev(dataset)
         )) +
  geom_vline(xintercept = 0) +
  geom_vline(xintercept = 0.1,
             color = "black",
             linetype = "dashed") +
  geom_point(
    size = 2,
    alpha = 0.8
  ) +
  scale_colour_manual(name = "Dataset: ", values = c(my_blue, my_orange)) +
  scale_shape_manual(name = "Dataset: ", values = c(16, 17)) +
  facet_wrap( ~ pollutant, scales = "free_y") +
  xlab("Standardized Mean Differences") +
  ylab("") +
  theme_tufte()

# print the graph
graph_love_plot_pollutants

Please show me the code!

# save the graph
ggsave(
  graph_love_plot_pollutants,
  filename = here::here(
    "inputs",
    "3.outputs",
    "1.hourly_analysis",
    "2.experiment_cruise",
    "1.checking_matching_procedure",
    "graph_love_plot_pollutants.pdf"
  ),
  width = 60,
  height = 40,
  units = "cm",
  device = cairo_pdf
)

Vessels Traffic

Please show me the code!

# compute figures for the love plot
data_tonnage <-  data %>%
  # select relevant variables
  select(
    dataset,
    pair_number,
    is_treated,
    contains("total_gross_tonnage_entry_cruise"),
    contains("total_gross_tonnage_exit_cruise"),
    contains("total_gross_tonnage_entry_ferry"),
    contains("total_gross_tonnage_exit_ferry"),
    contains("total_gross_tonnage_entry_other_vessels"),
    contains("total_gross_tonnage_exit_other_vessels")
  ) %>%
  # transform data in long format
  pivot_longer(
    cols = -c(dataset, pair_number, is_treated),
    names_to = "variable",
    values_to = "tonnage"
  ) %>%
  # create vessel type variable
  mutate(
    vessel_type = NA %>%
      ifelse(str_detect(variable, "cruise"), "Cruise", .) %>%
      ifelse(str_detect(variable, "ferry"), "Ferry", .) %>%
      ifelse(str_detect(variable, "other_vessels"), "Other Vessels", .)
  ) %>%
  # create the day variable
  mutate(entry_exit = NA %>%
           ifelse(str_detect(variable, "entry"), "Entry", .) %>%
           ifelse(str_detect(variable, "exit"), "Exit", .)) %>%
  mutate(
    time = 0 %>%
      ifelse(str_detect(variable, "lag_1"),-1, .) %>%
      ifelse(str_detect(variable, "lag_2"),-2, .) %>%
      ifelse(str_detect(variable, "lag_3"),-3, .) %>%
      ifelse(str_detect(variable, "lead_1"), 1, .) %>%
      ifelse(str_detect(variable, "lead_2"), 2, .) %>%
      ifelse(str_detect(variable, "lead_3"), 3, .)
  ) %>%
  select(dataset,
         pair_number,
         vessel_type,
         is_treated,
         entry_exit,
         time,
         tonnage)

data_abs_difference_tonnage <- data_tonnage %>%
  group_by(dataset, vessel_type, entry_exit, time, is_treated) %>%
  summarise(mean_tonnage = mean(tonnage, na.rm = TRUE)) %>%
  summarise(abs_difference = abs(mean_tonnage[2] - mean_tonnage[1]))

data_sd_tonnage <-  data_tonnage %>%
  filter(is_treated == "True" & dataset == "Initial Data") %>%
  group_by(vessel_type, entry_exit, time, is_treated) %>%
  summarise(sd_treatment = sd(tonnage, na.rm = TRUE)) %>%
  ungroup() %>%
  select(vessel_type, entry_exit, time, sd_treatment)

data_love_tonnage <-
  left_join(
    data_abs_difference_tonnage,
    data_sd_tonnage,
    by = c("vessel_type", "entry_exit", "time")
  ) %>%
  mutate(standardized_difference = abs_difference / sd_treatment) %>%
  select(-c(abs_difference, sd_treatment)) %>%
  filter(!(vessel_type == "Cruise" & time == 0))

# create the graph
graph_love_plot_tonnage <-
  ggplot(data_love_tonnage,
         aes(
           x = standardized_difference,
           y = as.factor(time),
           colour = fct_rev(dataset),
           shape = fct_rev(dataset)
         )) +
  geom_vline(xintercept = 0) +
  geom_vline(xintercept = 0.1,
             color = "black",
             linetype = "dashed") +
  geom_point(
    size = 2,
    alpha = 0.8
  ) +
  scale_colour_manual(name = "Dataset: ", values = c(my_blue, my_orange)) +
  scale_shape_manual(name = "Dataset: ", values = c(16, 17)) +
  facet_grid(entry_exit ~ vessel_type) +
  xlab("Standardized Mean Differences") +
  ylab("Hour") +
  theme_tufte()

# print the graph
graph_love_plot_tonnage

Please show me the code!

# save the graph
ggsave(
  graph_love_plot_tonnage,
  filename = here::here(
    "inputs",
    "3.outputs",
    "1.hourly_analysis",
    "2.experiment_cruise",
    "1.checking_matching_procedure",
    "graph_love_plot_tonnage.pdf"
  ),
  width = 40,
  height = 40,
  units = "cm",
  device = cairo_pdf
)

Road Traffic

Please show me the code!

# compute figures for the love plot
data_road <- data %>%
  # select relevant variables
  select(dataset,
         pair_number,
         is_treated,
         contains("road_traffic_flow")) %>%
  # transform data in long format
  pivot_longer(
    cols = -c(dataset, pair_number, is_treated),
    names_to = "road_traffic",
    values_to = "value"
  ) %>%
  mutate(
    time = 0 %>%
      ifelse(str_detect(road_traffic, "lag_1"),-1, .) %>%
      ifelse(str_detect(road_traffic, "lag_2"),-2, .) %>%
      ifelse(str_detect(road_traffic, "lag_3"),-3, .) %>%
      ifelse(str_detect(road_traffic, "lead_1"), 1, .) %>%
      ifelse(str_detect(road_traffic, "lead_2"), 2, .) %>%
      ifelse(str_detect(road_traffic, "lead_3"), 3, .)
  ) %>%
  select(dataset, pair_number, road_traffic, is_treated, time, value)

data_abs_difference_road <- data_road %>%
  group_by(dataset, road_traffic, time, is_treated) %>%
  summarise(mean_value = mean(value, na.rm = TRUE)) %>%
  summarise(abs_difference = abs(mean_value[2] - mean_value[1]))

data_sd_tonnage <-  data_road %>%
  filter(is_treated == "True" & dataset == "Initial Data") %>%
  group_by(road_traffic, time, is_treated) %>%
  summarise(sd_treatment = sd(value, na.rm = TRUE)) %>%
  ungroup() %>%
  select(road_traffic, time, sd_treatment)

data_love_road <-
  left_join(data_abs_difference_road,
            data_sd_tonnage,
            by = c("road_traffic", "time")) %>%
  mutate(standardized_difference = abs_difference / sd_treatment) %>%
  select(-c(abs_difference, sd_treatment))


# create the graph
graph_love_plot_road <-
  ggplot(
    data_love_road,
    aes(
      x = standardized_difference,
      y = as.factor(time),
      colour = fct_rev(dataset),
      shape = fct_rev(dataset)
    )
  ) +
  geom_vline(xintercept = 0) +
  geom_vline(xintercept = 0.1,
             color = "black",
             linetype = "dashed") +
  geom_point(size = 2, alpha = 0.8) +
  scale_colour_manual(name = "Dataset:", values = c(my_blue, my_orange)) +
  scale_shape_manual(name = "Dataset:", values = c(16, 17)) +
  xlab("Standardized Mean Differences") +
  ylab("Hour") +
  theme_tufte() 

# print the graph
graph_love_plot_road

Please show me the code!

# save the graph
ggsave(
  graph_love_plot_road,
  filename = here::here(
    "inputs",
    "3.outputs",
    "1.hourly_analysis",
    "2.experiment_cruise",
    "1.checking_matching_procedure",
    "graph_love_plot_road.pdf"
  ),
  width = 25,
  height = 20,
  units = "cm",
  device = cairo_pdf
)

Calendar Indicators

Create the relevant data:

Please show me the code!

# compute figures for the love plot
data_calendar <- data %>%
  mutate(weekday = lubridate::wday(date, abbr = FALSE, label = TRUE)) %>%
  select(dataset,
         is_treated,
         hour,
         weekday,
         holidays_dummy,
         bank_day_dummy,
         month,
         year) %>%
  mutate_at(vars(holidays_dummy, bank_day_dummy),
            ~ ifelse(. == 1, "True", "False")) %>%
  mutate_all( ~ as.character(.)) %>%
  pivot_longer(
    cols = -c(dataset, is_treated),
    names_to = "variable",
    values_to = "values"
  ) %>%
  # group by is_treated, variable and values
  group_by(dataset, is_treated, variable, values) %>%
  # compute the number of observations
  summarise(n = n()) %>%
  # compute the proportion
  mutate(freq = round(n / sum(n) * 100, 0)) %>%
  ungroup() %>%
  mutate(
    calendar_variable = NA %>%
      ifelse(str_detect(variable, "hour"), "Hour", .) %>%
      ifelse(str_detect(variable, "weekday"), "Day of the Week", .) %>%
      ifelse(str_detect(variable, "holidays_dummy"), "Holidays", .) %>%
      ifelse(str_detect(variable, "bank_day_dummy"), "Bank Day", .) %>%
      ifelse(str_detect(variable, "month"), "Month", .) %>%
      ifelse(str_detect(variable, "year"), "Year", .)
  ) %>%
  select(dataset, is_treated, calendar_variable, values, freq) %>%
  pivot_wider(names_from = is_treated, values_from = freq) %>%
  mutate(abs_difference = abs(`True` - `False`)) %>%
  filter(values != "False")

Plot for hours:

Please show me the code!

# graph for hour
graph_love_plot_hour <- data_calendar %>%
  filter(calendar_variable == "Hour") %>%
  ggplot(., aes(
    y = as.factor(as.numeric(values)),
    x = abs_difference,
    colour = fct_rev(dataset),
    shape = fct_rev(dataset)
  )) +
  geom_vline(xintercept = 0) +
  geom_point(size = 2, alpha = 0.8) +
  scale_colour_manual(name = "Dataset:", values = c(my_blue, my_orange)) +
  scale_shape_manual(name = "Dataset:", values = c(16, 17)) +
  xlab("Absolute Difference in Percentage Points") +
  ylab("") +
  theme_tufte()

# print the plot
graph_love_plot_hour

Please show me the code!

# save the plot
ggsave(
  graph_love_plot_hour,
  filename = here::here(
    "inputs",
    "3.outputs",
    "1.hourly_analysis",
    "2.experiment_cruise",
    "1.checking_matching_procedure",
    "graph_love_plot_hour.pdf"
  ),
  width = 30,
  height = 30,
  units = "cm",
  device = cairo_pdf
)

Plot for bank days and holidays:

Please show me the code!

# graph for bank days and holidays
graph_love_plot_bank_holidays <- data_calendar %>%
  filter(calendar_variable %in% c("Bank Day", "Holidays")) %>%
  ggplot(., aes(
    y = values,
    x = abs_difference,
    colour = fct_rev(dataset),
    shape = fct_rev(dataset)
  )) +
  geom_vline(xintercept = 0) +
  geom_point(size = 2, alpha = 0.8) +
  scale_colour_manual(name = "Dataset:", values = c(my_blue, my_orange)) +
  scale_shape_manual(name = "Dataset:", values = c(16, 17)) +
  facet_wrap( ~ calendar_variable) +
  xlab("Absolute Difference in Percentage Points") +
  ylab("") +
  theme_tufte() 

# print the plot
graph_love_plot_bank_holidays

Please show me the code!

# save the plot
ggsave(
  graph_love_plot_bank_holidays,
  filename = here::here(
    "inputs",
    "3.outputs",
    "1.hourly_analysis",
    "2.experiment_cruise",
    "1.checking_matching_procedure",
    "graph_love_plot_bank_holidays.pdf"
  ),
  width = 30,
  height = 20,
  units = "cm",
  device = cairo_pdf
)

Plot for days of the week:

Please show me the code!

# graph for weekdays
graph_love_plot_weekday <- data_calendar %>%
  filter(calendar_variable == "Day of the Week") %>%
  mutate(
    values = fct_relevel(
      values,
      "Monday",
      "Tuesday",
      "Wednesday",
      "Thursday",
      "Friday",
      "Saturday",
      "Sunday"
    )
  ) %>%
  ggplot(., aes(
    y = fct_rev(values),
    x = abs_difference,
    colour = fct_rev(dataset),
    shape = fct_rev(dataset)
  )) +
  geom_vline(xintercept = 0) +
  geom_point(size = 2, alpha = 0.8) +
  scale_colour_manual(name = "Dataset:", values = c(my_blue, my_orange)) +
  scale_shape_manual(name = "Dataset:", values = c(16, 17)) +
  xlab("Absolute Difference in Percentage Points") +
  ylab("") +
  theme_tufte() 

# print the plot
graph_love_plot_weekday

Please show me the code!

# save the plot
ggsave(
  graph_love_plot_weekday,
  filename = here::here(
    "inputs",
    "3.outputs",
    "1.hourly_analysis",
    "2.experiment_cruise",
    "1.checking_matching_procedure",
    "graph_love_plot_weekday.pdf"
  ),
  width = 30,
  height = 20,
  units = "cm",
  device = cairo_pdf
)

Plot for months:

Please show me the code!

# graph for month
graph_love_plot_month <- data_calendar %>%
  filter(calendar_variable == "Month") %>%
  mutate(
    values = fct_relevel(
      values,
      "January",
      "February",
      "March",
      "April",
      "May",
      "June",
      "July",
      "August",
      "September",
      "October",
      "November",
      "December"
    )
  ) %>%
  ggplot(., aes(
    y = fct_rev(values),
    x = abs_difference,
    colour = fct_rev(dataset),
    shape = fct_rev(dataset)
  )) +
  geom_vline(xintercept = 0) +
  geom_point(size = 2, alpha = 0.8) +
  scale_colour_manual(name = "Dataset:", values = c(my_blue, my_orange)) +
  scale_shape_manual(name = "Dataset:", values = c(16, 17)) +
  ggtitle("Month") +
  xlab("Absolute Difference in Percentage Points") +
  ylab("") +
  theme_tufte() 

# print the plot
graph_love_plot_month

Please show me the code!

# save the plot
ggsave(
  graph_love_plot_month,
  filename = here::here(
    "inputs",
    "3.outputs",
    "1.hourly_analysis",
    "2.experiment_cruise",
    "1.checking_matching_procedure",
    "graph_love_plot_month.pdf"
  ),
  width = 30,
  height = 20,
  units = "cm",
  device = cairo_pdf
)

Plot for years:

Please show me the code!

# graph for year
graph_love_plot_year <- data_calendar %>%
  filter(calendar_variable == "Year") %>%
  ggplot(., aes(
    y = as.factor(as.numeric(values)),
    x = abs_difference,
    colour = fct_rev(dataset),
    shape = fct_rev(dataset)
  )) +
  geom_vline(xintercept = 0) +
  geom_point(size = 2, alpha = 0.8) +
  scale_colour_manual(name = "Dataset:", values = c(my_blue, my_orange)) +
  scale_shape_manual(name = "Dataset:", values = c(16, 17)) +
  ggtitle("Year") +
  xlab("Absolute Difference in Percentage Points") +
  ylab("") +
  theme_tufte()

# print the graph
graph_love_plot_year

Please show me the code!

# save the plot
ggsave(
  graph_love_plot_year,
  filename = here::here(
    "inputs",
    "3.outputs",
    "1.hourly_analysis",
    "2.experiment_cruise",
    "1.checking_matching_procedure",
    "graph_love_plot_year.pdf"
  ),
  width = 30,
  height = 20,
  units = "cm",
  device = cairo_pdf
)

Overall Balance Improvement

We finally plot the distribution of standardized mean differences for continuous covariates or the absolute percentage points differences for categorical covariates between treated and control units before and after matching.

Continuous Covariates

Please show me the code!

# we select the dataset indicator and the standardized difference
data_love_tonnage <- data_love_tonnage %>%
  ungroup() %>%
  filter(time < 0) %>%
  select(dataset, standardized_difference)

data_love_pollutants <- data_love_pollutants %>%
  ungroup() %>%
  filter(parse_number(as.character(level_time))  < 0) %>%
  select(dataset, standardized_difference)

data_love_continuous_weather <- data_love_continuous_weather %>%
  ungroup() %>%
  filter(parse_number(as.character(level_time))  <= 0) %>%
  select(dataset, standardized_difference)

data_love_road <- data_love_road %>%
  ungroup() %>%
  filter(time <= 0) %>%
  select(dataset, standardized_difference)

data_continuous_love <-
  bind_rows(data_love_tonnage, data_love_pollutants) %>%
  bind_rows(., data_love_continuous_weather) %>%
  bind_rows(., data_love_road)

# create the graph
graph_boxplot_continuous_balance_improvement <-
  ggplot(data_continuous_love,
         aes(x = dataset, y = standardized_difference)) +
  geom_boxplot(colour = my_blue) +
  scale_y_continuous(breaks = scales::pretty_breaks(n = 5)) +
  xlab("Dataset") +
  ylab("Standardized \nMean Differences") +
  theme_tufte()

# print the graph
graph_boxplot_continuous_balance_improvement

Please show me the code!

# save the graph
ggsave(
  graph_boxplot_continuous_balance_improvement,
  filename = here::here(
    "inputs",
    "3.outputs",
    "1.hourly_analysis",
    "2.experiment_cruise",
    "1.checking_matching_procedure",
    "graph_boxplot_continuous_balance_improvement.pdf"
  ),
  width = 30,
  height = 20,
  units = "cm",
  device = cairo_pdf
)

Categorical Covariates

Please show me the code!

# we select the dataset indicator and the standardized difference
data_calendar <- data_calendar %>%
  ungroup() %>%
  # we remove the month variable
  #filter(calendar_variable != "Month") %>%
  select(dataset, abs_difference) %>%
  drop_na()

data_weather_categorical <- data_weather_categorical %>%
  ungroup() %>%
  select(dataset, abs_difference)

data_categorical_love <-
  bind_rows(data_calendar, data_weather_categorical) %>%
  drop_na()

# create the graph
graph_boxplot_categorical_balance_improvement <-
  ggplot(data_categorical_love, aes(x = dataset, y = abs_difference)) +
  geom_boxplot(colour = my_blue) +
  scale_y_continuous(breaks = scales::pretty_breaks(n = 5)) +
  xlab("Dataset") +
  ylab("Absolute Difference \nin Percentage Points") +
  theme_tufte()

# print the graph
graph_boxplot_categorical_balance_improvement

Please show me the code!

# save the graph
ggsave(
  graph_boxplot_categorical_balance_improvement,
  filename = here::here(
    "inputs",
    "3.outputs",
    "1.hourly_analysis",
    "2.experiment_cruise",
    "1.checking_matching_procedure",
    "graph_boxplot_categorical_balance_improvement.pdf"
  ),
  width = 30,
  height = 20,
  units = "cm",
  device = cairo_pdf
)

We combine the two previous plots in a single figure:

Please show me the code!

# combine plots
graph_overall_balance <-
  graph_boxplot_continuous_balance_improvement + graph_boxplot_categorical_balance_improvement +
  plot_annotation(tag_levels = 'A') &
  theme(plot.tag = element_text(size = 30, face = "bold"))

# save the plot
ggsave(
  graph_overall_balance,
  filename = here::here(
    "inputs",
    "3.outputs",
    "1.hourly_analysis",
    "2.experiment_cruise",
    "1.checking_matching_procedure",
    "graph_overall_balance_hourly.pdf"
  ),
  width = 20,
  height = 10,
  units = "cm",
  device = cairo_pdf
)

And we compute the overall figures for imbalance before and after matching:

Please show me the code!

# compute average imbalance before and after matching
data_categorical_love <- data_categorical_love %>%
  mutate(Type = "Categorical (Difference in Percentage Points)") %>%
  rename(standardized_difference = abs_difference)
  
data_continuous_love %>%
  mutate(Type = "Continuous (Standardized Difference)") %>%
  bind_rows(data_categorical_love) %>%
  group_by(Type, dataset) %>%
  summarise("Mean Imbalance" = round(mean(standardized_difference), 2)) %>%
  rename(Dataset = dataset) %>%
  knitr::kable(., align = c("l", "l", "c")) %>%
  kable_styling(position = "center")

Type	Dataset	Mean Imbalance
Categorical (Difference in Percentage Points)	Initial Data	4.00
Categorical (Difference in Percentage Points)	Matched Data	0.65
Continuous (Standardized Difference)	Initial Data	0.29
Continuous (Standardized Difference)	Matched Data	0.07

Randomization Check for Covariate Balance

Finally, we implement the procedure of Zach Branson (2021) to carry out a randomization check to test whether daily observations are as-if randomized according to observed covariates.

# we select covariates
data_covs <- data_matched %>%
  select(
    pair_number,
    is_treated,
    total_gross_tonnage_ferry,
    total_gross_tonnage_entry_ferry,
    total_gross_tonnage_entry_other_vessels,
    total_gross_tonnage_exit_cruise,
    total_gross_tonnage_exit_ferry,
    total_gross_tonnage_exit_other_vessels,
    total_gross_tonnage_cruise_lag_1,
    total_gross_tonnage_ferry_lag_1,
    total_gross_tonnage_entry_ferry_lag_1,
    total_gross_tonnage_entry_other_vessels_lag_1,
    total_gross_tonnage_exit_cruise_lag_1,
    total_gross_tonnage_exit_ferry_lag_1,
    total_gross_tonnage_exit_other_vessels_lag_1,
    total_gross_tonnage_cruise_lag_2,
    total_gross_tonnage_ferry_lag_2,
    total_gross_tonnage_entry_ferry_lag_2,
    total_gross_tonnage_entry_other_vessels_lag_2,
    total_gross_tonnage_exit_cruise_lag_2,
    total_gross_tonnage_exit_ferry_lag_2,
    total_gross_tonnage_exit_other_vessels_lag_2,
    total_gross_tonnage_cruise_lag_3,
    total_gross_tonnage_ferry_lag_3,
    total_gross_tonnage_entry_ferry_lag_3,
    total_gross_tonnage_entry_other_vessels_lag_3,
    total_gross_tonnage_exit_cruise_lag_3,
    total_gross_tonnage_exit_ferry_lag_3,
    total_gross_tonnage_exit_other_vessels_lag_3,
    temperature_average:wind_speed,
    wind_direction_east_west,
    temperature_average_lag_1:wind_speed_lag_1,
    wind_direction_east_west_lag_1,
    temperature_average_lag_2:wind_speed_lag_2,
    wind_direction_east_west_lag_2,
    temperature_average_lag_3:wind_speed_lag_3,
    wind_direction_east_west_lag_3,
    hour,
    weekday,
    holidays_dummy,
    bank_day_dummy,
    month,
    year
  ) %>% drop_na()

# recode some variables
data_covs <- data_covs %>%
  mutate(is_treated = ifelse(is_treated == TRUE, 1, 0)) %>%
  mutate_at(
    vars(
      wind_direction_east_west ,
      wind_direction_east_west_lag_1,
      wind_direction_east_west_lag_2,
      wind_direction_east_west_lag_3
    ),
    ~ ifelse(. == "West", 1, 0)
  ) %>%
  fastDummies::dummy_cols(., select_columns = c("year", "month", "weekday", "hour")) %>%
  select(-c("year", "month", "weekday", "hour"))

# format data for asIfRandPlot() function
pair_number <- as.numeric(data_covs$pair_number)
is_treated <- data_covs$is_treated

data_covs <- data_covs %>%
  select(-pair_number, -is_treated) %>%
  select(
    total_gross_tonnage_ferry,
    total_gross_tonnage_entry_ferry,
    total_gross_tonnage_entry_other_vessels,
    total_gross_tonnage_exit_cruise,
    total_gross_tonnage_exit_other_vessels,
    total_gross_tonnage_cruise_lag_1,
    total_gross_tonnage_ferry_lag_1,
    total_gross_tonnage_exit_other_vessels_lag_1,
    total_gross_tonnage_cruise_lag_2,
    total_gross_tonnage_ferry_lag_2,
    total_gross_tonnage_exit_other_vessels_lag_1,
    total_gross_tonnage_cruise_lag_2,
    total_gross_tonnage_ferry_lag_2,
    total_gross_tonnage_entry_ferry_lag_2,
    total_gross_tonnage_entry_ferry_lag_2,
    total_gross_tonnage_entry_ferry_lag_2,
    total_gross_tonnage_entry_other_vessels_lag_2,
    total_gross_tonnage_cruise_lag_3,
    total_gross_tonnage_ferry_lag_3,
    total_gross_tonnage_entry_other_vessels_lag_3,
    total_gross_tonnage_exit_cruise_lag_3,
    total_gross_tonnage_exit_ferry_lag_3,
    total_gross_tonnage_exit_other_vessels_lag_3,
    temperature_average,
    humidity_average,
    wind_speed,
    wind_direction_east_west,
    temperature_average_lag_1:wind_direction_east_west_lag_1,
    temperature_average_lag_2,
    humidity_average_lag_2:wind_direction_east_west_lag_3,-year_2018,
    -month_December,
    -weekday_Sunday,
    -hour_0,
    -bank_day_dummy
  )

# run balance check
asIfRandPlot(
  X.matched = data_covs,
  indicator.matched = is_treated,
  assignment = c("complete", "blocked"),
  subclass = pair_number,
  perms = 1000
)

Checking Balance Improvement

Required Packages

Preparing the Data

Love Plots

Continuous Weather Covariates

Categorical Weather Covariates

Pollutants

Vessels Traffic

Road Traffic

Calendar Indicators

Overall Balance Improvement

Continuous Covariates

Categorical Covariates

Randomization Check for Covariate Balance

Corrections