Comparing hours with entering cruise traffic to hours without. Adjusting for calendar and weather indicators.
We adjust for calendar calendar indicator and weather confounding factors.
Should you have any questions, need help to reproduce the analysis or find coding errors, please do not hesitate to contact us at leo.zabrocki@gmail.com and marion.leroutier@hhs.se.
We load the following packages:
# load required packages
library(knitr) # for creating the R Markdown document
library(here) # for files paths organization
library(tidyverse) # for data manipulation and visualization
library(dtplyr) # to speed up to dplyr
library(randChecks) # for randomization check
library(ggridges) # for ridge density plots
library(Cairo) # for printing custom police of graphs
library(patchwork) # combining plots
library(kableExtra) # for table formatting
We also load our custom ggplot2 theme for graphs:
We load the matching and matched data and bind them together:
# load Initial Data
data_matching <-
readRDS(
here::here(
"inputs",
"1.data",
"1.hourly_data",
"2.data_for_analysis",
"1.matched_data",
"matching_data.rds"
)
) %>%
mutate(dataset = "Initial Data")
# load matched data
data_matched <-
readRDS(
here::here(
"inputs",
"1.data",
"1.hourly_data",
"2.data_for_analysis",
"1.matched_data",
"matched_data_entry_cruise.rds"
)
) %>%
mutate(dataset = "Matched Data")
# bind the two datasets
data <- bind_rows(data_matched, data_matching) %>% dplyr::as_data_frame()
We also load the initial data to select the lags of variables at the daily level:
# load initial data
data_daily <-
readRDS(
here::here(
"inputs",
"1.data",
"1.hourly_data",
"2.data_for_analysis",
"0.main_data",
"data_for_analysis_hourly.RDS"
)
) %>%
select(
date,
daily_mean_no2_sl_lag_1:daily_mean_o3_l_lag_2,
daily_rainfall_height_dummy_lag_1:daily_humidity_average_lag_2
) %>% dplyr::as_data_frame()
We merge the matched_data
with the
data_daily
:
data <- data %>%
left_join(., data_daily, by = "date")
We change labels of the is_treated
variable :
data <- data %>%
mutate(is_treated = ifelse(is_treated == "TRUE", "True", "False"))
# compute figures for the love plot
data_weather_continuous <- data %>%
select(
dataset,
is_treated,
pair_number,
contains("temperature"),
contains("humidity"),
contains("wind_speed")
) %>%
pivot_longer(
cols = -c(pair_number, is_treated, dataset),
names_to = "variable",
values_to = "values"
) %>%
mutate(
weather_variable = NA %>%
ifelse(
str_detect(variable, "temperature_average"),
"Average Temperature",
.
) %>%
ifelse(
str_detect(variable, "humidity_average"),
"Humidity Average",
.
) %>%
ifelse(str_detect(variable, "wind_speed"), "Wind Speed", .)
) %>%
mutate(
time = 0 %>%
ifelse(str_detect(variable, "lag_1"),-1, .) %>%
ifelse(str_detect(variable, "lag_2"),-2, .) %>%
ifelse(str_detect(variable, "lag_3"),-3, .) %>%
ifelse(str_detect(variable, "lead_1"), 1, .) %>%
ifelse(str_detect(variable, "lead_2"), 2, .) %>%
ifelse(str_detect(variable, "lead_3"), 3, .)
) %>%
filter(time<= 0) %>%
mutate(level = "Hourly" %>%
ifelse(str_detect(variable, "daily"), "Daily", .)) %>%
filter(!is.na(time)) %>%
mutate(level_time = paste(level, "Lag", time, sep = " ")) %>%
select(dataset,
pair_number,
is_treated,
weather_variable,
level_time,
values)
data_abs_difference_continuous_weather <-
data_weather_continuous %>%
group_by(dataset, weather_variable, level_time, is_treated) %>%
summarise(mean_values = mean(values, na.rm = TRUE)) %>%
summarise(abs_difference = abs(mean_values[2] - mean_values[1]))
data_sd_weather_continuous <- data_weather_continuous %>%
filter(is_treated == "True" & dataset == "Initial Data") %>%
group_by( weather_variable, level_time, is_treated) %>%
summarise(sd_treatment = sd(values, na.rm = TRUE)) %>%
ungroup() %>%
select(weather_variable, level_time, sd_treatment)
data_love_continuous_weather <-
left_join(
data_abs_difference_continuous_weather,
data_sd_weather_continuous,
by = c("weather_variable", "level_time")
) %>%
mutate(standardized_difference = abs_difference / sd_treatment) %>%
select(-c(abs_difference, sd_treatment)) %>%
mutate(level_time = fct_relevel(level_time, "Daily Lag -1", "Daily Lag -2", "Hourly Lag 0", "Hourly Lag -1", "Hourly Lag -2", "Hourly Lag -3"))
# make the graph
graph_love_plot_continuous_weather <-
ggplot(
data_love_continuous_weather,
aes(
y = fct_rev(level_time),
x = standardized_difference,
colour = fct_rev(dataset),
shape = fct_rev(dataset),
)
) +
geom_vline(xintercept = 0) +
geom_vline(xintercept = 0.1,
color = "black",
linetype = "dashed") +
geom_point(
size = 2,
alpha = 0.8
) +
scale_colour_manual(name = "Dataset: ", values = c(my_blue, my_orange)) +
scale_shape_manual(name = "Dataset: ", values = c(16, 17)) +
facet_wrap( ~ weather_variable, scales = "free_y") +
xlab("Standardized Mean Differences") +
ylab("") +
theme_tufte()
# plot the graph
graph_love_plot_continuous_weather
# save the graph
ggsave(
graph_love_plot_continuous_weather,
filename = here::here(
"inputs",
"3.outputs",
"1.hourly_analysis",
"2.experiment_cruise",
"1.checking_matching_procedure",
"graph_love_plot_continuous_weather.pdf"
),
width = 55,
height = 20,
units = "cm",
device = cairo_pdf
)
# compute figures for the love plot
data_weather_categorical <- data %>%
select(
dataset,
is_treated,
contains("rainfall_height_dummy"),
contains("wind_direction_east_west")
) %>%
mutate_at(vars(contains("rainfall")), ~ ifelse(. == 1, "True", "False")) %>%
mutate_all( ~ as.character(.)) %>%
pivot_longer(
cols = -c(dataset, is_treated),
names_to = "variable",
values_to = "values"
) %>%
# group by is_treated, variable and values
group_by(dataset, is_treated, variable, values) %>%
# compute the number of observations
summarise(n = n()) %>%
# compute the proportion
mutate(freq = round(n / sum(n) * 100, 0)) %>%
ungroup() %>%
mutate(
weather_variable = NA %>%
ifelse(str_detect(variable, "wind"), "Wind Direction", .) %>%
ifelse(str_detect(variable, "rainfall"), "Rainfall Dummy", .)
) %>%
mutate(
time = 0 %>%
ifelse(str_detect(variable, "lag_1"),-1, .) %>%
ifelse(str_detect(variable, "lag_2"),-2, .) %>%
ifelse(str_detect(variable, "lag_3"),-3, .) %>%
ifelse(str_detect(variable, "lead_1"), 1, .) %>%
ifelse(str_detect(variable, "lead_2"), 2, .) %>%
ifelse(str_detect(variable, "lead_3"), 3, .)
) %>%
filter(time<= 0) %>%
mutate(level = "Hourly" %>%
ifelse(str_detect(variable, "daily"), "Daily", .)) %>%
filter(!is.na(time)) %>%
mutate(variable = paste(level, weather_variable, "\nLag", time, sep = " ")) %>%
select(dataset, is_treated, weather_variable, variable, values, freq) %>%
pivot_wider(names_from = is_treated, values_from = freq) %>%
mutate(abs_difference = abs(`True` - `False`)) %>%
filter(values != "False" & values != "West")
# create the figure for wind direction
graph_love_plot_wind_direction <- data_weather_categorical %>%
filter(weather_variable == "Wind Direction") %>%
mutate(variable = fct_relevel(variable, "Daily Wind Direction \nLag -1", "Daily Wind Direction \nLag -2", "Hourly Wind Direction \nLag 0", "Hourly Wind Direction \nLag -1", "Hourly Wind Direction \nLag -2", "Hourly Wind Direction \nLag -3")) %>%
ggplot(., aes(
y = fct_rev(values),
x = abs_difference,
colour = fct_rev(dataset),
shape = fct_rev(dataset)
)) +
geom_vline(xintercept = 0) +
geom_point(
size = 2,
alpha = 0.8
) +
scale_colour_manual(name = "Dataset: ", values = c(my_blue, my_orange)) +
scale_shape_manual(name = "Dataset: ", values = c(16, 17)) +
facet_wrap( ~ variable, scales = "free_y", ncol = 4) +
xlab("Absolute Difference in Percentage Points") +
ylab("") +
theme_tufte()
# print the figure for wind direction
graph_love_plot_wind_direction
# save the figure for wind direction
ggsave(
graph_love_plot_wind_direction,
filename = here::here(
"inputs",
"3.outputs",
"1.hourly_analysis",
"2.experiment_cruise",
"1.checking_matching_procedure",
"graph_love_plot_wind_direction.pdf"
),
width = 60,
height = 30,
units = "cm",
device = cairo_pdf
)
# create the figure for rainfall dummy
graph_love_plot_rainfall <- data_weather_categorical %>%
filter(weather_variable == "Rainfall Dummy") %>%
mutate(variable = fct_relevel(variable, "Daily Rainfall Dummy \nLag -1", "Daily Rainfall Dummy \nLag -2", "Hourly Rainfall Dummy \nLag 0", "Hourly Rainfall Dummy \nLag -1", "Hourly Rainfall Dummy \nLag -2", "Hourly Rainfall Dummy \nLag -3")) %>%
ggplot(., aes(
y = fct_rev(variable),
x = abs_difference,
colour = fct_rev(dataset),
shape = fct_rev(dataset)
)) +
geom_vline(xintercept = 0) +
geom_point(
size = 2,
alpha = 0.8
) +
scale_colour_manual(name = "Dataset: ", values = c(my_blue, my_orange)) +
scale_shape_manual(name = "Dataset: ", values = c(16, 17)) +
xlab("Absolute Difference in Percentage Points") +
ylab("") +
theme_tufte()
# print the figure for rainfall dummy
graph_love_plot_rainfall
# save the figure for rainfall dummy
ggsave(
graph_love_plot_rainfall,
filename = here::here(
"inputs",
"3.outputs",
"1.hourly_analysis",
"2.experiment_cruise",
"1.checking_matching_procedure",
"graph_love_plot_rainfall.pdf"
),
width = 30,
height = 30,
units = "cm",
device = cairo_pdf
)
# compute figures for the love plot
data_pollutants <- data %>%
select(
dataset,
is_treated,
pair_number,
contains("no2"),
contains("o3"),
contains("pm10"),
contains("pm25"),
contains("so2")
) %>%
pivot_longer(
cols = -c(pair_number, is_treated, dataset),
names_to = "variable",
values_to = "values"
) %>%
mutate(
pollutant = NA %>%
ifelse(str_detect(variable, "no2_l"), "NO2 Longchamp", .) %>%
ifelse(str_detect(variable, "no2_sl"), "NO2 Saint-Louis", .) %>%
ifelse(str_detect(variable, "o3"), "O3 Lonchamp", .) %>%
ifelse(str_detect(variable, "pm10_l"), "PM10 Longchamp", .) %>%
ifelse(str_detect(variable, "pm10_sl"), "PM10 Saint-Louis", .) %>%
ifelse(str_detect(variable, "pm25"), "PM2.5 Longchamp", .) %>%
ifelse(str_detect(variable, "so2"), "SO2 Longchamp", .)
) %>%
mutate(
time = 0 %>%
ifelse(str_detect(variable, "lag_1"),-1, .) %>%
ifelse(str_detect(variable, "lag_2"),-2, .) %>%
ifelse(str_detect(variable, "lag_3"),-3, .) %>%
ifelse(str_detect(variable, "lead_1"), 1, .) %>%
ifelse(str_detect(variable, "lead_2"), 2, .) %>%
ifelse(str_detect(variable, "lead_3"), 3, .)
) %>%
filter(time<= 0) %>%
mutate(level = "Hourly" %>%
ifelse(str_detect(variable, "daily"), "Daily", .)) %>%
filter(!is.na(time)) %>%
mutate(level_time = paste(level, "Lag", time, sep = " ")) %>%
select(dataset,
pair_number,
is_treated,
pollutant,
level_time,
values)
data_abs_difference_pollutants <- data_pollutants %>%
group_by(dataset, pollutant, level_time, is_treated) %>%
summarise(mean_values = mean(values, na.rm = TRUE)) %>%
summarise(abs_difference = abs(mean_values[2] - mean_values[1]))
data_sd_pollutants <- data_pollutants %>%
filter(is_treated == "True" & dataset == "Initial Data") %>%
group_by(pollutant, level_time, is_treated) %>%
summarise(sd_treatment = sd(values, na.rm = TRUE)) %>%
ungroup() %>%
select(pollutant, level_time, sd_treatment)
data_love_pollutants <-
left_join(data_abs_difference_pollutants,
data_sd_pollutants,
by = c("pollutant", "level_time")) %>%
mutate(standardized_difference = abs_difference / sd_treatment) %>%
select(-c(abs_difference, sd_treatment)) %>%
mutate(level_time = fct_relevel(level_time, "Daily Lag -1", "Daily Lag -2", "Hourly Lag 0", "Hourly Lag -1", "Hourly Lag -2", "Hourly Lag -3"))
# create the graph
graph_love_plot_pollutants <-
ggplot(data_love_pollutants,
aes(
y = fct_rev(level_time),
x = standardized_difference,
colour = fct_rev(dataset),
shape = fct_rev(dataset)
)) +
geom_vline(xintercept = 0) +
geom_vline(xintercept = 0.1,
color = "black",
linetype = "dashed") +
geom_point(
size = 2,
alpha = 0.8
) +
scale_colour_manual(name = "Dataset: ", values = c(my_blue, my_orange)) +
scale_shape_manual(name = "Dataset: ", values = c(16, 17)) +
facet_wrap( ~ pollutant, scales = "free_y") +
xlab("Standardized Mean Differences") +
ylab("") +
theme_tufte()
# print the graph
graph_love_plot_pollutants
# save the graph
ggsave(
graph_love_plot_pollutants,
filename = here::here(
"inputs",
"3.outputs",
"1.hourly_analysis",
"2.experiment_cruise",
"1.checking_matching_procedure",
"graph_love_plot_pollutants.pdf"
),
width = 60,
height = 40,
units = "cm",
device = cairo_pdf
)
# compute figures for the love plot
data_tonnage <- data %>%
# select relevant variables
select(
dataset,
pair_number,
is_treated,
contains("total_gross_tonnage_entry_cruise"),
contains("total_gross_tonnage_exit_cruise"),
contains("total_gross_tonnage_entry_ferry"),
contains("total_gross_tonnage_exit_ferry"),
contains("total_gross_tonnage_entry_other_vessels"),
contains("total_gross_tonnage_exit_other_vessels")
) %>%
# transform data in long format
pivot_longer(
cols = -c(dataset, pair_number, is_treated),
names_to = "variable",
values_to = "tonnage"
) %>%
# create vessel type variable
mutate(
vessel_type = NA %>%
ifelse(str_detect(variable, "cruise"), "Cruise", .) %>%
ifelse(str_detect(variable, "ferry"), "Ferry", .) %>%
ifelse(str_detect(variable, "other_vessels"), "Other Vessels", .)
) %>%
# create the day variable
mutate(entry_exit = NA %>%
ifelse(str_detect(variable, "entry"), "Entry", .) %>%
ifelse(str_detect(variable, "exit"), "Exit", .)) %>%
mutate(
time = 0 %>%
ifelse(str_detect(variable, "lag_1"),-1, .) %>%
ifelse(str_detect(variable, "lag_2"),-2, .) %>%
ifelse(str_detect(variable, "lag_3"),-3, .) %>%
ifelse(str_detect(variable, "lead_1"), 1, .) %>%
ifelse(str_detect(variable, "lead_2"), 2, .) %>%
ifelse(str_detect(variable, "lead_3"), 3, .)
) %>%
select(dataset,
pair_number,
vessel_type,
is_treated,
entry_exit,
time,
tonnage)
data_abs_difference_tonnage <- data_tonnage %>%
group_by(dataset, vessel_type, entry_exit, time, is_treated) %>%
summarise(mean_tonnage = mean(tonnage, na.rm = TRUE)) %>%
summarise(abs_difference = abs(mean_tonnage[2] - mean_tonnage[1]))
data_sd_tonnage <- data_tonnage %>%
filter(is_treated == "True" & dataset == "Initial Data") %>%
group_by(vessel_type, entry_exit, time, is_treated) %>%
summarise(sd_treatment = sd(tonnage, na.rm = TRUE)) %>%
ungroup() %>%
select(vessel_type, entry_exit, time, sd_treatment)
data_love_tonnage <-
left_join(
data_abs_difference_tonnage,
data_sd_tonnage,
by = c("vessel_type", "entry_exit", "time")
) %>%
mutate(standardized_difference = abs_difference / sd_treatment) %>%
select(-c(abs_difference, sd_treatment)) %>%
filter(!(vessel_type == "Cruise" & time == 0))
# create the graph
graph_love_plot_tonnage <-
ggplot(data_love_tonnage,
aes(
x = standardized_difference,
y = as.factor(time),
colour = fct_rev(dataset),
shape = fct_rev(dataset)
)) +
geom_vline(xintercept = 0) +
geom_vline(xintercept = 0.1,
color = "black",
linetype = "dashed") +
geom_point(
size = 2,
alpha = 0.8
) +
scale_colour_manual(name = "Dataset: ", values = c(my_blue, my_orange)) +
scale_shape_manual(name = "Dataset: ", values = c(16, 17)) +
facet_grid(entry_exit ~ vessel_type) +
xlab("Standardized Mean Differences") +
ylab("Hour") +
theme_tufte()
# print the graph
graph_love_plot_tonnage
# save the graph
ggsave(
graph_love_plot_tonnage,
filename = here::here(
"inputs",
"3.outputs",
"1.hourly_analysis",
"2.experiment_cruise",
"1.checking_matching_procedure",
"graph_love_plot_tonnage.pdf"
),
width = 40,
height = 40,
units = "cm",
device = cairo_pdf
)
# compute figures for the love plot
data_road <- data %>%
# select relevant variables
select(dataset,
pair_number,
is_treated,
contains("road_traffic_flow")) %>%
# transform data in long format
pivot_longer(
cols = -c(dataset, pair_number, is_treated),
names_to = "road_traffic",
values_to = "value"
) %>%
mutate(
time = 0 %>%
ifelse(str_detect(road_traffic, "lag_1"),-1, .) %>%
ifelse(str_detect(road_traffic, "lag_2"),-2, .) %>%
ifelse(str_detect(road_traffic, "lag_3"),-3, .) %>%
ifelse(str_detect(road_traffic, "lead_1"), 1, .) %>%
ifelse(str_detect(road_traffic, "lead_2"), 2, .) %>%
ifelse(str_detect(road_traffic, "lead_3"), 3, .)
) %>%
select(dataset, pair_number, road_traffic, is_treated, time, value)
data_abs_difference_road <- data_road %>%
group_by(dataset, road_traffic, time, is_treated) %>%
summarise(mean_value = mean(value, na.rm = TRUE)) %>%
summarise(abs_difference = abs(mean_value[2] - mean_value[1]))
data_sd_tonnage <- data_road %>%
filter(is_treated == "True" & dataset == "Initial Data") %>%
group_by(road_traffic, time, is_treated) %>%
summarise(sd_treatment = sd(value, na.rm = TRUE)) %>%
ungroup() %>%
select(road_traffic, time, sd_treatment)
data_love_road <-
left_join(data_abs_difference_road,
data_sd_tonnage,
by = c("road_traffic", "time")) %>%
mutate(standardized_difference = abs_difference / sd_treatment) %>%
select(-c(abs_difference, sd_treatment))
# create the graph
graph_love_plot_road <-
ggplot(
data_love_road,
aes(
x = standardized_difference,
y = as.factor(time),
colour = fct_rev(dataset),
shape = fct_rev(dataset)
)
) +
geom_vline(xintercept = 0) +
geom_vline(xintercept = 0.1,
color = "black",
linetype = "dashed") +
geom_point(size = 2, alpha = 0.8) +
scale_colour_manual(name = "Dataset:", values = c(my_blue, my_orange)) +
scale_shape_manual(name = "Dataset:", values = c(16, 17)) +
xlab("Standardized Mean Differences") +
ylab("Hour") +
theme_tufte()
# print the graph
graph_love_plot_road
# save the graph
ggsave(
graph_love_plot_road,
filename = here::here(
"inputs",
"3.outputs",
"1.hourly_analysis",
"2.experiment_cruise",
"1.checking_matching_procedure",
"graph_love_plot_road.pdf"
),
width = 25,
height = 20,
units = "cm",
device = cairo_pdf
)
Create the relevant data:
# compute figures for the love plot
data_calendar <- data %>%
mutate(weekday = lubridate::wday(date, abbr = FALSE, label = TRUE)) %>%
select(dataset,
is_treated,
hour,
weekday,
holidays_dummy,
bank_day_dummy,
month,
year) %>%
mutate_at(vars(holidays_dummy, bank_day_dummy),
~ ifelse(. == 1, "True", "False")) %>%
mutate_all( ~ as.character(.)) %>%
pivot_longer(
cols = -c(dataset, is_treated),
names_to = "variable",
values_to = "values"
) %>%
# group by is_treated, variable and values
group_by(dataset, is_treated, variable, values) %>%
# compute the number of observations
summarise(n = n()) %>%
# compute the proportion
mutate(freq = round(n / sum(n) * 100, 0)) %>%
ungroup() %>%
mutate(
calendar_variable = NA %>%
ifelse(str_detect(variable, "hour"), "Hour", .) %>%
ifelse(str_detect(variable, "weekday"), "Day of the Week", .) %>%
ifelse(str_detect(variable, "holidays_dummy"), "Holidays", .) %>%
ifelse(str_detect(variable, "bank_day_dummy"), "Bank Day", .) %>%
ifelse(str_detect(variable, "month"), "Month", .) %>%
ifelse(str_detect(variable, "year"), "Year", .)
) %>%
select(dataset, is_treated, calendar_variable, values, freq) %>%
pivot_wider(names_from = is_treated, values_from = freq) %>%
mutate(abs_difference = abs(`True` - `False`)) %>%
filter(values != "False")
Plot for hours:
# graph for hour
graph_love_plot_hour <- data_calendar %>%
filter(calendar_variable == "Hour") %>%
ggplot(., aes(
y = as.factor(as.numeric(values)),
x = abs_difference,
colour = fct_rev(dataset),
shape = fct_rev(dataset)
)) +
geom_vline(xintercept = 0) +
geom_point(size = 2, alpha = 0.8) +
scale_colour_manual(name = "Dataset:", values = c(my_blue, my_orange)) +
scale_shape_manual(name = "Dataset:", values = c(16, 17)) +
xlab("Absolute Difference in Percentage Points") +
ylab("") +
theme_tufte()
# print the plot
graph_love_plot_hour
# save the plot
ggsave(
graph_love_plot_hour,
filename = here::here(
"inputs",
"3.outputs",
"1.hourly_analysis",
"2.experiment_cruise",
"1.checking_matching_procedure",
"graph_love_plot_hour.pdf"
),
width = 30,
height = 30,
units = "cm",
device = cairo_pdf
)
Plot for bank days and holidays:
# graph for bank days and holidays
graph_love_plot_bank_holidays <- data_calendar %>%
filter(calendar_variable %in% c("Bank Day", "Holidays")) %>%
ggplot(., aes(
y = values,
x = abs_difference,
colour = fct_rev(dataset),
shape = fct_rev(dataset)
)) +
geom_vline(xintercept = 0) +
geom_point(size = 2, alpha = 0.8) +
scale_colour_manual(name = "Dataset:", values = c(my_blue, my_orange)) +
scale_shape_manual(name = "Dataset:", values = c(16, 17)) +
facet_wrap( ~ calendar_variable) +
xlab("Absolute Difference in Percentage Points") +
ylab("") +
theme_tufte()
# print the plot
graph_love_plot_bank_holidays
# save the plot
ggsave(
graph_love_plot_bank_holidays,
filename = here::here(
"inputs",
"3.outputs",
"1.hourly_analysis",
"2.experiment_cruise",
"1.checking_matching_procedure",
"graph_love_plot_bank_holidays.pdf"
),
width = 30,
height = 20,
units = "cm",
device = cairo_pdf
)
Plot for days of the week:
# graph for weekdays
graph_love_plot_weekday <- data_calendar %>%
filter(calendar_variable == "Day of the Week") %>%
mutate(
values = fct_relevel(
values,
"Monday",
"Tuesday",
"Wednesday",
"Thursday",
"Friday",
"Saturday",
"Sunday"
)
) %>%
ggplot(., aes(
y = fct_rev(values),
x = abs_difference,
colour = fct_rev(dataset),
shape = fct_rev(dataset)
)) +
geom_vline(xintercept = 0) +
geom_point(size = 2, alpha = 0.8) +
scale_colour_manual(name = "Dataset:", values = c(my_blue, my_orange)) +
scale_shape_manual(name = "Dataset:", values = c(16, 17)) +
xlab("Absolute Difference in Percentage Points") +
ylab("") +
theme_tufte()
# print the plot
graph_love_plot_weekday
# save the plot
ggsave(
graph_love_plot_weekday,
filename = here::here(
"inputs",
"3.outputs",
"1.hourly_analysis",
"2.experiment_cruise",
"1.checking_matching_procedure",
"graph_love_plot_weekday.pdf"
),
width = 30,
height = 20,
units = "cm",
device = cairo_pdf
)
Plot for months:
# graph for month
graph_love_plot_month <- data_calendar %>%
filter(calendar_variable == "Month") %>%
mutate(
values = fct_relevel(
values,
"January",
"February",
"March",
"April",
"May",
"June",
"July",
"August",
"September",
"October",
"November",
"December"
)
) %>%
ggplot(., aes(
y = fct_rev(values),
x = abs_difference,
colour = fct_rev(dataset),
shape = fct_rev(dataset)
)) +
geom_vline(xintercept = 0) +
geom_point(size = 2, alpha = 0.8) +
scale_colour_manual(name = "Dataset:", values = c(my_blue, my_orange)) +
scale_shape_manual(name = "Dataset:", values = c(16, 17)) +
ggtitle("Month") +
xlab("Absolute Difference in Percentage Points") +
ylab("") +
theme_tufte()
# print the plot
graph_love_plot_month
# save the plot
ggsave(
graph_love_plot_month,
filename = here::here(
"inputs",
"3.outputs",
"1.hourly_analysis",
"2.experiment_cruise",
"1.checking_matching_procedure",
"graph_love_plot_month.pdf"
),
width = 30,
height = 20,
units = "cm",
device = cairo_pdf
)
Plot for years:
# graph for year
graph_love_plot_year <- data_calendar %>%
filter(calendar_variable == "Year") %>%
ggplot(., aes(
y = as.factor(as.numeric(values)),
x = abs_difference,
colour = fct_rev(dataset),
shape = fct_rev(dataset)
)) +
geom_vline(xintercept = 0) +
geom_point(size = 2, alpha = 0.8) +
scale_colour_manual(name = "Dataset:", values = c(my_blue, my_orange)) +
scale_shape_manual(name = "Dataset:", values = c(16, 17)) +
ggtitle("Year") +
xlab("Absolute Difference in Percentage Points") +
ylab("") +
theme_tufte()
# print the graph
graph_love_plot_year
# save the plot
ggsave(
graph_love_plot_year,
filename = here::here(
"inputs",
"3.outputs",
"1.hourly_analysis",
"2.experiment_cruise",
"1.checking_matching_procedure",
"graph_love_plot_year.pdf"
),
width = 30,
height = 20,
units = "cm",
device = cairo_pdf
)
We finally plot the distribution of standardized mean differences for continuous covariates or the absolute percentage points differences for categorical covariates between treated and control units before and after matching.
# we select the dataset indicator and the standardized difference
data_love_tonnage <- data_love_tonnage %>%
ungroup() %>%
filter(time < 0) %>%
select(dataset, standardized_difference)
data_love_pollutants <- data_love_pollutants %>%
ungroup() %>%
filter(parse_number(as.character(level_time)) < 0) %>%
select(dataset, standardized_difference)
data_love_continuous_weather <- data_love_continuous_weather %>%
ungroup() %>%
filter(parse_number(as.character(level_time)) <= 0) %>%
select(dataset, standardized_difference)
data_love_road <- data_love_road %>%
ungroup() %>%
filter(time <= 0) %>%
select(dataset, standardized_difference)
data_continuous_love <-
bind_rows(data_love_tonnage, data_love_pollutants) %>%
bind_rows(., data_love_continuous_weather) %>%
bind_rows(., data_love_road)
# create the graph
graph_boxplot_continuous_balance_improvement <-
ggplot(data_continuous_love,
aes(x = dataset, y = standardized_difference)) +
geom_boxplot(colour = my_blue) +
scale_y_continuous(breaks = scales::pretty_breaks(n = 5)) +
xlab("Dataset") +
ylab("Standardized \nMean Differences") +
theme_tufte()
# print the graph
graph_boxplot_continuous_balance_improvement
# save the graph
ggsave(
graph_boxplot_continuous_balance_improvement,
filename = here::here(
"inputs",
"3.outputs",
"1.hourly_analysis",
"2.experiment_cruise",
"1.checking_matching_procedure",
"graph_boxplot_continuous_balance_improvement.pdf"
),
width = 30,
height = 20,
units = "cm",
device = cairo_pdf
)
# we select the dataset indicator and the standardized difference
data_calendar <- data_calendar %>%
ungroup() %>%
# we remove the month variable
#filter(calendar_variable != "Month") %>%
select(dataset, abs_difference) %>%
drop_na()
data_weather_categorical <- data_weather_categorical %>%
ungroup() %>%
select(dataset, abs_difference)
data_categorical_love <-
bind_rows(data_calendar, data_weather_categorical) %>%
drop_na()
# create the graph
graph_boxplot_categorical_balance_improvement <-
ggplot(data_categorical_love, aes(x = dataset, y = abs_difference)) +
geom_boxplot(colour = my_blue) +
scale_y_continuous(breaks = scales::pretty_breaks(n = 5)) +
xlab("Dataset") +
ylab("Absolute Difference \nin Percentage Points") +
theme_tufte()
# print the graph
graph_boxplot_categorical_balance_improvement
# save the graph
ggsave(
graph_boxplot_categorical_balance_improvement,
filename = here::here(
"inputs",
"3.outputs",
"1.hourly_analysis",
"2.experiment_cruise",
"1.checking_matching_procedure",
"graph_boxplot_categorical_balance_improvement.pdf"
),
width = 30,
height = 20,
units = "cm",
device = cairo_pdf
)
We combine the two previous plots in a single figure:
# combine plots
graph_overall_balance <-
graph_boxplot_continuous_balance_improvement + graph_boxplot_categorical_balance_improvement +
plot_annotation(tag_levels = 'A') &
theme(plot.tag = element_text(size = 30, face = "bold"))
# save the plot
ggsave(
graph_overall_balance,
filename = here::here(
"inputs",
"3.outputs",
"1.hourly_analysis",
"2.experiment_cruise",
"1.checking_matching_procedure",
"graph_overall_balance_hourly.pdf"
),
width = 20,
height = 10,
units = "cm",
device = cairo_pdf
)
And we compute the overall figures for imbalance before and after matching:
# compute average imbalance before and after matching
data_categorical_love <- data_categorical_love %>%
mutate(Type = "Categorical (Difference in Percentage Points)") %>%
rename(standardized_difference = abs_difference)
data_continuous_love %>%
mutate(Type = "Continuous (Standardized Difference)") %>%
bind_rows(data_categorical_love) %>%
group_by(Type, dataset) %>%
summarise("Mean Imbalance" = round(mean(standardized_difference), 2)) %>%
rename(Dataset = dataset) %>%
knitr::kable(., align = c("l", "l", "c")) %>%
kable_styling(position = "center")
Type | Dataset | Mean Imbalance |
---|---|---|
Categorical (Difference in Percentage Points) | Initial Data | 4.00 |
Categorical (Difference in Percentage Points) | Matched Data | 0.65 |
Continuous (Standardized Difference) | Initial Data | 0.29 |
Continuous (Standardized Difference) | Matched Data | 0.07 |
Finally, we implement the procedure of Zach Branson (2021) to carry out a randomization check to test whether daily observations are as-if randomized according to observed covariates.
# we select covariates
data_covs <- data_matched %>%
select(
pair_number,
is_treated,
total_gross_tonnage_ferry,
total_gross_tonnage_entry_ferry,
total_gross_tonnage_entry_other_vessels,
total_gross_tonnage_exit_cruise,
total_gross_tonnage_exit_ferry,
total_gross_tonnage_exit_other_vessels,
total_gross_tonnage_cruise_lag_1,
total_gross_tonnage_ferry_lag_1,
total_gross_tonnage_entry_ferry_lag_1,
total_gross_tonnage_entry_other_vessels_lag_1,
total_gross_tonnage_exit_cruise_lag_1,
total_gross_tonnage_exit_ferry_lag_1,
total_gross_tonnage_exit_other_vessels_lag_1,
total_gross_tonnage_cruise_lag_2,
total_gross_tonnage_ferry_lag_2,
total_gross_tonnage_entry_ferry_lag_2,
total_gross_tonnage_entry_other_vessels_lag_2,
total_gross_tonnage_exit_cruise_lag_2,
total_gross_tonnage_exit_ferry_lag_2,
total_gross_tonnage_exit_other_vessels_lag_2,
total_gross_tonnage_cruise_lag_3,
total_gross_tonnage_ferry_lag_3,
total_gross_tonnage_entry_ferry_lag_3,
total_gross_tonnage_entry_other_vessels_lag_3,
total_gross_tonnage_exit_cruise_lag_3,
total_gross_tonnage_exit_ferry_lag_3,
total_gross_tonnage_exit_other_vessels_lag_3,
temperature_average:wind_speed,
wind_direction_east_west,
temperature_average_lag_1:wind_speed_lag_1,
wind_direction_east_west_lag_1,
temperature_average_lag_2:wind_speed_lag_2,
wind_direction_east_west_lag_2,
temperature_average_lag_3:wind_speed_lag_3,
wind_direction_east_west_lag_3,
hour,
weekday,
holidays_dummy,
bank_day_dummy,
month,
year
) %>% drop_na()
# recode some variables
data_covs <- data_covs %>%
mutate(is_treated = ifelse(is_treated == TRUE, 1, 0)) %>%
mutate_at(
vars(
wind_direction_east_west ,
wind_direction_east_west_lag_1,
wind_direction_east_west_lag_2,
wind_direction_east_west_lag_3
),
~ ifelse(. == "West", 1, 0)
) %>%
fastDummies::dummy_cols(., select_columns = c("year", "month", "weekday", "hour")) %>%
select(-c("year", "month", "weekday", "hour"))
# format data for asIfRandPlot() function
pair_number <- as.numeric(data_covs$pair_number)
is_treated <- data_covs$is_treated
data_covs <- data_covs %>%
select(-pair_number, -is_treated) %>%
select(
total_gross_tonnage_ferry,
total_gross_tonnage_entry_ferry,
total_gross_tonnage_entry_other_vessels,
total_gross_tonnage_exit_cruise,
total_gross_tonnage_exit_other_vessels,
total_gross_tonnage_cruise_lag_1,
total_gross_tonnage_ferry_lag_1,
total_gross_tonnage_exit_other_vessels_lag_1,
total_gross_tonnage_cruise_lag_2,
total_gross_tonnage_ferry_lag_2,
total_gross_tonnage_exit_other_vessels_lag_1,
total_gross_tonnage_cruise_lag_2,
total_gross_tonnage_ferry_lag_2,
total_gross_tonnage_entry_ferry_lag_2,
total_gross_tonnage_entry_ferry_lag_2,
total_gross_tonnage_entry_ferry_lag_2,
total_gross_tonnage_entry_other_vessels_lag_2,
total_gross_tonnage_cruise_lag_3,
total_gross_tonnage_ferry_lag_3,
total_gross_tonnage_entry_other_vessels_lag_3,
total_gross_tonnage_exit_cruise_lag_3,
total_gross_tonnage_exit_ferry_lag_3,
total_gross_tonnage_exit_other_vessels_lag_3,
temperature_average,
humidity_average,
wind_speed,
wind_direction_east_west,
temperature_average_lag_1:wind_direction_east_west_lag_1,
temperature_average_lag_2,
humidity_average_lag_2:wind_direction_east_west_lag_3,-year_2018,
-month_December,
-weekday_Sunday,
-hour_0,
-bank_day_dummy
)
# run balance check
asIfRandPlot(
X.matched = data_covs,
indicator.matched = is_treated,
assignment = c("complete", "blocked"),
subclass = pair_number,
perms = 1000
)
If you see mistakes or want to suggest changes, please create an issue on the source repository.