Comparing days with cruise traffic to days without. Adjusting for calendar and weather indicators.
In this document, we take great care providing all steps and R codes required to check whether our most matching procedure achieved balance. We compare days where:
We adjust for calendar calendar indicator and weather confounding factors.
Should you have any questions, need help to reproduce the analysis or find coding errors, please do not hesitate to contact us at leo.zabrocki@gmail.com and marion.leroutier@hhs.se.
We load the following packages:
# load required packages
library(knitr) # for creating the R Markdown document
library(here) # for files paths organization
library(tidyverse) # for data manipulation and visualization
library(ggridges) # for ridge density plots
library(Cairo) # for printing costumed police of graphs
library(patchwork) # combining plots
We finally load our custom ggplot2
theme for graphs:
We load the matched data:
We check whether coviariates balance was achieved with the thresholds we defined for our matching procedure. We plot distributions of weather and calendar variables (Lags 0-1) and pollutants (Lag 1) for treated and control groups.
For continuous weather covariates, we draw boxplots for treated and control groups:
# we select control variables and store them in a long dataframe
data_weather_continuous_variables <- data_matched %>%
select(
temperature_average,
temperature_average_lag_1,
humidity_average,
humidity_average_lag_1,
wind_speed,
wind_speed_lag_1,
is_treated
) %>%
pivot_longer(
cols = -c(is_treated),
names_to = "variable",
values_to = "values"
) %>%
mutate(
new_variable = NA %>%
ifelse(
str_detect(variable, "temperature_average"),
"Average Temperature (°C)",
.
) %>%
ifelse(
str_detect(variable, "humidity_average"),
"Humidity Average (%)",
.
) %>%
ifelse(str_detect(variable, "wind_speed"), "Wind Speed (m/s)", .)
) %>%
mutate(time = "in t" %>%
ifelse(str_detect(variable, "lag_1"), "in t-1", .)) %>%
mutate(variable = paste(new_variable, time, sep = " ")) %>%
mutate(is_treated = if_else(is_treated == TRUE, "Treated", "Control"))
graph_boxplot_continuous_weather <-
ggplot(data_weather_continuous_variables,
aes(x = is_treated, y = values, colour = is_treated)) +
geom_violin(size = 1.2) +
geom_boxplot(width = 0.1, outlier.shape = NA) +
scale_color_manual(values = c(my_blue, my_orange)) +
ylab("Covariate Value") +
xlab("") +
labs(colour = "Units Status:") +
facet_wrap(~ variable, scale = "free", ncol = 2) +
theme_tufte()
# we print the graph
graph_boxplot_continuous_weather
# save the graph
ggsave(
graph_boxplot_continuous_weather,
filename = here::here(
"inputs",
"3.outputs",
"2.daily_analysis",
"2.analysis_pollution",
"1.cruise_experiment",
"1.checking_matching_procedure",
"graph_boxplot_continuous_weather.pdf"
),
width = 30,
height = 30,
units = "cm",
device = cairo_pdf
)
For the rainfall dummy and the wind direction categories, we plot the proportions:
# we select the rainfall variables
data_weather_categorical <- data_matched %>%
select(
rainfall_height_dummy,
rainfall_height_dummy_lag_1,
wind_direction_categories,
wind_direction_categories_lag_1,
is_treated
) %>%
mutate_at(
vars(rainfall_height_dummy:rainfall_height_dummy_lag_1),
~ ifelse(. == 1, "True", "False")
) %>%
mutate_all( ~ as.character(.)) %>%
pivot_longer(
cols = -c(is_treated),
names_to = "variable",
values_to = "values"
) %>%
# group by is_treated, variable and values
group_by(is_treated, variable, values) %>%
# compute the number of observations
summarise(n = n()) %>%
# compute the proportion
mutate(freq = round(n / sum(n) * 100, 0)) %>%
ungroup() %>%
filter(!(
variable %in% c("rainfall_height_dummy", "rainfall_height_dummy_lag_1") &
values == "False"
)) %>%
mutate(
new_variable = NA %>%
ifelse(str_detect(variable, "wind"), "Wind Direction", .) %>%
ifelse(str_detect(variable, "rainfall"), "Rainfall Dummy", .)
) %>%
mutate(time = "\nin t" %>%
ifelse(str_detect(variable, "lag_1"), "\nin t-1", .)) %>%
mutate(variable = paste(new_variable, time, sep = " ")) %>%
mutate(is_treated = if_else(is_treated == TRUE, "Treated", "Control"))
# build the graph for wind direction
graph_categorical_wd_weather <- data_weather_categorical %>%
filter(new_variable == "Wind Direction") %>%
ggplot(., aes(x = freq, y = values, fill = is_treated)) +
geom_point(shape = 21,
size = 6,
alpha = 0.8) +
scale_fill_manual(values = c(my_blue, my_orange)) +
facet_wrap( ~ variable, scales = "free") +
ylab("Proportion (%)") +
xlab("") +
labs(fill = "Units Status:") +
theme_tufte()
# we print the graph
graph_categorical_wd_weather
# build the graph for rainfall dummy
graph_categorical_rainfall_weather <- data_weather_categorical %>%
filter(new_variable == "Rainfall Dummy") %>%
ggplot(., aes(x = freq, y = variable, fill = is_treated)) +
geom_point(shape = 21,
size = 6,
alpha = 0.8) +
scale_fill_manual(values = c(my_blue, my_orange)) +
ylab("Proportion (%)") +
xlab("") +
labs(fill = "Units Status:") +
theme_tufte()
# we print the graph
graph_categorical_rainfall_weather
# combine plots
graph_categorical_weather <-
graph_categorical_wd_weather / graph_categorical_rainfall_weather +
plot_annotation(tag_levels = 'A') &
theme(plot.tag = element_text(size = 30, face = "bold"))
# save the graph
ggsave(
graph_categorical_weather,
filename = here::here(
"inputs",
"3.outputs",
"2.daily_analysis",
"2.analysis_pollution",
"1.cruise_experiment",
"1.checking_matching_procedure",
"graph_categorical_weather.pdf"
),
width = 30,
height = 20,
units = "cm",
device = cairo_pdf
)
For pollutants lag 1, we draw boxplots for treated and control groups:
# we select control variables and store them in a long dataframe
data_pollutant_variables <- data_matched %>%
select(mean_no2_l_lag_1:mean_o3_l_lag_1,
is_treated) %>%
# transform the data to long to compute the proportion of observations for each variable
pivot_longer(
cols = -c(is_treated),
names_to = "variable",
values_to = "values"
) %>%
mutate(is_treated = ifelse(is_treated == "TRUE", "Treated", "Control")) %>%
mutate(
pollutant = NA %>%
ifelse(str_detect(variable, "no2_l"), "NO2 Longchamp", .) %>%
ifelse(str_detect(variable, "no2_sl"), "NO2 Saint-Louis", .) %>%
ifelse(str_detect(variable, "o3"), "O3 in Longchamp", .) %>%
ifelse(str_detect(variable, "pm10_l"), "PM10 Longchamp", .) %>%
ifelse(str_detect(variable, "pm10_sl"), "PM10 Saint-Louis", .) %>%
ifelse(str_detect(variable, "pm25"), "PM2.5 Longchamp", .) %>%
ifelse(str_detect(variable, "so2"), "SO2 Longchamp", .)
) %>%
mutate(time = "\nin t-1") %>%
mutate(variable = paste(pollutant, time, sep = " "))
# make graph
graph_boxplot_pollutants <- data_pollutant_variables %>%
ggplot(., aes(x = is_treated, y = values, colour = is_treated)) +
geom_violin(size = 1.2) +
geom_boxplot(width = 0.1, outlier.shape = NA) +
scale_color_manual(values = c(my_blue, my_orange)) +
ylab("Concentration (µg/m³)") +
xlab("") +
labs(colour = "Units Status:") +
facet_wrap( ~ variable, scale = "free", ncol = 4) +
theme_tufte()
# we print the graph
graph_boxplot_pollutants
# save the graph
ggsave(
graph_boxplot_pollutants,
filename = here::here(
"inputs",
"3.outputs",
"2.daily_analysis",
"2.analysis_pollution",
"1.cruise_experiment",
"1.checking_matching_procedure",
"graph_boxplot_pollutants.pdf"
),
width = 40,
height = 25,
units = "cm",
device = cairo_pdf
)
For road traffic flow and occupancy rate, we draw boxplots for treated and control groups over the 0-1 lags:
# we select control variables and store them in a long dataframe
data_road_traffic_variables <- data_matched %>%
select(
road_traffic_flow_all,
road_traffic_flow_all_lag_1,
road_occupancy_rate,
road_occupancy_rate_lag_1,
is_treated
) %>%
pivot_longer(
cols = -c(is_treated),
names_to = "traffic_variable",
values_to = "value"
) %>%
mutate(
traffic_variable = case_when(
traffic_variable == "road_traffic_flow_all" ~ "Road Traffic Flow (N. Vehicles) in t",
traffic_variable == "road_traffic_flow_all_lag_1" ~ "Road Traffic Flow (N. Vehicles) in t-1",
traffic_variable == "road_occupancy_rate" ~ "Road Occupancy Rate (%) in t",
traffic_variable == "road_occupancy_rate_lag_1" ~ "Road Occupancy Rate (%) in t-1"
)
) %>%
mutate(is_treated = ifelse(is_treated == "TRUE", "Treated", "Control"))
# make graph
graph_boxplot_road_traffic <- data_road_traffic_variables %>%
ggplot(., aes(x = is_treated, y = value, colour = is_treated)) +
geom_violin(size = 1.2) +
geom_boxplot(width = 0.1, outlier.shape = NA) +
scale_color_manual(values = c(my_blue, my_orange)) +
ylab("Road Traffic Flow") +
xlab("") +
labs(colour = "Units Status:") +
facet_wrap(~ traffic_variable, scale = "free", ncol = 2) +
theme_tufte()
# we print the graph
graph_boxplot_road_traffic
# save the graph
ggsave(
graph_boxplot_road_traffic,
filename = here::here(
"inputs",
"3.outputs",
"2.daily_analysis",
"2.analysis_pollution",
"1.cruise_experiment",
"1.checking_matching_procedure",
"graph_boxplot_road_traffic.pdf"
),
width = 25,
height = 20,
units = "cm",
device = cairo_pdf
)
For calendar variables such as the day of the week, bank days and holidays we matched strictly. We plot the proportions of observations belonging to each month by treatment status:
# compute the proportions of observations belonging to each month by treatment status
data_month <- data_matched %>%
mutate(is_treated = ifelse(is_treated == "TRUE", "Treated", "Control")) %>%
select(month, is_treated) %>%
mutate(
month = recode(
month,
`1` = "January",
`2` = "February",
`3` = "March",
`4` = "April",
`5` = "May",
`6` = "June",
`7` = "July",
`8` = "August",
`9` = "September",
`10` = "October",
`11` = "November",
`12` = "December"
) %>%
fct_relevel(
.,
"January",
"February",
"March",
"April",
"May",
"June",
"July",
"August",
"September",
"October",
"November",
"December"
)
) %>%
pivot_longer(., -is_treated) %>%
group_by(name, is_treated, value) %>%
summarise(n = n()) %>%
mutate(proportion = round(n / sum(n) * 100, 0)) %>%
ungroup()
# we plot the data using cleveland dot plots
graph_month <-
ggplot(data_month,
aes(
x = as.factor(value),
y = proportion,
colour = is_treated,
group = is_treated
)) +
geom_line(size = 1.2) +
scale_colour_manual(values = c(my_blue, my_orange),
guide = guide_legend(reverse = FALSE)) +
expand_limits(x = 0, y = 0) +
ggtitle("Month") +
ylab("Proportion (%)") +
xlab("") +
labs(colour = "Units Status:") +
theme_tufte()
# we print the graph
graph_month
We plot the proportions of observations belonging to each year by treatment status:
# compute the proportions of observations belonging to each year by treatment status
data_year <- data_matched %>%
mutate(is_treated = ifelse(is_treated == "TRUE", "Treated", "Control")) %>%
select(year, is_treated) %>%
pivot_longer(.,-is_treated) %>%
group_by(name, is_treated, value) %>%
summarise(n = n()) %>%
mutate(proportion = round(n / sum(n) * 100, 0)) %>%
ungroup()
# we plot the data using cleveland dot plots
graph_year <-
ggplot(data_year,
aes(
x = as.factor(value),
y = proportion,
colour = is_treated,
group = is_treated
)) +
geom_line(size = 1.2) +
scale_colour_manual(values = c(my_blue, my_orange),
guide = guide_legend(reverse = FALSE)) +
expand_limits(x = 0, y = 0) +
ggtitle("Year") +
ylab("Proportion (%)") +
xlab("") +
labs(colour = "Units Status:") +
theme_tufte()
# we print the graph
graph_year
We combine and save the two previous plots:
# combine plots
graph_month_year <- graph_month / graph_year +
plot_annotation(tag_levels = 'A') &
theme(plot.tag = element_text(size = 30, face = "bold"))
# save the plot
ggsave(
graph_month_year,
filename = here::here(
"inputs",
"3.outputs",
"2.daily_analysis",
"2.analysis_pollution",
"1.cruise_experiment",
"1.checking_matching_procedure",
"graph_month_year.pdf"
),
width = 30,
height = 25,
units = "cm",
device = cairo_pdf
)
If you see mistakes or want to suggest changes, please create an issue on the source repository.