Comparing days with Wind Blowing from the North-East to Other Directions. Adjusting for calendar indicators and other weather variables.
In this document, we take great care providing all steps and R codes required to estimate the influence of North-East winds on air pollutants. We compare days where:
We adjust for calendar indicators and weather confouding factors.
Should you have any questions, need help to reproduce the analysis or find coding errors, please do not hesitate to contact me at leo.zabrocki@psemail.eu
To reproduce exactly the 5_script_analyzing_results.html
document, we first need to have installed:
5_script_analyzing_results.Rmd
file and interact with the R code chunksOnce everything is set up, we have to load the following packages:
# load required packages
library(knitr) # for creating the R Markdown document
library(here) # for files paths organization
library(tidyverse) # for data manipulation and visualization
library(retrodesign) # for assessing type m and s errors
library(Cairo) # for printing custom police of graphs
library(patchwork) # combining plots
library(DT) # for tables
We finally load our custom ggplot2
theme for graphs:
We load the matched data:
We first compute the differences in a pollutant’s concentration for each pair over time:
data_matched_wide <- data_matched %>%
mutate(is_treated = ifelse(is_treated == TRUE, "treated", "control")) %>%
select(
is_treated,
pair_number,
contains("mean_no2"),
contains("mean_o3"),
contains("mean_pm10"),
contains("mean_pm25")
) %>%
pivot_longer(
cols = -c(pair_number, is_treated),
names_to = "variable",
values_to = "concentration"
) %>%
mutate(
pollutant = NA %>%
ifelse(str_detect(variable, "no2"), "NO2", .) %>%
ifelse(str_detect(variable, "o3"), "O3", .) %>%
ifelse(str_detect(variable, "pm10"), "PM10", .) %>%
ifelse(str_detect(variable, "pm25"), "PM2.5", .)
) %>%
mutate(time = 0 %>%
ifelse(str_detect(variable, "lag_1"),-1, .) %>%
ifelse(str_detect(variable, "lead_1"), 1, .)) %>%
select(-variable) %>%
select(pair_number, is_treated, pollutant, time, concentration) %>%
pivot_wider(names_from = is_treated, values_from = concentration) %>%
unnest()
data_pair_difference_pollutant <- data_matched_wide %>%
mutate(difference = treated - control) %>%
select(-c(treated, control))
Boxplots for NO2:
# create the graph for no2
graph_boxplot_difference_pollutant_no2 <-
data_pair_difference_pollutant %>%
filter(str_detect(pollutant, "NO2")) %>%
ggplot(., aes(x = as.factor(time), y = difference)) +
geom_boxplot(colour = my_blue, size = 0.3) +
scale_y_continuous(breaks = scales::pretty_breaks(n = 8)) +
facet_wrap( ~ pollutant) +
ylab("Pair Difference in \nConcentration (µg/m3)") + xlab("Day") +
theme_tufte()
# display the graph
graph_boxplot_difference_pollutant_no2
# save the graph
ggsave(
graph_boxplot_difference_pollutant_no2,
filename = here::here(
"inputs", "3.outputs",
"2.matching_analysis",
"graph_boxplot_difference_pollutant_no2.pdf"
),
width = 18,
height = 9,
units = "cm",
device = cairo_pdf
)
Boxplots for O3:
# create the graph for o3
graph_boxplot_difference_pollutant_o3 <-
data_pair_difference_pollutant %>%
filter(str_detect(pollutant, "O3")) %>%
ggplot(., aes(x = as.factor(time), y = difference)) +
geom_boxplot(colour = my_blue, size = 0.3) +
scale_y_continuous(breaks = scales::pretty_breaks(n = 8)) +
ylab("Pair Difference in \nConcentration (µg/m3)") + xlab("Day") +
theme_tufte()
# display the graph
graph_boxplot_difference_pollutant_o3
# save the graph
ggsave(
graph_boxplot_difference_pollutant_o3,
filename = here::here(
"inputs", "3.outputs",
"2.matching_analysis",
"graph_boxplot_difference_pollutant_o3.pdf"
),
width = 18,
height = 9,
units = "cm",
device = cairo_pdf
)
Boxplots for PM10:
# create the graph for pm10
graph_boxplot_difference_pollutant_pm10 <-
data_pair_difference_pollutant %>%
filter(str_detect(pollutant, "PM10")) %>%
ggplot(., aes(x = as.factor(time), y = difference)) +
geom_boxplot(colour = my_blue, size = 0.3) +
scale_y_continuous(breaks = scales::pretty_breaks(n = 8)) +
facet_wrap( ~ pollutant) +
ylab("Pair Difference in \nConcentration (µg/m3)") + xlab("Day") +
theme_tufte()
# display the graph
graph_boxplot_difference_pollutant_pm10
# save the graph
ggsave(
graph_boxplot_difference_pollutant_pm10,
filename = here::here(
"inputs", "3.outputs",
"2.matching_analysis",
"graph_boxplot_difference_pollutant_pm10.pdf"
),
width = 18,
height = 9,
units = "cm",
device = cairo_pdf
)
Boxplots for PM2.5:
# create the graph for pm10
graph_boxplot_difference_pollutant_pm25 <-
data_pair_difference_pollutant %>%
filter(str_detect(pollutant, "PM2.5")) %>%
ggplot(., aes(x = as.factor(time), y = difference)) +
geom_boxplot(colour = my_blue, size = 0.3) +
scale_y_continuous(breaks = scales::pretty_breaks(n = 8)) +
facet_wrap( ~ pollutant) +
ylab("Pair Difference in \nConcentration (µg/m3)") + xlab("Day") +
theme_tufte()
# display the graph
graph_boxplot_difference_pollutant_pm25
# save the graph
ggsave(
graph_boxplot_difference_pollutant_pm25,
filename = here::here(
"inputs", "3.outputs",
"2.matching_analysis",
"graph_boxplot_difference_pollutant_pm25.pdf"
),
width = 18,
height = 9,
units = "cm",
device = cairo_pdf
)
We compute confidence intervals for the average treatment effect of North-East winds using Neyman’s approach. We use the formula for the standard error of pair randomized experiment found in Imbens and Rubin (2015).
# we first compute the average treatment effects for each pollutant and day
data_pair_mean_difference <- data_pair_difference_pollutant %>%
filter(pollutant != "PM2.5") %>%
group_by(pollutant, time) %>%
summarise(mean_difference = mean(difference)) %>%
ungroup()
# we store the number of pairs
n_pair <- nrow(data_matched) / 2
# compute the standard error
data_se_neyman_pair <-
left_join(
data_pair_difference_pollutant,
data_pair_mean_difference,
by = c("pollutant", "time")
) %>%
mutate(squared_difference = (difference - mean_difference) ^ 2) %>%
group_by(pollutant, time) %>%
summarise(standard_error = sqrt(1 / (n_pair * (n_pair - 1)) * sum(squared_difference))) %>%
select(pollutant, time, standard_error) %>%
ungroup()
# merge the average treatment effect data witht the standard error data
data_neyman <-
left_join(data_pair_mean_difference,
data_se_neyman_pair,
by = c("pollutant", "time")) %>%
# compute the 95% and 99% confidence intervals
mutate(
upper_bound_95 = mean_difference + (-qnorm((1 - 0.95) / 2) * standard_error),
lower_bound_95 = mean_difference - (-qnorm((1 - 0.95) / 2) * standard_error),
upper_bound_99 = mean_difference + (-qnorm((1 - 0.99) / 2) * standard_error),
lower_bound_99 = mean_difference - (-qnorm((1 - 0.99) / 2) * standard_error)
)
We plot below the point estimates and the associated 95% and 99% confidence intervals:
# create an indicator to alternate shading of confidence intervals
data_neyman <- data_neyman %>%
arrange(pollutant, time) %>%
mutate(stripe = ifelse((time %% 2) == 0, "Grey", "White")) %>%
ungroup()
# make the graph
graph_ci <-
ggplot(data_neyman,
aes(x = as.factor(time), y = mean_difference)) +
geom_rect(
aes(fill = stripe),
xmin = as.numeric(as.factor(data_neyman$time)) - 0.42,
xmax = as.numeric(as.factor(data_neyman$time)) + 0.42,
ymin = -Inf,
ymax = Inf,
color = NA,
alpha = 0.4
) +
geom_hline(yintercept = 0,
color = "black",
size = 0.3) +
geom_pointrange(
aes(
x = as.factor(time),
y = mean_difference,
ymin = lower_bound_95 ,
ymax = upper_bound_95
),
colour = my_blue,
fatten = 1.5,
size = 2
) +
geom_pointrange(
aes(
x = as.factor(time),
y = mean_difference,
ymin = lower_bound_99 ,
ymax = upper_bound_99
),
colour = my_blue,
lwd = 0.5
) +
scale_y_continuous(breaks = scales::pretty_breaks(n = 8)) +
facet_wrap( ~ pollutant, ncol = 4) +
scale_fill_manual(values = c('gray96', "white")) +
guides(fill = FALSE) +
ylab("Average Increase in\n Concentrations (µg/m³)") + xlab("Day") +
theme_tufte() +
theme(axis.text.x = element_text(margin = ggplot2::margin(t = 0, unit = "cm")))
# print the graph
graph_ci
# save the graph
ggsave(
graph_ci,
filename = here::here("inputs", "3.outputs", "2.matching_analysis", "graph_ci.pdf"),
width = 16,
height = 8,
units = "cm",
device = cairo_pdf
)
We display below the table with the point estimates and the 95% and 99% confidence intervals:
We can finally check if there is also an effect on North-East winds on PM2.5 concentrations. One issue is that Paris did not have measuring stations for background PM2.5 concentrations from 2009-09-22 to 2010-06-23, that is to say 274 days. We did not impute these missing concentrations but can nonetheless assess if the North-East winds influence the observed pollutant concentrations. We proceed as before but only work with pairs of days without missing PM2.5 recordings:
# we first only select pm2.5 pair differences
data_pair_difference_pollutant_pm25 <- data_pair_difference_pollutant %>%
filter(pollutant == "PM2.5")
# we then find pairs with missing PM2.5 concentrations
pairs_to_remove <- data_pair_difference_pollutant_pm25 %>%
filter(is.na(difference)) %>%
distinct(pair_number) %>%
pull(pair_number)
# we remove those pairs
data_pair_difference_pollutant_pm25 <- data_pair_difference_pollutant_pm25 %>%
filter(!(pair_number %in% pairs_to_remove))
# we compute the average treatment effects for pm2.5 and by day
data_pair_mean_difference_pm25 <- data_pair_difference_pollutant_pm25 %>%
group_by(time) %>%
summarise(mean_difference = mean(difference, na.rm = TRUE)) %>%
ungroup()
# we store the number of pairs
n_pair <- length(unique(data_pair_difference_pollutant_pm25$pair_number))
# we compute the standard error
data_se_neyman_pair_pm25 <-
left_join(
data_pair_difference_pollutant_pm25,
data_pair_mean_difference_pm25,
by = c("time")
) %>%
mutate(squared_difference = (difference - mean_difference) ^ 2) %>%
group_by(time) %>%
summarise(standard_error = sqrt(1 / (n_pair * (n_pair - 1)) * sum(squared_difference))) %>%
select(time, standard_error) %>%
ungroup()
# merge the average treatment effect data witht the standard error data
data_neyman_pm25 <-
left_join(data_pair_mean_difference_pm25,
data_se_neyman_pair_pm25,
by = c("time")) %>%
# compute the 95% and 99% confidence intervals
mutate(
upper_bound_95 = mean_difference + (-qnorm((1 - 0.95) / 2) * standard_error),
lower_bound_95 = mean_difference - (-qnorm((1 - 0.95) / 2) * standard_error),
upper_bound_99 = mean_difference + (-qnorm((1 - 0.99) / 2) * standard_error),
lower_bound_99 = mean_difference - (-qnorm((1 - 0.99) / 2) * standard_error)
)
We display below the estimates for the ATE and the associated 95% and 99% confidence intervals:
We load non-imputed air pollution data and compute for each pollutant the 0-1 daily lags and leads:
# load non-imputed data
data_not_imputed <-
readRDS(here::here("inputs", "1.data", "4.data_for_analysis", "data_pollutants_not_imputed.rds"))
# merge with matched data
data_missing <- left_join(data_matched, data_not_imputed, by = "date")
# display proportion of missing values
data_missing %>%
pivot_longer(
cols = c(not_imputed_mean_pm25:not_imputed_mean_pm10),
names_to = "Pollutant",
values_to = "concentration"
) %>%
group_by(Pollutant) %>%
summarise("Missing Proportion (%)" = round(sum(is.na(concentration)) /
n() * 100, 1)) %>%
knitr::kable(., align = c("l", "c"))
Pollutant | Missing Proportion (%) |
---|---|
not_imputed_mean_no2 | 13.2 |
not_imputed_mean_o3 | 8.3 |
not_imputed_mean_pm10 | 7.4 |
not_imputed_mean_pm25 | 23.1 |
# load non-imputed data
data_not_imputed <-
readRDS(here::here("inputs", "1.data", "4.data_for_analysis", "data_pollutants_not_imputed.rds")) %>%
select(-not_imputed_mean_pm25)
# we first define data_not_imputed_leads and data_not_imputed_lags
# to store leads and lags
data_not_imputed_leads <- data_not_imputed
data_not_imputed_lags <- data_not_imputed
#
# create leads
#
# create a list to store dataframe of leads
leads_list <- vector(mode = "list", length = 1)
names(leads_list) <- c(1)
# create the leads
for(i in 1){
leads_list[[i]] <- data_not_imputed_leads %>%
mutate_at(vars(-date), ~ lead(., n = i, order_by = date)) %>%
rename_at(vars(-date),function(x) paste0(x,"_lead_", i))
}
# merge the dataframes of leads
data_leads <- leads_list %>%
reduce(left_join, by = "date")
# merge the leads with the data_not_imputed_leads
data_not_imputed_leads <- left_join(data_not_imputed_leads, data_leads, by = "date") %>%
select(-c(not_imputed_mean_no2:not_imputed_mean_pm10))
#
# create lags
#
# create a list to store dataframe of lags
lags_list <- vector(mode = "list", length = 1)
names(lags_list) <- c(1)
# create the lags
for(i in 1){
lags_list[[i]] <- data_not_imputed_lags %>%
mutate_at(vars(-date), ~ lag(., n = i, order_by = date)) %>%
rename_at(vars(-date),function(x) paste0(x,"_lag_", i))
}
# merge the dataframes of lags
data_lags <- lags_list %>%
reduce(left_join, by = "date")
# merge the lags with the initial data_not_imputed_lags
data_not_imputed_lags <- left_join(data_not_imputed_lags, data_lags, by = "date")
#
# merge data_not_imputed_leads with data_not_imputed_lags
#
data_not_imputed <- left_join(data_not_imputed_lags, data_not_imputed_leads, by = "date")
We merge these data with the matched data and compute pair differences:
# merge with the matched_data
data_matched_with_missing_pollutants <- left_join(data_matched, data_not_imputed, by = "date")
# compute pair differences
data_matched_wide_missing_pollutants <- data_matched_with_missing_pollutants %>%
mutate(is_treated = ifelse(is_treated == TRUE, "treated", "control")) %>%
select(is_treated, pair_number, contains("not_imputed_mean_no2"), contains("not_imputed_mean_o3"), contains("not_imputed_mean_pm10")) %>%
pivot_longer(cols = -c(pair_number, is_treated), names_to = "variable", values_to = "concentration") %>%
mutate(pollutant = NA %>%
ifelse(str_detect(variable, "no2"), "NO2",.) %>%
ifelse(str_detect(variable, "o3"), "O3",.) %>%
ifelse(str_detect(variable, "pm10"), "PM10",.)) %>%
mutate(time = 0 %>%
ifelse(str_detect(variable, "lag_1"), -1, .) %>%
ifelse(str_detect(variable, "lead_1"), 1, .)) %>%
select(-variable) %>%
select(pair_number, is_treated, pollutant, time, concentration) %>%
pivot_wider(names_from = is_treated, values_from = concentration)
data_missing_pair_difference_pollutant <- data_matched_wide_missing_pollutants %>%
mutate(difference = treated-control) %>%
select(-c(treated, control))
We display below the proportion of missing differences by pollutant and day:
# make the graph
graph_missing_pollutants <- data_missing_pair_difference_pollutant %>%
group_by(pollutant, time) %>%
summarise(proportion_missing = sum(is.na(difference))/n()*100) %>%
ggplot(., aes(x = as.factor(time), y = proportion_missing)) +
geom_segment(aes(x = as.factor(time), xend = as.factor(time), y = 0, yend = proportion_missing)) +
geom_point(shape = 21, size = 4, colour = "black", fill = my_blue) +
ylim(0, 100) +
facet_wrap(~ pollutant, ncol = 4) +
xlab("Day") + ylab("Proportion of Pairs with \nMissing Concentrations (%)") +
theme_tufte()
# display the graph
graph_missing_pollutants
# save the graph
ggsave(
graph_missing_pollutants,
filename = here::here(
"inputs", "3.outputs",
"2.matching_analysis",
"graph_missing_pollutants.pdf"
),
width = 20,
height = 10,
units = "cm",
device = cairo_pdf
)
Clearly, many missing concentrations in the matched pairs were imputed. We compute the point estimates and confidence intervals for the pairs without missing values. Here, we use a Wilcoxon’s test to also make the results less sensitive to potential outliers:
# carry out the wilcox.test
data_missing_rank_ci <- data_missing_pair_difference_pollutant %>%
drop_na() %>%
select(- pair_number) %>%
group_by(pollutant, time) %>%
nest() %>%
mutate(mean_difference = map(data, ~ wilcox.test(.$difference, conf.int = TRUE)$estimate),
lower_bound_95 = map(data, ~ wilcox.test(.$difference, conf.int = TRUE)$conf.int[1]),
upper_bound_95 = map(data, ~ wilcox.test(.$difference, conf.int = TRUE)$conf.int[2])) %>%
unnest(cols = c(mean_difference, lower_bound_95, upper_bound_95)) %>%
mutate(data = "Pairs without Missing Concentrations")
# bind with analysis on imputed concentrations
data_imputed_nonimputed <- data_neyman %>%
mutate(data = "Pairs with Imputed Concentrations") %>%
select(pollutant,
time,
mean_difference,
lower_bound_95,
upper_bound_95,
data) %>%
bind_rows(., data_missing_rank_ci)
# create an indicator to alternate shading of confidence intervals
data_imputed_nonimputed <- data_imputed_nonimputed %>%
arrange(pollutant, time) %>%
mutate(stripe = ifelse((time %% 2) == 0, "Grey", "White")) %>%
ungroup()
# make the graph
graph_ci_missing <-
ggplot(data_imputed_nonimputed,
aes(x = as.factor(time), y = mean_difference, ymin = lower_bound_95, ymax = upper_bound_95, colour = data, shape = data)) +
geom_rect(
aes(fill = stripe),
xmin = as.numeric(as.factor(data_imputed_nonimputed$time)) - 0.42,
xmax = as.numeric(as.factor(data_imputed_nonimputed$time)) + 0.42,
ymin = -Inf,
ymax = Inf,
color = NA,
alpha = 0.4
) +
geom_hline(yintercept = 0,
color = "black",
size = 0.3) +
geom_pointrange(position = position_dodge(width = 1), size = 1.2) +
scale_shape_manual(name = "Dataset:", values = c(16, 17)) +
scale_color_manual(name = "Dataset:", values = c(my_orange, my_blue)) +
scale_y_continuous(breaks = scales::pretty_breaks(n = 8)) +
facet_wrap( ~ pollutant, ncol = 4) +
scale_fill_manual(values = c('gray96', "white")) +
guides(fill = FALSE) +
ylab("Average Increase in\n Concentrations (µg/m³)") + xlab("Day") +
theme_tufte() +
theme(axis.text.x = element_text(margin = ggplot2::margin(t = 0, unit = "cm")))
# print the graph
graph_ci_missing
# save the graph
ggsave(
graph_ci_missing,
filename = here::here("inputs", "3.outputs", "2.matching_analysis", "graph_ci_missing.pdf"),
width = 16,
height = 8,
units = "cm",
device = cairo_pdf
)
Results seem relatively robust to the imputation of missing values.
To assess whether the effects of North-East winds on PM\(_{10}\) concentrations could be due to hidden bias, we implement the studentized sensitivity analysis for the ATE developed by Colin B. Fogarty (2019). We first load the relevant functions:
# load fogarty's studentized Sensitivity Analysis functions
# retrieved from http://www.mit.edu/~cfogarty/StudentizedSensitivity.R
#' StudentizedSensitivity
#'Function to perform a Studentized Sensitivity analysis on the sample average treatment
#'effect in a paired observational study
#'
#' @param PairedDiff: Vector of treated-minus-control paired differences.
#' @param null: Value of the sample average treatment effect under the null.
#' @param alpha: Desired Type I error rate.
#' @param alternative: Can be "less", "greater", or "two.sided".
#' @param Gamma: Vector of values for Gamma at which to perform the sensitivity
#' analysis.
#' @param nperm: Number of permutations to perform permutation test.
#' @param Changepoint: If true, function returns the maximal Gamma for which the
#' test rejects at level alpha.
#' @param SensitivityInterval: If true, function returns (100-alpha) sensitivity
#' intervals. They will be one-sided if the alternative is less than or greater than,
#' and two-sided if the alternative is two-sided.
#'
#' @return Gamma: Vector of Gammas for which the sensitivity analysis was performed.
#' @return pval: P-values for each value of Gamma.
#' @return GammaPval: Matrix combining Gamma and pval.
#' @return Changepoint: Maximal Gamma for which the test rejected at level alpha.
#' @return SensitivityInterval: Upper and lower bounds for 100(1-alpha) sensitivity
#' intervals for each value of Gamma.
#' @export
StudentizedSensitivity = function(PairedDiff, null = 0, alpha = 0.05, alternative = "greater", Gamma = 1, nperm = 50000, Changepoint = T, SensitivityInterval = T)
{
if(any(Gamma < 1))
{
stop("Values for Gamma must be >= 1")
}
if(alternative!="less" & alternative!= "greater" & alternative != "two.sided")
{
stop("Values for alternative are `less', `greater', or `two.sided'")
}
if(length(null) > 1)
{
stop("Value under the null must be a scalar")
}
if(alpha < 0 | alpha > 0.5)
{
stop("alpha must be between 0 and 0.5")
}
PairedDifftrue <- PairedDiff
alphatrue <- alpha
I <- length(PairedDiff)
Adjust <- PairedDiff - null
if(alternative == "less")
{
Adjust <- -Adjust
}
if(alternative == "two.sided")
{
alpha <- alphatrue/2
if(mean(Adjust) < 0)
{
Adjust <- -Adjust
}
}
pval <- rep(0, length(Gamma))
for(i in 1:length(Gamma))
{
D <- (Adjust) - (Gamma[i]-1)/(1+Gamma[i])*abs(Adjust)
obs <- mean(D)/(sd(D)/sqrt(I))
Adjmat <- matrix(abs(Adjust), I, nperm)
Zmat <- matrix(runif(I*nperm) < Gamma[i]/(1+Gamma[i]), I, nperm)
Dmat <- (2*Zmat-1)*(Adjmat) - (Gamma[i]-1)/(1+Gamma[i])*Adjmat
perm <- colMeans(Dmat)/(sqrt(colVars(Dmat)/I))
pval[i] <- (1+sum(perm>=obs))/(nperm + 1)
}
pvalret = pval
if(alternative == "two.sided")
{
pvalret = 2*pval
}
Pmatrix <- cbind(Gamma, pvalret)
colnames(Pmatrix) <- c("Gamma", "P-value")
if(Changepoint == T)
{
proceed <- StudentizedSensitivity(PairedDifftrue, null, alphatrue, alternative, Gamma=1, nperm,
Changepoint = F, SensitivityInterval = F)$pval <= alphatrue
change <- 1
if(proceed)
{
change <- uniroot(StudentizedChangepoint, interval = c(1, 30), PairedDiff = PairedDifftrue, null = null,
alpha = alphatrue, alternative = alternative, nperm = nperm,
extendInt = "upX")$root
}
}
if(SensitivityInterval == T)
{
lb = rep(-Inf, length(Gamma))
ub = rep(Inf, length(Gamma))
for(i in 1:length(Gamma))
{
# Warm Starts
UB = uniroot(BoundFinder, PairedDifftrue, Gamma[i],
interval = c(mean(PairedDifftrue), mean(PairedDifftrue)+4*sd(PairedDifftrue)/sqrt(I)), extendInt = "yes")$root
LB = -uniroot(BoundFinder, -PairedDifftrue, Gamma[i],
interval = c(-mean(PairedDifftrue)-4*sd(PairedDifftrue)/sqrt(I), -mean(PairedDifftrue)), extendInt = "yes")$root
SUB = Inf
SLB = -Inf
if(alternative == "greater")
{
SLB = uniroot(StudentizedSI, interval = c(UB-4*sd(PairedDifftrue)/sqrt(I), UB), extendInt = "yes",
Gamma = Gamma[i], PairedDiff=PairedDifftrue, alternative = "greater", alpha = alpha, nperm = nperm)$root
}
if(alternative == "less")
{
SUB = uniroot(StudentizedSI, interval = c(LB, LB + 4*sd(PairedDifftrue)/sqrt(I)), extendInt = "yes",
Gamma = Gamma[i], PairedDiff=PairedDifftrue, alternative = "less", alpha = alpha, nperm = nperm)$root
}
if(alternative == "two.sided")
{
SLB = uniroot(StudentizedSI, interval = c(UB-4*sd(PairedDifftrue)/sqrt(I), UB), extendInt = "yes",
Gamma = Gamma[i], PairedDiff=PairedDifftrue, alternative = "greater", alpha = alpha, nperm = nperm)$root
SUB = uniroot(StudentizedSI, interval = c(LB, LB+4*sd(PairedDifftrue)/sqrt(I)), extendInt = "yes",
Gamma = Gamma[i], PairedDiff=PairedDifftrue, alternative = "less", alpha = alpha, nperm = nperm)$root
}
lb[i] = SLB
ub[i] = SUB
}
SImat = cbind(Gamma, lb, ub)
colnames(SImat) = c("Gamma", "Lower Bound", "Upper Bound")
}
if(Changepoint == F & SensitivityInterval == F)
{
return(list(Gamma=Gamma, pval = pvalret, GammaPval = Pmatrix))
}
if(Changepoint == F & SensitivityInterval == T)
{
return(list(Gamma = Gamma, pval = pvalret, GammaPval = Pmatrix, SensitivityInterval = SImat))
}
if(Changepoint == T & SensitivityInterval == F)
{
return(list(Gamma = Gamma, pval = pvalret, GammaPval = Pmatrix, Changepoint = change))
}
if(Changepoint == T & SensitivityInterval == T)
{
return(list(Gamma = Gamma, pval = pvalret, GammaPval = Pmatrix, Changepoint = change,
SensitivityInterval = SImat))
}
}
####These are auxiliary functions used for root finding and for calculating columnwise variances in StudentizedSensitivity
StudentizedChangepoint = function(Gamma, PairedDiff, null, alternative, alpha, nperm)
{
alphachange = alpha
StudentizedSensitivity(PairedDiff, null, alpha, alternative, Gamma, nperm, Changepoint = F, SensitivityInterval = F)$pval - alphachange
}
StudentizedSI = function(null, Gamma, PairedDiff, alternative, alpha, nperm)
{
StudentizedSensitivity(PairedDiff, null, alpha, alternative, Gamma, nperm, Changepoint = F, SensitivityInterval = F)$pval - alpha
}
BoundFinder = function(null, PairedDiff, Gamma)
{
mean(PairedDiff - null - (Gamma-1)/(1+Gamma)*abs(PairedDiff-null))
}
colVars <- function(x) {
N = nrow(x)
(colSums(x^2) - colSums(x)^2/N) / (N-1)
}
We select the pair differences for PM\(_{10}\) concentrations in \(t\) and run the function for \(\Gamma\)=2:
# we select the relevant pair differences
PairedDiff <- data_pair_difference_pollutant %>%
filter(pollutant == "PM10" & time == 0) %>%
pull(difference)
# we run the function
data_sensitivity_pm10_0 <- StudentizedSensitivity(
PairedDiff,
null = 0,
alpha = 0.05,
alternative = "two.sided",
Gamma = 2,
nperm = 50000,
Changepoint = T,
SensitivityInterval = T
)$SensitivityInterval %>%
as_tibble() %>%
mutate_all(~ round(., 2))
# save results
saveRDS(data_sensitivity_pm10_0, here::here("inputs", "1.data", "5.matched_data", "data_sensitivity_pm10_0.rds"))
We display below the results:
We then implement the same procedure but for concentrations in \(t+1\):
# we select the relevant pair differences
PairedDiff <- data_pair_difference_pollutant %>%
filter(pollutant == "PM10" & time == 1) %>%
pull(difference)
# we run the function
data_sensitivity_pm10_1 <- StudentizedSensitivity(
PairedDiff,
null = 0,
alpha = 0.05,
alternative = "two.sided",
Gamma = 2,
nperm = 50000,
Changepoint = T,
SensitivityInterval = T
)$SensitivityInterval %>%
as_tibble() %>%
mutate_all( ~ round(., 2))
# save results
saveRDS(data_sensitivity_pm10_1, here::here("inputs", "1.data", "5.matched_data", "data_sensitivity_pm10_1.rds"))
We display below the results:
To check that our pair matching procedure improves precision, we compare the estimate of the variance for a pair experiment with the one of a complete experiment (the formula can be found in Imbens & Rubin 2015’s textbook):
# compute estimates of the sampling variability
# for a complete experiment
sampling_variability_complete <- data_matched %>%
select(is_treated, mean_no2, mean_o3, mean_pm10) %>%
pivot_longer(
cols = c(mean_no2:mean_pm10),
names_to = "pollutant",
values_to = "concentration"
) %>%
group_by(pollutant, is_treated) %>%
mutate(
n_obs = n(),
average_concentration = mean(concentration),
squared_difference = (concentration - average_concentration) ^
2,
variance_group = sum(squared_difference) / (n_obs - 1)
) %>%
summarise(variance_component = mean(variance_group / n_obs)) %>%
group_by(pollutant) %>%
summarise(
variance = sum(variance_component),
standard_error_complete = sqrt(variance)
) %>%
select(-variance) %>%
mutate(
pollutant = case_when(
pollutant == "mean_no2" ~ "NO2",
pollutant == "mean_o3" ~ "O3",
pollutant == "mean_pm10" ~ "PM10"
)
)
# estimates of the sampling variability for a pair experiment
sampling_variability_pair <- data_se_neyman_pair %>%
filter(time == 0 & pollutant != "PM2.5") %>%
select(-time) %>%
rename(standard_error_pair = standard_error)
# merge the two datasets and display results
left_join(sampling_variability_pair,
sampling_variability_complete,
by = c("pollutant")) %>%
mutate(
"Precision Improvement (%)" = (standard_error_pair - standard_error_complete) /
standard_error_complete * 100
) %>%
mutate_at(vars(standard_error_pair, standard_error_complete), ~ round(., 2)) %>%
mutate(`Precision Improvement (%)` = round(`Precision Improvement (%)`, 0)) %>%
rename(
"Pollutant" = pollutant,
"S.E Pair Experiment" = standard_error_pair,
"S.E Complete Experiment" = standard_error_complete
) %>%
datatable(.)
We ran a simple time-stratified regression model on the matching data to see how the results differ with those found in our matched data analysis. We only adjust for calendar indicator and weather covariate measured at time \(t\). We load the matching data:
# load matching data
data_matching <-
readRDS(here::here("inputs", "1.data", "5.matched_data", "matching_data.Rds"))
# reshape in long according to pollutants
data_regression_analysis <- data_matching %>%
pivot_longer(
cols = c(
contains("no2"),
contains("o3"),
contains("pm10"),
contains("pm25")
),
names_to = "variable",
values_to = "concentration"
) %>%
mutate(
pollutant = NA %>%
ifelse(str_detect(variable, "no2"), "NO2", .) %>%
ifelse(str_detect(variable, "o3"), "O3", .) %>%
ifelse(str_detect(variable, "pm10"), "PM10", .) %>%
ifelse(str_detect(variable, "pm25"), "PM2.5", .)
) %>%
mutate(time = 0 %>%
ifelse(str_detect(variable, "lag_1"),-1, .) %>%
ifelse(str_detect(variable, "lead_1"), 1, .)) %>%
select(-variable)
# we nest the data by pollutant and time
data_regression_analysis <- data_regression_analysis %>%
group_by(pollutant, time) %>%
nest()
We run the regression model:
# running the model
data_regression_analysis <- data_regression_analysis %>%
mutate(
# regression model
model = map(data, ~lm(concentration ~ is_treated +
temperature_average + I(temperature_average^2) +
temperature_average_lag_1 + I(temperature_average_lag_1^2) +
rainfall_duration + rainfall_duration_lag_1 +
humidity_average + humidity_average_lag_1 +
wind_speed + wind_speed_lag_1 +
weekday + holidays_dummy +
bank_day_dummy + month*as.factor(year), data = .)))
# transform in long according to models
data_regression_analysis <- data_regression_analysis %>%
pivot_longer(cols = c(model), names_to = "model", values_to = "coefficients") %>%
select(-data)
# tidy regression ouputs
data_regression_analysis <- data_regression_analysis %>%
mutate(models_dfs = map(coefficients, ~ broom::tidy(.)))
# unnest results and select coefficient for total gross tonnage
data_regression_analysis <- data_regression_analysis %>%
unnest(models_dfs) %>%
dplyr::filter(term=="is_treatedTRUE") %>%
select(pollutant, time, estimate, std.error)
# we compute the 95% confidence intervals
interval_95 <- -qnorm((1-0.95)/2)
# compute lower and upper bound of each interval
data_regression_analysis <- data_regression_analysis %>%
mutate(ci_lower_95 = estimate - std.error*interval_95,
ci_upper_95 = estimate + std.error*interval_95)
We draw the graph of results:
# create an indicator to alternate shading of confidence intervals
data_regression_analysis <- data_regression_analysis %>%
arrange(pollutant, time) %>%
mutate(stripe = ifelse((time %% 2) == 0, "Grey", "White")) %>%
ungroup()
# make the graph
graph_regression_ci <-
ggplot(data_regression_analysis,
aes(x = as.factor(time), y = estimate)) +
geom_rect(
aes(fill = stripe),
xmin = as.numeric(as.factor(data_regression_analysis$time)) - 0.42,
xmax = as.numeric(as.factor(data_regression_analysis$time)) + 0.42,
ymin = -Inf,
ymax = Inf,
color = NA,
alpha = 0.4
) +
geom_hline(yintercept = 0,
color = "black",
size = 0.3) +
geom_pointrange(
aes(
x = as.factor(time),
y = estimate,
ymin = ci_lower_95 ,
ymax = ci_upper_95
),
size = 0.5,
colour = my_blue,
lwd = 0.8
) +
scale_y_continuous(breaks = scales::pretty_breaks(n = 8)) +
facet_wrap( ~ pollutant, ncol = 4) +
scale_fill_manual(values = c('gray96', "white")) +
guides(fill = FALSE) +
ylab("Average Increase in\n Concentrations (µg/m³)") + xlab("Day") +
theme_tufte() +
theme(axis.text.x = element_text(margin = ggplot2::margin(t = 0, unit = "cm")))
# print the graph
graph_regression_ci
# save the graph
ggsave(
graph_regression_ci,
filename = here::here("inputs", "3.outputs", "2.matching_analysis", "graph_regression_ci.pdf"),
width = 16,
height = 8,
units = "cm",
device = cairo_pdf
)
We display below the values of the point estimates and 95% confidence intervals:
Finally, we evaluate whether the matching procedure helped decrease the model dependence. We run 6 regression models—with different types of covariate adjustements—on the initial and matched data
Below is the code for the initial data:
data_matching <- data_matching %>%
select(
date:bank_day_dummy,
mean_pm10,
temperature_average:wind_speed,
temperature_average_lag_1:wind_speed_lag_1,
is_treated_lag_1:is_treated
) %>%
mutate(year = as.factor(year),
temperature_deciles = ntile(temperature_average, 10),
temperature_deciles_lag_1 = ntile(temperature_average_lag_1, 10))
data_model_dep_matching <- data_matching %>%
nest() %>%
mutate(
model_1 = map(data, ~ lm(mean_pm10 ~ is_treated, data = .)),
model_2 = map(
data,
~ lm(
mean_pm10 ~ is_treated + weekday + holidays_dummy +
bank_day_dummy + month * year,
data = .
)
),
model_3 = map(
data,
~ lm(
mean_pm10 ~ is_treated + temperature_average + weekday + holidays_dummy +
bank_day_dummy + month * year,
data = .
)
),
model_4 = map(
data,
~ lm(
mean_pm10 ~ is_treated + temperature_average + I(temperature_average ^ 2) + weekday + holidays_dummy +
bank_day_dummy + month * year,
data = .
)
),
model_5 = map(
data,
~ lm(
mean_pm10 ~ is_treated + temperature_deciles + weekday + holidays_dummy +
bank_day_dummy + month * year,
data = .
)
),
model_6 = map(
data,
~ lm(
mean_pm10 ~ is_treated + temperature_deciles + rainfall_duration + humidity_average + wind_speed + weekday + holidays_dummy +
bank_day_dummy + month * year,
data = .
)
),
model_7 = map(
data,
~ lm(
mean_pm10 ~ is_treated + is_treated_lag_1 + temperature_deciles + temperature_deciles_lag_1 + rainfall_duration + rainfall_duration_lag_1 + humidity_average + humidity_average_lag_1 + wind_speed + wind_speed_lag_1 + weekday + holidays_dummy +
bank_day_dummy + month * year,
data = .
)
)
)
clean_model_output <- function(model_output) {
model_output %>%
broom::tidy(., conf.int = TRUE) %>%
filter(term == "is_treatedTRUE") %>%
select(estimate, conf.low, conf.high)
}
data_model_dep_matching <- data_model_dep_matching %>%
select(-data) %>%
pivot_longer(cols = c(model_1:model_7), names_to = "model", values_to = "model_output") %>%
mutate(model_output = map(model_output, ~ clean_model_output(.))) %>%
unnest(model_output) %>%
mutate_at(vars(estimate:conf.high), ~ ./mean(data_matching$mean_pm10)*100) %>%
mutate(dataset = "Initial Data")
And the code for the matched data:
data_matched <- data_matched %>%
select(
date:bank_day_dummy,
mean_pm10,
temperature_average:wind_speed,
temperature_average_lag_1:wind_speed_lag_1,
is_treated_lag_1:is_treated
) %>%
mutate(time = 1:nrow(.),
year = as.factor(year),
temperature_deciles = ntile(temperature_average, 10),
temperature_deciles_lag_1 = ntile(temperature_average_lag_1, 10))
data_model_dep_matched <- data_matched %>%
nest() %>%
mutate(
model_1 = map(data, ~ lm(mean_pm10 ~ is_treated, data = .)),
model_2 = map(
data,
~ lm(
mean_pm10 ~ is_treated + weekday + holidays_dummy +
bank_day_dummy + month * year,
data = .
)
),
model_3 = map(
data,
~ lm(
mean_pm10 ~ is_treated + temperature_average + weekday + holidays_dummy +
bank_day_dummy + month * year,
data = .
)
),
model_4 = map(
data,
~ lm(
mean_pm10 ~ is_treated + temperature_average + I(temperature_average ^ 2) + weekday + holidays_dummy +
bank_day_dummy + month * year,
data = .
)
),
model_5 = map(
data,
~ lm(
mean_pm10 ~ is_treated + temperature_deciles + weekday + holidays_dummy +
bank_day_dummy + month * year,
data = .
)
),
model_6 = map(
data,
~ lm(
mean_pm10 ~ is_treated + temperature_deciles + rainfall_duration + humidity_average + wind_speed + weekday + holidays_dummy +
bank_day_dummy + month * year,
data = .
)
),
model_7 = map(
data,
~ lm(
mean_pm10 ~ is_treated + is_treated_lag_1 + temperature_deciles + temperature_deciles_lag_1 + rainfall_duration + rainfall_duration_lag_1 + humidity_average + humidity_average_lag_1 + wind_speed + wind_speed_lag_1 + weekday + holidays_dummy +
bank_day_dummy + month * year,
data = .
)
)
)
clean_model_output <- function(model_output) {
model_output %>%
broom::tidy(., conf.int = TRUE) %>%
filter(term == "is_treatedTRUE") %>%
select(estimate, conf.low, conf.high)
}
data_model_dep_matched <- data_model_dep_matched %>%
select(-data) %>%
pivot_longer(cols = c(model_1:model_7), names_to = "model", values_to = "model_output") %>%
mutate(model_output = map(model_output, ~ clean_model_output(.))) %>%
unnest(model_output) %>%
mutate_at(vars(estimate:conf.high), ~ ./mean(data_matching$mean_pm10)*100) %>%
mutate(dataset = "Matched Data")
We displayed the results:
data_model_dep_all <- bind_rows(data_model_dep_matching, data_model_dep_matched) %>%
arrange(model) %>%
mutate(stripe = ifelse((readr::parse_number(model) %% 2) == 0, "Grey", "White")) %>%
ungroup() %>%
mutate(model = paste("Model", readr::parse_number(model), sep = " "))
# make the graph
graph_model_dep_all <-
ggplot(data_model_dep_all,
aes(x = estimate, y = model)) +
geom_rect(
aes(fill = stripe),
ymin = as.numeric(as.factor(data_model_dep_all$model)) - 0.42,
ymax = as.numeric(as.factor(data_model_dep_all$model)) + 0.42,
xmin = -Inf,
xmax = Inf,
color = NA,
alpha = 0.4
) +
geom_vline(xintercept = 0,
color = "black",
size = 0.3) +
geom_pointrange(
aes(
y = model,
x = estimate,
xmin = conf.low ,
xmax = conf.high
),
size = 0.5,
colour = my_blue,
lwd = 0.8
) +
scale_x_continuous(breaks = scales::pretty_breaks(n = 8)) +
facet_wrap( ~ dataset, ncol = 4) +
scale_fill_manual(values = c('gray96', "white")) +
guides(fill = FALSE) +
ylab("") + xlab("Relative Increase in Concentrations (%)") +
theme_tufte()
# print the graph
graph_model_dep_all
# save the graph
ggsave(
graph_model_dep_all,
filename = here::here("inputs", "3.outputs", "2.matching_analysis", "graph_model_dep_all.pdf"),
width = 18,
height = 6,
units = "cm",
device = cairo_pdf
)
If you see mistakes or want to suggest changes, please create an issue on the source repository.