Analyzing Results

Comparing days with Wind Blowing from the North-East to Other Directions. Adjusting for calendar indicators and other weather variables.

Léo Zabrocki https://lzabrocki.github.io/ (Paris School of Economics)https://www.parisschoolofeconomics.eu/fr/zabrocki-leo/ , Anna Alari https://scholar.google.com/citations?user=MiFY320AAAAJ&hl=fr (ISGlobal)https://www.isglobal.org/ , Tarik Benmarhnia https://profiles.ucsd.edu/tarik.benmarhnia (UCSD & Scripps Institute)https://benmarhniaresearch.ucsd.edu/
2022-04-12

In this document, we take great care providing all steps and R codes required to estimate the influence of North-East winds on air pollutants. We compare days where:

We adjust for calendar indicators and weather confouding factors.

Should you have any questions, need help to reproduce the analysis or find coding errors, please do not hesitate to contact me at

Required Packages

To reproduce exactly the 5_script_analyzing_results.html document, we first need to have installed:

Once everything is set up, we have to load the following packages:

# load required packages
library(knitr) # for creating the R Markdown document
library(here) # for files paths organization
library(tidyverse) # for data manipulation and visualization
library(retrodesign) # for assessing type m and s errors
library(Cairo) # for printing custom police of graphs
library(patchwork) # combining plots
library(DT) # for tables

We finally load our custom ggplot2 theme for graphs:

# load ggplot custom theme
source(here::here(
  "inputs",
  "2.functions",
  "script_theme_tufte.R"
))
# define nice colors
my_blue <- "#0081a7"
my_orange <- "#fb8500"

Preparing the Data

We load the matched data:

# load matched data
data_matched <-
  readRDS(here::here("inputs", "1.data", "5.matched_data", "matched_data.rds"))

Distribution of the Pair Differences in Concentration between Treated and Control units for each Pollutant

Computing Pairs Differences in Pollutant Concentrations

We first compute the differences in a pollutant’s concentration for each pair over time:

data_matched_wide <- data_matched %>%
  mutate(is_treated = ifelse(is_treated == TRUE, "treated", "control")) %>%
  select(
    is_treated,
    pair_number,
    contains("mean_no2"),
    contains("mean_o3"),
    contains("mean_pm10"),
    contains("mean_pm25")
  ) %>%
  pivot_longer(
    cols = -c(pair_number, is_treated),
    names_to = "variable",
    values_to = "concentration"
  ) %>%
  mutate(
    pollutant = NA %>%
      ifelse(str_detect(variable, "no2"), "NO2", .) %>%
      ifelse(str_detect(variable, "o3"), "O3", .) %>%
      ifelse(str_detect(variable, "pm10"), "PM10", .) %>%
      ifelse(str_detect(variable, "pm25"), "PM2.5", .)
  ) %>%
  mutate(time = 0 %>%
           ifelse(str_detect(variable, "lag_1"),-1, .) %>%
           ifelse(str_detect(variable, "lead_1"), 1, .)) %>%
  select(-variable) %>%
  select(pair_number, is_treated, pollutant, time, concentration) %>%
  pivot_wider(names_from = is_treated, values_from = concentration) %>%
  unnest()

data_pair_difference_pollutant <- data_matched_wide %>%
  mutate(difference = treated - control) %>%
  select(-c(treated, control)) 

Pairs Differences in NO2 Concentrations

Boxplots for NO2:

Please show me the code!
# create the graph for no2
graph_boxplot_difference_pollutant_no2 <-
  data_pair_difference_pollutant %>%
  filter(str_detect(pollutant, "NO2")) %>%
  ggplot(., aes(x = as.factor(time), y = difference)) +
  geom_boxplot(colour = my_blue, size = 0.3) +
  scale_y_continuous(breaks = scales::pretty_breaks(n = 8)) +
  facet_wrap( ~ pollutant) +
  ylab("Pair Difference in \nConcentration (µg/m3)") + xlab("Day") +
  theme_tufte()

# display the graph
graph_boxplot_difference_pollutant_no2
Please show me the code!
# save the graph
ggsave(
  graph_boxplot_difference_pollutant_no2,
  filename = here::here(
    "inputs", "3.outputs",
    "2.matching_analysis",
    "graph_boxplot_difference_pollutant_no2.pdf"
  ),
  width = 18,
  height = 9,
  units = "cm",
  device = cairo_pdf
)

Pairs Differences in O3 Concentrations

Boxplots for O3:

Please show me the code!
# create the graph for o3
graph_boxplot_difference_pollutant_o3 <-
  data_pair_difference_pollutant %>%
  filter(str_detect(pollutant, "O3")) %>%
  ggplot(., aes(x = as.factor(time), y = difference)) +
  geom_boxplot(colour = my_blue, size = 0.3) +
  scale_y_continuous(breaks = scales::pretty_breaks(n = 8)) +
  ylab("Pair Difference in \nConcentration (µg/m3)") + xlab("Day") +
  theme_tufte()

# display the graph
graph_boxplot_difference_pollutant_o3
Please show me the code!
# save the graph
ggsave(
  graph_boxplot_difference_pollutant_o3,
  filename = here::here(
    "inputs", "3.outputs",
    "2.matching_analysis",
    "graph_boxplot_difference_pollutant_o3.pdf"
  ),
  width = 18,
  height = 9,
  units = "cm",
  device = cairo_pdf
)

Pairs Differences in PM10 Concentrations

Boxplots for PM10:

Please show me the code!
# create the graph for pm10
graph_boxplot_difference_pollutant_pm10 <-
  data_pair_difference_pollutant %>%
  filter(str_detect(pollutant, "PM10")) %>%
  ggplot(., aes(x = as.factor(time), y = difference)) +
  geom_boxplot(colour = my_blue, size = 0.3) +
  scale_y_continuous(breaks = scales::pretty_breaks(n = 8)) +
  facet_wrap( ~ pollutant) +
  ylab("Pair Difference in \nConcentration (µg/m3)") + xlab("Day") +
  theme_tufte()

# display the graph
graph_boxplot_difference_pollutant_pm10
Please show me the code!
# save the graph
ggsave(
  graph_boxplot_difference_pollutant_pm10,
  filename = here::here(
    "inputs", "3.outputs",
    "2.matching_analysis",
    "graph_boxplot_difference_pollutant_pm10.pdf"
  ),
  width = 18,
  height = 9,
  units = "cm",
  device = cairo_pdf
)

Pairs Differences in PM2.5 Concentrations

Boxplots for PM2.5:

Please show me the code!
# create the graph for pm10
graph_boxplot_difference_pollutant_pm25 <-
  data_pair_difference_pollutant %>%
  filter(str_detect(pollutant, "PM2.5")) %>%
  ggplot(., aes(x = as.factor(time), y = difference)) +
  geom_boxplot(colour = my_blue, size = 0.3) +
  scale_y_continuous(breaks = scales::pretty_breaks(n = 8)) +
  facet_wrap( ~ pollutant) +
  ylab("Pair Difference in \nConcentration (µg/m3)") + xlab("Day") +
  theme_tufte()

# display the graph
graph_boxplot_difference_pollutant_pm25
Please show me the code!
# save the graph
ggsave(
  graph_boxplot_difference_pollutant_pm25,
  filename = here::here(
    "inputs", "3.outputs",
    "2.matching_analysis",
    "graph_boxplot_difference_pollutant_pm25.pdf"
  ),
  width = 18,
  height = 9,
  units = "cm",
  device = cairo_pdf
)

Neymanian Inference: Computing 95% and 99% Confidence Intervals for the Average Treatment Effects

We compute confidence intervals for the average treatment effect of North-East winds using Neyman’s approach. We use the formula for the standard error of pair randomized experiment found in Imbens and Rubin (2015).

# we first compute the average treatment effects for each pollutant and day
data_pair_mean_difference <- data_pair_difference_pollutant %>%
  filter(pollutant != "PM2.5") %>%
  group_by(pollutant, time) %>%
  summarise(mean_difference = mean(difference)) %>%
  ungroup()

# we store the number of pairs
n_pair <- nrow(data_matched) / 2

# compute the standard error
data_se_neyman_pair <-
  left_join(
    data_pair_difference_pollutant,
    data_pair_mean_difference,
    by = c("pollutant", "time")
  ) %>%
  mutate(squared_difference = (difference - mean_difference) ^ 2) %>%
  group_by(pollutant, time) %>%
  summarise(standard_error = sqrt(1 / (n_pair * (n_pair - 1)) * sum(squared_difference))) %>%
  select(pollutant, time, standard_error) %>%
  ungroup()

# merge the average treatment effect data witht the standard error data
data_neyman <-
  left_join(data_pair_mean_difference,
            data_se_neyman_pair,
            by = c("pollutant", "time")) %>%
# compute the 95% and 99% confidence intervals
  mutate(
    upper_bound_95 = mean_difference + (-qnorm((1 - 0.95) / 2) * standard_error),
    lower_bound_95 = mean_difference - (-qnorm((1 - 0.95) / 2) * standard_error),
    upper_bound_99 = mean_difference + (-qnorm((1 - 0.99) / 2) * standard_error),
    lower_bound_99 = mean_difference - (-qnorm((1 - 0.99) / 2) * standard_error)
  )

We plot below the point estimates and the associated 95% and 99% confidence intervals:

Please show me the code!
# create an indicator to alternate shading of confidence intervals
data_neyman <- data_neyman %>%
  arrange(pollutant, time) %>%
  mutate(stripe = ifelse((time %% 2) == 0, "Grey", "White")) %>%
  ungroup()

# make the graph
graph_ci <-
  ggplot(data_neyman,
         aes(x = as.factor(time), y = mean_difference)) +
  geom_rect(
    aes(fill = stripe),
    xmin = as.numeric(as.factor(data_neyman$time)) - 0.42,
    xmax = as.numeric(as.factor(data_neyman$time)) + 0.42,
    ymin = -Inf,
    ymax = Inf,
    color = NA,
    alpha = 0.4
  ) +
  geom_hline(yintercept = 0,
             color = "black",
             size = 0.3) +
  geom_pointrange(
    aes(
      x = as.factor(time),
      y = mean_difference,
      ymin = lower_bound_95 ,
      ymax = upper_bound_95
    ),
    colour = my_blue,
    fatten = 1.5, 
    size = 2
) +
  geom_pointrange(
    aes(
      x = as.factor(time),
      y = mean_difference,
      ymin = lower_bound_99 ,
      ymax = upper_bound_99
    ),
    colour = my_blue,
    lwd = 0.5
  ) +
  scale_y_continuous(breaks = scales::pretty_breaks(n = 8)) +
  facet_wrap( ~ pollutant, ncol = 4) +
  scale_fill_manual(values = c('gray96', "white")) +
  guides(fill = FALSE) +
  ylab("Average Increase in\n Concentrations (µg/m³)") + xlab("Day") +
  theme_tufte() +
  theme(axis.text.x = element_text(margin = ggplot2::margin(t = 0, unit = "cm")))


# print the graph
graph_ci
Please show me the code!
# save the graph
ggsave(
  graph_ci,
  filename = here::here("inputs", "3.outputs", "2.matching_analysis", "graph_ci.pdf"),
  width = 16,
  height = 8,
  units = "cm",
  device = cairo_pdf
)

We display below the table with the point estimates and the 95% and 99% confidence intervals:

We can finally check if there is also an effect on North-East winds on PM2.5 concentrations. One issue is that Paris did not have measuring stations for background PM2.5 concentrations from 2009-09-22 to 2010-06-23, that is to say 274 days. We did not impute these missing concentrations but can nonetheless assess if the North-East winds influence the observed pollutant concentrations. We proceed as before but only work with pairs of days without missing PM2.5 recordings:

# we first only select pm2.5 pair differences
data_pair_difference_pollutant_pm25 <- data_pair_difference_pollutant %>%
  filter(pollutant == "PM2.5")

# we then find pairs with missing PM2.5 concentrations
pairs_to_remove <- data_pair_difference_pollutant_pm25 %>%
  filter(is.na(difference)) %>% 
  distinct(pair_number) %>%
  pull(pair_number)

# we remove those pairs
data_pair_difference_pollutant_pm25 <- data_pair_difference_pollutant_pm25 %>%
  filter(!(pair_number %in% pairs_to_remove))

# we compute the average treatment effects for pm2.5 and by day
data_pair_mean_difference_pm25 <-  data_pair_difference_pollutant_pm25 %>%
  group_by(time) %>%
  summarise(mean_difference = mean(difference, na.rm = TRUE)) %>%
  ungroup()

# we store the number of pairs
n_pair <- length(unique(data_pair_difference_pollutant_pm25$pair_number))

# we compute the standard error
data_se_neyman_pair_pm25 <-
  left_join(
    data_pair_difference_pollutant_pm25,
    data_pair_mean_difference_pm25,
    by = c("time")
  ) %>%
  mutate(squared_difference = (difference - mean_difference) ^ 2) %>%
  group_by(time) %>%
  summarise(standard_error = sqrt(1 / (n_pair * (n_pair - 1)) * sum(squared_difference))) %>%
  select(time, standard_error) %>%
  ungroup()

# merge the average treatment effect data witht the standard error data
data_neyman_pm25 <-
  left_join(data_pair_mean_difference_pm25,
            data_se_neyman_pair_pm25,
            by = c("time")) %>%
# compute the 95% and 99% confidence intervals
  mutate(
    upper_bound_95 = mean_difference + (-qnorm((1 - 0.95) / 2) * standard_error),
    lower_bound_95 = mean_difference - (-qnorm((1 - 0.95) / 2) * standard_error),
    upper_bound_99 = mean_difference + (-qnorm((1 - 0.99) / 2) * standard_error),
    lower_bound_99 = mean_difference - (-qnorm((1 - 0.99) / 2) * standard_error)
  )

We display below the estimates for the ATE and the associated 95% and 99% confidence intervals:

Robustness Checks

Missing Concentrations

We load non-imputed air pollution data and compute for each pollutant the 0-1 daily lags and leads:

# load non-imputed data
data_not_imputed <-
  readRDS(here::here("inputs", "1.data", "4.data_for_analysis", "data_pollutants_not_imputed.rds"))

# merge with matched data
data_missing <- left_join(data_matched, data_not_imputed, by = "date")

# display proportion of missing values
data_missing %>%
  pivot_longer(
    cols = c(not_imputed_mean_pm25:not_imputed_mean_pm10),
    names_to = "Pollutant",
    values_to = "concentration"
  ) %>%
  group_by(Pollutant) %>%
  summarise("Missing Proportion (%)" = round(sum(is.na(concentration)) /
                                               n() * 100, 1)) %>%
  knitr::kable(., align = c("l", "c"))
Pollutant Missing Proportion (%)
not_imputed_mean_no2 13.2
not_imputed_mean_o3 8.3
not_imputed_mean_pm10 7.4
not_imputed_mean_pm25 23.1
# load non-imputed data
data_not_imputed <-
  readRDS(here::here("inputs", "1.data", "4.data_for_analysis", "data_pollutants_not_imputed.rds")) %>%
  select(-not_imputed_mean_pm25)

# we first define data_not_imputed_leads and data_not_imputed_lags
# to store leads and lags

data_not_imputed_leads <- data_not_imputed
data_not_imputed_lags <- data_not_imputed

#
# create leads
# 

# create a list to store dataframe of leads
leads_list <- vector(mode = "list", length = 1)
names(leads_list) <- c(1) 

# create the leads
for(i in 1){
  leads_list[[i]] <- data_not_imputed_leads %>%
    mutate_at(vars(-date), ~  lead(., n = i, order_by = date)) %>%
    rename_at(vars(-date),function(x) paste0(x,"_lead_", i))
}

# merge the dataframes of leads
data_leads <- leads_list %>%
  reduce(left_join, by = "date")

# merge the leads with the data_not_imputed_leads
data_not_imputed_leads <- left_join(data_not_imputed_leads, data_leads, by = "date") %>%
  select(-c(not_imputed_mean_no2:not_imputed_mean_pm10))

#
# create lags
# 

# create a list to store dataframe of lags
lags_list <- vector(mode = "list", length = 1)
names(lags_list) <- c(1) 

# create the lags
for(i in 1){
  lags_list[[i]] <- data_not_imputed_lags %>%
    mutate_at(vars(-date), ~  lag(., n = i, order_by = date)) %>%
    rename_at(vars(-date),function(x) paste0(x,"_lag_", i))
}

# merge the dataframes of lags
data_lags <- lags_list %>%
  reduce(left_join, by = "date")

# merge the lags with the initial data_not_imputed_lags
data_not_imputed_lags <- left_join(data_not_imputed_lags, data_lags, by = "date")

#
# merge data_not_imputed_leads with data_not_imputed_lags
#

data_not_imputed <- left_join(data_not_imputed_lags, data_not_imputed_leads, by = "date")

We merge these data with the matched data and compute pair differences:

# merge with the matched_data
data_matched_with_missing_pollutants <- left_join(data_matched, data_not_imputed, by = "date")

# compute pair differences
data_matched_wide_missing_pollutants <- data_matched_with_missing_pollutants %>%
  mutate(is_treated = ifelse(is_treated == TRUE, "treated", "control")) %>%
  select(is_treated, pair_number, contains("not_imputed_mean_no2"), contains("not_imputed_mean_o3"), contains("not_imputed_mean_pm10")) %>%
  pivot_longer(cols = -c(pair_number, is_treated), names_to = "variable", values_to = "concentration") %>%
  mutate(pollutant = NA %>%
           ifelse(str_detect(variable, "no2"), "NO2",.) %>%
           ifelse(str_detect(variable, "o3"), "O3",.) %>%
           ifelse(str_detect(variable, "pm10"), "PM10",.)) %>%
  mutate(time = 0 %>%
           ifelse(str_detect(variable, "lag_1"), -1, .) %>%
           ifelse(str_detect(variable, "lead_1"), 1, .)) %>%
  select(-variable) %>%
  select(pair_number, is_treated, pollutant, time, concentration) %>% 
  pivot_wider(names_from = is_treated, values_from = concentration)

data_missing_pair_difference_pollutant <- data_matched_wide_missing_pollutants %>%
  mutate(difference = treated-control) %>%
  select(-c(treated, control)) 

We display below the proportion of missing differences by pollutant and day:

Please show me the code!
# make the graph
graph_missing_pollutants <- data_missing_pair_difference_pollutant %>%
  group_by(pollutant, time) %>%
  summarise(proportion_missing = sum(is.na(difference))/n()*100) %>%
  ggplot(., aes(x = as.factor(time), y = proportion_missing)) +
  geom_segment(aes(x = as.factor(time), xend = as.factor(time), y = 0, yend = proportion_missing)) +
  geom_point(shape = 21, size = 4, colour = "black", fill = my_blue) +
  ylim(0, 100) +
  facet_wrap(~ pollutant, ncol = 4) +
  xlab("Day") + ylab("Proportion of Pairs with \nMissing Concentrations (%)") +
  theme_tufte()
  
# display the graph
graph_missing_pollutants
Please show me the code!
# save the graph
ggsave(
  graph_missing_pollutants,
  filename = here::here(
    "inputs", "3.outputs",
    "2.matching_analysis",
    "graph_missing_pollutants.pdf"
  ),
  width = 20,
  height = 10,
  units = "cm",
  device = cairo_pdf
)

Clearly, many missing concentrations in the matched pairs were imputed. We compute the point estimates and confidence intervals for the pairs without missing values. Here, we use a Wilcoxon’s test to also make the results less sensitive to potential outliers:

Please show me the code!
# carry out the wilcox.test 
data_missing_rank_ci <- data_missing_pair_difference_pollutant %>%
  drop_na() %>%
  select(- pair_number) %>%
  group_by(pollutant, time) %>%
  nest() %>%
  mutate(mean_difference = map(data, ~ wilcox.test(.$difference, conf.int = TRUE)$estimate),
         lower_bound_95 = map(data, ~ wilcox.test(.$difference, conf.int = TRUE)$conf.int[1]),
         upper_bound_95 = map(data, ~ wilcox.test(.$difference, conf.int = TRUE)$conf.int[2])) %>%
  unnest(cols = c(mean_difference, lower_bound_95, upper_bound_95)) %>%
  mutate(data = "Pairs without Missing Concentrations")

# bind with analysis on imputed concentrations
data_imputed_nonimputed <- data_neyman %>%
  mutate(data = "Pairs with Imputed Concentrations") %>%
  select(pollutant,
         time,
         mean_difference,
         lower_bound_95,
         upper_bound_95,
         data) %>%
  bind_rows(.,  data_missing_rank_ci)

# create an indicator to alternate shading of confidence intervals
data_imputed_nonimputed <- data_imputed_nonimputed %>%
  arrange(pollutant, time) %>%
  mutate(stripe = ifelse((time %% 2) == 0, "Grey", "White")) %>%
  ungroup()

# make the graph
graph_ci_missing <-
  ggplot(data_imputed_nonimputed,
         aes(x = as.factor(time), y = mean_difference, ymin = lower_bound_95, ymax = upper_bound_95, colour = data, shape = data)) +
  geom_rect(
    aes(fill = stripe),
    xmin = as.numeric(as.factor(data_imputed_nonimputed$time)) - 0.42,
    xmax = as.numeric(as.factor(data_imputed_nonimputed$time)) + 0.42,
    ymin = -Inf,
    ymax = Inf,
    color = NA,
    alpha = 0.4
  ) +
  geom_hline(yintercept = 0,
             color = "black",
             size = 0.3) +
  geom_pointrange(position = position_dodge(width = 1), size = 1.2) +
  scale_shape_manual(name = "Dataset:", values = c(16, 17)) +
  scale_color_manual(name = "Dataset:", values = c(my_orange, my_blue)) +
  scale_y_continuous(breaks = scales::pretty_breaks(n = 8)) +
  facet_wrap( ~ pollutant, ncol = 4) +
  scale_fill_manual(values = c('gray96', "white")) +
  guides(fill = FALSE) +
  ylab("Average Increase in\n Concentrations (µg/m³)") + xlab("Day") +
  theme_tufte() +
  theme(axis.text.x = element_text(margin = ggplot2::margin(t = 0, unit = "cm")))


# print the graph
graph_ci_missing
Please show me the code!
# save the graph
ggsave(
  graph_ci_missing,
  filename = here::here("inputs", "3.outputs", "2.matching_analysis", "graph_ci_missing.pdf"),
  width = 16,
  height = 8,
  units = "cm",
  device = cairo_pdf
)

Results seem relatively robust to the imputation of missing values.

Are results on PM\(_{10}\) sensitive to hidden bias?

To assess whether the effects of North-East winds on PM\(_{10}\) concentrations could be due to hidden bias, we implement the studentized sensitivity analysis for the ATE developed by Colin B. Fogarty (2019). We first load the relevant functions:

Please show me the code!
# load fogarty's studentized Sensitivity Analysis functions
# retrieved from http://www.mit.edu/~cfogarty/StudentizedSensitivity.R

#' StudentizedSensitivity
#'Function to perform a Studentized Sensitivity analysis on the sample average treatment
#'effect in a paired observational study
#'
#' @param PairedDiff: Vector of treated-minus-control paired differences.
#' @param null: Value of the sample average treatment effect under the null.
#' @param alpha: Desired Type I error rate.
#' @param alternative: Can be "less", "greater", or "two.sided".
#' @param Gamma: Vector of values for Gamma at which to perform the sensitivity
#'  analysis.
#' @param nperm: Number of permutations to perform permutation test.
#' @param Changepoint: If true, function returns the maximal Gamma for which the
#' test rejects at level alpha.
#' @param SensitivityInterval: If true, function returns (100-alpha) sensitivity
#' intervals. They will be one-sided if the alternative is less than or greater than,
#' and two-sided if the alternative is two-sided.
#'
#' @return Gamma: Vector of Gammas for which the sensitivity analysis was performed.
#' @return pval: P-values for each value of Gamma.
#' @return GammaPval: Matrix combining Gamma and pval.
#' @return Changepoint: Maximal Gamma for which the test rejected at level alpha.
#' @return SensitivityInterval: Upper and lower bounds for 100(1-alpha) sensitivity
#' intervals for each value of Gamma.
#' @export


StudentizedSensitivity = function(PairedDiff, null = 0, alpha = 0.05, alternative = "greater", Gamma = 1, nperm = 50000, Changepoint = T, SensitivityInterval = T)
{
   if(any(Gamma < 1))
   {
     stop("Values for Gamma must be >= 1")
   }
   if(alternative!="less" & alternative!= "greater" & alternative != "two.sided")
   {
     stop("Values for alternative are `less', `greater', or `two.sided'")
   }
   if(length(null) > 1)
   {
     stop("Value under the null must be a scalar")
   }
      if(alpha < 0 | alpha > 0.5)
   {
     stop("alpha must be between 0 and 0.5")
      }

  PairedDifftrue <- PairedDiff
  alphatrue <- alpha
  I <- length(PairedDiff)
  Adjust <- PairedDiff - null

  if(alternative == "less")
  {
    Adjust <- -Adjust
  }
  if(alternative == "two.sided")
  {
    alpha <- alphatrue/2

    if(mean(Adjust) < 0)
    {
      Adjust <- -Adjust
    }
  }

  pval <- rep(0, length(Gamma))

  for(i in 1:length(Gamma))

  {
  D <- (Adjust) - (Gamma[i]-1)/(1+Gamma[i])*abs(Adjust)
  obs <- mean(D)/(sd(D)/sqrt(I))
  Adjmat <- matrix(abs(Adjust), I, nperm)
  Zmat <- matrix(runif(I*nperm) < Gamma[i]/(1+Gamma[i]), I, nperm)
  Dmat <- (2*Zmat-1)*(Adjmat) - (Gamma[i]-1)/(1+Gamma[i])*Adjmat
  perm <- colMeans(Dmat)/(sqrt(colVars(Dmat)/I))
  pval[i] <- (1+sum(perm>=obs))/(nperm + 1)
  }
  pvalret = pval
  if(alternative == "two.sided")
  {
    pvalret = 2*pval
  }
  Pmatrix <- cbind(Gamma, pvalret)
  colnames(Pmatrix) <- c("Gamma", "P-value")

  if(Changepoint == T)
  {
    proceed <- StudentizedSensitivity(PairedDifftrue, null, alphatrue, alternative, Gamma=1, nperm,
                                      Changepoint = F, SensitivityInterval = F)$pval <= alphatrue

    change <- 1

    if(proceed)
    {
      change <- uniroot(StudentizedChangepoint, interval = c(1, 30), PairedDiff = PairedDifftrue, null = null,
                        alpha = alphatrue, alternative = alternative, nperm = nperm,
                        extendInt = "upX")$root
    }
  }

  if(SensitivityInterval == T)
    {
      lb = rep(-Inf, length(Gamma))
      ub = rep(Inf, length(Gamma))
      for(i in 1:length(Gamma))
      {
        # Warm Starts
      UB = uniroot(BoundFinder, PairedDifftrue, Gamma[i],
                   interval = c(mean(PairedDifftrue), mean(PairedDifftrue)+4*sd(PairedDifftrue)/sqrt(I)), extendInt = "yes")$root
      LB = -uniroot(BoundFinder, -PairedDifftrue, Gamma[i],
                    interval = c(-mean(PairedDifftrue)-4*sd(PairedDifftrue)/sqrt(I), -mean(PairedDifftrue)), extendInt = "yes")$root

      SUB = Inf
      SLB = -Inf

      if(alternative == "greater")
      {
        SLB = uniroot(StudentizedSI, interval = c(UB-4*sd(PairedDifftrue)/sqrt(I), UB), extendInt = "yes",
                      Gamma = Gamma[i], PairedDiff=PairedDifftrue, alternative = "greater", alpha = alpha, nperm = nperm)$root
      }

      if(alternative == "less")
      {
        SUB = uniroot(StudentizedSI, interval = c(LB, LB + 4*sd(PairedDifftrue)/sqrt(I)), extendInt = "yes",
                      Gamma = Gamma[i], PairedDiff=PairedDifftrue, alternative = "less", alpha = alpha, nperm = nperm)$root
      }

      if(alternative == "two.sided")
      {
       SLB = uniroot(StudentizedSI, interval = c(UB-4*sd(PairedDifftrue)/sqrt(I), UB), extendInt = "yes",
                     Gamma = Gamma[i], PairedDiff=PairedDifftrue, alternative = "greater", alpha = alpha, nperm = nperm)$root
       SUB = uniroot(StudentizedSI, interval = c(LB, LB+4*sd(PairedDifftrue)/sqrt(I)), extendInt = "yes",
                     Gamma = Gamma[i], PairedDiff=PairedDifftrue, alternative = "less", alpha = alpha, nperm = nperm)$root
      }

      lb[i] = SLB
      ub[i] = SUB
      }

    SImat = cbind(Gamma, lb, ub)
    colnames(SImat) = c("Gamma", "Lower Bound", "Upper Bound")
    }
    if(Changepoint == F & SensitivityInterval == F)
    {
      return(list(Gamma=Gamma, pval = pvalret, GammaPval = Pmatrix))
    }
    if(Changepoint == F & SensitivityInterval == T)
    {
      return(list(Gamma = Gamma, pval = pvalret, GammaPval = Pmatrix, SensitivityInterval = SImat))
    }
    if(Changepoint == T & SensitivityInterval == F)
    {
      return(list(Gamma = Gamma, pval = pvalret, GammaPval = Pmatrix, Changepoint = change))
    }
    if(Changepoint == T & SensitivityInterval == T)
    {
      return(list(Gamma = Gamma, pval = pvalret, GammaPval = Pmatrix, Changepoint = change,
                  SensitivityInterval = SImat))
    }

}

####These are auxiliary functions used for root finding and for calculating columnwise variances in StudentizedSensitivity
StudentizedChangepoint = function(Gamma, PairedDiff, null, alternative, alpha, nperm)
{
  alphachange = alpha
  StudentizedSensitivity(PairedDiff, null, alpha, alternative, Gamma, nperm, Changepoint = F, SensitivityInterval = F)$pval - alphachange
}

StudentizedSI = function(null,  Gamma, PairedDiff,  alternative, alpha, nperm)
{
  StudentizedSensitivity(PairedDiff, null, alpha, alternative, Gamma, nperm, Changepoint = F, SensitivityInterval = F)$pval - alpha
}

BoundFinder = function(null,  PairedDiff, Gamma)
{
  mean(PairedDiff - null - (Gamma-1)/(1+Gamma)*abs(PairedDiff-null))
}

colVars <- function(x) {
  N = nrow(x)
  (colSums(x^2) - colSums(x)^2/N) / (N-1)
}

We select the pair differences for PM\(_{10}\) concentrations in \(t\) and run the function for \(\Gamma\)=2:

# we select the relevant pair differences
PairedDiff <- data_pair_difference_pollutant %>%
  filter(pollutant == "PM10" & time == 0) %>%
  pull(difference)

# we run the function
data_sensitivity_pm10_0 <- StudentizedSensitivity(
  PairedDiff,
  null = 0,
  alpha = 0.05,
  alternative = "two.sided",
  Gamma = 2,
  nperm = 50000,
  Changepoint = T,
  SensitivityInterval = T
)$SensitivityInterval %>%
  as_tibble() %>%
  mutate_all(~ round(., 2)) 

# save results
saveRDS(data_sensitivity_pm10_0, here::here("inputs", "1.data", "5.matched_data", "data_sensitivity_pm10_0.rds"))

We display below the results:

Please show me the code!
readRDS(
  here::here(
    "inputs",
    "1.data",
    "5.matched_data",
    "data_sensitivity_pm10_0.rds"
  )
) %>%
  datatable(.)

We then implement the same procedure but for concentrations in \(t+1\):

# we select the relevant pair differences
PairedDiff <- data_pair_difference_pollutant %>%
  filter(pollutant == "PM10" & time == 1) %>%
  pull(difference)

# we run the function
data_sensitivity_pm10_1 <- StudentizedSensitivity(
  PairedDiff,
  null = 0,
  alpha = 0.05,
  alternative = "two.sided",
  Gamma = 2,
  nperm = 50000,
  Changepoint = T,
  SensitivityInterval = T
)$SensitivityInterval %>%
  as_tibble() %>%
  mutate_all( ~ round(., 2)) 

# save results
saveRDS(data_sensitivity_pm10_1, here::here("inputs", "1.data", "5.matched_data", "data_sensitivity_pm10_1.rds"))

We display below the results:

Please show me the code!
readRDS(
  here::here(
    "inputs",
    "1.data",
    "5.matched_data",
    "data_sensitivity_pm10_1.rds"
  )
) %>%
  datatable(.)

Does pairing improve precision?

To check that our pair matching procedure improves precision, we compare the estimate of the variance for a pair experiment with the one of a complete experiment (the formula can be found in Imbens & Rubin 2015’s textbook):

Please show me the code!
# compute estimates of the sampling variability
# for a complete experiment
sampling_variability_complete <- data_matched %>%
  select(is_treated, mean_no2, mean_o3, mean_pm10) %>%
  pivot_longer(
    cols = c(mean_no2:mean_pm10),
    names_to = "pollutant",
    values_to = "concentration"
  ) %>%
  group_by(pollutant, is_treated) %>%
  mutate(
    n_obs = n(),
    average_concentration = mean(concentration),
    squared_difference = (concentration - average_concentration) ^
      2,
    variance_group = sum(squared_difference) / (n_obs - 1)
  ) %>%
  summarise(variance_component = mean(variance_group / n_obs)) %>%
  group_by(pollutant) %>%
  summarise(
    variance = sum(variance_component),
    standard_error_complete = sqrt(variance)
  ) %>%
  select(-variance) %>%
  mutate(
    pollutant = case_when(
      pollutant == "mean_no2" ~ "NO2",
      pollutant == "mean_o3" ~ "O3",
      pollutant == "mean_pm10" ~ "PM10"
    )
  )

# estimates of the sampling variability for a pair experiment
sampling_variability_pair <- data_se_neyman_pair %>%
  filter(time == 0 & pollutant != "PM2.5") %>%
  select(-time) %>%
  rename(standard_error_pair = standard_error)

# merge the two datasets and display results
left_join(sampling_variability_pair,
          sampling_variability_complete,
          by = c("pollutant")) %>%
  mutate(
    "Precision Improvement (%)" = (standard_error_pair - standard_error_complete) /
      standard_error_complete * 100
  ) %>%
  mutate_at(vars(standard_error_pair, standard_error_complete), ~ round(., 2)) %>%
  mutate(`Precision Improvement (%)` = round(`Precision Improvement (%)`, 0)) %>%
  rename(
    "Pollutant" = pollutant,
    "S.E Pair Experiment" = standard_error_pair,
    "S.E Complete Experiment" = standard_error_complete
  ) %>%
  datatable(.)

Comparison with an Outcome Regression Analysis on Initial Data

We ran a simple time-stratified regression model on the matching data to see how the results differ with those found in our matched data analysis. We only adjust for calendar indicator and weather covariate measured at time \(t\). We load the matching data:

# load matching data
data_matching <-
  readRDS(here::here("inputs", "1.data", "5.matched_data", "matching_data.Rds"))


# reshape in long according to pollutants
data_regression_analysis <- data_matching %>%
  pivot_longer(
    cols = c(
      contains("no2"),
      contains("o3"),
      contains("pm10"),
      contains("pm25")
    ),
    names_to = "variable",
    values_to = "concentration"
  ) %>%
  mutate(
    pollutant = NA %>%
      ifelse(str_detect(variable, "no2"), "NO2", .) %>%
      ifelse(str_detect(variable, "o3"), "O3", .) %>%
      ifelse(str_detect(variable, "pm10"), "PM10", .) %>%
      ifelse(str_detect(variable, "pm25"), "PM2.5", .)
  ) %>%
  mutate(time = 0 %>%
           ifelse(str_detect(variable, "lag_1"),-1, .) %>%
           ifelse(str_detect(variable, "lead_1"), 1, .)) %>%
  select(-variable)

# we nest the data by pollutant and time
data_regression_analysis <- data_regression_analysis %>%
  group_by(pollutant, time) %>%
  nest()

We run the regression model:

# running the model
data_regression_analysis <- data_regression_analysis %>%
  mutate(
    # regression model
    model = map(data, ~lm(concentration ~ is_treated + 
                                 temperature_average + I(temperature_average^2) + 
                                 temperature_average_lag_1 + I(temperature_average_lag_1^2) +
                                 rainfall_duration + rainfall_duration_lag_1 +
                                 humidity_average + humidity_average_lag_1 +
                                 wind_speed + wind_speed_lag_1 +
                                 weekday + holidays_dummy +
                                 bank_day_dummy + month*as.factor(year), data = .)))

# transform in long according to models
data_regression_analysis <- data_regression_analysis %>%
  pivot_longer(cols = c(model), names_to = "model", values_to = "coefficients") %>%
  select(-data)

# tidy regression ouputs
data_regression_analysis <- data_regression_analysis %>%
  mutate(models_dfs = map(coefficients, ~ broom::tidy(.)))

# unnest results and select coefficient for total gross tonnage
data_regression_analysis <- data_regression_analysis %>%  
  unnest(models_dfs) %>%
  dplyr::filter(term=="is_treatedTRUE") %>%
  select(pollutant, time, estimate, std.error)

# we compute the 95% confidence intervals
interval_95 <- -qnorm((1-0.95)/2)

# compute lower and upper bound of each interval
data_regression_analysis <- data_regression_analysis %>%
  mutate(ci_lower_95 = estimate - std.error*interval_95,
         ci_upper_95 = estimate + std.error*interval_95)

We draw the graph of results:

Please show me the code!
# create an indicator to alternate shading of confidence intervals
data_regression_analysis <- data_regression_analysis %>%
  arrange(pollutant, time) %>%
  mutate(stripe = ifelse((time %% 2) == 0, "Grey", "White")) %>%
  ungroup()

# make the graph
graph_regression_ci <-
  ggplot(data_regression_analysis,
         aes(x = as.factor(time), y = estimate)) +
  geom_rect(
    aes(fill = stripe),
    xmin = as.numeric(as.factor(data_regression_analysis$time)) - 0.42,
    xmax = as.numeric(as.factor(data_regression_analysis$time)) + 0.42,
    ymin = -Inf,
    ymax = Inf,
    color = NA,
    alpha = 0.4
  ) +
  geom_hline(yintercept = 0,
             color = "black",
             size = 0.3) +
  geom_pointrange(
    aes(
      x = as.factor(time),
      y = estimate,
      ymin = ci_lower_95 ,
      ymax = ci_upper_95
    ),
    size = 0.5,
    colour = my_blue,
    lwd = 0.8
  ) +
  scale_y_continuous(breaks = scales::pretty_breaks(n = 8)) +
  facet_wrap( ~ pollutant, ncol = 4) +
  scale_fill_manual(values = c('gray96', "white")) +
  guides(fill = FALSE) +
  ylab("Average Increase in\n Concentrations (µg/m³)") + xlab("Day") +
  theme_tufte() +
  theme(axis.text.x = element_text(margin = ggplot2::margin(t = 0, unit = "cm")))


# print the graph
graph_regression_ci
Please show me the code!
# save the graph
ggsave(
  graph_regression_ci,
  filename = here::here("inputs", "3.outputs", "2.matching_analysis", "graph_regression_ci.pdf"),
  width = 16,
  height = 8,
  units = "cm",
  device = cairo_pdf
)

We display below the values of the point estimates and 95% confidence intervals:

Model Dependance

Finally, we evaluate whether the matching procedure helped decrease the model dependence. We run 6 regression models—with different types of covariate adjustements—on the initial and matched data

Below is the code for the initial data:

Please show me the code!
data_matching <- data_matching %>%
  select(
    date:bank_day_dummy,
    mean_pm10,
    temperature_average:wind_speed,
    temperature_average_lag_1:wind_speed_lag_1,
    is_treated_lag_1:is_treated
  ) %>%
  mutate(year = as.factor(year),
         temperature_deciles = ntile(temperature_average, 10),
         temperature_deciles_lag_1 = ntile(temperature_average_lag_1, 10))
  

data_model_dep_matching <- data_matching %>%
  nest() %>%
  mutate(
    model_1 = map(data, ~ lm(mean_pm10 ~ is_treated, data = .)),
    model_2 = map(
      data,
      ~ lm(
        mean_pm10 ~ is_treated + weekday + holidays_dummy +
          bank_day_dummy + month * year,
        data = .
      )
    ),
    model_3 = map(
      data,
      ~ lm(
        mean_pm10 ~ is_treated + temperature_average + weekday + holidays_dummy +
          bank_day_dummy + month * year,
        data = .
      )
    ),
    model_4 = map(
      data,
      ~ lm(
        mean_pm10 ~ is_treated + temperature_average + I(temperature_average ^ 2) + weekday + holidays_dummy +
          bank_day_dummy + month * year,
        data = .
      )
    ),
    model_5 = map(
      data,
      ~ lm(
        mean_pm10 ~ is_treated + temperature_deciles + weekday + holidays_dummy +
          bank_day_dummy + month * year,
        data = .
      )
    ),
    model_6 = map(
      data,
      ~ lm(
        mean_pm10 ~ is_treated + temperature_deciles + rainfall_duration + humidity_average + wind_speed + weekday + holidays_dummy +
          bank_day_dummy + month * year,
        data = .
      )
    ),
    model_7 = map(
      data,
      ~ lm(
        mean_pm10 ~ is_treated + is_treated_lag_1 + temperature_deciles + temperature_deciles_lag_1 + rainfall_duration + rainfall_duration_lag_1 + humidity_average + humidity_average_lag_1 + wind_speed + wind_speed_lag_1 + weekday + holidays_dummy +
          bank_day_dummy + month * year,
        data = .
      )
    )
  )


clean_model_output <- function(model_output) {
  model_output %>%
    broom::tidy(., conf.int = TRUE) %>% 
    filter(term == "is_treatedTRUE") %>% 
    select(estimate, conf.low, conf.high)
}
  

data_model_dep_matching <- data_model_dep_matching %>%
  select(-data) %>%
  pivot_longer(cols = c(model_1:model_7), names_to = "model", values_to = "model_output") %>%
  mutate(model_output = map(model_output, ~ clean_model_output(.))) %>%
  unnest(model_output) %>%
  mutate_at(vars(estimate:conf.high), ~ ./mean(data_matching$mean_pm10)*100) %>%
  mutate(dataset = "Initial Data")

And the code for the matched data:

Please show me the code!
data_matched <- data_matched %>%
  select(
    date:bank_day_dummy,
    mean_pm10,
    temperature_average:wind_speed,
    temperature_average_lag_1:wind_speed_lag_1,
    is_treated_lag_1:is_treated
  ) %>%
  mutate(time = 1:nrow(.),
         year = as.factor(year),
         temperature_deciles = ntile(temperature_average, 10),
         temperature_deciles_lag_1 = ntile(temperature_average_lag_1, 10))
  

data_model_dep_matched <- data_matched %>%
  nest() %>%
  mutate(
    model_1 = map(data, ~ lm(mean_pm10 ~ is_treated, data = .)),
    model_2 = map(
      data,
      ~ lm(
        mean_pm10 ~ is_treated + weekday + holidays_dummy +
          bank_day_dummy + month * year,
        data = .
      )
    ),
    model_3 = map(
      data,
      ~ lm(
        mean_pm10 ~ is_treated + temperature_average + weekday + holidays_dummy +
          bank_day_dummy + month * year,
        data = .
      )
    ),
    model_4 = map(
      data,
      ~ lm(
        mean_pm10 ~ is_treated + temperature_average + I(temperature_average ^ 2) + weekday + holidays_dummy +
          bank_day_dummy + month * year,
        data = .
      )
    ),
    model_5 = map(
      data,
      ~ lm(
        mean_pm10 ~ is_treated + temperature_deciles + weekday + holidays_dummy +
          bank_day_dummy + month * year,
        data = .
      )
    ),
    model_6 = map(
      data,
      ~ lm(
        mean_pm10 ~ is_treated + temperature_deciles + rainfall_duration + humidity_average + wind_speed + weekday + holidays_dummy +
          bank_day_dummy + month * year,
        data = .
      )
    ),
    model_7 = map(
      data,
      ~ lm(
        mean_pm10 ~ is_treated + is_treated_lag_1 + temperature_deciles + temperature_deciles_lag_1 + rainfall_duration + rainfall_duration_lag_1 + humidity_average + humidity_average_lag_1 + wind_speed + wind_speed_lag_1 + weekday + holidays_dummy +
          bank_day_dummy + month * year,
        data = .
      )
    )
  )


clean_model_output <- function(model_output) {
  model_output %>%
    broom::tidy(., conf.int = TRUE) %>% 
    filter(term == "is_treatedTRUE") %>% 
    select(estimate, conf.low, conf.high)
}
  

data_model_dep_matched <- data_model_dep_matched %>%
  select(-data) %>%
  pivot_longer(cols = c(model_1:model_7), names_to = "model", values_to = "model_output") %>%
  mutate(model_output = map(model_output, ~ clean_model_output(.))) %>%
  unnest(model_output) %>%
  mutate_at(vars(estimate:conf.high), ~ ./mean(data_matching$mean_pm10)*100) %>%
  mutate(dataset = "Matched Data")

We displayed the results:

Please show me the code!
data_model_dep_all <- bind_rows(data_model_dep_matching, data_model_dep_matched) %>%
  arrange(model) %>%
  mutate(stripe = ifelse((readr::parse_number(model) %% 2) == 0, "Grey", "White")) %>%
  ungroup() %>%
  mutate(model = paste("Model", readr::parse_number(model), sep = " "))

# make the graph
graph_model_dep_all <-
  ggplot(data_model_dep_all,
         aes(x = estimate, y = model)) +
  geom_rect(
    aes(fill = stripe),
    ymin = as.numeric(as.factor(data_model_dep_all$model)) - 0.42,
    ymax = as.numeric(as.factor(data_model_dep_all$model)) + 0.42,
    xmin = -Inf,
    xmax = Inf,
    color = NA,
    alpha = 0.4
  ) +
  geom_vline(xintercept = 0,
             color = "black",
             size = 0.3) +
  geom_pointrange(
    aes(
      y = model,
      x = estimate,
      xmin = conf.low ,
      xmax = conf.high
    ),
    size = 0.5,
    colour = my_blue,
    lwd = 0.8
  ) +
  scale_x_continuous(breaks = scales::pretty_breaks(n = 8)) +
  facet_wrap( ~ dataset, ncol = 4) +
  scale_fill_manual(values = c('gray96', "white")) +
  guides(fill = FALSE) +
  ylab("") + xlab("Relative Increase in Concentrations (%)") +
  theme_tufte() 


# print the graph
graph_model_dep_all
Please show me the code!
# save the graph
ggsave(
  graph_model_dep_all,
  filename = here::here("inputs", "3.outputs", "2.matching_analysis", "graph_model_dep_all.pdf"),
  width = 18,
  height = 6,
  units = "cm",
  device = cairo_pdf
)

Corrections

If you see mistakes or want to suggest changes, please create an issue on the source repository.