Outcome Regression Approach

Comparing days with cruise traffic to days without. Adjusting for calendar and weather indicators.

Marie-Abèle Bind https://scholar.harvard.edu/marie-abele (Biostatistics Center, Massachusetts General Hospital)https://biostatistics.massgeneral.org/faculty/marie-abele-bind-phd/ , Marion Leroutier https://www.parisschoolofeconomics.eu/en/leroutier-marion/work-in-progress/ (Misum, Stockholm School of Economics)https://www.hhs.se/en/persons/l/leroutier-marion/ , Léo Zabrocki https://lzabrocki.github.io/ (RFF-CMCC EIEE)https://www.eiee.org/
2022-08-03

In this document, we compare the results found with our matching analysis with those found with an outcome regression approach.

Should you have any questions, need help to reproduce the analysis or find coding errors, please do not hesitate to contact us at and .

Required Packages and Data

We load the following packages:

# load required packages
library(knitr) # for creating the R Markdown document
library(here) # for files paths organization
library(tidyverse) # for data manipulation and visualization
library(broom) # for cleaning regression outputs
library(randChecks) # for randomization check
library(lmtest) # for modifying regression standard errors
library(sandwich) # for robust and cluster robust standard errors
library(kableExtra) # for table formatting
library(Cairo) # for printing custom police of graphs

We load our custom ggplot2 theme for graphs:

# load ggplot custom theme
source(here::here("inputs",
                  "2.functions",
                  "script_theme_tufte.R"))
# define nice colors
my_blue <- "#0081a7"
my_orange <- "#fb8500"

And we load the matching data:

# load matching data
data_matching <-
  readRDS(
    here::here(
      "inputs",
      "1.data",
      "2.daily_data",
      "2.data_for_analysis",
      "1.cruise_experiment",
      "matching_data.rds"
    )
  ) %>%
  mutate(year = as.factor(year)) %>%
  select(
    is_treated,
    year,
    month,
    weekday,
    holidays_dummy,
    bank_day_dummy,
    total_gross_tonnage_ferry:total_gross_tonnage_other_boat,
    temperature_average:wind_speed,
    wind_direction_east_west,
    holidays_dummy_lag_1,
    bank_day_dummy_lag_1,
    total_gross_tonnage_cruise_lag_1:total_gross_tonnage_other_boat_lag_1,
    temperature_average_lag_1:wind_speed_lag_1,
    wind_direction_east_west_lag_1,
    contains("no2"),
    contains("o3"),
    contains("pm10"),
    contains("pm25"),
    contains("so2")
  )

Balance check

We first evaluate as the overall balance of our treatment before matching by implementing the randomization inference approach proposed by Zach Branson (2021):

# we select covariates
data_covs <- data_matching %>%
  select(
    is_treated,
    year,
    month,
    weekday,
    holidays_dummy,
    bank_day_dummy,
    total_gross_tonnage_ferry:total_gross_tonnage_other_boat,
    temperature_average:wind_speed,
    wind_direction_east_west,
    holidays_dummy_lag_1,
    bank_day_dummy_lag_1,
    total_gross_tonnage_cruise_lag_1:total_gross_tonnage_other_boat_lag_1,
    temperature_average_lag_1:wind_speed_lag_1,
    wind_direction_east_west_lag_1,
    mean_no2_l_lag_1:mean_o3_l_lag_1
  ) %>%
  drop_na()

data_covs <- data_covs %>%
  group_by(year, month) %>%
  mutate(year_month_block = cur_group_id())

# recode some variables
data_covs <- data_covs %>%
  mutate(is_treated = ifelse(is_treated == TRUE, 1, 0)) %>%
  mutate_at(
    vars(wind_direction_east_west , wind_direction_east_west_lag_1),
    ~ ifelse(. == "West", 1, 0)
  ) %>%
  fastDummies::dummy_cols(., select_columns = c("year", "month", "weekday")) %>%
  select(-c("year", "month", "weekday"))

# format data for asIfRandPlot() function
is_treated <- data_covs$is_treated
year_month_block <- data_covs$year_month_block

data_covs <- data_covs %>%
  select(-is_treated, -year_month_block) %>%
  select(-year_2018,-month_December, -weekday_Sunday)

# run balance check
asIfRandPlot(
  X.matched = data_covs,
  indicator.matched = is_treated,
  assignment = c("complete", "blocked"),
  subclass = year_month_block,
  perms = 1000
)

Regression Analysis

We run an outcome regression analysis where we regress the concentration of air pollutants on the treatment indiciator, measures of other vessels traffic, weather parameters and calendar indicators:

Please show me the code!
# define outcome regression function
function_reg_analysis <- function(data) {
  # we fit the regression model
  model <-
    lm(
      concentration ~ is_treated + total_gross_tonnage_ferry + total_gross_tonnage_other_boat +
        total_gross_tonnage_ferry_lag_1 + total_gross_tonnage_other_boat_lag_1 +
        temperature_average + temperature_average_lag_1 +
        humidity_average + humidity_average_lag_1 +
        rainfall_height_dummy + rainfall_height_dummy_lag_1 +
        wind_direction_east_west + wind_direction_east_west_lag_1 +
        wind_speed + wind_speed_lag_1 +
        weekday + holidays_dummy + bank_day_dummy + month * year,
      data = data
    )
  
  # retrieve the estimate and 95% ci
  results_reg <- tidy(coeftest(model, vcov = vcovHC),
                      conf.int = TRUE) %>%
    filter(term == "is_treatedTRUE") %>%
    select(term, estimate, conf.low, conf.high)
  
  # return output
  return(results_reg)
}

# reshape in long according to pollutants
data_reg_analysis <- data_matching %>%
  pivot_longer(cols = c(contains("no2"), contains("o3"), contains("pm10"), contains("pm25"), contains("so2")), names_to = "variable", values_to = "concentration") %>%
  mutate(pollutant = NA %>%
           ifelse(str_detect(variable, "no2_l"), "NO2 Longchamp",.) %>%
           ifelse(str_detect(variable, "no2_sl"), "NO2 Saint-Louis",.) %>%
           ifelse(str_detect(variable, "o3"), "O3 Longchamp",.) %>%
           ifelse(str_detect(variable, "pm10_l"), "PM10 Longchamp",.) %>%
           ifelse(str_detect(variable, "pm10_sl"), "PM10 Saint-Louis",.) %>%
           ifelse(str_detect(variable, "pm25"), "PM2.5 Longchamp",.) %>%
           ifelse(str_detect(variable, "so2"), "SO2 Lonchamp",.)) %>%
  mutate(time = 0 %>%
           ifelse(str_detect(variable, "lag_1"), -1, .) %>%
           ifelse(str_detect(variable, "lead_1"), 1, .)) %>%
  select(-variable)

# we nest the data by pollutant and time
data_reg_analysis <- data_reg_analysis %>% 
  group_by(pollutant, time) %>%
  nest()

# run outcome regression analysis
data_reg_analysis <- data_reg_analysis %>%
  mutate(result = map(data, ~ function_reg_analysis(.))) %>%
  select(-data) %>%
  unnest(cols = c(result)) %>%
  select(-term) %>%
  rename("mean_difference" = "estimate", "ci_lower_95" = "conf.low", "ci_upper_95" = "conf.high") %>%
  mutate(analysis = "Outcome Regression")

# load matching results
data_neyman <- readRDS(
  here(
    "inputs",
    "1.data",
    "2.daily_data",
    "2.data_for_analysis",
    "1.cruise_experiment",
    "data_neyman.rds"
  )
)

data_neyman <- data_neyman %>%
  mutate(analysis = "Constrained Pair Matching")

# merge them with outcome regression results
data_neyman_reg <- bind_rows(data_neyman, data_reg_analysis)

# create an indicator to alternate shading of confidence intervals
data_neyman_reg <- data_neyman_reg %>%
  arrange(pollutant, time) %>%
  mutate(stripe = ifelse((time %% 2) == 0, "Grey", "White")) %>%
  ungroup()

# make the graph
graph_reg_analysis <-
  ggplot(
    data_neyman_reg,
    aes(
      x = as.factor(time),
      y = mean_difference,
      ymin = ci_lower_95,
      ymax = ci_upper_95,
      colour = analysis,
      shape = analysis
    )
  ) +
  geom_rect(
    aes(fill = stripe),
    xmin = as.numeric(as.factor(data_neyman_reg$time)) - 0.42,
    xmax = as.numeric(as.factor(data_neyman_reg$time)) + 0.42,
    ymin = -Inf,
    ymax = Inf,
    color = NA,
    alpha = 0.4
  ) +
  geom_hline(yintercept = 0, color = "black") +
  geom_pointrange(position = position_dodge(width = 1), size = 1.2) +
  scale_shape_manual(name = "Analysis:", values = c(16, 17)) +
  scale_color_manual(name = "Analysis:", values = c(my_orange, my_blue)) +
  facet_wrap( ~ pollutant, ncol = 4) +
  scale_fill_manual(values = c('grey90', "white")) +
  guides(fill = FALSE) +
  ylab("Average Difference \nin Concentrations (µg/m³)") + xlab("Day") +
  theme_tufte()

# display the graph
graph_reg_analysis
Please show me the code!
# save the graph
ggsave(
  graph_reg_analysis + labs(title = NULL),
  filename = here::here(
    "inputs",
    "3.outputs",
    "2.daily_analysis",
    "2.analysis_pollution",
    "1.cruise_experiment",
    "3.outcome_regression",
    "graph_reg_analysis.pdf"
  ),
  width = 30,
  height = 15,
  units = "cm",
  device = cairo_pdf
)

Corrections

If you see mistakes or want to suggest changes, please create an issue on the source repository.