In this document, we take great care providing all steps and R codes required to compare the matched data to the initial data. We compare days where:

treated units are days where winds blow from the North-East in t.
control units are day winds blow from other directions in t.

We adjust for calendar indicators and weather confouding factors.

Should you have any questions, need help to reproduce the analysis or find coding errors, please do not hesitate to contact me at leo.zabrocki@psemail.eu

Required Packages

To reproduce exactly the 2_script_comparing_two_datasets.html document, we first need to have installed:

the R programming language
RStudio, an integrated development environment for R, which will allow you to knit the 2_script_comparing_two_datasets.Rmd file and interact with the R code chunks
the R Markdown package
and the Distill package which provides the template for this document.

Once everything is set up, we have to load the following packages:

# load required packages
library(knitr) # for creating the R Markdown document
library(here) # for files paths organization
library(tidyverse) # for data manipulation and visualization
library(ggridges) # for ridge density plots
library(kableExtra) # for table formatting
library(Cairo) # for printing customed police of graphs
library(patchwork) # combining plots

We finally load our custom ggplot2 theme for graphs:

# load ggplot custom theme
source(here::here(
  "inputs",
  "2.functions",
  "script_theme_tufte.R"
))
# define nice colors
my_blue <- "#0081a7"
my_orange <- "#fb8500"

Comparing Distribution of Covariates in Matched and Initial Datasets

We explore the characteristics of the matched data by comparing the distribution of its covariates to those of the matching data. We load the two datasets and bind them in the data_all object:

# load matching data
data_matching <-
  readRDS(here::here("inputs", "1.data", "5.matched_data", "matching_data.Rds")) %>%
  mutate(dataset = "Initial Data")

# load matched data
data_matched <-
  readRDS(here::here("inputs", "1.data", "5.matched_data", "matched_data.Rds")) %>%
  mutate(dataset = "Matched Data")

# bind the three datasets
data_all <- bind_rows(data_matching, data_matched)

Weather Covariates

We plot below the density distributions of continuous weather covariates for the two datasets:

Please show me the code!

# we select continuous weather variables and store them in a long dataframe
data_continuous_weather_variables <- data_all %>%
  select(temperature_average, wind_speed, humidity_average, dataset) %>%
  pivot_longer(
    .,
    cols =  c(temperature_average:humidity_average),
    names_to = "variable",
    values_to = "values"
  ) %>%
  mutate(
    variable =  factor(
      variable,
      levels = c("temperature_average", "humidity_average", "wind_speed")
    ) %>%
      fct_recode(
        .,
        "Temperature Average (°C)" = "temperature_average",
        "Average Humidity (%)" = "humidity_average",
        "Wind Speed (m/s)" = "wind_speed"
      )
  ) %>%
  mutate(dataset = fct_relevel(dataset, "Initial Data", "Matched Data"))

# we plot the density distributions
graph_density_continuous_weather_variables <-
  ggplot(data_continuous_weather_variables,
         aes(
           x = values,
           y = fct_rev(dataset),
           fill = fct_rev(dataset)
         )) +
  geom_density_ridges(colour = NA) +
  scale_fill_manual(values = c(my_blue, my_orange),
                    guide = guide_legend(reverse = TRUE)) +
  xlab("Covariate Value") + ylab("") +
  labs(fill = "Dataset:") +
  facet_wrap( ~ variable, scale = "free_x", ncol = 3) +
  theme_tufte() +
  theme(
    legend.position = "top",
    legend.justification = "left",
    legend.direction = "horizontal",
    axis.text.x = element_text(
      family = "Roboto Condensed",
      color = "gray18",
      size = 8,
      margin = ggplot2::margin(t = 0, unit = "cm")
    )
  )

# print the graph
graph_density_continuous_weather_variables

We plot the proportion of weather categorical variables for the two datasets

Please show me the code!

# we select categorical weather variables and store them in a long dataframe
data_categorical_weather_variables <- data_all %>%
  # select relevant variables
  select(wind_direction_categories, rainfall_duration, dataset) %>%
  drop_na() %>%
  mutate(rainfall_duration = paste("Rainfall Duration:", rainfall_duration, sep = " "))  %>%
  mutate(
    rainfall_duration = case_when(
      rainfall_duration == "Rainfall Duration:   0" ~ "Rainfall Duration: 0 min",
      rainfall_duration == "Rainfall Duration: [  1,  12)" ~ "Rainfall Duration: [1, 12) min",
      rainfall_duration == "Rainfall Duration: [ 12, 159)" ~ "Rainfall Duration: [12, 159) min",
      rainfall_duration == "Rainfall Duration: [159,1440]" ~ "Rainfall Duration: [159, 1440] min"
    )
  ) %>%
  mutate(
    wind_direction_categories = fct_recode(
      wind_direction_categories,
      "Wind Direction: North-East" = "North-East",
      "Wind Direction: South-East" = "South-East",
      "Wind Direction: South-West" = "South-West",
      "Wind Direction: North-West" = "North-West"
    )
  ) %>%
  # transform variables to character
  mutate_all( ~ as.character(.)) %>%
  # transform the data to long to compute the proportion of observations for each variable
  pivot_longer(cols = -c(dataset),
               names_to = "variable",
               values_to = "values") %>%
  # group by dataset, variable and values
  group_by(dataset, variable, values) %>%
  # compute the number of observations
  summarise(n = n()) %>%
  # compute the proportion
  mutate(freq = round(n / sum(n) * 100, 0)) %>%
  # reorder labels of the dataset variable
  ungroup() %>%
  mutate(dataset = fct_relevel(dataset, "Initial Data", "Matched Data"))

# we plot the cleveland dots plots
graph_categorical_weather_variables <-
  ggplot(data_categorical_weather_variables,
         aes(
           x = freq,
           y = fct_rev(dataset),
           fill =  fct_rev(dataset)
         )) +
  geom_segment(aes(
    x = 0,
    xend = freq,
    y =  fct_rev(dataset),
    yend =  fct_rev(dataset)
  )) +
  geom_point(shape = 21,
             color = "black",
             size = 4) +
  scale_fill_manual(values = c(my_blue, my_orange),
                    guide = guide_legend(reverse = TRUE)) +
  facet_wrap( ~ values, scale = "free_x", ncol = 3) +
  ylab("") +
  xlab("Proportion (%)") +
  labs(fill = "Dataset:") +
  theme_tufte() +
  theme(
    legend.position = "top",
    legend.justification = "left",
    legend.direction = "horizontal",
    axis.text.x = element_text(
      family = "Roboto Condensed",
      color = "gray18",
      size = 8,
      margin = ggplot2::margin(t = 0, unit = "cm")
    )
  )

# print the graph
graph_categorical_weather_variables

We combine the graph_density_continuous_weather_variables and graph_categorical_weather_variables :

Please show me the code!

# combine plots
graph_weather_three_datasets <- graph_density_continuous_weather_variables / graph_categorical_weather_variables +
  plot_annotation(tag_levels = 'A') & theme(plot.tag = element_text(size = 20, face = "bold"))

# display graph
graph_weather_three_datasets

Please show me the code!

# save the plot
ggsave(graph_weather_three_datasets, filename = here::here("inputs", "3.outputs", "2.matching_analysis", "graph_weather_two_datasets.pdf"), 
       width = 35, height = 20, units = "cm", device = cairo_pdf)

Calendar Indicators

We plot the proportions of observations belonging to each day of the week by dataset:

Please show me the code!

# compute the proportions of observations belonging to each day of the week by dataset
data_weekday <- data_all %>%
  mutate(weekday = lubridate::wday(date, abbr = FALSE, label = TRUE)) %>%
  select(weekday, dataset) %>%
  mutate(
    weekday = fct_relevel(
      weekday,
      "Monday",
      "Tuesday",
      "Wednesday",
      "Thursday",
      "Friday",
      "Saturday",
      "Sunday"
    )
  ) %>%
  pivot_longer(.,-dataset) %>%
  group_by(name, dataset, value) %>%
  summarise(n = n()) %>%
  mutate(proportion = round(n / sum(n) * 100, 0)) %>%
  ungroup() %>%
  mutate(dataset = fct_relevel(dataset, "Initial Data", "Matched Data"))

# we plot the data using cleveland dot plots
graph_weekday <-
  ggplot(data_weekday,
         aes(
           x = as.factor(value),
           y = proportion,
           colour = dataset,
           group = dataset
         )) +
  geom_line(size = 1) +
  scale_colour_manual(values = c(my_orange, my_blue),
                      guide = guide_legend(reverse = FALSE)) +
  ggtitle("Day of the Week") +
  ylab("Proportion (%)") +
  xlab("") +
  labs(colour = "Dataset:") +
  theme_tufte() +
  theme(
    legend.position = "top",
    legend.justification = "left",
    legend.direction = "horizontal"
  )

# we print the graph
graph_weekday

We plot the proportions of observations belonging to bank days and holidays by dataset:

Please show me the code!

# compute the proportions of observations belonging to bank days and holidays by dataset
data_bank_holidays <- data_all %>%
  select(bank_day_dummy, holidays_dummy, dataset) %>%
  pivot_longer(.,-dataset) %>%
  mutate(name = recode(name, bank_day_dummy = "Bank Day", holidays_dummy = "Holidays")) %>%
  group_by(name, dataset, value) %>%
  summarise(n = n()) %>%
  mutate(proportion = round(n / sum(n) * 100, 0)) %>%
  ungroup() %>%
  mutate(dataset = fct_relevel(dataset, "Data", "Initial Data", "Matched Data")) %>%
  filter(value == 1) %>%
  mutate(name = paste(name, ": True", sep = ""))

# we plot the data using cleveland dot plots
graph_bank_holidays <-
  ggplot(data_bank_holidays,
         aes(
           x = proportion,
           y = as.factor(dataset),
           fill = dataset
         )) +
  geom_segment(aes(
    x = 0,
    xend = proportion,
    y =  fct_rev(dataset),
    yend =  fct_rev(dataset)
  )) +
  geom_point(shape = 21,
             colour = "black",
             size = 4) +
  scale_fill_manual(values = c(my_orange, my_blue),
                    guide = guide_legend(reverse = FALSE)) +
  facet_wrap( ~ name) +
  ggtitle("Bank Days and Holidays") +
  ylab("Proportion (%)") +
  xlab("") +
  labs(fill = "Dataset:") +
  theme_tufte() +
  theme(
    legend.position = "top",
    legend.justification = "left",
    legend.direction = "horizontal"
  )

# we print the graph
graph_bank_holidays

We plot the proportions of observations belonging to each month by dataset:

Please show me the code!

# compute the proportions of observations belonging to each month by dataset
data_month <- data_all %>%
  select(month, dataset) %>%
  mutate(
    month = recode(
      month,
      `1` = "January",
      `2` = "February",
      `3` = "March",
      `4` = "April",
      `5` = "May",
      `6` = "June",
      `7` = "July",
      `8` = "August",
      `9` = "September",
      `10` = "October",
      `11` = "November",
      `12` = "December"
    ) %>%
      fct_relevel(
        .,
        "January",
        "February",
        "March",
        "April",
        "May",
        "June",
        "July",
        "August",
        "September",
        "October",
        "November",
        "December"
      )
  ) %>%
  pivot_longer(.,-dataset) %>%
  group_by(name, dataset, value) %>%
  summarise(n = n()) %>%
  mutate(proportion = round(n / sum(n) * 100, 0)) %>%
  ungroup() %>%
  mutate(dataset = fct_relevel(dataset, "Initial Data", "Matched Data"))


# we plot the data using cleveland dot plots
graph_month <-
  ggplot(data_month,
         aes(
           x = as.factor(value),
           y = proportion,
           colour = dataset,
           group = dataset
         )) +
  geom_line(size = 1) +
  scale_colour_manual(values = c(my_orange, my_blue),
                      guide = guide_legend(reverse = FALSE)) +
  ggtitle("Month") +
  ylab("Proportion (%)") +
  xlab("") +
  labs(colour = "Dataset:") +
  theme_tufte() +
  theme(
    legend.position = "top",
    legend.justification = "left",
    legend.direction = "horizontal"
  )

# we print the graph
graph_month

We plot the proportions of observations belonging to each year by dataset:

Please show me the code!

# compute the proportions of observations belonging to each year by dataset
data_year <- data_all %>%
  select(year, dataset) %>%
  pivot_longer(.,-dataset) %>%
  group_by(name, dataset, value) %>%
  summarise(n = n()) %>%
  mutate(proportion = round(n / sum(n) * 100, 0)) %>%
  ungroup() %>%
  mutate(dataset = fct_relevel(dataset, "Initial Data", "Matched Data"))


# we plot the data using cleveland dot plots
graph_year <-
  ggplot(data_year,
         aes(
           x = as.factor(value),
           y = proportion,
           colour = dataset,
           group = dataset
         )) +
  geom_line(size = 1) +
  scale_colour_manual(values = c(my_orange, my_blue),
                      guide = guide_legend(reverse = FALSE)) +
  ggtitle("Year") +
  ylab("Proportion (%)") +
  xlab("") +
  labs(colour = "Dataset:") +
  theme_tufte() +
  theme(
    legend.position = "top",
    legend.justification = "left",
    legend.direction = "horizontal"
  )

# we print the graph
graph_year

We combine all plots for calendar variables:

Please show me the code!

# combine plots
graph_calendar_three_datasets <-
  (graph_weekday + graph_bank_holidays) / (graph_month + graph_year) +
  plot_annotation(tag_levels = 'A') &
  theme(plot.tag = element_text(size = 20, face = "bold"))

# display
graph_calendar_three_datasets

Please show me the code!

# save the plot
ggsave(
  graph_calendar_three_datasets,
  filename = here::here(
    "inputs", "3.outputs",
    "2.matching_analysis",
    "graph_calendar_two_datasets.pdf"
  ),
  width = 40,
  height = 20,
  units = "cm",
  device = cairo_pdf
)

Comparing the Matched Data to the Initial Data

Required Packages

Comparing Distribution of Covariates in Matched and Initial Datasets

Weather Covariates

Calendar Indicators

Corrections