Wind Patterns & Air Pollution in Paris, France.
In this document, we carry out an exploratory data analysis to understand the relationships of wind patterns with other variables:
Should you have any questions, need help to reproduce the analysis or find coding errors, please do not hesitate to contact me at leo.zabrocki@psemail.eu
To reproduce exactly the script_eda.html
document, we first need to have installed:
script_eda.Rmd
file and interact with the R code chunksOnce everything is set up, we have to load the following packages:
We load our custom ggplot2
theme for graphs:
We finally load the data:
We explore general patterns related to wind directions.
We plot the distribution of PM10 concentration by wind direction:
# make the graph
graph_wd_pm10 <- data %>%
ggplot(., aes(x = mean_pm10, y = fct_rev(wind_direction_categories))) +
geom_density_ridges(
color = NA,
size = 0.3,
fill = my_blue,
alpha = 0.8
) +
scale_x_continuous(breaks = scales::pretty_breaks(n = 8)) +
xlab("Concentration (µg/m³)") +
ylab("Wind Direction") +
theme_tufte() +
theme(axis.title.y = element_text(angle = 90, hjust = 1))
# display the graph
graph_wd_pm10
# save the graph
ggsave(
graph_wd_pm10,
filename = here::here("inputs", "3.outputs", "1.eda", "graph_wd_pm10.pdf"),
width = 15,
height = 8,
units = "cm",
device = cairo_pdf
)
We plot the distribution of all pollutant concentrations by wind direction:
# make the graph
graph_wd_pollutants <- data %>%
rename(
"PM10" = mean_pm10,
"PM2.5" = mean_pm25,
"NO2" = mean_no2,
"O3" = mean_o3
) %>%
pivot_longer(
cols = c(PM10, PM2.5, NO2, O3),
names_to = "pollutant",
values_to = "concentration"
) %>%
ggplot(., aes(x = concentration, y = fct_rev(wind_direction_categories))) +
geom_density_ridges(
color = NA,
size = 0.3,
fill = my_blue,
alpha = 0.8
) +
scale_x_continuous(breaks = scales::pretty_breaks(n = 8)) +
facet_wrap( ~ pollutant, scales = "free") +
xlab("Concentration (µg/m³)") +
ylab("Wind Direction") +
theme_tufte()
# display the graph
graph_wd_pollutants
# save the graph
ggsave(
graph_wd_pollutants,
filename = here::here( "inputs", "3.outputs", "1.eda", "graph_wd_pollutants.pdf"),
width = 20,
height = 12,
units = "cm",
device = cairo_pdf
)
Using the openair
package, we predict air pollutant concentrations using wind components:
# select relevant variables for the polar plots
data_polar_plots <- data %>%
select(wind_speed,
wind_direction,
mean_no2,
mean_o3,
mean_pm10,
mean_pm25) %>%
rename('Wind Speed' = wind_speed)
# make the polarPlot for all pollutants
no2_plot <-
polarPlot(
data_polar_plots,
pollutant = "mean_no2",
x = "Wind Speed",
wd = "wind_direction",
main = "Average NO2 (' * mu * 'g/m' ^3 *')",
key.header = "",
key.footer = "",
resolution = "fine",
par.settings = list(fontsize = list(text = 8))
)
o3_plot <-
polarPlot(
data_polar_plots,
pollutant = "mean_o3",
x = "Wind Speed",
wd = "wind_direction",
main = "Average O3 (' * mu * 'g/m' ^3 *')",
key.header = "",
key.footer = "",
resolution = "fine",
par.settings = list(fontsize = list(text = 8))
)
pm10_plot <-
polarPlot(
data_polar_plots,
pollutant = "mean_pm10",
x = "Wind Speed",
wd = "wind_direction",
main = "Average PM10 (' * mu * 'g/m' ^3 *')",
key.header = "",
key.footer = "",
resolution = "fine",
par.settings = list(fontsize = list(text = 8))
)
pm25_plot <-
polarPlot(
data_polar_plots,
pollutant = "mean_pm25",
x = "Wind Speed",
wd = "wind_direction",
main = "Average PM2.5 (' * mu * 'g/m' ^3 *')",
key.header = "",
key.footer = "",
resolution = "fine",
par.settings = list(fontsize = list(text = 8))
)
# save the graph
png(
here::here("inputs", "3.outputs", "1.eda", "graph_polar_plots_pollutants.png"),
width = 20,
height = 10,
units = "cm",
res = 1000
)
print(no2_plot, split = c(1, 1, 4, 1), more = TRUE)
print(o3_plot, split = c(2, 1, 4, 1), more = TRUE)
print(pm10_plot, split = c(3, 1, 4, 1), more = TRUE)
print(pm25_plot, split = c(4, 1, 4, 1), more = FALSE)
dev.off()
We plot the distribution of continuous weather parameter by wind direction:
# make the graph
graph_wd_weather <- data %>%
rename(
"Average Temperature (°C)" = temperature_average,
"Average Humidity (%)" = humidity_average,
"Wind Speed (m/s)" = wind_speed
) %>%
pivot_longer(
cols = c(
"Average Temperature (°C)",
"Average Humidity (%)",
"Wind Speed (m/s)"
),
names_to = "weather_parameter",
values_to = "value"
) %>%
mutate(
horizontal_lines = case_when(
weather_parameter == "Average Temperature (°C)" ~ -6,
weather_parameter == "Average Humidity (%)" ~ 25,
weather_parameter == "Wind Speed (m/s)" ~ 0
)
) %>%
ggplot(., aes(x = value, y = wind_direction_categories)) +
geom_density_ridges(
color = NA,
size = 0.3,
fill = my_blue,
alpha = 0.8
) +
scale_x_continuous(breaks = scales::pretty_breaks(n = 8)) +
facet_wrap(~ weather_parameter, scales = "free") +
xlab("") +
ylab("Wind Direction") +
theme_tufte()
# display the graph
graph_wd_weather
# save the graph
ggsave(
graph_wd_weather,
filename = here::here( "inputs", "3.outputs", "1.eda", "graph_wd_weather.pdf"),
width = 30,
height = 10,
units = "cm",
device = cairo_pdf
)
We plot the distribution of rainfall duration by wind direction:
# make the graph
graph_wd_rainfall <- data %>%
select(rainfall_duration, wind_direction_categories) %>%
group_by(wind_direction_categories, rainfall_duration) %>%
summarise(n = n()) %>%
mutate(proportion = n / sum(n) * 100) %>%
ungroup() %>%
mutate(
rainfall_duration = case_when(
rainfall_duration == " 0" ~ "0 minute",
rainfall_duration == "[ 1, 12)" ~ "[1, 12) minutes",
rainfall_duration == "[ 12, 159)" ~ "[12, 159) minutes",
rainfall_duration == "[159,1440]" ~ "[159,1440] minutes"
)
) %>%
mutate(
rainfall_duration = fct_relevel(
rainfall_duration,
"0 minute",
"[1, 12) minutes",
"[12, 159) minutes",
"[159,1440] minutes"
)
) %>%
ggplot(., aes(x = proportion, y = rainfall_duration)) +
geom_point(shape = 16,
colour = my_blue,
size = 4) +
geom_vline(xintercept = 0,
size = 0.3,
colour = "black") +
facet_wrap( ~ wind_direction_categories, ncol = 4) +
xlab("Proportion (%)") + ylab("") +
theme_tufte() +
theme(axis.text.y = element_text(hjust = 1))
# display the graph
graph_wd_rainfall
# save the graph
ggsave(
graph_wd_rainfall,
filename = here::here( "inputs", "3.outputs", "1.eda", "graph_wd_rainfall.pdf"),
width = 25,
height = 10,
units = "cm",
device = cairo_pdf
)
We plot the distribution of wind direction by month:
# make the graph
graph_wd_month <- data %>%
select(month, wind_direction_categories) %>%
group_by(wind_direction_categories, month) %>%
summarise(n = n()) %>%
mutate(proportion = n / sum(n) * 100) %>%
ungroup() %>%
ggplot(., aes(x = month, y = proportion, group = "l")) +
geom_line(colour = "gray80") +
geom_point(colour = my_blue, size = 3) +
scale_y_continuous(breaks = scales::pretty_breaks(n = 7)) +
facet_wrap( ~ wind_direction_categories, ncol = 2) +
xlab("") + ylab("Proportion (%)") +
theme_tufte()
# display the graph
graph_wd_month
# save the graph
ggsave(
graph_wd_month,
filename = here::here( "inputs", "3.outputs", "1.eda", "graph_wd_month.pdf"),
width = 35,
height = 15,
units = "cm",
device = cairo_pdf
)
We plot the distribution of wind direction by year:
# make the graph
graph_wd_year <- data %>%
select(year, wind_direction_categories) %>%
mutate(year = as.factor(year)) %>%
group_by(wind_direction_categories, year) %>%
summarise(n = n()) %>%
mutate(proportion = n / sum(n) * 100) %>%
ungroup() %>%
ggplot(., aes(x = year, y = proportion, group = "l")) +
geom_line(colour = "gray80") +
geom_point(colour = my_blue, size = 3) +
scale_y_continuous(breaks = scales::pretty_breaks(n = 7)) +
facet_wrap( ~ wind_direction_categories, ncol = 2) +
xlab("") + ylab("Proportion (%)") +
theme_tufte()
# display the graph
graph_wd_year
# save the graph
ggsave(
graph_wd_year,
filename = here::here( "inputs", "3.outputs", "1.eda", "graph_wd_year.pdf"),
width = 30,
height = 15,
units = "cm",
device = cairo_pdf
)
We check if covariates are balanced when we define treated days as units with North-East winds and control days as units where winds blow from other directions.
The treatment is defined as follows:
# construct treatment allocation variable
data <- data %>%
mutate(is_treated = ifelse(
wind_direction_categories == "North-East",
"Treated",
"Control"
))
We plot density distributions for continuous weather covariate by treatment status:
# we select control variables and store them in a long dataframe
data_weather_continuous_variables <- data %>%
select(temperature_average,
humidity_average,
wind_speed,
is_treated) %>%
pivot_longer(
cols = -c(is_treated),
names_to = "variable",
values_to = "values"
) %>%
mutate(
variable = NA %>%
ifelse(
str_detect(variable, "temperature_average"),
"Average Temperature (°C)",
.
) %>%
ifelse(
str_detect(variable, "humidity_average"),
"Humidity Average (%)",
.
) %>%
ifelse(str_detect(variable, "wind_speed"), "Wind Speed (m/s)", .)
)
# make the graph
graph_boxplot_continuous_weather <-
ggplot(data_weather_continuous_variables,
aes(x = values, y = is_treated, fill = is_treated)) +
geom_density_ridges(colour = NA) +
scale_fill_manual(values = c(my_blue, my_orange)) +
scale_x_continuous(breaks = scales::pretty_breaks(n = 10)) +
xlab("Covariate Value") +
ylab("") +
labs(fill = "Units Status:") +
facet_wrap( ~ variable, scale = "free", ncol = 3) +
theme_tufte()
# we print the graph
graph_boxplot_continuous_weather
# save the graph
ggsave(
graph_boxplot_continuous_weather,
filename = here::here( "inputs", "3.outputs", "1.eda", "graph_boxplot_continuous_weather.pdf"),
width = 30,
height = 10,
units = "cm",
device = cairo_pdf
)
We draw love plots which display the standardized mean differences between treated and control days for each weather covariate and by month:
# compute absolute differences
data_abs_difference <- data %>%
select(month,
is_treated,
temperature_average,
humidity_average,
wind_speed) %>%
group_by(month, is_treated) %>%
summarise_all(., ~ mean(., na.rm = TRUE)) %>%
pivot_longer(
cols = -c(month, is_treated),
names_to = "variable",
values_to = "average"
) %>%
arrange(month, variable) %>%
group_by(month, variable) %>%
summarise(abs_difference = abs(average[2] - average[1]))
# we compute the standard deviation of each treatment group covariate
data_sd <- data %>%
select(month,
is_treated,
temperature_average,
humidity_average,
wind_speed) %>%
filter(is_treated == "Treated") %>%
select(-is_treated) %>%
group_by(month) %>%
summarise_all(., ~ sd(., na.rm = TRUE)) %>%
pivot_longer(cols = -c(month),
names_to = "variable",
values_to = "sd_treatment")
# we merge data_abs_difference and data_sd
data_love <-
left_join(data_abs_difference, data_sd, by = c("month", "variable")) %>%
mutate(standardized_difference = abs_difference / sd_treatment) %>%
select(-c(abs_difference, sd_treatment)) %>%
mutate(
variable = case_when(
variable == "temperature_average" ~ "Average Temperature (°C)",
variable == "humidity_average" ~ "Humidity Average (%)",
variable == "wind_speed" ~ "Wind Speed (m/s)"
)
)
# we make the graph
graph_love <-
ggplot(data_love, aes(y = fct_rev(variable), x = standardized_difference)) +
geom_vline(xintercept = 0, size = 0.3) +
geom_vline(xintercept = 0.1,
color = my_orange,
size = 0.3) +
geom_point(size = 2, colour = my_blue) +
facet_wrap( ~ month, ncol = 4) +
xlab("Standardized Mean Differences") +
ylab("") +
theme_tufte() +
theme(
axis.text.y = element_text(hjust = 1, size = 10),
axis.text.x = element_text(size = 10),
panel.spacing = unit(0.5, "cm"),
plot.margin = unit(c(0.5, 0.5, 0.5, -0.5), "cm")
)
# we print the graph
graph_love
# save the graph
ggsave(
graph_love,
filename = here::here( "inputs", "3.outputs", "1.eda", "graph_love_weather.pdf"),
width = 20,
height = 12,
units = "cm",
device = cairo_pdf
)
We can also the imbalance and lack of overlap for the average temperature by drawing a ridgeline plot:
# make ridgeline plot for temperature
graph_ridgeline_temperature <-
ggplot(data, aes(x = temperature_average, y = month, fill = is_treated)) +
ggridges::geom_density_ridges2(alpha = 0.4, colour = NA) +
scale_fill_manual(values = c(my_blue, my_orange)) +
facet_wrap(~ year) +
labs(x = "Average Temperature (°C)", y = "", fill = "Group:") +
theme_tufte()
# we print the graph
graph_ridgeline_temperature
# save the graph
ggsave(
graph_ridgeline_temperature,
filename = here::here( "inputs", "3.outputs", "1.eda", "graph_ridgeline_temperature.pdf"),
width = 30,
height = 20,
units = "cm",
device = cairo_pdf
)
# make ridgeline plot for temperature
# figures for the article
graph_ridgeline_temperature <-
data %>%
filter(year %in% c(2008:2010)) %>%
mutate(is_treated = ifelse(is_treated == "Treated", "North-East Winds", "Other Wind Directions")) %>%
ggplot(., aes(x = temperature_average, y = month, fill = is_treated)) +
ggridges::geom_density_ridges2(alpha = 0.4, colour = NA) +
scale_fill_manual(values = c(my_orange, my_blue)) +
facet_wrap(~ year, nrow = 1) +
labs(x = "Average Temperature (°C)", y = "", fill = "Group:") +
theme_tufte()
# open polar plot
img_polar_lot <- png::readPNG(here::here("inputs", "3.outputs", "1.eda", "graph_polar_plots_pollutants.png"))
img_polar_lot <- ggplot() +
annotation_custom(
grid::rasterGrob(
img_polar_lot,
width =
ggplot2::unit(1, "npc"),
height =
ggplot2::unit(1, "npc")
),-Inf,
Inf,
-Inf,
Inf
)
# load patchwork
library(patchwork)
# combine plots
graph_polar_plots_imbalance_temperature <- img_polar_lot / graph_ridgeline_temperature +
plot_annotation(tag_levels = 'A') & theme(plot.tag = element_text(size = 18, face = "bold"))
# save the plots
ggsave(
graph_polar_plots_imbalance_temperature,
filename = here::here("inputs", "3.outputs", "1.eda", "graph_polar_plots_imbalance_temperature.pdf"),
width = 25,
height = 16,
units = "cm",
device = cairo_pdf
)
We plot the proportion of treated units by month:
# make the graph
graph_treated_month <- data %>%
select(month, is_treated) %>%
group_by(month, is_treated) %>%
summarise(n = n()) %>%
mutate(proportion = n / sum(n) * 100) %>%
ungroup() %>%
filter(is_treated == "Treated") %>%
ggplot(., aes(x = month, y = proportion, group = "l")) +
geom_line(colour = "gray80") +
geom_point(colour = my_blue, size = 3) +
scale_y_continuous(breaks = scales::pretty_breaks(n = 7)) +
xlab("") + ylab("Proportion (%)") +
theme_tufte()
# display the graph
graph_treated_month
# save the graph
ggsave(
graph_treated_month,
filename = here::here( "inputs", "3.outputs", "1.eda", "graph_treated_month.pdf"),
width = 20,
height = 8,
units = "cm",
device = cairo_pdf
)
If you see mistakes or want to suggest changes, please create an issue on the source repository.