# Require some packages
library(readr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ purrr 1.0.2
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(knitr)
# Import the air pollution data for the years 1999 and 2012
air99 <- as_tibble(read_csv(url("https://bit.ly/3c4AHbL")))
## Rows: 117421 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): X..RD, Action.Code, State.Code, County.Code, Site.ID
## dbl (7): Parameter, POC, Sample.Duration, Unit, Method, Date, Sample.Value
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
air12 <- as_tibble(read_csv(url("https://bit.ly/3nZicL2")))
## Rows: 1304287 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): X..RD, Action.Code, State.Code, County.Code, Site.ID
## dbl (7): Parameter, POC, Sample.Duration, Unit, Method, Date, Sample.Value
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Combine 2 datasets by using "bind_rows()" first
air_combined <- bind_rows(air99 %>%
mutate(Year = 1999) %>%
rename(PM2.5 = Sample.Value),
air12 %>%
mutate(Year = 2012) %>%
rename(PM2.5 = Sample.Value)
) %>%
drop_na() #Drop any rows with NA values
# Structure validation by using glimpse()
glimpse(air_combined)
## Rows: 1,335,358
## Columns: 13
## $ X..RD <chr> "RD", "RD", "RD", "RD", "RD", "RD", "RD", "RD", "RD", …
## $ Action.Code <chr> "I", "I", "I", "I", "I", "I", "I", "I", "I", "I", "I",…
## $ State.Code <chr> "01", "01", "01", "01", "01", "01", "01", "01", "01", …
## $ County.Code <chr> "027", "027", "027", "027", "027", "027", "027", "027"…
## $ Site.ID <chr> "0001", "0001", "0001", "0001", "0001", "0001", "0001"…
## $ Parameter <dbl> 88101, 88101, 88101, 88101, 88101, 88101, 88101, 88101…
## $ POC <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ Sample.Duration <dbl> 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, …
## $ Unit <dbl> 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105,…
## $ Method <dbl> 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120,…
## $ Date <dbl> 19990112, 19990115, 19990118, 19990121, 19990124, 1999…
## $ PM2.5 <dbl> 8.841, 14.920, 3.878, 9.042, 5.464, 20.170, 11.560, 13…
## $ Year <dbl> 1999, 1999, 1999, 1999, 1999, 1999, 1999, 1999, 1999, …
# Calculate the summary statistic of PM2.5 in 1999 and 2012
pm_summary <- air_combined %>%
group_by(Year) %>%
summarise(Mean = mean(PM2.5, na.rm = TRUE),
Median = median(PM2.5, na.rm = TRUE),
Min = min(PM2.5, na.rm = TRUE),
Max = max(PM2.5, na.rm = TRUE)
)
print(pm_summary)
## # A tibble: 2 × 5
## Year Mean Median Min Max
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1999 13.7 11.5 0 157.
## 2 2012 9.14 7.63 -10 909.
# Update and filter the data in "air_combined"
air_combined <- air_combined %>%
filter(PM2.5 > 0)
print(head(air_combined))
## # A tibble: 6 × 13
## X..RD Action.Code State.Code County.Code Site.ID Parameter POC
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 RD I 01 027 0001 88101 1
## 2 RD I 01 027 0001 88101 1
## 3 RD I 01 027 0001 88101 1
## 4 RD I 01 027 0001 88101 1
## 5 RD I 01 027 0001 88101 1
## 6 RD I 01 027 0001 88101 1
## # ℹ 6 more variables: Sample.Duration <dbl>, Unit <dbl>, Method <dbl>,
## # Date <dbl>, PM2.5 <dbl>, Year <dbl>
Answer: According to the boxplot visualization
below, it could be interpreted that: In 1999, the Median Value is higher
than 2012.
library(ggplot2)
ggplot(air_combined,
mapping = aes(x = log2(PM2.5),
y = as.factor(Year),
fill = as.factor(Year))) +
geom_boxplot(alpha = 0.5) +
scale_fill_brewer(palette = "Set1") +
labs(y = "Year",
x = "Log2 PM2.5",
title = "Boxplot of PM Values in 1999 and 2012") +
theme_minimal()
# Subsetting "air_combined" which included "site.code"
ny_data <- air_combined %>%
filter(State.Code == 36) %>%
mutate(site.code = paste0(County.Code, "." ,Site.ID))
print(ny_data)
## # A tibble: 2,258 × 14
## X..RD Action.Code State.Code County.Code Site.ID Parameter POC
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 RD I 36 001 0005 88101 1
## 2 RD I 36 001 0005 88101 1
## 3 RD I 36 001 0005 88101 1
## 4 RD I 36 001 0005 88101 1
## 5 RD I 36 001 0005 88101 1
## 6 RD I 36 001 0005 88101 1
## 7 RD I 36 001 0005 88101 1
## 8 RD I 36 001 0005 88101 1
## 9 RD I 36 001 0005 88101 1
## 10 RD I 36 001 0005 88101 1
## # ℹ 2,248 more rows
## # ℹ 7 more variables: Sample.Duration <dbl>, Unit <dbl>, Method <dbl>,
## # Date <dbl>, PM2.5 <dbl>, Year <dbl>, site.code <chr>
# Group the data by site.code and count the number of distinct years
active_monitors <- ny_data %>%
group_by(site.code) %>%
filter(n_distinct(Year) == 2) %>%
pull(site.code) # Extract the unique site codes
# Store the unique site codes in an object named active_both_year
active_both_year <- unique(active_monitors); active_both_year
## [1] "001.0005" "001.0012" "005.0080" "013.0011" "029.0005" "031.0003"
## [7] "063.2008" "067.1015" "085.0055" "101.0003"
# Since we need to parse the Date column into a date object using lubridate::ymd()
library(lubridate)
# Filter ny_data for monitors in active_both_year including count and arrange observations
monitor_counts <- ny_data %>%
filter(site.code %in% active_both_year) %>%
group_by(site.code) %>%
summarise(observations = n()) %>%
arrange(desc(observations))
# Extract the monitor with the highest number of observations
most_active_monitor <- monitor_counts$site.code[1]; most_active_monitor
## [1] "101.0003"
# Monitor only 101.0003 and extract the day of the year
air101.0003 <- ny_data %>%
filter(site.code == "101.0003") %>%
mutate(Date = lubridate::ymd(Date),
Day_of_Year = yday(Date))
# View the first few rows of air101.0003 to verify
head(air101.0003)
## # A tibble: 6 × 15
## X..RD Action.Code State.Code County.Code Site.ID Parameter POC
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 RD I 36 101 0003 88101 1
## 2 RD I 36 101 0003 88101 1
## 3 RD I 36 101 0003 88101 1
## 4 RD I 36 101 0003 88101 1
## 5 RD I 36 101 0003 88101 1
## 6 RD I 36 101 0003 88101 1
## # ℹ 8 more variables: Sample.Duration <dbl>, Unit <dbl>, Method <dbl>,
## # Date <date>, PM2.5 <dbl>, Year <dbl>, site.code <chr>, Day_of_Year <dbl>
# Plotting a scatter plot by using the dataset of "air101.0003"
ggplot(data = air101.0003,
mapping = aes(x = Day_of_Year,
y = PM2.5)) +
geom_point() +
facet_wrap(vars(Year)) +
labs(x = "Day of Year",
y = "PM 2.5")
# Access the data first
lalonde <- as_tibble(read_csv(url("https://bit.ly/3sJJKuk")))
## Rows: 722 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): data_id
## dbl (9): treat, age, education, black, hispanic, married, nodegree, re75, re78
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(lalonde)
## # A tibble: 6 × 10
## data_id treat age education black hispanic married nodegree re75 re78
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Lalonde Sa… 1 37 11 1 0 1 1 0 9930.
## 2 Lalonde Sa… 1 22 9 0 1 0 1 0 3596.
## 3 Lalonde Sa… 1 30 12 1 0 0 0 0 24909.
## 4 Lalonde Sa… 1 27 11 1 0 0 1 0 7506.
## 5 Lalonde Sa… 1 33 8 1 0 0 1 0 290.
## 6 Lalonde Sa… 1 22 9 1 0 0 1 0 4056.
library(tidyverse)
# Group by control groups and calculate sample averages of covariates
balance_table <- lalonde %>%
group_by(treat) %>%
summarise(Average_Age = mean(age),
Average_Education = mean(education),
Average_Black = mean(black),
Average_Hispanic = mean(hispanic),
Average_Married = mean(married),
Average_Nodegree = mean(nodegree)
)
# Print the table using knitr::kable()
knitr::kable(balance_table, col.names = c("Group", "Average Age", "Average Education",
"Average Black", "Average Hispanic",
"Average Married", "Average No Degree"))
| Group | Average Age | Average Education | Average Black | Average Hispanic | Average Married | Average No Degree |
|---|---|---|---|---|---|---|
| 0 | 24.44706 | 10.18824 | 0.8000000 | 0.1129412 | 0.1576471 | 0.8141176 |
| 1 | 24.62626 | 10.38047 | 0.8013468 | 0.0942761 | 0.1683502 | 0.7306397 |
Verbal Comments on Question 9: As the table above,
ut displays the sample averages of various covariates grouped by
treatment (1) and control (0), which might be some imbalances between
the treatment and control groups. Since the average age in the treatment
group is slightly higher compared to the control group. Also, the
average of people who educated, black, and married seem to be higher in
the treatment group.Thus, these differences suggest that the two groups
may not be entirely balanced in terms of their covariates.
# Calculate the change in earnings between pre- and post-experiment
lalonde <- lalonde %>%
mutate(change = re78 - re75)
# Compute the average change in earnings for the treated group
trt_change <- lalonde %>%
filter(treat == 1) %>%
summarise(Average_Change = mean(change))
# Compute the average change in earnings for the control group
ctr_change <- lalonde %>%
filter(treat == 0) %>%
summarise(Average_Change = mean(change))
# Compute the average treatment effect from two objects
ate <- trt_change - ctr_change
# Report estimates
paste("The average change in earnings for the treated group is", trt_change)
## [1] "The average change in earnings for the treated group is 2910.25383244781"
paste("The average change in earnings for the control group is", ctr_change)
## [1] "The average change in earnings for the control group is 2063.36554507087"
paste("The average treatment effect", ate)
## [1] "The average treatment effect 846.88828737694"
Summary the report estimates in the main text of Question
10:
The average change in earnings for the treated
group is “trt_change” which indicating the average increase in
earnings for individuals who received the treatment.
The
average change in earnings for the control group is
“ctr_change” which showing the average increase in earnings for
individuals in the control group who did not receive the treatment.
The average treatment effect is “ate” representing the
difference in the average change in earnings between the treated and
control groups. This value provides an estimate of the causal effect of
the treatment on earnings. If Positive Values indicate a positive
treatment effect, while Negative Values indicate a negative treatment
effect.
Answer: As an analysis above, each person has two
potential outcomes:
Their change in earnings if they receive the
job training program (treatment), and their change in earnings if they
do not receive the program (control). This potential outcome reflects
how their earnings would change after participating in the program
compared to their earnings before the program. Therefore, the
fundamental problem of causal inference in this context refers to the
challenge of estimating the causal effect of the treatment (job training
program) on individuals’ outcomes (earnings change) when we can only
observe one potential outcome for each individual, depending on whether
they receive the treatment or not.
Answer: Both re75 (earnings before the program) and
re78 (earnings after the program) are valid outcomes to explore. Using
re75 helps understand participants’ initial socioeconomic status and
baseline earnings, influencing potential improvement through the
program. Using re78 directly assesses the program’s impact on
participants’ earnings at the end of the experiment, gauging its
effectiveness and lasting impact. Thus, as re75 and re78, all provide
different perspectives on the impact of the job training program. Both
outcomes offer valuable insights into the program’s effectiveness and
its implications for participants’ socioeconomic outcomes.
# Create a tibble called "ate_dropout" by using group_by(), summarize(), pivot_wider(), and mutate()
ate_dropout <- lalonde %>%
mutate(treat = ifelse(treat == 1, "Treated", "Control"),
nodegree = ifelse(nodegree == 1, "Dropped out", "Finished HS")) %>%
group_by(nodegree, treat) %>%
summarize(Mean_Change = mean(change)) %>%
pivot_wider(names_from = treat, values_from = Mean_Change) %>%
mutate(ATE = Treated - Control)
## `summarise()` has grouped output by 'nodegree'. You can override using the
## `.groups` argument.
# Rename columns for clarity
colnames(ate_dropout) <- c("dropout", "Treated", "Control", "ATE")
# Produce a formatted table by using knitr::kable()
ate_kable <- knitr::kable(ate_dropout, align = "c"); ate_kable
| dropout | Treated | Control | ATE |
|---|---|---|---|
| Dropped out | 2172.871 | 2623.151 | 450.2804 |
| Finished HS | 1583.760 | 3689.019 | 2105.2599 |
# Report ATEs in text
ATE_Dropped_out <- ate_dropout$ATE[ate_dropout$dropout == "Dropped out"]
ATE_Finished_HS <- ate_dropout$ATE[ate_dropout$dropout == "Finished HS"]
# Comments on the evidence of a differential effect of treatment
if (ATE_Dropped_out > ATE_Finished_HS) {
cat("\nAverage Treatment Effect (ATE) for those who dropped out of high school:",
ATE_Dropped_out, "\n")
cat("Average Treatment Effect (ATE) for those who finished high school:", ATE_Finished_HS, "\n")
cat("\nThere is evidence of a differential effect of treatment,
with dropping out of high school associated with a larger average treatment effect.\n")
} else if (ATE_Dropped_out < ATE_Finished_HS) {
cat("\nAverage Treatment Effect (ATE) for those who dropped out of high school:",
ATE_Dropped_out, "\n")
cat("Average Treatment Effect (ATE) for those who finished high school:", ATE_Finished_HS, "\n")
cat("\nThere is evidence of a differential effect of treatment,
with finishing high school associated with a larger average treatment effect.\n")
} else {
cat("\nAverage Treatment Effect (ATE) for those who dropped out of high school:",
ATE_Dropped_out, "\n")
cat("Average Treatment Effect (ATE) for those who finished high school:", ATE_Finished_HS, "\n")
cat("\nThere is no evidence of a differential effect of treatment based on
high school completion status.\n")
}
##
## Average Treatment Effect (ATE) for those who dropped out of high school: 450.2804
## Average Treatment Effect (ATE) for those who finished high school: 2105.26
##
## There is evidence of a differential effect of treatment,
## with finishing high school associated with a larger average treatment effect.
# Create 'age_group' variable first
lalonde <- lalonde %>%
mutate(age_group = case_when(
age <= 30 ~ "30 and under",
between(age, 31, 40) ~ "31 - 40",
TRUE ~ "Over 40"
))
# Create 'ate_age' table
ate_age <- lalonde %>%
mutate(treat = ifelse(treat == 1, "Treated", "Control")) %>%
group_by(age_group, treat) %>%
summarize(Mean_Change = mean(change), .groups = "drop_last") %>%
pivot_wider(names_from = treat, values_from = Mean_Change) %>%
mutate(ATE = Treated - Control)
# Rename the age column for clarity
colnames(ate_age) <- c("age_group", "Treated", "Control", "ATE")
# Pass the table to knitr::kable() for formatting
ate_age_kable <- knitr::kable(ate_age, align = "c"); ate_age_kable
| age_group | Treated | Control | ATE |
|---|---|---|---|
| 30 and under | 2136.730 | 2736.367 | 599.6373 |
| 31 - 40 | 1192.721 | 3950.871 | 2758.1497 |
| Over 40 | 3786.459 | 3914.669 | 128.2100 |
# Report ATEs in text
ATE_30_under <- ate_age$ATE[ate_age$age_group == "30 and under"]
ATE_31_40 <- ate_age$ATE[ate_age$age_group == "31 - 40"]
ATE_over_40 <- ate_age$ATE[ate_age$age_group == "Over 40"]
# Comments on the evidence of a differential effect of treatment
if (ATE_30_under > ATE_31_40 & ATE_30_under > ATE_over_40) {
cat("\nAverage Treatment Effect (ATE) for individuals aged 30 and under:", ATE_30_under, "\n")
cat("Average Treatment Effect (ATE) for individuals aged 31 - 40:", ATE_31_40, "\n")
cat("Average Treatment Effect (ATE) for individuals aged over 40:", ATE_over_40, "\n")
cat("\nThere is evidence of a differential effect of treatment based on age,
with individuals aged 30 and under experiencing the largest average treatment effect.\n")
} else if (ATE_31_40 > ATE_30_under & ATE_31_40 > ATE_over_40) {
cat("\nAverage Treatment Effect (ATE) for individuals aged 30 and under:", ATE_30_under, "\n")
cat("Average Treatment Effect (ATE) for individuals aged 31 - 40:", ATE_31_40, "\n")
cat("Average Treatment Effect (ATE) for individuals aged over 40:", ATE_over_40, "\n")
cat("\nThere is evidence of a differential effect of treatment based on age,
with individuals aged 31 - 40 experiencing the largest average treatment effect.\n")
} else if (ATE_over_40 > ATE_30_under & ATE_over_40 > ATE_31_40) {
cat("\nAverage Treatment Effect (ATE) for individuals aged 30 and under:", ATE_30_under, "\n")
cat("Average Treatment Effect (ATE) for individuals aged 31 - 40:", ATE_31_40, "\n")
cat("Average Treatment Effect (ATE) for individuals aged over 40:", ATE_over_40, "\n")
cat("\nThere is evidence of a differential effect of treatment based on age,
with individuals aged over 40 experiencing the largest average treatment effect.\n")
} else {
cat("\nThere is no evidence of a differential effect of treatment based on age.\n")
}
##
## Average Treatment Effect (ATE) for individuals aged 30 and under: 599.6373
## Average Treatment Effect (ATE) for individuals aged 31 - 40: 2758.15
## Average Treatment Effect (ATE) for individuals aged over 40: 128.21
##
## There is evidence of a differential effect of treatment based on age,
## with individuals aged 31 - 40 experiencing the largest average treatment effect.
Answer: Regarding to the outcomes of systematic
relationships between treatment effects and age are observed in the
barplot. The individuals aged 30 and under tend to have a higher average
treatment effect compared to those aged 31 - 40 and over 40. There is a
slight decrease in the average treatment effect with increasing age,
suggesting that younger individuals may benefit more from the treatment.
library(ggplot2)
# Create barplot of ATE for each age group
age_plot <- ggplot(ate_age, aes(x = age_group, y = ATE, fill = age_group)) +
geom_bar(stat = "identity", position = "dodge") +
labs(x = "Age Group", y = "Average Treatment Effect (ATE)", title = "Average Treatment
Effect by Age Group") + theme_minimal()
# Print the plot
print(age_plot)