PART I: PM2.5

Question 1

Import the air pollution data for the years 1999 and 2012, and combine the 1999 and 2012 datasets into one tibble

# Require some packages
library(readr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ purrr     1.0.2
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(knitr)
# Import the air pollution data for the years 1999 and 2012
air99 <- as_tibble(read_csv(url("https://bit.ly/3c4AHbL")))
## Rows: 117421 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): X..RD, Action.Code, State.Code, County.Code, Site.ID
## dbl (7): Parameter, POC, Sample.Duration, Unit, Method, Date, Sample.Value
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
air12 <- as_tibble(read_csv(url("https://bit.ly/3nZicL2")))
## Rows: 1304287 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): X..RD, Action.Code, State.Code, County.Code, Site.ID
## dbl (7): Parameter, POC, Sample.Duration, Unit, Method, Date, Sample.Value
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Combine 2 datasets by using "bind_rows()" first
air_combined <- bind_rows(air99 %>%
                            mutate(Year = 1999) %>%
                            rename(PM2.5 = Sample.Value),
                          air12 %>%
                            mutate(Year = 2012) %>%
                            rename(PM2.5 = Sample.Value)
                          ) %>%
  drop_na() #Drop any rows with NA values

# Structure validation by using glimpse()            
glimpse(air_combined)
## Rows: 1,335,358
## Columns: 13
## $ X..RD           <chr> "RD", "RD", "RD", "RD", "RD", "RD", "RD", "RD", "RD", …
## $ Action.Code     <chr> "I", "I", "I", "I", "I", "I", "I", "I", "I", "I", "I",…
## $ State.Code      <chr> "01", "01", "01", "01", "01", "01", "01", "01", "01", …
## $ County.Code     <chr> "027", "027", "027", "027", "027", "027", "027", "027"…
## $ Site.ID         <chr> "0001", "0001", "0001", "0001", "0001", "0001", "0001"…
## $ Parameter       <dbl> 88101, 88101, 88101, 88101, 88101, 88101, 88101, 88101…
## $ POC             <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ Sample.Duration <dbl> 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, …
## $ Unit            <dbl> 105, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105,…
## $ Method          <dbl> 120, 120, 120, 120, 120, 120, 120, 120, 120, 120, 120,…
## $ Date            <dbl> 19990112, 19990115, 19990118, 19990121, 19990124, 1999…
## $ PM2.5           <dbl> 8.841, 14.920, 3.878, 9.042, 5.464, 20.170, 11.560, 13…
## $ Year            <dbl> 1999, 1999, 1999, 1999, 1999, 1999, 1999, 1999, 1999, …

Question 2

Calculate the summary statistics (mean, median, min, max) including update, clean and store the data

# Calculate the summary statistic of PM2.5 in 1999 and 2012
pm_summary <- air_combined %>%
  group_by(Year) %>%
  summarise(Mean = mean(PM2.5, na.rm = TRUE),
            Median = median(PM2.5, na.rm = TRUE),
            Min = min(PM2.5, na.rm = TRUE),
            Max = max(PM2.5, na.rm = TRUE)
  )
print(pm_summary)
## # A tibble: 2 × 5
##    Year  Mean Median   Min   Max
##   <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1  1999 13.7   11.5      0  157.
## 2  2012  9.14   7.63   -10  909.
# Update and filter the data in "air_combined"
air_combined <- air_combined %>%
  filter(PM2.5 > 0)

print(head(air_combined))
## # A tibble: 6 × 13
##   X..RD Action.Code State.Code County.Code Site.ID Parameter   POC
##   <chr> <chr>       <chr>      <chr>       <chr>       <dbl> <dbl>
## 1 RD    I           01         027         0001        88101     1
## 2 RD    I           01         027         0001        88101     1
## 3 RD    I           01         027         0001        88101     1
## 4 RD    I           01         027         0001        88101     1
## 5 RD    I           01         027         0001        88101     1
## 6 RD    I           01         027         0001        88101     1
## # ℹ 6 more variables: Sample.Duration <dbl>, Unit <dbl>, Method <dbl>,
## #   Date <dbl>, PM2.5 <dbl>, Year <dbl>

Question 3

Create a boxplot to analyze the distribution of PM2.5 values for the years 1999 and 2012

Answer: According to the boxplot visualization below, it could be interpreted that: In 1999, the Median Value is higher than 2012.

library(ggplot2)

ggplot(air_combined,
       mapping = aes(x = log2(PM2.5),
                      y = as.factor(Year),
                     fill = as.factor(Year))) +
         geom_boxplot(alpha = 0.5) + 
  scale_fill_brewer(palette = "Set1") +
  labs(y = "Year", 
  x = "Log2 PM2.5", 
        title = "Boxplot of PM Values in 1999 and 2012") + 
  theme_minimal()

Question 4

Subseting ‘air_combined’ that includes only observations from New York State, and creating ‘site.code’

# Subsetting "air_combined" which included "site.code"
ny_data <- air_combined %>%
  filter(State.Code == 36) %>%
  mutate(site.code = paste0(County.Code, "." ,Site.ID)) 

print(ny_data)
## # A tibble: 2,258 × 14
##    X..RD Action.Code State.Code County.Code Site.ID Parameter   POC
##    <chr> <chr>       <chr>      <chr>       <chr>       <dbl> <dbl>
##  1 RD    I           36         001         0005        88101     1
##  2 RD    I           36         001         0005        88101     1
##  3 RD    I           36         001         0005        88101     1
##  4 RD    I           36         001         0005        88101     1
##  5 RD    I           36         001         0005        88101     1
##  6 RD    I           36         001         0005        88101     1
##  7 RD    I           36         001         0005        88101     1
##  8 RD    I           36         001         0005        88101     1
##  9 RD    I           36         001         0005        88101     1
## 10 RD    I           36         001         0005        88101     1
## # ℹ 2,248 more rows
## # ℹ 7 more variables: Sample.Duration <dbl>, Unit <dbl>, Method <dbl>,
## #   Date <dbl>, PM2.5 <dbl>, Year <dbl>, site.code <chr>

Question 5

Group the data and identify monitors active in both years

# Group the data by site.code and count the number of distinct years
active_monitors <- ny_data %>%
  group_by(site.code) %>%
  filter(n_distinct(Year) == 2) %>%
  pull(site.code) # Extract the unique site codes

# Store the unique site codes in an object named active_both_year
active_both_year <- unique(active_monitors); active_both_year
##  [1] "001.0005" "001.0012" "005.0080" "013.0011" "029.0005" "031.0003"
##  [7] "063.2008" "067.1015" "085.0055" "101.0003"

Question 6

Identify the most active monitor

# Since we need to parse the Date column into a date object using lubridate::ymd()
library(lubridate)

# Filter ny_data for monitors in active_both_year including count and arrange observations
monitor_counts <- ny_data %>%
  filter(site.code %in% active_both_year) %>%
  group_by(site.code) %>%
  summarise(observations = n()) %>%
  arrange(desc(observations))

# Extract the monitor with the highest number of observations
most_active_monitor <- monitor_counts$site.code[1]; most_active_monitor
## [1] "101.0003"

Question 7

Subsetting for monitor 101.0003 ans correctting the date manipulation

# Monitor only 101.0003 and extract the day of the year
air101.0003 <- ny_data %>%
  filter(site.code == "101.0003") %>% 
  mutate(Date = lubridate::ymd(Date),
         Day_of_Year = yday(Date)) 

# View the first few rows of air101.0003 to verify
head(air101.0003)
## # A tibble: 6 × 15
##   X..RD Action.Code State.Code County.Code Site.ID Parameter   POC
##   <chr> <chr>       <chr>      <chr>       <chr>       <dbl> <dbl>
## 1 RD    I           36         101         0003        88101     1
## 2 RD    I           36         101         0003        88101     1
## 3 RD    I           36         101         0003        88101     1
## 4 RD    I           36         101         0003        88101     1
## 5 RD    I           36         101         0003        88101     1
## 6 RD    I           36         101         0003        88101     1
## # ℹ 8 more variables: Sample.Duration <dbl>, Unit <dbl>, Method <dbl>,
## #   Date <date>, PM2.5 <dbl>, Year <dbl>, site.code <chr>, Day_of_Year <dbl>

Question 8

Accurate the scatter plot reproduction for insightful analysis

# Plotting a scatter plot by using the dataset of "air101.0003"
ggplot(data = air101.0003,
         mapping = aes(x = Day_of_Year,
                     y = PM2.5)) +
  geom_point() +
  facet_wrap(vars(Year)) +
  labs(x = "Day of Year",
       y = "PM 2.5")

PART II: The impact of job training programs on employees’ earnings

Question 9

Successful knitting with included correctting “balance_table” object and make the verbal comments

# Access the data first
lalonde <- as_tibble(read_csv(url("https://bit.ly/3sJJKuk")))
## Rows: 722 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): data_id
## dbl (9): treat, age, education, black, hispanic, married, nodegree, re75, re78
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(lalonde)
## # A tibble: 6 × 10
##   data_id     treat   age education black hispanic married nodegree  re75   re78
##   <chr>       <dbl> <dbl>     <dbl> <dbl>    <dbl>   <dbl>    <dbl> <dbl>  <dbl>
## 1 Lalonde Sa…     1    37        11     1        0       1        1     0  9930.
## 2 Lalonde Sa…     1    22         9     0        1       0        1     0  3596.
## 3 Lalonde Sa…     1    30        12     1        0       0        0     0 24909.
## 4 Lalonde Sa…     1    27        11     1        0       0        1     0  7506.
## 5 Lalonde Sa…     1    33         8     1        0       0        1     0   290.
## 6 Lalonde Sa…     1    22         9     1        0       0        1     0  4056.
library(tidyverse)

# Group by control groups and calculate sample averages of covariates
balance_table <- lalonde %>%
  group_by(treat) %>%
  summarise(Average_Age = mean(age),
            Average_Education = mean(education),
            Average_Black = mean(black),
            Average_Hispanic = mean(hispanic),
            Average_Married = mean(married),
            Average_Nodegree = mean(nodegree)
            )

# Print the table using knitr::kable()
knitr::kable(balance_table, col.names = c("Group", "Average Age", "Average Education", 
                                           "Average Black", "Average Hispanic", 
                                           "Average Married", "Average No Degree"))
Group Average Age Average Education Average Black Average Hispanic Average Married Average No Degree
0 24.44706 10.18824 0.8000000 0.1129412 0.1576471 0.8141176
1 24.62626 10.38047 0.8013468 0.0942761 0.1683502 0.7306397

Verbal Comments on Question 9: As the table above, ut displays the sample averages of various covariates grouped by treatment (1) and control (0), which might be some imbalances between the treatment and control groups. Since the average age in the treatment group is slightly higher compared to the control group. Also, the average of people who educated, black, and married seem to be higher in the treatment group.Thus, these differences suggest that the two groups may not be entirely balanced in terms of their covariates.

Question 10

Create the variables and write up the results including report the estimates in the main text

# Calculate the change in earnings between pre- and post-experiment
lalonde <- lalonde %>%
  mutate(change = re78 - re75)

# Compute the average change in earnings for the treated group
trt_change <- lalonde %>%
  filter(treat == 1) %>%
  summarise(Average_Change = mean(change))

# Compute the average change in earnings for the control group
ctr_change <- lalonde %>%
  filter(treat == 0) %>%
  summarise(Average_Change = mean(change))

# Compute the average treatment effect from two objects
ate <- trt_change - ctr_change

# Report estimates
paste("The average change in earnings for the treated group is", trt_change)
## [1] "The average change in earnings for the treated group is 2910.25383244781"
paste("The average change in earnings for the control group is", ctr_change)
## [1] "The average change in earnings for the control group is 2063.36554507087"
paste("The average treatment effect", ate)
## [1] "The average treatment effect 846.88828737694"

Summary the report estimates in the main text of Question 10:
The average change in earnings for the treated group is “trt_change” which indicating the average increase in earnings for individuals who received the treatment.
The average change in earnings for the control group is “ctr_change” which showing the average increase in earnings for individuals in the control group who did not receive the treatment.
The average treatment effect is “ate” representing the difference in the average change in earnings between the treated and control groups. This value provides an estimate of the causal effect of the treatment on earnings. If Positive Values indicate a positive treatment effect, while Negative Values indicate a negative treatment effect.

Question 11

What the potential outcomes for a particular person are in the analysis for the last problem. Substantively, what does the fundamental problem of causal inference refer to in this context?

Answer: As an analysis above, each person has two potential outcomes:
Their change in earnings if they receive the job training program (treatment), and their change in earnings if they do not receive the program (control). This potential outcome reflects how their earnings would change after participating in the program compared to their earnings before the program. Therefore, the fundamental problem of causal inference in this context refers to the challenge of estimating the causal effect of the treatment (job training program) on individuals’ outcomes (earnings change) when we can only observe one potential outcome for each individual, depending on whether they receive the treatment or not.

Question 12

Jimmy Q. Boxplot and Suzy T. Histogram’s studies, are either of these two valid and interesting outcomes to explore in this study?

Answer: Both re75 (earnings before the program) and re78 (earnings after the program) are valid outcomes to explore. Using re75 helps understand participants’ initial socioeconomic status and baseline earnings, influencing potential improvement through the program. Using re78 directly assesses the program’s impact on participants’ earnings at the end of the experiment, gauging its effectiveness and lasting impact. Thus, as re75 and re78, all provide different perspectives on the impact of the job training program. Both outcomes offer valuable insights into the program’s effectiveness and its implications for participants’ socioeconomic outcomes.

Question 13

Does having finished high school influence the effect of job training program on earnings? Report the ATEs in text and comment on whether or not you see evidence of a differential effect of treatment

# Create a tibble called "ate_dropout" by using group_by(), summarize(), pivot_wider(), and mutate()
ate_dropout <- lalonde %>%
  mutate(treat = ifelse(treat == 1, "Treated", "Control"),
         nodegree = ifelse(nodegree == 1, "Dropped out", "Finished HS")) %>%
  group_by(nodegree, treat) %>%
  summarize(Mean_Change = mean(change)) %>%
  pivot_wider(names_from = treat, values_from = Mean_Change) %>%
  mutate(ATE = Treated - Control)
## `summarise()` has grouped output by 'nodegree'. You can override using the
## `.groups` argument.
# Rename columns for clarity
colnames(ate_dropout) <- c("dropout", "Treated", "Control", "ATE")

# Produce a formatted table by using knitr::kable()
ate_kable <- knitr::kable(ate_dropout, align = "c"); ate_kable
dropout Treated Control ATE
Dropped out 2172.871 2623.151 450.2804
Finished HS 1583.760 3689.019 2105.2599
# Report ATEs in text
ATE_Dropped_out <- ate_dropout$ATE[ate_dropout$dropout == "Dropped out"]
ATE_Finished_HS <- ate_dropout$ATE[ate_dropout$dropout == "Finished HS"]

# Comments on the evidence of a differential effect of treatment
if (ATE_Dropped_out > ATE_Finished_HS) {
  cat("\nAverage Treatment Effect (ATE) for those who dropped out of high school:",
      ATE_Dropped_out, "\n")
  cat("Average Treatment Effect (ATE) for those who finished high school:", ATE_Finished_HS, "\n")
  cat("\nThere is evidence of a differential effect of treatment,
      with dropping out of high school associated with a larger average treatment effect.\n")
} else if (ATE_Dropped_out < ATE_Finished_HS) {
  cat("\nAverage Treatment Effect (ATE) for those who dropped out of high school:",
      ATE_Dropped_out, "\n")
  cat("Average Treatment Effect (ATE) for those who finished high school:", ATE_Finished_HS, "\n")
  cat("\nThere is evidence of a differential effect of treatment,
      with finishing high school associated with a larger average treatment effect.\n")
} else {
  cat("\nAverage Treatment Effect (ATE) for those who dropped out of high school:",
      ATE_Dropped_out, "\n")
  cat("Average Treatment Effect (ATE) for those who finished high school:", ATE_Finished_HS, "\n")
  cat("\nThere is no evidence of a differential effect of treatment based on
      high school completion status.\n")
}
## 
## Average Treatment Effect (ATE) for those who dropped out of high school: 450.2804 
## Average Treatment Effect (ATE) for those who finished high school: 2105.26 
## 
## There is evidence of a differential effect of treatment,
##       with finishing high school associated with a larger average treatment effect.

Question 14

Repeat the same analysis as in the previous question but this time with respect to age

# Create 'age_group' variable first
lalonde <- lalonde %>%
  mutate(age_group = case_when(
    age <= 30 ~ "30 and under",
    between(age, 31, 40) ~ "31 - 40",
    TRUE ~ "Over 40"
  ))

# Create 'ate_age' table
ate_age <- lalonde %>%
  mutate(treat = ifelse(treat == 1, "Treated", "Control")) %>%
  group_by(age_group, treat) %>%
  summarize(Mean_Change = mean(change), .groups = "drop_last") %>%
  pivot_wider(names_from = treat, values_from = Mean_Change) %>%
  mutate(ATE = Treated - Control)

# Rename the age column for clarity
colnames(ate_age) <- c("age_group", "Treated", "Control", "ATE")

# Pass the table to knitr::kable() for formatting
ate_age_kable <- knitr::kable(ate_age, align = "c"); ate_age_kable
age_group Treated Control ATE
30 and under 2136.730 2736.367 599.6373
31 - 40 1192.721 3950.871 2758.1497
Over 40 3786.459 3914.669 128.2100
# Report ATEs in text
ATE_30_under <- ate_age$ATE[ate_age$age_group == "30 and under"]
ATE_31_40 <- ate_age$ATE[ate_age$age_group == "31 - 40"]
ATE_over_40 <- ate_age$ATE[ate_age$age_group == "Over 40"]

# Comments on the evidence of a differential effect of treatment
if (ATE_30_under > ATE_31_40 & ATE_30_under > ATE_over_40) {
  cat("\nAverage Treatment Effect (ATE) for individuals aged 30 and under:", ATE_30_under, "\n")
  cat("Average Treatment Effect (ATE) for individuals aged 31 - 40:", ATE_31_40, "\n")
  cat("Average Treatment Effect (ATE) for individuals aged over 40:", ATE_over_40, "\n")
  cat("\nThere is evidence of a differential effect of treatment based on age,
      with individuals aged 30 and under experiencing the largest average treatment effect.\n")
} else if (ATE_31_40 > ATE_30_under & ATE_31_40 > ATE_over_40) {
  cat("\nAverage Treatment Effect (ATE) for individuals aged 30 and under:", ATE_30_under, "\n")
  cat("Average Treatment Effect (ATE) for individuals aged 31 - 40:", ATE_31_40, "\n")
  cat("Average Treatment Effect (ATE) for individuals aged over 40:", ATE_over_40, "\n")
  cat("\nThere is evidence of a differential effect of treatment based on age,
      with individuals aged 31 - 40 experiencing the largest average treatment effect.\n")
} else if (ATE_over_40 > ATE_30_under & ATE_over_40 > ATE_31_40) {
  cat("\nAverage Treatment Effect (ATE) for individuals aged 30 and under:", ATE_30_under, "\n")
  cat("Average Treatment Effect (ATE) for individuals aged 31 - 40:", ATE_31_40, "\n")
  cat("Average Treatment Effect (ATE) for individuals aged over 40:", ATE_over_40, "\n")
  cat("\nThere is evidence of a differential effect of treatment based on age,
      with individuals aged over 40 experiencing the largest average treatment effect.\n")
} else {
  cat("\nThere is no evidence of a differential effect of treatment based on age.\n")
}
## 
## Average Treatment Effect (ATE) for individuals aged 30 and under: 599.6373 
## Average Treatment Effect (ATE) for individuals aged 31 - 40: 2758.15 
## Average Treatment Effect (ATE) for individuals aged over 40: 128.21 
## 
## There is evidence of a differential effect of treatment based on age,
##       with individuals aged 31 - 40 experiencing the largest average treatment effect.

Question 15

Produce a barplot by using ggplot, and answer the question of, Do there appear to be systematic relationships between the treatment effects and age? If so, what patterns do you see?

Answer: Regarding to the outcomes of systematic relationships between treatment effects and age are observed in the barplot. The individuals aged 30 and under tend to have a higher average treatment effect compared to those aged 31 - 40 and over 40. There is a slight decrease in the average treatment effect with increasing age, suggesting that younger individuals may benefit more from the treatment.

library(ggplot2)

# Create barplot of ATE for each age group
age_plot <- ggplot(ate_age, aes(x = age_group, y = ATE, fill = age_group)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(x = "Age Group", y = "Average Treatment Effect (ATE)", title = "Average Treatment
       Effect by Age Group") + theme_minimal()

# Print the plot
print(age_plot)