R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00
options(repos = c(CRAN = "https://cloud.r-project.org"))
library(readr)
## Warning: package 'readr' was built under R version 4.3.3
file_path <- "C:\\Users\\mohua\\OneDrive - EQS\\Programming\\EPAData\\Summer Paper 2024\\2000-2022_epa_qtr_combine_df.csv"


# Load the dataset
epa_df <- read_csv(file_path)
## Rows: 819173 Columns: 44
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (22): state_name, county_name, datum, parameter, sample_duration, sample...
## dbl (21): state_code, county_code, site_number, parameter_code, poc, latitud...
## lgl  (1): estimated_days_gt_std
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Inspect the first few rows of the data to understand its structure
head(epa_df)
## # A tibble: 6 × 44
##   state_code county_code state_name county_name site_number parameter_code   poc
##        <dbl>       <dbl> <chr>      <chr>             <dbl>          <dbl> <dbl>
## 1          1           3 Alabama    Baldwin              10          88101     1
## 2          1           3 Alabama    Baldwin              10          88101     1
## 3          1           3 Alabama    Baldwin              10          88101     1
## 4          1           3 Alabama    Baldwin              10          88101     1
## 5          1           3 Alabama    Baldwin              10          88101     1
## 6          1           3 Alabama    Baldwin              10          88101     1
## # ℹ 37 more variables: latitude <dbl>, longitude <dbl>, datum <chr>,
## #   parameter <chr>, sample_duration <chr>, sample_duration_code <chr>,
## #   sample_duration_type <chr>, pollutant_standard <chr>, year <dbl>,
## #   quarter <dbl>, units_of_measure <chr>, event_type <chr>,
## #   observation_count <dbl>, observation_percent <dbl>, arithmetic_mean <dbl>,
## #   minimum_value <dbl>, maximum_value <dbl>, quarterly_criteria_met <chr>,
## #   actual_days_gt_std <dbl>, estimated_days_gt_std <lgl>, …
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
epa_df <- epa_df %>%
  mutate(state_code = sprintf("%02d", state_code), # Format state_code as two digits
         county_code = sprintf("%03d", county_code), # Format county_code as three digits
         county_id = paste0(state_code, county_code)) # Concatenate

# View the modified data frame
head(epa_df)
## # A tibble: 6 × 45
##   state_code county_code state_name county_name site_number parameter_code   poc
##   <chr>      <chr>       <chr>      <chr>             <dbl>          <dbl> <dbl>
## 1 01         003         Alabama    Baldwin              10          88101     1
## 2 01         003         Alabama    Baldwin              10          88101     1
## 3 01         003         Alabama    Baldwin              10          88101     1
## 4 01         003         Alabama    Baldwin              10          88101     1
## 5 01         003         Alabama    Baldwin              10          88101     1
## 6 01         003         Alabama    Baldwin              10          88101     1
## # ℹ 38 more variables: latitude <dbl>, longitude <dbl>, datum <chr>,
## #   parameter <chr>, sample_duration <chr>, sample_duration_code <chr>,
## #   sample_duration_type <chr>, pollutant_standard <chr>, year <dbl>,
## #   quarter <dbl>, units_of_measure <chr>, event_type <chr>,
## #   observation_count <dbl>, observation_percent <dbl>, arithmetic_mean <dbl>,
## #   minimum_value <dbl>, maximum_value <dbl>, quarterly_criteria_met <chr>,
## #   actual_days_gt_std <dbl>, estimated_days_gt_std <lgl>, …
# Number of unique county_id
unique_county_ids <- length(unique(epa_df$county_id))

# Total number of observations
total_observations <- nrow(epa_df)

# Display results
unique_county_ids
## [1] 1006
total_observations
## [1] 819173

House Data

library(readr)

file_path <- "C:\\Users\\mohua\\OneDrive - EQS\\Programming\\EPAData\\Summer Paper 2024\\raw_quarterly_house_data.csv"


# Load the dataset
house_df <- read_csv(file_path)
## Rows: 204218 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (5): RegionName, RegionType, StateName, State, Metro
## dbl  (8): RegionID, SizeRank, StateCodeFIPS, MunicipalCodeFIPS, Value, Year,...
## date (1): Date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Inspect the first few rows of the data to understand its structure
head(house_df)
## # A tibble: 6 × 14
##   RegionID SizeRank RegionName RegionType StateName State Metro    StateCodeFIPS
##      <dbl>    <dbl> <chr>      <chr>      <chr>     <chr> <chr>            <dbl>
## 1       66      146 Ada County county     ID        ID    Boise C…            16
## 2       66      146 Ada County county     ID        ID    Boise C…            16
## 3       66      146 Ada County county     ID        ID    Boise C…            16
## 4       66      146 Ada County county     ID        ID    Boise C…            16
## 5       66      146 Ada County county     ID        ID    Boise C…            16
## 6       66      146 Ada County county     ID        ID    Boise C…            16
## # ℹ 6 more variables: MunicipalCodeFIPS <dbl>, Date <date>, Value <dbl>,
## #   Year <dbl>, Month <dbl>, Quarter <dbl>
library(dplyr)


house_df <- house_df %>%
  mutate(state_code = sprintf("%02d", StateCodeFIPS), # Format state_code as two digits
         county_code = sprintf("%03d", MunicipalCodeFIPS), # Format county_code as three digits
         county_id = paste0(state_code, county_code)) # Concatenate

# View the modified data frame
head(house_df)
## # A tibble: 6 × 17
##   RegionID SizeRank RegionName RegionType StateName State Metro    StateCodeFIPS
##      <dbl>    <dbl> <chr>      <chr>      <chr>     <chr> <chr>            <dbl>
## 1       66      146 Ada County county     ID        ID    Boise C…            16
## 2       66      146 Ada County county     ID        ID    Boise C…            16
## 3       66      146 Ada County county     ID        ID    Boise C…            16
## 4       66      146 Ada County county     ID        ID    Boise C…            16
## 5       66      146 Ada County county     ID        ID    Boise C…            16
## 6       66      146 Ada County county     ID        ID    Boise C…            16
## # ℹ 9 more variables: MunicipalCodeFIPS <dbl>, Date <date>, Value <dbl>,
## #   Year <dbl>, Month <dbl>, Quarter <dbl>, state_code <chr>,
## #   county_code <chr>, county_id <chr>
# Number of unique county_id
unique_county_ids <- length(unique(house_df$county_id))

# Total number of observations
total_observations <- nrow(house_df)

# Display results
unique_county_ids
## [1] 3074
total_observations
## [1] 204218

Merge EPA data with House Data:

# Merging two dataframes
combined_epa_house_df <- merge(epa_df, 
                               house_df[, c("Year", "Quarter", "county_id", "Value")], 
                               by.x = c("year", "quarter", "county_id"), 
                               by.y = c("Year", "Quarter", "county_id"), 
                               all.x = TRUE)

# Display the combined dataframe
head(combined_epa_house_df)
##   year quarter county_id state_code county_code state_name county_name
## 1 2000       1     01003         01         003    Alabama     Baldwin
## 2 2000       1     01003         01         003    Alabama     Baldwin
## 3 2000       1     01003         01         003    Alabama     Baldwin
## 4 2000       1     01003         01         003    Alabama     Baldwin
## 5 2000       1     01003         01         003    Alabama     Baldwin
## 6 2000       1     01003         01         003    Alabama     Baldwin
##   site_number parameter_code poc latitude longitude datum
## 1          10          88101   1 30.49748 -87.88026 NAD83
## 2          10          88101   1 30.49748 -87.88026 NAD83
## 3          10          88101   1 30.49748 -87.88026 NAD83
## 4          10          88101   1 30.49748 -87.88026 NAD83
## 5          10          88101   1 30.49748 -87.88026 NAD83
## 6          10          88101   1 30.49748 -87.88026 NAD83
##                  parameter sample_duration sample_duration_code
## 1 PM2.5 - Local Conditions         24 HOUR                    7
## 2 PM2.5 - Local Conditions         24 HOUR                    7
## 3 PM2.5 - Local Conditions         24 HOUR                    7
## 4 PM2.5 - Local Conditions         24 HOUR                    7
## 5 PM2.5 - Local Conditions         24 HOUR                    7
## 6 PM2.5 - Local Conditions         24 HOUR                    7
##   sample_duration_type pollutant_standard            units_of_measure
## 1                    O   PM25 Annual 1997 Micrograms/cubic meter (LC)
## 2                    O  PM25 24-hour 2006 Micrograms/cubic meter (LC)
## 3                    O   PM25 Annual 2012 Micrograms/cubic meter (LC)
## 4                    O  PM25 24-hour 2012 Micrograms/cubic meter (LC)
## 5                    O  PM25 24-hour 1997 Micrograms/cubic meter (LC)
## 6                    O   PM25 Annual 2006 Micrograms/cubic meter (LC)
##   event_type observation_count observation_percent arithmetic_mean
## 1  No Events                16                  52         11.8188
## 2  No Events                16                  52         11.8188
## 3  No Events                16                  52         11.8188
## 4  No Events                16                  52         11.8188
## 5  No Events                16                  52         11.8188
## 6  No Events                16                  52         11.8188
##   minimum_value maximum_value quarterly_criteria_met actual_days_gt_std
## 1           5.9          24.1                      N                 NA
## 2           5.9          24.1                      N                  0
## 3           5.9          24.1                      N                 NA
## 4           5.9          24.1                      N                  0
## 5           5.9          24.1                      N                  0
## 6           5.9          24.1                      N                 NA
##   estimated_days_gt_std valid_samples valid_day_count scheduled_samples
## 1                    NA            16              16                31
## 2                    NA            16              16                31
## 3                    NA            16              16                31
## 4                    NA            16              16                31
## 5                    NA            16              16                31
## 6                    NA            16              16                31
##   percent_days percent_one_value monitoring_agency_code  monitoring_agency
## 1         17.6                18                     13 Al Dept Of Env Mgt
## 2         17.6                18                     13 Al Dept Of Env Mgt
## 3         17.6                18                     13 Al Dept Of Env Mgt
## 4         17.6                18                     13 Al Dept Of Env Mgt
## 5         17.6                18                     13 Al Dept Of Env Mgt
## 6         17.6                18                     13 Al Dept Of Env Mgt
##     local_site_name                                                  address
## 1 FAIRHOPE, Alabama FAIRHOPE HIGH SCHOOL, 1 PIRATE DRIVE, FAIRHOPE,  ALABAMA
## 2 FAIRHOPE, Alabama FAIRHOPE HIGH SCHOOL, 1 PIRATE DRIVE, FAIRHOPE,  ALABAMA
## 3 FAIRHOPE, Alabama FAIRHOPE HIGH SCHOOL, 1 PIRATE DRIVE, FAIRHOPE,  ALABAMA
## 4 FAIRHOPE, Alabama FAIRHOPE HIGH SCHOOL, 1 PIRATE DRIVE, FAIRHOPE,  ALABAMA
## 5 FAIRHOPE, Alabama FAIRHOPE HIGH SCHOOL, 1 PIRATE DRIVE, FAIRHOPE,  ALABAMA
## 6 FAIRHOPE, Alabama FAIRHOPE HIGH SCHOOL, 1 PIRATE DRIVE, FAIRHOPE,  ALABAMA
##     state  county     city tribal_code tribal_land cbsa_code
## 1 Alabama Baldwin Fairhope        <NA>        <NA>     19300
## 2 Alabama Baldwin Fairhope        <NA>        <NA>     19300
## 3 Alabama Baldwin Fairhope        <NA>        <NA>     19300
## 4 Alabama Baldwin Fairhope        <NA>        <NA>     19300
## 5 Alabama Baldwin Fairhope        <NA>        <NA>     19300
## 6 Alabama Baldwin Fairhope        <NA>        <NA>     19300
##                        cbsa date_of_last_change    Value
## 1 Daphne-Fairhope-Foley, AL           5/16/2023 117246.2
## 2 Daphne-Fairhope-Foley, AL           5/16/2023 117246.2
## 3 Daphne-Fairhope-Foley, AL           5/16/2023 117246.2
## 4 Daphne-Fairhope-Foley, AL           5/16/2023 117246.2
## 5 Daphne-Fairhope-Foley, AL           5/16/2023 117246.2
## 6 Daphne-Fairhope-Foley, AL           5/16/2023 117246.2
# Renaming variables
colnames(combined_epa_house_df)[colnames(combined_epa_house_df) == "arithmetic_mean"] <- "AQI"
colnames(combined_epa_house_df)[colnames(combined_epa_house_df) == "Value"] <- "Price"

# Display the updated dataframe
head(combined_epa_house_df)
##   year quarter county_id state_code county_code state_name county_name
## 1 2000       1     01003         01         003    Alabama     Baldwin
## 2 2000       1     01003         01         003    Alabama     Baldwin
## 3 2000       1     01003         01         003    Alabama     Baldwin
## 4 2000       1     01003         01         003    Alabama     Baldwin
## 5 2000       1     01003         01         003    Alabama     Baldwin
## 6 2000       1     01003         01         003    Alabama     Baldwin
##   site_number parameter_code poc latitude longitude datum
## 1          10          88101   1 30.49748 -87.88026 NAD83
## 2          10          88101   1 30.49748 -87.88026 NAD83
## 3          10          88101   1 30.49748 -87.88026 NAD83
## 4          10          88101   1 30.49748 -87.88026 NAD83
## 5          10          88101   1 30.49748 -87.88026 NAD83
## 6          10          88101   1 30.49748 -87.88026 NAD83
##                  parameter sample_duration sample_duration_code
## 1 PM2.5 - Local Conditions         24 HOUR                    7
## 2 PM2.5 - Local Conditions         24 HOUR                    7
## 3 PM2.5 - Local Conditions         24 HOUR                    7
## 4 PM2.5 - Local Conditions         24 HOUR                    7
## 5 PM2.5 - Local Conditions         24 HOUR                    7
## 6 PM2.5 - Local Conditions         24 HOUR                    7
##   sample_duration_type pollutant_standard            units_of_measure
## 1                    O   PM25 Annual 1997 Micrograms/cubic meter (LC)
## 2                    O  PM25 24-hour 2006 Micrograms/cubic meter (LC)
## 3                    O   PM25 Annual 2012 Micrograms/cubic meter (LC)
## 4                    O  PM25 24-hour 2012 Micrograms/cubic meter (LC)
## 5                    O  PM25 24-hour 1997 Micrograms/cubic meter (LC)
## 6                    O   PM25 Annual 2006 Micrograms/cubic meter (LC)
##   event_type observation_count observation_percent     AQI minimum_value
## 1  No Events                16                  52 11.8188           5.9
## 2  No Events                16                  52 11.8188           5.9
## 3  No Events                16                  52 11.8188           5.9
## 4  No Events                16                  52 11.8188           5.9
## 5  No Events                16                  52 11.8188           5.9
## 6  No Events                16                  52 11.8188           5.9
##   maximum_value quarterly_criteria_met actual_days_gt_std estimated_days_gt_std
## 1          24.1                      N                 NA                    NA
## 2          24.1                      N                  0                    NA
## 3          24.1                      N                 NA                    NA
## 4          24.1                      N                  0                    NA
## 5          24.1                      N                  0                    NA
## 6          24.1                      N                 NA                    NA
##   valid_samples valid_day_count scheduled_samples percent_days
## 1            16              16                31         17.6
## 2            16              16                31         17.6
## 3            16              16                31         17.6
## 4            16              16                31         17.6
## 5            16              16                31         17.6
## 6            16              16                31         17.6
##   percent_one_value monitoring_agency_code  monitoring_agency   local_site_name
## 1                18                     13 Al Dept Of Env Mgt FAIRHOPE, Alabama
## 2                18                     13 Al Dept Of Env Mgt FAIRHOPE, Alabama
## 3                18                     13 Al Dept Of Env Mgt FAIRHOPE, Alabama
## 4                18                     13 Al Dept Of Env Mgt FAIRHOPE, Alabama
## 5                18                     13 Al Dept Of Env Mgt FAIRHOPE, Alabama
## 6                18                     13 Al Dept Of Env Mgt FAIRHOPE, Alabama
##                                                    address   state  county
## 1 FAIRHOPE HIGH SCHOOL, 1 PIRATE DRIVE, FAIRHOPE,  ALABAMA Alabama Baldwin
## 2 FAIRHOPE HIGH SCHOOL, 1 PIRATE DRIVE, FAIRHOPE,  ALABAMA Alabama Baldwin
## 3 FAIRHOPE HIGH SCHOOL, 1 PIRATE DRIVE, FAIRHOPE,  ALABAMA Alabama Baldwin
## 4 FAIRHOPE HIGH SCHOOL, 1 PIRATE DRIVE, FAIRHOPE,  ALABAMA Alabama Baldwin
## 5 FAIRHOPE HIGH SCHOOL, 1 PIRATE DRIVE, FAIRHOPE,  ALABAMA Alabama Baldwin
## 6 FAIRHOPE HIGH SCHOOL, 1 PIRATE DRIVE, FAIRHOPE,  ALABAMA Alabama Baldwin
##       city tribal_code tribal_land cbsa_code                      cbsa
## 1 Fairhope        <NA>        <NA>     19300 Daphne-Fairhope-Foley, AL
## 2 Fairhope        <NA>        <NA>     19300 Daphne-Fairhope-Foley, AL
## 3 Fairhope        <NA>        <NA>     19300 Daphne-Fairhope-Foley, AL
## 4 Fairhope        <NA>        <NA>     19300 Daphne-Fairhope-Foley, AL
## 5 Fairhope        <NA>        <NA>     19300 Daphne-Fairhope-Foley, AL
## 6 Fairhope        <NA>        <NA>     19300 Daphne-Fairhope-Foley, AL
##   date_of_last_change    Price
## 1           5/16/2023 117246.2
## 2           5/16/2023 117246.2
## 3           5/16/2023 117246.2
## 4           5/16/2023 117246.2
## 5           5/16/2023 117246.2
## 6           5/16/2023 117246.2
# Display all variable names
colnames(combined_epa_house_df)
##  [1] "year"                   "quarter"                "county_id"             
##  [4] "state_code"             "county_code"            "state_name"            
##  [7] "county_name"            "site_number"            "parameter_code"        
## [10] "poc"                    "latitude"               "longitude"             
## [13] "datum"                  "parameter"              "sample_duration"       
## [16] "sample_duration_code"   "sample_duration_type"   "pollutant_standard"    
## [19] "units_of_measure"       "event_type"             "observation_count"     
## [22] "observation_percent"    "AQI"                    "minimum_value"         
## [25] "maximum_value"          "quarterly_criteria_met" "actual_days_gt_std"    
## [28] "estimated_days_gt_std"  "valid_samples"          "valid_day_count"       
## [31] "scheduled_samples"      "percent_days"           "percent_one_value"     
## [34] "monitoring_agency_code" "monitoring_agency"      "local_site_name"       
## [37] "address"                "state"                  "county"                
## [40] "city"                   "tribal_code"            "tribal_land"           
## [43] "cbsa_code"              "cbsa"                   "date_of_last_change"   
## [46] "Price"
# Selecting specific columns
combined_epa_house_df <- combined_epa_house_df[, c("year", "quarter", "state_code", "county_code", 
                                                   "county_id", "state_name", "county_name", "city",
                                                   "parameter_code", "parameter", "latitude", "longitude", 
                                                   "observation_count", "AQI", "minimum_value", 
                                                   "maximum_value", "Price")]

# Display the updated dataframe
head(combined_epa_house_df)
##   year quarter state_code county_code county_id state_name county_name     city
## 1 2000       1         01         003     01003    Alabama     Baldwin Fairhope
## 2 2000       1         01         003     01003    Alabama     Baldwin Fairhope
## 3 2000       1         01         003     01003    Alabama     Baldwin Fairhope
## 4 2000       1         01         003     01003    Alabama     Baldwin Fairhope
## 5 2000       1         01         003     01003    Alabama     Baldwin Fairhope
## 6 2000       1         01         003     01003    Alabama     Baldwin Fairhope
##   parameter_code                parameter latitude longitude observation_count
## 1          88101 PM2.5 - Local Conditions 30.49748 -87.88026                16
## 2          88101 PM2.5 - Local Conditions 30.49748 -87.88026                16
## 3          88101 PM2.5 - Local Conditions 30.49748 -87.88026                16
## 4          88101 PM2.5 - Local Conditions 30.49748 -87.88026                16
## 5          88101 PM2.5 - Local Conditions 30.49748 -87.88026                16
## 6          88101 PM2.5 - Local Conditions 30.49748 -87.88026                16
##       AQI minimum_value maximum_value    Price
## 1 11.8188           5.9          24.1 117246.2
## 2 11.8188           5.9          24.1 117246.2
## 3 11.8188           5.9          24.1 117246.2
## 4 11.8188           5.9          24.1 117246.2
## 5 11.8188           5.9          24.1 117246.2
## 6 11.8188           5.9          24.1 117246.2
# Number of unique county_id
unique_county_ids <- length(unique(combined_epa_house_df$county_id))

# Total number of observations
total_observations <- nrow(combined_epa_house_df)

# Display results
unique_county_ids
## [1] 1006
total_observations
## [1] 819173

Drop Null Values

# Removing rows with NA values in the Price column
combined_epa_house_df <- combined_epa_house_df[!is.na(combined_epa_house_df$Price), ]

# Display the cleaned dataframe
head(combined_epa_house_df)
##   year quarter state_code county_code county_id state_name county_name     city
## 1 2000       1         01         003     01003    Alabama     Baldwin Fairhope
## 2 2000       1         01         003     01003    Alabama     Baldwin Fairhope
## 3 2000       1         01         003     01003    Alabama     Baldwin Fairhope
## 4 2000       1         01         003     01003    Alabama     Baldwin Fairhope
## 5 2000       1         01         003     01003    Alabama     Baldwin Fairhope
## 6 2000       1         01         003     01003    Alabama     Baldwin Fairhope
##   parameter_code                parameter latitude longitude observation_count
## 1          88101 PM2.5 - Local Conditions 30.49748 -87.88026                16
## 2          88101 PM2.5 - Local Conditions 30.49748 -87.88026                16
## 3          88101 PM2.5 - Local Conditions 30.49748 -87.88026                16
## 4          88101 PM2.5 - Local Conditions 30.49748 -87.88026                16
## 5          88101 PM2.5 - Local Conditions 30.49748 -87.88026                16
## 6          88101 PM2.5 - Local Conditions 30.49748 -87.88026                16
##       AQI minimum_value maximum_value    Price
## 1 11.8188           5.9          24.1 117246.2
## 2 11.8188           5.9          24.1 117246.2
## 3 11.8188           5.9          24.1 117246.2
## 4 11.8188           5.9          24.1 117246.2
## 5 11.8188           5.9          24.1 117246.2
## 6 11.8188           5.9          24.1 117246.2
# Number of unique county_id
unique_county_ids <- length(unique(combined_epa_house_df$county_id))

# Total number of observations
total_observations <- nrow(combined_epa_house_df)

# Display results
unique_county_ids
## [1] 904
total_observations
## [1] 719674
# Saving the dataframe as a CSV file using the specified path
file_path <- "C:\\Users\\mohua\\OneDrive - EQS\\Programming\\EPAData\\Summer Paper 2024\\combined_epa_house_df.csv"

write.csv(combined_epa_house_df, file = file_path, row.names = FALSE)

Decile Mean

library(dplyr)

# Creating decile groups for housing prices
combined_epa_house_df <- combined_epa_house_df %>%
  mutate(Price_Decile = ntile(Price, 10))  # ntile creates decile groups

# Group by decile and summarize to calculate mean Price and AQI
result_decile <- combined_epa_house_df %>%
  group_by(Price_Decile) %>%
  summarise(
    Mean_Housing_Price = mean(Price, na.rm = TRUE),
    Mean_AQI = mean(AQI, na.rm = TRUE),
    .groups = 'drop'
  )

# Print the result
print(result_decile)
## # A tibble: 10 × 3
##    Price_Decile Mean_Housing_Price Mean_AQI
##           <int>              <dbl>    <dbl>
##  1            1             80808.    11.2 
##  2            2            108025.    11.2 
##  3            3            125269.    10.9 
##  4            4            143143.    10.1 
##  5            5            165451.     9.62
##  6            6            191355.     9.23
##  7            7            221213.     8.89
##  8            8            263979.     8.95
##  9            9            340386.     8.93
## 10           10            592108.     8.86

Descriptive Summary:

# Using dplyr and summarise to get a descriptive summary
library(dplyr)

# Descriptive summary for AQI and Price
descriptive_summary <- combined_epa_house_df %>%
  summarise(
    Mean_AQI = mean(AQI, na.rm = TRUE),
    Median_AQI = median(AQI, na.rm = TRUE),
    SD_AQI = sd(AQI, na.rm = TRUE),
    Min_AQI = min(AQI, na.rm = TRUE),
    Max_AQI = max(AQI, na.rm = TRUE),
    Mean_Price = mean(Price, na.rm = TRUE),
    Median_Price = median(Price, na.rm = TRUE),
    SD_Price = sd(Price, na.rm = TRUE),
    Min_Price = min(Price, na.rm = TRUE),
    Max_Price = max(Price, na.rm = TRUE)
  )

# Print the descriptive summary
print(descriptive_summary)
##   Mean_AQI Median_AQI   SD_AQI Min_AQI Max_AQI Mean_Price Median_Price SD_Price
## 1 9.782919     9.1767 4.227191  -2.059  124.85     223173     178188.5 160046.3
##   Min_Price Max_Price
## 1  31718.24   4710886

Basic Regression OLS:

install.packages("plm")
## Installing package into 'C:/Users/mohua/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'plm' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\mohua\AppData\Local\Temp\RtmpSy8WkG\downloaded_packages
library(plm)
## Warning: package 'plm' was built under R version 4.3.3
## 
## Attaching package: 'plm'
## The following objects are masked from 'package:dplyr':
## 
##     between, lag, lead
# Create log(Price)
combined_epa_house_df$log_Price <- log(combined_epa_house_df$Price)

pdata <- pdata.frame(combined_epa_house_df, index = c("county_id", "year"))
## Warning in pdata.frame(combined_epa_house_df, index = c("county_id", "year")): duplicate couples (id-time) in resulting pdata.frame
##  to find out which, use, e.g., table(index(your_pdataframe), useNA = "ifany")
# Run the OLS regression with log_Price as the dependent variable
ols_model <- plm(log_Price ~ AQI, data = pdata, model = "pooling")

# Display the regression results
summary(ols_model)
## Pooling Model
## 
## Call:
## plm(formula = log_Price ~ AQI, data = pdata, model = "pooling")
## 
## Unbalanced Panel: n = 904, T = 2-7567, N = 703555
## 
## Residuals:
##      Min.   1st Qu.    Median   3rd Qu.      Max. 
## -1.738903 -0.386124 -0.063152  0.325696  3.354754 
## 
## Coefficients:
##                Estimate  Std. Error t-value  Pr(>|t|)    
## (Intercept) 12.39491010  0.00166111 7461.83 < 2.2e-16 ***
## AQI         -0.02546050  0.00015587 -163.35 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    223040
## Residual Sum of Squares: 214890
## R-Squared:      0.036539
## Adj. R-Squared: 0.036538
## F-statistic: 26682.1 on 1 and 703553 DF, p-value: < 2.22e-16

Fixed effect Regression:

install.packages("lfe")
## Installing package into 'C:/Users/mohua/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'lfe' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'lfe'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\mohua\AppData\Local\R\win-library\4.3\00LOCK\lfe\libs\x64\lfe.dll to
## C:\Users\mohua\AppData\Local\R\win-library\4.3\lfe\libs\x64\lfe.dll: Permission
## denied
## Warning: restored 'lfe'
## 
## The downloaded binary packages are in
##  C:\Users\mohua\AppData\Local\Temp\RtmpSy8WkG\downloaded_packages
library(lfe)
## Warning: package 'lfe' was built under R version 4.3.3
## Loading required package: Matrix
## 
## Attaching package: 'lfe'
## The following object is masked from 'package:plm':
## 
##     sargan
# Run the fixed-effects model regression

fixed_model <- felm(log_Price ~ AQI | county_id + year, data = combined_epa_house_df)

# Display the summary of the regression model
summary(fixed_model)
## 
## Call:
##    felm(formula = log_Price ~ AQI | county_id + year, data = combined_epa_house_df) 
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.65918 -0.06555  0.00121  0.06883  0.64768 
## 
## Coefficients:
##      Estimate Std. Error t value Pr(>|t|)    
## AQI 2.150e-03  4.331e-05   49.63   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1157 on 702629 degrees of freedom
##   (16119 observations deleted due to missingness)
## Multiple R-squared(full model): 0.9578   Adjusted R-squared: 0.9577 
## Multiple R-squared(proj model): 0.003493   Adjusted R-squared: 0.002181 
## F-statistic(full model):1.724e+04 on 925 and 702629 DF, p-value: < 2.2e-16 
## F-statistic(proj model):  2463 on 1 and 702629 DF, p-value: < 2.2e-16

plm Fixed Model:

# Run the fixed-effects panel regression model
fixed_model_2 <- plm(log_Price ~ AQI, 
                   data = combined_epa_house_df, 
                   index = c("county_id", "year"), # Define panel structure with county_id and year
                   model = "within") # "within" specifies the fixed-effects model
## Warning in pdata.frame(data, index = index, ...): duplicate couples (id-time) in resulting pdata.frame
##  to find out which, use, e.g., table(index(your_pdataframe), useNA = "ifany")
# Display the summary of the fixed-effects model
summary(fixed_model_2)
## Oneway (individual) effect Within Model
## 
## Call:
## plm(formula = log_Price ~ AQI, data = combined_epa_house_df, 
##     model = "within", index = c("county_id", "year"))
## 
## Unbalanced Panel: n = 904, T = 2-7567, N = 703555
## 
## Residuals:
##      Min.   1st Qu.    Median   3rd Qu.      Max. 
## -0.981809 -0.136438 -0.027985  0.116719  1.835867 
## 
## Coefficients:
##        Estimate  Std. Error t-value  Pr(>|t|)    
## AQI -1.4841e-02  7.9067e-05  -187.7 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    41784
## Residual Sum of Squares: 39789
## R-Squared:      0.047745
## Adj. R-Squared: 0.04652
## F-statistic: 35229.9 on 1 and 702650 DF, p-value: < 2.22e-16

Random effect:

# Run the random effects model for the last 10 years
random_model <- plm(log_Price ~ AQI, 
                            data = combined_epa_house_df, 
                            index = c("county_id", "year"), 
                            model = "random")
## Warning in pdata.frame(data, index = index, ...): duplicate couples (id-time) in resulting pdata.frame
##  to find out which, use, e.g., table(index(your_pdataframe), useNA = "ifany")
# Display the summary of the random effects model
summary(random_model)
## Oneway (individual) effect Random Effect Model 
##    (Swamy-Arora's transformation)
## 
## Call:
## plm(formula = log_Price ~ AQI, data = combined_epa_house_df, 
##     model = "random", index = c("county_id", "year"))
## 
## Unbalanced Panel: n = 904, T = 2-7567, N = 703555
## 
## Effects:
##                   var std.dev share
## idiosyncratic 0.05663 0.23796 0.188
## individual    0.24506 0.49504 0.812
## theta:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.6782  0.9817  0.9870  0.9852  0.9908  0.9945 
## 
## Residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.96679 -0.13669 -0.02868  0.00094  0.11794  1.84811 
## 
## Coefficients:
##                Estimate  Std. Error z-value  Pr(>|z|)    
## (Intercept)  1.2170e+01  1.6497e-02  737.73 < 2.2e-16 ***
## AQI         -1.4845e-02  7.9062e-05 -187.77 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    49498
## Residual Sum of Squares: 39841
## R-Squared:      0.19515
## Adj. R-Squared: 0.19515
## Chisq: 35257.5 on 1 DF, p-value: < 2.22e-16

Lagged Model:

# Create a lagged variable for AQI considering quarterly data
combined_epa_house_df <- combined_epa_house_df %>%
  arrange(county_id, year, quarter) %>%  # Ensure data is sorted by county_id, year, and quarter
  group_by(county_id) %>%                # Group by county_id
  mutate(lag_AQI = lag(AQI, n = 1)) %>%  # Create a lagged variable with a lag of 1 quarter
  ungroup()

# Display the first few rows to verify
head(combined_epa_house_df)
## # A tibble: 6 × 20
##    year quarter state_code county_code county_id state_name county_name city    
##   <dbl>   <dbl> <chr>      <chr>       <chr>     <chr>      <chr>       <chr>   
## 1  2000       1 01         003         01003     Alabama    Baldwin     Fairhope
## 2  2000       1 01         003         01003     Alabama    Baldwin     Fairhope
## 3  2000       1 01         003         01003     Alabama    Baldwin     Fairhope
## 4  2000       1 01         003         01003     Alabama    Baldwin     Fairhope
## 5  2000       1 01         003         01003     Alabama    Baldwin     Fairhope
## 6  2000       1 01         003         01003     Alabama    Baldwin     Fairhope
## # ℹ 12 more variables: parameter_code <dbl>, parameter <chr>, latitude <dbl>,
## #   longitude <dbl>, observation_count <dbl>, AQI <dbl>, minimum_value <dbl>,
## #   maximum_value <dbl>, Price <dbl>, Price_Decile <int>, log_Price <dbl>,
## #   lag_AQI <dbl>
# Run the fixed-effects model regression with the lagged AQI variable
lagged_model <- felm(log_Price ~ lag_AQI | county_id + year, data = combined_epa_house_df)

# Display the summary of the regression model
summary(lagged_model)
## 
## Call:
##    felm(formula = log_Price ~ lag_AQI | county_id + year, data = combined_epa_house_df) 
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.65918 -0.06555  0.00121  0.06883  0.64768 
## 
## Coefficients:
##          Estimate Std. Error t value Pr(>|t|)    
## lag_AQI 2.150e-03  4.331e-05   49.63   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1157 on 702629 degrees of freedom
##   (16119 observations deleted due to missingness)
## Multiple R-squared(full model): 0.9578   Adjusted R-squared: 0.9577 
## Multiple R-squared(proj model): 0.003493   Adjusted R-squared: 0.002181 
## F-statistic(full model):1.724e+04 on 925 and 702629 DF, p-value: < 2.2e-16 
## F-statistic(proj model):  2463 on 1 and 702629 DF, p-value: < 2.2e-16

Hausman test

# Run the fixed-effects model
fixed_model <- plm(log_Price ~ AQI, 
                   data = combined_epa_house_df, 
                   index = c("county_id", "year"), 
                   model = "within")
## Warning in pdata.frame(data, index = index, ...): duplicate couples (id-time) in resulting pdata.frame
##  to find out which, use, e.g., table(index(your_pdataframe), useNA = "ifany")
# Run the random-effects model
random_model <- plm(log_Price ~ AQI, 
                    data = combined_epa_house_df, 
                    index = c("county_id", "year"), 
                    model = "random")
## Warning in pdata.frame(data, index = index, ...): duplicate couples (id-time) in resulting pdata.frame
##  to find out which, use, e.g., table(index(your_pdataframe), useNA = "ifany")
# Perform the Hausman test
hausman_test <- phtest(fixed_model, random_model)

# Display the results of the Hausman test
print(hausman_test)
## 
##  Hausman Test
## 
## data:  log_Price ~ AQI
## chisq = 28.969, df = 1, p-value = 7.354e-08
## alternative hypothesis: one model is inconsistent

Annually aggregate to merge with GDP and Unemployment:

# Convert quarterly data to annual data by calculating the mean AQI and Price for each year
annual_epa_house_df <- combined_epa_house_df %>%
  group_by(year, county_id, state_code, state_name, county_code, county_name) %>% # Group by relevant identifiers
  summarise(
    Mean_AQI = mean(AQI, na.rm = TRUE),
    Mean_Price = mean(Price, na.rm = TRUE),
    .groups = 'drop'
  )

# Display the first few rows of the annual dataset
head(annual_epa_house_df)
## # A tibble: 6 × 8
##    year county_id state_code state_name county_code county_name Mean_AQI
##   <dbl> <chr>     <chr>      <chr>      <chr>       <chr>          <dbl>
## 1  2000 01003     01         Alabama    003         Baldwin         14.5
## 2  2000 01049     01         Alabama    049         DeKalb          18.4
## 3  2000 01089     01         Alabama    089         Madison         18.6
## 4  2000 01097     01         Alabama    097         Mobile          15.2
## 5  2000 01101     01         Alabama    101         Montgomery      17.7
## 6  2000 01121     01         Alabama    121         Talladega       20.9
## # ℹ 1 more variable: Mean_Price <dbl>

Load GDP data:

file_path <- "C:\\Users\\mohua\\OneDrive - EQS\\Programming\\EPAData\\Summer Paper 2024\\GDP_2017_2022.csv"


# Load the dataset
gdp_df <- read_csv(file_path)
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 19062 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): GeoFIPS, GeoName, TableName, IndustryClassification
## dbl (5): Region, Year, indexes_real_GDP, Current_GDP, Real_GDP
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Inspect the first few rows of the data to understand its structure
head(gdp_df)
## # A tibble: 6 × 9
##   GeoFIPS GeoName Region TableName IndustryClassification  Year indexes_real_GDP
##   <chr>   <chr>    <dbl> <chr>     <chr>                  <dbl>            <dbl>
## 1 "\"010… Alabama      5 CAGDP1    ...                     2017             100 
## 2 "\"010… Alabama      5 CAGDP1    ...                     2018             102.
## 3 "\"010… Alabama      5 CAGDP1    ...                     2019             104.
## 4 "\"010… Alabama      5 CAGDP1    ...                     2020             103.
## 5 "\"010… Alabama      5 CAGDP1    ...                     2021             107.
## 6 "\"010… Alabama      5 CAGDP1    ...                     2022             109.
## # ℹ 2 more variables: Current_GDP <dbl>, Real_GDP <dbl>

Merge with GDP data:

# Ensure GeoFIPS is a character without quotes
gdp_df <- gdp_df %>%
  mutate(GeoFIPS = gsub('"', '', GeoFIPS))

# Merge the two dataframes by matching GeoFIPS with county_id and Year with year
combined_annual_df <- annual_epa_house_df %>%
  left_join(gdp_df %>% select(GeoFIPS, Year, Current_GDP, Real_GDP), 
            by = c("county_id" = "GeoFIPS", "year" = "Year"))

# Display the first few rows of the combined dataframe
head(combined_annual_df)
## # A tibble: 6 × 10
##    year county_id state_code state_name county_code county_name Mean_AQI
##   <dbl> <chr>     <chr>      <chr>      <chr>       <chr>          <dbl>
## 1  2000 01003     01         Alabama    003         Baldwin         14.5
## 2  2000 01049     01         Alabama    049         DeKalb          18.4
## 3  2000 01089     01         Alabama    089         Madison         18.6
## 4  2000 01097     01         Alabama    097         Mobile          15.2
## 5  2000 01101     01         Alabama    101         Montgomery      17.7
## 6  2000 01121     01         Alabama    121         Talladega       20.9
## # ℹ 3 more variables: Mean_Price <dbl>, Current_GDP <dbl>, Real_GDP <dbl>

Load Uneployment data:

file_path <- "C:\\Users\\mohua\\OneDrive - EQS\\Programming\\EPAData\\Summer Paper 2024\\Unemployment_long_format.csv"


# Load the dataset
ue_df <- read_csv(file_path)
## Rows: 73892 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): State, Area_Name
## dbl (6): FIPS_Code, year, Civilian_labor_force, Employed, Unemployed, Unempl...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Inspect the first few rows of the data to understand its structure
head(ue_df)
## # A tibble: 6 × 8
##   FIPS_Code State Area_Name        year Civilian_labor_force Employed Unemployed
##       <dbl> <chr> <chr>           <dbl>                <dbl>    <dbl>      <dbl>
## 1      1001 AL    Autauga County…  2000                21861    20971        890
## 2      1001 AL    Autauga County…  2001                22081    21166        915
## 3      1001 AL    Autauga County…  2002                22161    21096       1065
## 4      1001 AL    Autauga County…  2003                22695    21557       1138
## 5      1001 AL    Autauga County…  2004                23241    22146       1095
## 6      1001 AL    Autauga County…  2005                23887    22986        901
## # ℹ 1 more variable: Unemployment_rate <dbl>

Merge with combined_annual_df:

# Ensure that FIPS_Code is in character format with leading zeros in unemployment_df
ue_df <- ue_df %>%
  mutate(FIPS_Code = sprintf("%05d", FIPS_Code))

# Merge the two dataframes by matching FIPS_Code with county_id and year
annual_gdp_ue_df <- combined_annual_df %>%
  left_join(ue_df %>% select(FIPS_Code, year, Unemployment_rate),
            by = c("county_id" = "FIPS_Code", "year" = "year"))

# Display the first few rows of the combined dataframe
head(annual_gdp_ue_df)
## # A tibble: 6 × 11
##    year county_id state_code state_name county_code county_name Mean_AQI
##   <dbl> <chr>     <chr>      <chr>      <chr>       <chr>          <dbl>
## 1  2000 01003     01         Alabama    003         Baldwin         14.5
## 2  2000 01049     01         Alabama    049         DeKalb          18.4
## 3  2000 01089     01         Alabama    089         Madison         18.6
## 4  2000 01097     01         Alabama    097         Mobile          15.2
## 5  2000 01101     01         Alabama    101         Montgomery      17.7
## 6  2000 01121     01         Alabama    121         Talladega       20.9
## # ℹ 4 more variables: Mean_Price <dbl>, Current_GDP <dbl>, Real_GDP <dbl>,
## #   Unemployment_rate <dbl>
# Remove rows with any NA values using base R
annual_gdp_ue_df <- annual_gdp_ue_df[complete.cases(annual_gdp_ue_df), ]

# Display the first few rows of the cleaned dataframe
head(annual_gdp_ue_df)
## # A tibble: 6 × 11
##    year county_id state_code state_name county_code county_name Mean_AQI
##   <dbl> <chr>     <chr>      <chr>      <chr>       <chr>          <dbl>
## 1  2017 01003     01         Alabama    003         Baldwin         7.42
## 2  2017 01027     01         Alabama    027         Clay            7.94
## 3  2017 01033     01         Alabama    033         Colbert         7.43
## 4  2017 01049     01         Alabama    049         DeKalb          8.02
## 5  2017 01055     01         Alabama    055         Etowah          9.21
## 6  2017 01069     01         Alabama    069         Houston         8.62
## # ℹ 4 more variables: Mean_Price <dbl>, Current_GDP <dbl>, Real_GDP <dbl>,
## #   Unemployment_rate <dbl>
# Create log(Mean_Price) and em_rate
annual_gdp_ue_df <- annual_gdp_ue_df %>%
  mutate(
    log_price = log(Mean_Price),               # Create the log(Mean_Price) column
    em_rate = 100 - Unemployment_rate               # Create the employment rate (em_rate) column
  )

# Display the first few rows of the updated dataframe
head(annual_gdp_ue_df)
## # A tibble: 6 × 13
##    year county_id state_code state_name county_code county_name Mean_AQI
##   <dbl> <chr>     <chr>      <chr>      <chr>       <chr>          <dbl>
## 1  2017 01003     01         Alabama    003         Baldwin         7.42
## 2  2017 01027     01         Alabama    027         Clay            7.94
## 3  2017 01033     01         Alabama    033         Colbert         7.43
## 4  2017 01049     01         Alabama    049         DeKalb          8.02
## 5  2017 01055     01         Alabama    055         Etowah          9.21
## 6  2017 01069     01         Alabama    069         Houston         8.62
## # ℹ 6 more variables: Mean_Price <dbl>, Current_GDP <dbl>, Real_GDP <dbl>,
## #   Unemployment_rate <dbl>, log_price <dbl>, em_rate <dbl>

Fixed Effect Extended model:

# Run the lfe regression with log_price as the dependent variable
annual_gdp_ue_df_model <- felm(log_price ~ Mean_AQI + Current_GDP + em_rate | county_id + year, data = annual_gdp_ue_df)

# Display the summary of the regression model
summary(annual_gdp_ue_df_model)
## 
## Call:
##    felm(formula = log_price ~ Mean_AQI + Current_GDP + em_rate |      county_id + year, data = annual_gdp_ue_df) 
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.311998 -0.026846 -0.000328  0.026044  0.267790 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## Mean_AQI    -3.446e-03  9.038e-04  -3.813 0.000140 ***
## Current_GDP -4.294e-10  1.653e-10  -2.598 0.009423 ** 
## em_rate      4.717e-03  1.410e-03   3.346 0.000831 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.06014 on 2905 degrees of freedom
## Multiple R-squared(full model): 0.9909   Adjusted R-squared: 0.9884 
## Multiple R-squared(proj model): 0.01141   Adjusted R-squared: -0.2557 
## F-statistic(full model):401.8 on 785 and 2905 DF, p-value: < 2.2e-16 
## F-statistic(proj model): 11.18 on 3 and 2905 DF, p-value: 2.726e-07