This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
options(repos = c(CRAN = "https://cloud.r-project.org"))
library(readr)
## Warning: package 'readr' was built under R version 4.3.3
file_path <- "C:\\Users\\mohua\\OneDrive - EQS\\Programming\\EPAData\\Summer Paper 2024\\2000-2022_epa_qtr_combine_df.csv"
# Load the dataset
epa_df <- read_csv(file_path)
## Rows: 819173 Columns: 44
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (22): state_name, county_name, datum, parameter, sample_duration, sample...
## dbl (21): state_code, county_code, site_number, parameter_code, poc, latitud...
## lgl (1): estimated_days_gt_std
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Inspect the first few rows of the data to understand its structure
head(epa_df)
## # A tibble: 6 × 44
## state_code county_code state_name county_name site_number parameter_code poc
## <dbl> <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 1 3 Alabama Baldwin 10 88101 1
## 2 1 3 Alabama Baldwin 10 88101 1
## 3 1 3 Alabama Baldwin 10 88101 1
## 4 1 3 Alabama Baldwin 10 88101 1
## 5 1 3 Alabama Baldwin 10 88101 1
## 6 1 3 Alabama Baldwin 10 88101 1
## # ℹ 37 more variables: latitude <dbl>, longitude <dbl>, datum <chr>,
## # parameter <chr>, sample_duration <chr>, sample_duration_code <chr>,
## # sample_duration_type <chr>, pollutant_standard <chr>, year <dbl>,
## # quarter <dbl>, units_of_measure <chr>, event_type <chr>,
## # observation_count <dbl>, observation_percent <dbl>, arithmetic_mean <dbl>,
## # minimum_value <dbl>, maximum_value <dbl>, quarterly_criteria_met <chr>,
## # actual_days_gt_std <dbl>, estimated_days_gt_std <lgl>, …
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
epa_df <- epa_df %>%
mutate(state_code = sprintf("%02d", state_code), # Format state_code as two digits
county_code = sprintf("%03d", county_code), # Format county_code as three digits
county_id = paste0(state_code, county_code)) # Concatenate
# View the modified data frame
head(epa_df)
## # A tibble: 6 × 45
## state_code county_code state_name county_name site_number parameter_code poc
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 01 003 Alabama Baldwin 10 88101 1
## 2 01 003 Alabama Baldwin 10 88101 1
## 3 01 003 Alabama Baldwin 10 88101 1
## 4 01 003 Alabama Baldwin 10 88101 1
## 5 01 003 Alabama Baldwin 10 88101 1
## 6 01 003 Alabama Baldwin 10 88101 1
## # ℹ 38 more variables: latitude <dbl>, longitude <dbl>, datum <chr>,
## # parameter <chr>, sample_duration <chr>, sample_duration_code <chr>,
## # sample_duration_type <chr>, pollutant_standard <chr>, year <dbl>,
## # quarter <dbl>, units_of_measure <chr>, event_type <chr>,
## # observation_count <dbl>, observation_percent <dbl>, arithmetic_mean <dbl>,
## # minimum_value <dbl>, maximum_value <dbl>, quarterly_criteria_met <chr>,
## # actual_days_gt_std <dbl>, estimated_days_gt_std <lgl>, …
# Number of unique county_id
unique_county_ids <- length(unique(epa_df$county_id))
# Total number of observations
total_observations <- nrow(epa_df)
# Display results
unique_county_ids
## [1] 1006
total_observations
## [1] 819173
library(readr)
file_path <- "C:\\Users\\mohua\\OneDrive - EQS\\Programming\\EPAData\\Summer Paper 2024\\raw_quarterly_house_data.csv"
# Load the dataset
house_df <- read_csv(file_path)
## Rows: 204218 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): RegionName, RegionType, StateName, State, Metro
## dbl (8): RegionID, SizeRank, StateCodeFIPS, MunicipalCodeFIPS, Value, Year,...
## date (1): Date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Inspect the first few rows of the data to understand its structure
head(house_df)
## # A tibble: 6 × 14
## RegionID SizeRank RegionName RegionType StateName State Metro StateCodeFIPS
## <dbl> <dbl> <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 66 146 Ada County county ID ID Boise C… 16
## 2 66 146 Ada County county ID ID Boise C… 16
## 3 66 146 Ada County county ID ID Boise C… 16
## 4 66 146 Ada County county ID ID Boise C… 16
## 5 66 146 Ada County county ID ID Boise C… 16
## 6 66 146 Ada County county ID ID Boise C… 16
## # ℹ 6 more variables: MunicipalCodeFIPS <dbl>, Date <date>, Value <dbl>,
## # Year <dbl>, Month <dbl>, Quarter <dbl>
library(dplyr)
house_df <- house_df %>%
mutate(state_code = sprintf("%02d", StateCodeFIPS), # Format state_code as two digits
county_code = sprintf("%03d", MunicipalCodeFIPS), # Format county_code as three digits
county_id = paste0(state_code, county_code)) # Concatenate
# View the modified data frame
head(house_df)
## # A tibble: 6 × 17
## RegionID SizeRank RegionName RegionType StateName State Metro StateCodeFIPS
## <dbl> <dbl> <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 66 146 Ada County county ID ID Boise C… 16
## 2 66 146 Ada County county ID ID Boise C… 16
## 3 66 146 Ada County county ID ID Boise C… 16
## 4 66 146 Ada County county ID ID Boise C… 16
## 5 66 146 Ada County county ID ID Boise C… 16
## 6 66 146 Ada County county ID ID Boise C… 16
## # ℹ 9 more variables: MunicipalCodeFIPS <dbl>, Date <date>, Value <dbl>,
## # Year <dbl>, Month <dbl>, Quarter <dbl>, state_code <chr>,
## # county_code <chr>, county_id <chr>
# Number of unique county_id
unique_county_ids <- length(unique(house_df$county_id))
# Total number of observations
total_observations <- nrow(house_df)
# Display results
unique_county_ids
## [1] 3074
total_observations
## [1] 204218
# Merging two dataframes
combined_epa_house_df <- merge(epa_df,
house_df[, c("Year", "Quarter", "county_id", "Value")],
by.x = c("year", "quarter", "county_id"),
by.y = c("Year", "Quarter", "county_id"),
all.x = TRUE)
# Display the combined dataframe
head(combined_epa_house_df)
## year quarter county_id state_code county_code state_name county_name
## 1 2000 1 01003 01 003 Alabama Baldwin
## 2 2000 1 01003 01 003 Alabama Baldwin
## 3 2000 1 01003 01 003 Alabama Baldwin
## 4 2000 1 01003 01 003 Alabama Baldwin
## 5 2000 1 01003 01 003 Alabama Baldwin
## 6 2000 1 01003 01 003 Alabama Baldwin
## site_number parameter_code poc latitude longitude datum
## 1 10 88101 1 30.49748 -87.88026 NAD83
## 2 10 88101 1 30.49748 -87.88026 NAD83
## 3 10 88101 1 30.49748 -87.88026 NAD83
## 4 10 88101 1 30.49748 -87.88026 NAD83
## 5 10 88101 1 30.49748 -87.88026 NAD83
## 6 10 88101 1 30.49748 -87.88026 NAD83
## parameter sample_duration sample_duration_code
## 1 PM2.5 - Local Conditions 24 HOUR 7
## 2 PM2.5 - Local Conditions 24 HOUR 7
## 3 PM2.5 - Local Conditions 24 HOUR 7
## 4 PM2.5 - Local Conditions 24 HOUR 7
## 5 PM2.5 - Local Conditions 24 HOUR 7
## 6 PM2.5 - Local Conditions 24 HOUR 7
## sample_duration_type pollutant_standard units_of_measure
## 1 O PM25 Annual 1997 Micrograms/cubic meter (LC)
## 2 O PM25 24-hour 2006 Micrograms/cubic meter (LC)
## 3 O PM25 Annual 2012 Micrograms/cubic meter (LC)
## 4 O PM25 24-hour 2012 Micrograms/cubic meter (LC)
## 5 O PM25 24-hour 1997 Micrograms/cubic meter (LC)
## 6 O PM25 Annual 2006 Micrograms/cubic meter (LC)
## event_type observation_count observation_percent arithmetic_mean
## 1 No Events 16 52 11.8188
## 2 No Events 16 52 11.8188
## 3 No Events 16 52 11.8188
## 4 No Events 16 52 11.8188
## 5 No Events 16 52 11.8188
## 6 No Events 16 52 11.8188
## minimum_value maximum_value quarterly_criteria_met actual_days_gt_std
## 1 5.9 24.1 N NA
## 2 5.9 24.1 N 0
## 3 5.9 24.1 N NA
## 4 5.9 24.1 N 0
## 5 5.9 24.1 N 0
## 6 5.9 24.1 N NA
## estimated_days_gt_std valid_samples valid_day_count scheduled_samples
## 1 NA 16 16 31
## 2 NA 16 16 31
## 3 NA 16 16 31
## 4 NA 16 16 31
## 5 NA 16 16 31
## 6 NA 16 16 31
## percent_days percent_one_value monitoring_agency_code monitoring_agency
## 1 17.6 18 13 Al Dept Of Env Mgt
## 2 17.6 18 13 Al Dept Of Env Mgt
## 3 17.6 18 13 Al Dept Of Env Mgt
## 4 17.6 18 13 Al Dept Of Env Mgt
## 5 17.6 18 13 Al Dept Of Env Mgt
## 6 17.6 18 13 Al Dept Of Env Mgt
## local_site_name address
## 1 FAIRHOPE, Alabama FAIRHOPE HIGH SCHOOL, 1 PIRATE DRIVE, FAIRHOPE, ALABAMA
## 2 FAIRHOPE, Alabama FAIRHOPE HIGH SCHOOL, 1 PIRATE DRIVE, FAIRHOPE, ALABAMA
## 3 FAIRHOPE, Alabama FAIRHOPE HIGH SCHOOL, 1 PIRATE DRIVE, FAIRHOPE, ALABAMA
## 4 FAIRHOPE, Alabama FAIRHOPE HIGH SCHOOL, 1 PIRATE DRIVE, FAIRHOPE, ALABAMA
## 5 FAIRHOPE, Alabama FAIRHOPE HIGH SCHOOL, 1 PIRATE DRIVE, FAIRHOPE, ALABAMA
## 6 FAIRHOPE, Alabama FAIRHOPE HIGH SCHOOL, 1 PIRATE DRIVE, FAIRHOPE, ALABAMA
## state county city tribal_code tribal_land cbsa_code
## 1 Alabama Baldwin Fairhope <NA> <NA> 19300
## 2 Alabama Baldwin Fairhope <NA> <NA> 19300
## 3 Alabama Baldwin Fairhope <NA> <NA> 19300
## 4 Alabama Baldwin Fairhope <NA> <NA> 19300
## 5 Alabama Baldwin Fairhope <NA> <NA> 19300
## 6 Alabama Baldwin Fairhope <NA> <NA> 19300
## cbsa date_of_last_change Value
## 1 Daphne-Fairhope-Foley, AL 5/16/2023 117246.2
## 2 Daphne-Fairhope-Foley, AL 5/16/2023 117246.2
## 3 Daphne-Fairhope-Foley, AL 5/16/2023 117246.2
## 4 Daphne-Fairhope-Foley, AL 5/16/2023 117246.2
## 5 Daphne-Fairhope-Foley, AL 5/16/2023 117246.2
## 6 Daphne-Fairhope-Foley, AL 5/16/2023 117246.2
# Renaming variables
colnames(combined_epa_house_df)[colnames(combined_epa_house_df) == "arithmetic_mean"] <- "AQI"
colnames(combined_epa_house_df)[colnames(combined_epa_house_df) == "Value"] <- "Price"
# Display the updated dataframe
head(combined_epa_house_df)
## year quarter county_id state_code county_code state_name county_name
## 1 2000 1 01003 01 003 Alabama Baldwin
## 2 2000 1 01003 01 003 Alabama Baldwin
## 3 2000 1 01003 01 003 Alabama Baldwin
## 4 2000 1 01003 01 003 Alabama Baldwin
## 5 2000 1 01003 01 003 Alabama Baldwin
## 6 2000 1 01003 01 003 Alabama Baldwin
## site_number parameter_code poc latitude longitude datum
## 1 10 88101 1 30.49748 -87.88026 NAD83
## 2 10 88101 1 30.49748 -87.88026 NAD83
## 3 10 88101 1 30.49748 -87.88026 NAD83
## 4 10 88101 1 30.49748 -87.88026 NAD83
## 5 10 88101 1 30.49748 -87.88026 NAD83
## 6 10 88101 1 30.49748 -87.88026 NAD83
## parameter sample_duration sample_duration_code
## 1 PM2.5 - Local Conditions 24 HOUR 7
## 2 PM2.5 - Local Conditions 24 HOUR 7
## 3 PM2.5 - Local Conditions 24 HOUR 7
## 4 PM2.5 - Local Conditions 24 HOUR 7
## 5 PM2.5 - Local Conditions 24 HOUR 7
## 6 PM2.5 - Local Conditions 24 HOUR 7
## sample_duration_type pollutant_standard units_of_measure
## 1 O PM25 Annual 1997 Micrograms/cubic meter (LC)
## 2 O PM25 24-hour 2006 Micrograms/cubic meter (LC)
## 3 O PM25 Annual 2012 Micrograms/cubic meter (LC)
## 4 O PM25 24-hour 2012 Micrograms/cubic meter (LC)
## 5 O PM25 24-hour 1997 Micrograms/cubic meter (LC)
## 6 O PM25 Annual 2006 Micrograms/cubic meter (LC)
## event_type observation_count observation_percent AQI minimum_value
## 1 No Events 16 52 11.8188 5.9
## 2 No Events 16 52 11.8188 5.9
## 3 No Events 16 52 11.8188 5.9
## 4 No Events 16 52 11.8188 5.9
## 5 No Events 16 52 11.8188 5.9
## 6 No Events 16 52 11.8188 5.9
## maximum_value quarterly_criteria_met actual_days_gt_std estimated_days_gt_std
## 1 24.1 N NA NA
## 2 24.1 N 0 NA
## 3 24.1 N NA NA
## 4 24.1 N 0 NA
## 5 24.1 N 0 NA
## 6 24.1 N NA NA
## valid_samples valid_day_count scheduled_samples percent_days
## 1 16 16 31 17.6
## 2 16 16 31 17.6
## 3 16 16 31 17.6
## 4 16 16 31 17.6
## 5 16 16 31 17.6
## 6 16 16 31 17.6
## percent_one_value monitoring_agency_code monitoring_agency local_site_name
## 1 18 13 Al Dept Of Env Mgt FAIRHOPE, Alabama
## 2 18 13 Al Dept Of Env Mgt FAIRHOPE, Alabama
## 3 18 13 Al Dept Of Env Mgt FAIRHOPE, Alabama
## 4 18 13 Al Dept Of Env Mgt FAIRHOPE, Alabama
## 5 18 13 Al Dept Of Env Mgt FAIRHOPE, Alabama
## 6 18 13 Al Dept Of Env Mgt FAIRHOPE, Alabama
## address state county
## 1 FAIRHOPE HIGH SCHOOL, 1 PIRATE DRIVE, FAIRHOPE, ALABAMA Alabama Baldwin
## 2 FAIRHOPE HIGH SCHOOL, 1 PIRATE DRIVE, FAIRHOPE, ALABAMA Alabama Baldwin
## 3 FAIRHOPE HIGH SCHOOL, 1 PIRATE DRIVE, FAIRHOPE, ALABAMA Alabama Baldwin
## 4 FAIRHOPE HIGH SCHOOL, 1 PIRATE DRIVE, FAIRHOPE, ALABAMA Alabama Baldwin
## 5 FAIRHOPE HIGH SCHOOL, 1 PIRATE DRIVE, FAIRHOPE, ALABAMA Alabama Baldwin
## 6 FAIRHOPE HIGH SCHOOL, 1 PIRATE DRIVE, FAIRHOPE, ALABAMA Alabama Baldwin
## city tribal_code tribal_land cbsa_code cbsa
## 1 Fairhope <NA> <NA> 19300 Daphne-Fairhope-Foley, AL
## 2 Fairhope <NA> <NA> 19300 Daphne-Fairhope-Foley, AL
## 3 Fairhope <NA> <NA> 19300 Daphne-Fairhope-Foley, AL
## 4 Fairhope <NA> <NA> 19300 Daphne-Fairhope-Foley, AL
## 5 Fairhope <NA> <NA> 19300 Daphne-Fairhope-Foley, AL
## 6 Fairhope <NA> <NA> 19300 Daphne-Fairhope-Foley, AL
## date_of_last_change Price
## 1 5/16/2023 117246.2
## 2 5/16/2023 117246.2
## 3 5/16/2023 117246.2
## 4 5/16/2023 117246.2
## 5 5/16/2023 117246.2
## 6 5/16/2023 117246.2
# Display all variable names
colnames(combined_epa_house_df)
## [1] "year" "quarter" "county_id"
## [4] "state_code" "county_code" "state_name"
## [7] "county_name" "site_number" "parameter_code"
## [10] "poc" "latitude" "longitude"
## [13] "datum" "parameter" "sample_duration"
## [16] "sample_duration_code" "sample_duration_type" "pollutant_standard"
## [19] "units_of_measure" "event_type" "observation_count"
## [22] "observation_percent" "AQI" "minimum_value"
## [25] "maximum_value" "quarterly_criteria_met" "actual_days_gt_std"
## [28] "estimated_days_gt_std" "valid_samples" "valid_day_count"
## [31] "scheduled_samples" "percent_days" "percent_one_value"
## [34] "monitoring_agency_code" "monitoring_agency" "local_site_name"
## [37] "address" "state" "county"
## [40] "city" "tribal_code" "tribal_land"
## [43] "cbsa_code" "cbsa" "date_of_last_change"
## [46] "Price"
# Selecting specific columns
combined_epa_house_df <- combined_epa_house_df[, c("year", "quarter", "state_code", "county_code",
"county_id", "state_name", "county_name", "city",
"parameter_code", "parameter", "latitude", "longitude",
"observation_count", "AQI", "minimum_value",
"maximum_value", "Price")]
# Display the updated dataframe
head(combined_epa_house_df)
## year quarter state_code county_code county_id state_name county_name city
## 1 2000 1 01 003 01003 Alabama Baldwin Fairhope
## 2 2000 1 01 003 01003 Alabama Baldwin Fairhope
## 3 2000 1 01 003 01003 Alabama Baldwin Fairhope
## 4 2000 1 01 003 01003 Alabama Baldwin Fairhope
## 5 2000 1 01 003 01003 Alabama Baldwin Fairhope
## 6 2000 1 01 003 01003 Alabama Baldwin Fairhope
## parameter_code parameter latitude longitude observation_count
## 1 88101 PM2.5 - Local Conditions 30.49748 -87.88026 16
## 2 88101 PM2.5 - Local Conditions 30.49748 -87.88026 16
## 3 88101 PM2.5 - Local Conditions 30.49748 -87.88026 16
## 4 88101 PM2.5 - Local Conditions 30.49748 -87.88026 16
## 5 88101 PM2.5 - Local Conditions 30.49748 -87.88026 16
## 6 88101 PM2.5 - Local Conditions 30.49748 -87.88026 16
## AQI minimum_value maximum_value Price
## 1 11.8188 5.9 24.1 117246.2
## 2 11.8188 5.9 24.1 117246.2
## 3 11.8188 5.9 24.1 117246.2
## 4 11.8188 5.9 24.1 117246.2
## 5 11.8188 5.9 24.1 117246.2
## 6 11.8188 5.9 24.1 117246.2
# Number of unique county_id
unique_county_ids <- length(unique(combined_epa_house_df$county_id))
# Total number of observations
total_observations <- nrow(combined_epa_house_df)
# Display results
unique_county_ids
## [1] 1006
total_observations
## [1] 819173
# Removing rows with NA values in the Price column
combined_epa_house_df <- combined_epa_house_df[!is.na(combined_epa_house_df$Price), ]
# Display the cleaned dataframe
head(combined_epa_house_df)
## year quarter state_code county_code county_id state_name county_name city
## 1 2000 1 01 003 01003 Alabama Baldwin Fairhope
## 2 2000 1 01 003 01003 Alabama Baldwin Fairhope
## 3 2000 1 01 003 01003 Alabama Baldwin Fairhope
## 4 2000 1 01 003 01003 Alabama Baldwin Fairhope
## 5 2000 1 01 003 01003 Alabama Baldwin Fairhope
## 6 2000 1 01 003 01003 Alabama Baldwin Fairhope
## parameter_code parameter latitude longitude observation_count
## 1 88101 PM2.5 - Local Conditions 30.49748 -87.88026 16
## 2 88101 PM2.5 - Local Conditions 30.49748 -87.88026 16
## 3 88101 PM2.5 - Local Conditions 30.49748 -87.88026 16
## 4 88101 PM2.5 - Local Conditions 30.49748 -87.88026 16
## 5 88101 PM2.5 - Local Conditions 30.49748 -87.88026 16
## 6 88101 PM2.5 - Local Conditions 30.49748 -87.88026 16
## AQI minimum_value maximum_value Price
## 1 11.8188 5.9 24.1 117246.2
## 2 11.8188 5.9 24.1 117246.2
## 3 11.8188 5.9 24.1 117246.2
## 4 11.8188 5.9 24.1 117246.2
## 5 11.8188 5.9 24.1 117246.2
## 6 11.8188 5.9 24.1 117246.2
# Number of unique county_id
unique_county_ids <- length(unique(combined_epa_house_df$county_id))
# Total number of observations
total_observations <- nrow(combined_epa_house_df)
# Display results
unique_county_ids
## [1] 904
total_observations
## [1] 719674
# Saving the dataframe as a CSV file using the specified path
file_path <- "C:\\Users\\mohua\\OneDrive - EQS\\Programming\\EPAData\\Summer Paper 2024\\combined_epa_house_df.csv"
write.csv(combined_epa_house_df, file = file_path, row.names = FALSE)
library(dplyr)
# Creating decile groups for housing prices
combined_epa_house_df <- combined_epa_house_df %>%
mutate(Price_Decile = ntile(Price, 10)) # ntile creates decile groups
# Group by decile and summarize to calculate mean Price and AQI
result_decile <- combined_epa_house_df %>%
group_by(Price_Decile) %>%
summarise(
Mean_Housing_Price = mean(Price, na.rm = TRUE),
Mean_AQI = mean(AQI, na.rm = TRUE),
.groups = 'drop'
)
# Print the result
print(result_decile)
## # A tibble: 10 × 3
## Price_Decile Mean_Housing_Price Mean_AQI
## <int> <dbl> <dbl>
## 1 1 80808. 11.2
## 2 2 108025. 11.2
## 3 3 125269. 10.9
## 4 4 143143. 10.1
## 5 5 165451. 9.62
## 6 6 191355. 9.23
## 7 7 221213. 8.89
## 8 8 263979. 8.95
## 9 9 340386. 8.93
## 10 10 592108. 8.86
# Using dplyr and summarise to get a descriptive summary
library(dplyr)
# Descriptive summary for AQI and Price
descriptive_summary <- combined_epa_house_df %>%
summarise(
Mean_AQI = mean(AQI, na.rm = TRUE),
Median_AQI = median(AQI, na.rm = TRUE),
SD_AQI = sd(AQI, na.rm = TRUE),
Min_AQI = min(AQI, na.rm = TRUE),
Max_AQI = max(AQI, na.rm = TRUE),
Mean_Price = mean(Price, na.rm = TRUE),
Median_Price = median(Price, na.rm = TRUE),
SD_Price = sd(Price, na.rm = TRUE),
Min_Price = min(Price, na.rm = TRUE),
Max_Price = max(Price, na.rm = TRUE)
)
# Print the descriptive summary
print(descriptive_summary)
## Mean_AQI Median_AQI SD_AQI Min_AQI Max_AQI Mean_Price Median_Price SD_Price
## 1 9.782919 9.1767 4.227191 -2.059 124.85 223173 178188.5 160046.3
## Min_Price Max_Price
## 1 31718.24 4710886
install.packages("plm")
## Installing package into 'C:/Users/mohua/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'plm' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\mohua\AppData\Local\Temp\RtmpSy8WkG\downloaded_packages
library(plm)
## Warning: package 'plm' was built under R version 4.3.3
##
## Attaching package: 'plm'
## The following objects are masked from 'package:dplyr':
##
## between, lag, lead
# Create log(Price)
combined_epa_house_df$log_Price <- log(combined_epa_house_df$Price)
pdata <- pdata.frame(combined_epa_house_df, index = c("county_id", "year"))
## Warning in pdata.frame(combined_epa_house_df, index = c("county_id", "year")): duplicate couples (id-time) in resulting pdata.frame
## to find out which, use, e.g., table(index(your_pdataframe), useNA = "ifany")
# Run the OLS regression with log_Price as the dependent variable
ols_model <- plm(log_Price ~ AQI, data = pdata, model = "pooling")
# Display the regression results
summary(ols_model)
## Pooling Model
##
## Call:
## plm(formula = log_Price ~ AQI, data = pdata, model = "pooling")
##
## Unbalanced Panel: n = 904, T = 2-7567, N = 703555
##
## Residuals:
## Min. 1st Qu. Median 3rd Qu. Max.
## -1.738903 -0.386124 -0.063152 0.325696 3.354754
##
## Coefficients:
## Estimate Std. Error t-value Pr(>|t|)
## (Intercept) 12.39491010 0.00166111 7461.83 < 2.2e-16 ***
## AQI -0.02546050 0.00015587 -163.35 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Total Sum of Squares: 223040
## Residual Sum of Squares: 214890
## R-Squared: 0.036539
## Adj. R-Squared: 0.036538
## F-statistic: 26682.1 on 1 and 703553 DF, p-value: < 2.22e-16
install.packages("lfe")
## Installing package into 'C:/Users/mohua/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'lfe' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'lfe'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\mohua\AppData\Local\R\win-library\4.3\00LOCK\lfe\libs\x64\lfe.dll to
## C:\Users\mohua\AppData\Local\R\win-library\4.3\lfe\libs\x64\lfe.dll: Permission
## denied
## Warning: restored 'lfe'
##
## The downloaded binary packages are in
## C:\Users\mohua\AppData\Local\Temp\RtmpSy8WkG\downloaded_packages
library(lfe)
## Warning: package 'lfe' was built under R version 4.3.3
## Loading required package: Matrix
##
## Attaching package: 'lfe'
## The following object is masked from 'package:plm':
##
## sargan
# Run the fixed-effects model regression
fixed_model <- felm(log_Price ~ AQI | county_id + year, data = combined_epa_house_df)
# Display the summary of the regression model
summary(fixed_model)
##
## Call:
## felm(formula = log_Price ~ AQI | county_id + year, data = combined_epa_house_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.65918 -0.06555 0.00121 0.06883 0.64768
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## AQI 2.150e-03 4.331e-05 49.63 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1157 on 702629 degrees of freedom
## (16119 observations deleted due to missingness)
## Multiple R-squared(full model): 0.9578 Adjusted R-squared: 0.9577
## Multiple R-squared(proj model): 0.003493 Adjusted R-squared: 0.002181
## F-statistic(full model):1.724e+04 on 925 and 702629 DF, p-value: < 2.2e-16
## F-statistic(proj model): 2463 on 1 and 702629 DF, p-value: < 2.2e-16
# Run the fixed-effects panel regression model
fixed_model_2 <- plm(log_Price ~ AQI,
data = combined_epa_house_df,
index = c("county_id", "year"), # Define panel structure with county_id and year
model = "within") # "within" specifies the fixed-effects model
## Warning in pdata.frame(data, index = index, ...): duplicate couples (id-time) in resulting pdata.frame
## to find out which, use, e.g., table(index(your_pdataframe), useNA = "ifany")
# Display the summary of the fixed-effects model
summary(fixed_model_2)
## Oneway (individual) effect Within Model
##
## Call:
## plm(formula = log_Price ~ AQI, data = combined_epa_house_df,
## model = "within", index = c("county_id", "year"))
##
## Unbalanced Panel: n = 904, T = 2-7567, N = 703555
##
## Residuals:
## Min. 1st Qu. Median 3rd Qu. Max.
## -0.981809 -0.136438 -0.027985 0.116719 1.835867
##
## Coefficients:
## Estimate Std. Error t-value Pr(>|t|)
## AQI -1.4841e-02 7.9067e-05 -187.7 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Total Sum of Squares: 41784
## Residual Sum of Squares: 39789
## R-Squared: 0.047745
## Adj. R-Squared: 0.04652
## F-statistic: 35229.9 on 1 and 702650 DF, p-value: < 2.22e-16
# Run the random effects model for the last 10 years
random_model <- plm(log_Price ~ AQI,
data = combined_epa_house_df,
index = c("county_id", "year"),
model = "random")
## Warning in pdata.frame(data, index = index, ...): duplicate couples (id-time) in resulting pdata.frame
## to find out which, use, e.g., table(index(your_pdataframe), useNA = "ifany")
# Display the summary of the random effects model
summary(random_model)
## Oneway (individual) effect Random Effect Model
## (Swamy-Arora's transformation)
##
## Call:
## plm(formula = log_Price ~ AQI, data = combined_epa_house_df,
## model = "random", index = c("county_id", "year"))
##
## Unbalanced Panel: n = 904, T = 2-7567, N = 703555
##
## Effects:
## var std.dev share
## idiosyncratic 0.05663 0.23796 0.188
## individual 0.24506 0.49504 0.812
## theta:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.6782 0.9817 0.9870 0.9852 0.9908 0.9945
##
## Residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.96679 -0.13669 -0.02868 0.00094 0.11794 1.84811
##
## Coefficients:
## Estimate Std. Error z-value Pr(>|z|)
## (Intercept) 1.2170e+01 1.6497e-02 737.73 < 2.2e-16 ***
## AQI -1.4845e-02 7.9062e-05 -187.77 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Total Sum of Squares: 49498
## Residual Sum of Squares: 39841
## R-Squared: 0.19515
## Adj. R-Squared: 0.19515
## Chisq: 35257.5 on 1 DF, p-value: < 2.22e-16
# Create a lagged variable for AQI considering quarterly data
combined_epa_house_df <- combined_epa_house_df %>%
arrange(county_id, year, quarter) %>% # Ensure data is sorted by county_id, year, and quarter
group_by(county_id) %>% # Group by county_id
mutate(lag_AQI = lag(AQI, n = 1)) %>% # Create a lagged variable with a lag of 1 quarter
ungroup()
# Display the first few rows to verify
head(combined_epa_house_df)
## # A tibble: 6 × 20
## year quarter state_code county_code county_id state_name county_name city
## <dbl> <dbl> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 2000 1 01 003 01003 Alabama Baldwin Fairhope
## 2 2000 1 01 003 01003 Alabama Baldwin Fairhope
## 3 2000 1 01 003 01003 Alabama Baldwin Fairhope
## 4 2000 1 01 003 01003 Alabama Baldwin Fairhope
## 5 2000 1 01 003 01003 Alabama Baldwin Fairhope
## 6 2000 1 01 003 01003 Alabama Baldwin Fairhope
## # ℹ 12 more variables: parameter_code <dbl>, parameter <chr>, latitude <dbl>,
## # longitude <dbl>, observation_count <dbl>, AQI <dbl>, minimum_value <dbl>,
## # maximum_value <dbl>, Price <dbl>, Price_Decile <int>, log_Price <dbl>,
## # lag_AQI <dbl>
# Run the fixed-effects model regression with the lagged AQI variable
lagged_model <- felm(log_Price ~ lag_AQI | county_id + year, data = combined_epa_house_df)
# Display the summary of the regression model
summary(lagged_model)
##
## Call:
## felm(formula = log_Price ~ lag_AQI | county_id + year, data = combined_epa_house_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.65918 -0.06555 0.00121 0.06883 0.64768
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## lag_AQI 2.150e-03 4.331e-05 49.63 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1157 on 702629 degrees of freedom
## (16119 observations deleted due to missingness)
## Multiple R-squared(full model): 0.9578 Adjusted R-squared: 0.9577
## Multiple R-squared(proj model): 0.003493 Adjusted R-squared: 0.002181
## F-statistic(full model):1.724e+04 on 925 and 702629 DF, p-value: < 2.2e-16
## F-statistic(proj model): 2463 on 1 and 702629 DF, p-value: < 2.2e-16
# Run the fixed-effects model
fixed_model <- plm(log_Price ~ AQI,
data = combined_epa_house_df,
index = c("county_id", "year"),
model = "within")
## Warning in pdata.frame(data, index = index, ...): duplicate couples (id-time) in resulting pdata.frame
## to find out which, use, e.g., table(index(your_pdataframe), useNA = "ifany")
# Run the random-effects model
random_model <- plm(log_Price ~ AQI,
data = combined_epa_house_df,
index = c("county_id", "year"),
model = "random")
## Warning in pdata.frame(data, index = index, ...): duplicate couples (id-time) in resulting pdata.frame
## to find out which, use, e.g., table(index(your_pdataframe), useNA = "ifany")
# Perform the Hausman test
hausman_test <- phtest(fixed_model, random_model)
# Display the results of the Hausman test
print(hausman_test)
##
## Hausman Test
##
## data: log_Price ~ AQI
## chisq = 28.969, df = 1, p-value = 7.354e-08
## alternative hypothesis: one model is inconsistent
# Convert quarterly data to annual data by calculating the mean AQI and Price for each year
annual_epa_house_df <- combined_epa_house_df %>%
group_by(year, county_id, state_code, state_name, county_code, county_name) %>% # Group by relevant identifiers
summarise(
Mean_AQI = mean(AQI, na.rm = TRUE),
Mean_Price = mean(Price, na.rm = TRUE),
.groups = 'drop'
)
# Display the first few rows of the annual dataset
head(annual_epa_house_df)
## # A tibble: 6 × 8
## year county_id state_code state_name county_code county_name Mean_AQI
## <dbl> <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 2000 01003 01 Alabama 003 Baldwin 14.5
## 2 2000 01049 01 Alabama 049 DeKalb 18.4
## 3 2000 01089 01 Alabama 089 Madison 18.6
## 4 2000 01097 01 Alabama 097 Mobile 15.2
## 5 2000 01101 01 Alabama 101 Montgomery 17.7
## 6 2000 01121 01 Alabama 121 Talladega 20.9
## # ℹ 1 more variable: Mean_Price <dbl>
file_path <- "C:\\Users\\mohua\\OneDrive - EQS\\Programming\\EPAData\\Summer Paper 2024\\GDP_2017_2022.csv"
# Load the dataset
gdp_df <- read_csv(file_path)
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 19062 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): GeoFIPS, GeoName, TableName, IndustryClassification
## dbl (5): Region, Year, indexes_real_GDP, Current_GDP, Real_GDP
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Inspect the first few rows of the data to understand its structure
head(gdp_df)
## # A tibble: 6 × 9
## GeoFIPS GeoName Region TableName IndustryClassification Year indexes_real_GDP
## <chr> <chr> <dbl> <chr> <chr> <dbl> <dbl>
## 1 "\"010… Alabama 5 CAGDP1 ... 2017 100
## 2 "\"010… Alabama 5 CAGDP1 ... 2018 102.
## 3 "\"010… Alabama 5 CAGDP1 ... 2019 104.
## 4 "\"010… Alabama 5 CAGDP1 ... 2020 103.
## 5 "\"010… Alabama 5 CAGDP1 ... 2021 107.
## 6 "\"010… Alabama 5 CAGDP1 ... 2022 109.
## # ℹ 2 more variables: Current_GDP <dbl>, Real_GDP <dbl>
# Ensure GeoFIPS is a character without quotes
gdp_df <- gdp_df %>%
mutate(GeoFIPS = gsub('"', '', GeoFIPS))
# Merge the two dataframes by matching GeoFIPS with county_id and Year with year
combined_annual_df <- annual_epa_house_df %>%
left_join(gdp_df %>% select(GeoFIPS, Year, Current_GDP, Real_GDP),
by = c("county_id" = "GeoFIPS", "year" = "Year"))
# Display the first few rows of the combined dataframe
head(combined_annual_df)
## # A tibble: 6 × 10
## year county_id state_code state_name county_code county_name Mean_AQI
## <dbl> <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 2000 01003 01 Alabama 003 Baldwin 14.5
## 2 2000 01049 01 Alabama 049 DeKalb 18.4
## 3 2000 01089 01 Alabama 089 Madison 18.6
## 4 2000 01097 01 Alabama 097 Mobile 15.2
## 5 2000 01101 01 Alabama 101 Montgomery 17.7
## 6 2000 01121 01 Alabama 121 Talladega 20.9
## # ℹ 3 more variables: Mean_Price <dbl>, Current_GDP <dbl>, Real_GDP <dbl>
file_path <- "C:\\Users\\mohua\\OneDrive - EQS\\Programming\\EPAData\\Summer Paper 2024\\Unemployment_long_format.csv"
# Load the dataset
ue_df <- read_csv(file_path)
## Rows: 73892 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): State, Area_Name
## dbl (6): FIPS_Code, year, Civilian_labor_force, Employed, Unemployed, Unempl...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Inspect the first few rows of the data to understand its structure
head(ue_df)
## # A tibble: 6 × 8
## FIPS_Code State Area_Name year Civilian_labor_force Employed Unemployed
## <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1001 AL Autauga County… 2000 21861 20971 890
## 2 1001 AL Autauga County… 2001 22081 21166 915
## 3 1001 AL Autauga County… 2002 22161 21096 1065
## 4 1001 AL Autauga County… 2003 22695 21557 1138
## 5 1001 AL Autauga County… 2004 23241 22146 1095
## 6 1001 AL Autauga County… 2005 23887 22986 901
## # ℹ 1 more variable: Unemployment_rate <dbl>
# Ensure that FIPS_Code is in character format with leading zeros in unemployment_df
ue_df <- ue_df %>%
mutate(FIPS_Code = sprintf("%05d", FIPS_Code))
# Merge the two dataframes by matching FIPS_Code with county_id and year
annual_gdp_ue_df <- combined_annual_df %>%
left_join(ue_df %>% select(FIPS_Code, year, Unemployment_rate),
by = c("county_id" = "FIPS_Code", "year" = "year"))
# Display the first few rows of the combined dataframe
head(annual_gdp_ue_df)
## # A tibble: 6 × 11
## year county_id state_code state_name county_code county_name Mean_AQI
## <dbl> <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 2000 01003 01 Alabama 003 Baldwin 14.5
## 2 2000 01049 01 Alabama 049 DeKalb 18.4
## 3 2000 01089 01 Alabama 089 Madison 18.6
## 4 2000 01097 01 Alabama 097 Mobile 15.2
## 5 2000 01101 01 Alabama 101 Montgomery 17.7
## 6 2000 01121 01 Alabama 121 Talladega 20.9
## # ℹ 4 more variables: Mean_Price <dbl>, Current_GDP <dbl>, Real_GDP <dbl>,
## # Unemployment_rate <dbl>
# Remove rows with any NA values using base R
annual_gdp_ue_df <- annual_gdp_ue_df[complete.cases(annual_gdp_ue_df), ]
# Display the first few rows of the cleaned dataframe
head(annual_gdp_ue_df)
## # A tibble: 6 × 11
## year county_id state_code state_name county_code county_name Mean_AQI
## <dbl> <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 2017 01003 01 Alabama 003 Baldwin 7.42
## 2 2017 01027 01 Alabama 027 Clay 7.94
## 3 2017 01033 01 Alabama 033 Colbert 7.43
## 4 2017 01049 01 Alabama 049 DeKalb 8.02
## 5 2017 01055 01 Alabama 055 Etowah 9.21
## 6 2017 01069 01 Alabama 069 Houston 8.62
## # ℹ 4 more variables: Mean_Price <dbl>, Current_GDP <dbl>, Real_GDP <dbl>,
## # Unemployment_rate <dbl>
# Create log(Mean_Price) and em_rate
annual_gdp_ue_df <- annual_gdp_ue_df %>%
mutate(
log_price = log(Mean_Price), # Create the log(Mean_Price) column
em_rate = 100 - Unemployment_rate # Create the employment rate (em_rate) column
)
# Display the first few rows of the updated dataframe
head(annual_gdp_ue_df)
## # A tibble: 6 × 13
## year county_id state_code state_name county_code county_name Mean_AQI
## <dbl> <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 2017 01003 01 Alabama 003 Baldwin 7.42
## 2 2017 01027 01 Alabama 027 Clay 7.94
## 3 2017 01033 01 Alabama 033 Colbert 7.43
## 4 2017 01049 01 Alabama 049 DeKalb 8.02
## 5 2017 01055 01 Alabama 055 Etowah 9.21
## 6 2017 01069 01 Alabama 069 Houston 8.62
## # ℹ 6 more variables: Mean_Price <dbl>, Current_GDP <dbl>, Real_GDP <dbl>,
## # Unemployment_rate <dbl>, log_price <dbl>, em_rate <dbl>
# Run the lfe regression with log_price as the dependent variable
annual_gdp_ue_df_model <- felm(log_price ~ Mean_AQI + Current_GDP + em_rate | county_id + year, data = annual_gdp_ue_df)
# Display the summary of the regression model
summary(annual_gdp_ue_df_model)
##
## Call:
## felm(formula = log_price ~ Mean_AQI + Current_GDP + em_rate | county_id + year, data = annual_gdp_ue_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.311998 -0.026846 -0.000328 0.026044 0.267790
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## Mean_AQI -3.446e-03 9.038e-04 -3.813 0.000140 ***
## Current_GDP -4.294e-10 1.653e-10 -2.598 0.009423 **
## em_rate 4.717e-03 1.410e-03 3.346 0.000831 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06014 on 2905 degrees of freedom
## Multiple R-squared(full model): 0.9909 Adjusted R-squared: 0.9884
## Multiple R-squared(proj model): 0.01141 Adjusted R-squared: -0.2557
## F-statistic(full model):401.8 on 785 and 2905 DF, p-value: < 2.2e-16
## F-statistic(proj model): 11.18 on 3 and 2905 DF, p-value: 2.726e-07