R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

rm(list = ls())

Loading EPA and housing annual Data

# Load necessary libraries
library(readr)  # For reading CSV files
## Warning: package 'readr' was built under R version 4.3.3
# File path to the data (update this if necessary)

# File path to the data
file_path <- "C:/Users/mferdo2/OneDrive - EQS/Higher study USA/Finance_PhD/First_year Paper/GDP/annual_epa.csv"

# Load the dataset
annual_epa <- read_csv(file_path)
## Rows: 9693 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): state_name, county_name
## dbl (5): county_id, year, parameter_code, AQI, Price
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Inspect the first few rows of the data to understand its structure
head(annual_epa)
## # A tibble: 6 × 7
##   county_id  year state_name county_name parameter_code   AQI   Price
##       <dbl> <dbl> <chr>      <chr>                <dbl> <dbl>   <dbl>
## 1      1003  2000 Alabama    Baldwin              88101  14.5 118891.
## 2      1003  2001 Alabama    Baldwin              88101  10.6 125572.
## 3      1003  2002 Alabama    Baldwin              88101  10.4 129665.
## 4      1003  2003 Alabama    Baldwin              88101  12.2 132879.
## 5      1003  2004 Alabama    Baldwin              88101  11.3 140003.
## 6      1003  2005 Alabama    Baldwin              88101  11.7 158390.

Decile Price AQI:

library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Creating decile groups for housing prices
annual_epa <- annual_epa %>%
  mutate(Price_Decile = ntile(Price, 10))  # ntile creates decile groups

# Group by decile and summarize to calculate mean Price and AQI
result_decile <- annual_epa %>%
  group_by(Price_Decile) %>%
  summarise(
    Mean_Housing_Price = mean(Price, na.rm = TRUE),
    Mean_AQI = mean(AQI, na.rm = TRUE),
    .groups = 'drop'
  )

# Print the result
print(result_decile)
## # A tibble: 10 × 3
##    Price_Decile Mean_Housing_Price Mean_AQI
##           <int>              <dbl>    <dbl>
##  1            1             78888.    10.9 
##  2            2            104566.    10.4 
##  3            3            122469.    10.3 
##  4            4            140695.     9.61
##  5            5            160367.     9.11
##  6            6            183272.     8.88
##  7            7            211495.     8.54
##  8            8            252544.     8.41
##  9            9            324135.     8.23
## 10           10            607500.     7.94

Descriptive Analysis:

descriptive_stats <- annual_epa %>%
  summarise(
    N_Obs = n(),
    Mean_Price = mean(Price, na.rm = TRUE),
    Median_Price = median(Price, na.rm = TRUE),
    SD_Price = sd(Price, na.rm = TRUE),
    Min_Price = min(Price, na.rm = TRUE),
    Max_Price = max(Price, na.rm = TRUE),
    
    Mean_AQI = mean(AQI, na.rm = TRUE),
    Median_AQI = median(AQI, na.rm = TRUE),
    SD_AQI = sd(AQI, na.rm = TRUE),
    Min_AQI = min(AQI, na.rm = TRUE),
    Max_AQI = max(AQI, na.rm = TRUE)
  )

# Print the result
print(descriptive_stats)
## # A tibble: 1 × 11
##   N_Obs Mean_Price Median_Price SD_Price Min_Price Max_Price Mean_AQI Median_AQI
##   <int>      <dbl>        <dbl>    <dbl>     <dbl>     <dbl>    <dbl>      <dbl>
## 1  9693    218557.      171140.  180530.    38622.  4536232.     9.23       8.87
## # ℹ 3 more variables: SD_AQI <dbl>, Min_AQI <dbl>, Max_AQI <dbl>

Scatter Plot

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.2
ggplot(annual_epa, aes(x = Price)) +
  geom_histogram(binwidth = 10000, fill = "blue", color = "black") +
  labs(title = "Distribution of House Prices", x = "House Price", y = "Frequency") +
  theme_minimal()

Histogram of House Prices

ggplot(annual_epa, aes(x = AQI)) +
  geom_histogram(binwidth = 1, fill = "green", color = "black") +
  labs(title = "Distribution of Air Quality Index (AQI)", x = "AQI", y = "Frequency") +
  theme_minimal()

Basic Regression

# Convert Price to logPrice by taking the natural log
annual_epa$logPrice <- log(annual_epa$Price)

# Run a linear regression model (Price ~ AQI)
linear_model <- lm(logPrice ~ AQI, data = annual_epa)

# Print summary of the regression model
summary(linear_model)
## 
## Call:
## lm(formula = logPrice ~ AQI, data = annual_epa)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.52271 -0.36251 -0.06753  0.29824  2.80130 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 12.644627   0.017564  719.90   <2e-16 ***
## AQI         -0.057842   0.001809  -31.97   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5359 on 9691 degrees of freedom
## Multiple R-squared:  0.09539,    Adjusted R-squared:  0.0953 
## F-statistic:  1022 on 1 and 9691 DF,  p-value: < 2.2e-16
# Plot the relationship between log(Price) and AQI
ggplot(annual_epa, aes(x = AQI, y = logPrice)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(title = "Scatter Plot of AQI vs log(House Price)", 
       x = "Air Quality Index (AQI)", 
       y = "Log of House Price") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

Panel Data Model Regression:

# Load the plm library for panel data models
library(plm)
## Warning: package 'plm' was built under R version 4.3.3
## 
## Attaching package: 'plm'
## The following objects are masked from 'package:dplyr':
## 
##     between, lag, lead
# Convert your data into panel data format (assuming 'county_id' and 'year' are your panel identifiers)
annual_epa_panel <- pdata.frame(annual_epa, index = c("county_id", "year"))

# Fixed Effects model
fe_model <- plm(logPrice ~ AQI, data = annual_epa_panel, model = "within")

# Summary of the Fixed Effects model
summary(fe_model)
## Oneway (individual) effect Within Model
## 
## Call:
## plm(formula = logPrice ~ AQI, data = annual_epa_panel, model = "within")
## 
## Unbalanced Panel: n = 623, T = 1-22, N = 9693
## 
## Residuals:
##      Min.   1st Qu.    Median   3rd Qu.      Max. 
## -0.856095 -0.129507 -0.028973  0.109137  1.008673 
## 
## Coefficients:
##       Estimate Std. Error t-value  Pr(>|t|)    
## AQI -0.0442041  0.0011412 -38.734 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    524.96
## Residual Sum of Squares: 450.44
## R-Squared:      0.14195
## Adj. R-Squared: 0.083008
## F-statistic: 1500.34 on 1 and 9069 DF, p-value: < 2.22e-16
# Random Effects model
re_model <- plm(logPrice ~ AQI, data = annual_epa_panel, model = "random")

# Summary of the Random Effects model
summary(re_model)
## Oneway (individual) effect Random Effect Model 
##    (Swamy-Arora's transformation)
## 
## Call:
## plm(formula = logPrice ~ AQI, data = annual_epa_panel, model = "random")
## 
## Unbalanced Panel: n = 623, T = 1-22, N = 9693
## 
## Effects:
##                   var std.dev share
## idiosyncratic 0.04967 0.22286 0.173
## individual    0.23704 0.48687 0.827
## theta:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.5838  0.8826  0.9029  0.8885  0.9029  0.9029 
## 
## Residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.87005 -0.14043 -0.03321  0.00349  0.11532  1.05209 
## 
## Coefficients:
##               Estimate Std. Error z-value  Pr(>|z|)    
## (Intercept) 12.4687472  0.0221948 561.788 < 2.2e-16 ***
## AQI         -0.0445894  0.0011295 -39.477 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    1547.4
## Residual Sum of Squares: 480.41
## R-Squared:      0.69076
## Adj. R-Squared: 0.69073
## Chisq: 1558.44 on 1 DF, p-value: < 2.22e-16
# Perform a Hausman test to decide between Fixed Effects and Random Effects
hausman_test <- phtest(fe_model, re_model)
print(hausman_test)
## 
##  Hausman Test
## 
## data:  logPrice ~ AQI
## chisq = 5.5801, df = 1, p-value = 0.01817
## alternative hypothesis: one model is inconsistent

Multicollinearity test

# Fit the model using lm for VIF calculation (note: this is not a panel model)
lm_model <- lm(logPrice ~ AQI + factor(year), data = annual_epa)

# Load the 'car' package
library(car)
## Warning: package 'car' was built under R version 4.3.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.3.3
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
# Calculate VIF
vif(lm_model)
##                  GVIF Df GVIF^(1/(2*Df))
## AQI          1.521626  1        1.233542
## factor(year) 1.521626 21        1.010045

#AQI:GVIF = 1.52: This means the variance of the AQI coefficient is inflated by a factor of 1.52 due to multicollinearity.GVIF^(1/(2*Df)) = 1.23: The adjusted GVIF (which accounts for the degrees of freedom) for AQI is 1.23, well below the commonly used threshold of 5 or 10, indicating that multicollinearity is not an issue for AQI.

#factor(year):GVIF = 1.52: This is the GVIF for the whole set of year dummy variables.GVIF^(1/(2*Df)) = 1.01: The adjusted GVIF is 1.01, which is also very low. This suggests that the year dummies do not have a multicollinearity problem.

Fixed effect regression panel data

# Fixed Effects model with time fixed effects (year as a factor)
fe_model_time <- plm(logPrice ~ AQI + factor(year), data = annual_epa_panel, model = "within")

# Summary of the Fixed Effects model with time fixed effects
summary(fe_model_time)
## Oneway (individual) effect Within Model
## 
## Call:
## plm(formula = logPrice ~ AQI + factor(year), data = annual_epa_panel, 
##     model = "within")
## 
## Unbalanced Panel: n = 623, T = 1-22, N = 9693
## 
## Residuals:
##        Min.     1st Qu.      Median     3rd Qu.        Max. 
## -0.60034736 -0.06144546  0.00053231  0.06416185  0.53252710 
## 
## Coefficients:
##                    Estimate Std. Error t-value  Pr(>|t|)    
## AQI              0.01192716 0.00089154  13.378 < 2.2e-16 ***
## factor(year)2001 0.07303782 0.00888972   8.216 2.394e-16 ***
## factor(year)2002 0.14023106 0.00880895  15.919 < 2.2e-16 ***
## factor(year)2003 0.21372149 0.00881826  24.236 < 2.2e-16 ***
## factor(year)2004 0.30230618 0.00891358  33.915 < 2.2e-16 ***
## factor(year)2005 0.38603078 0.00875606  44.087 < 2.2e-16 ***
## factor(year)2006 0.47678842 0.00893824  53.343 < 2.2e-16 ***
## factor(year)2007 0.48497565 0.00883097  54.918 < 2.2e-16 ***
## factor(year)2008 0.45270969 0.00900126  50.294 < 2.2e-16 ***
## factor(year)2009 0.38056192 0.00905017  42.050 < 2.2e-16 ***
## factor(year)2010 0.34617479 0.00885431  39.097 < 2.2e-16 ***
## factor(year)2011 0.29876108 0.00890198  33.561 < 2.2e-16 ***
## factor(year)2012 0.29791759 0.00907274  32.837 < 2.2e-16 ***
## factor(year)2013 0.35035695 0.00913515  38.353 < 2.2e-16 ***
## factor(year)2014 0.40639827 0.00919717  44.187 < 2.2e-16 ***
## factor(year)2015 0.45370402 0.00927381  48.923 < 2.2e-16 ***
## factor(year)2016 0.51454088 0.00957220  53.754 < 2.2e-16 ***
## factor(year)2017 0.55774514 0.00938407  59.435 < 2.2e-16 ***
## factor(year)2018 0.61422650 0.00935056  65.689 < 2.2e-16 ***
## factor(year)2020 0.73139783 0.00940756  77.746 < 2.2e-16 ***
## factor(year)2021 0.86376751 0.00924302  93.451 < 2.2e-16 ***
## factor(year)2022 0.99281637 0.00958472 103.583 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    524.96
## Residual Sum of Squares: 114.27
## R-Squared:      0.78233
## Adj. R-Squared: 0.76684
## F-statistic: 1478.16 on 22 and 9048 DF, p-value: < 2.22e-16

Cluster standard errors by county

library(lmtest)
## Warning: package 'lmtest' was built under R version 4.3.3
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.3.3
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(sandwich)
## Warning: package 'sandwich' was built under R version 4.3.3
fe_model_clustered <- coeftest(fe_model_time, vcov = vcovHC(fe_model_time, type = "HC1", cluster = "group"))
print(fe_model_clustered)
## 
## t test of coefficients:
## 
##                   Estimate Std. Error t value  Pr(>|t|)    
## AQI              0.0119272  0.0018536  6.4346 1.301e-10 ***
## factor(year)2001 0.0730378  0.0032807 22.2632 < 2.2e-16 ***
## factor(year)2002 0.1402311  0.0049754 28.1848 < 2.2e-16 ***
## factor(year)2003 0.2137215  0.0069901 30.5748 < 2.2e-16 ***
## factor(year)2004 0.3023062  0.0091865 32.9078 < 2.2e-16 ***
## factor(year)2005 0.3860308  0.0123244 31.3225 < 2.2e-16 ***
## factor(year)2006 0.4767884  0.0139679 34.1345 < 2.2e-16 ***
## factor(year)2007 0.4849756  0.0131304 36.9354 < 2.2e-16 ***
## factor(year)2008 0.4527097  0.0113394 39.9235 < 2.2e-16 ***
## factor(year)2009 0.3805619  0.0113334 33.5788 < 2.2e-16 ***
## factor(year)2010 0.3461748  0.0113209 30.5784 < 2.2e-16 ***
## factor(year)2011 0.2987611  0.0115728 25.8157 < 2.2e-16 ***
## factor(year)2012 0.2979176  0.0118800 25.0773 < 2.2e-16 ***
## factor(year)2013 0.3503570  0.0117853 29.7284 < 2.2e-16 ***
## factor(year)2014 0.4063983  0.0121166 33.5406 < 2.2e-16 ***
## factor(year)2015 0.4537040  0.0125941 36.0252 < 2.2e-16 ***
## factor(year)2016 0.5145409  0.0134428 38.2763 < 2.2e-16 ***
## factor(year)2017 0.5577451  0.0132287 42.1616 < 2.2e-16 ***
## factor(year)2018 0.6142265  0.0134043 45.8230 < 2.2e-16 ***
## factor(year)2020 0.7313978  0.0133166 54.9238 < 2.2e-16 ***
## factor(year)2021 0.8637675  0.0138036 62.5755 < 2.2e-16 ***
## factor(year)2022 0.9928164  0.0150030 66.1747 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual Diagnostics

# Plot residuals
residuals_data <- data.frame(residuals = residuals(fe_model_time))
ggplot(residuals_data, aes(x = residuals)) +
  geom_histogram(binwidth = 0.05, fill = "blue", color = "black") +
  labs(title = "Residuals Distribution", x = "Residuals", y = "Frequency")

# Fixed Effect regression county plm:

# Fixed Effects model
fe_model_2 <- plm(logPrice ~ AQI, data = annual_epa_panel, model = "within")

# Summary of the Fixed Effects model
summary(fe_model_2)
## Oneway (individual) effect Within Model
## 
## Call:
## plm(formula = logPrice ~ AQI, data = annual_epa_panel, model = "within")
## 
## Unbalanced Panel: n = 623, T = 1-22, N = 9693
## 
## Residuals:
##      Min.   1st Qu.    Median   3rd Qu.      Max. 
## -0.856095 -0.129507 -0.028973  0.109137  1.008673 
## 
## Coefficients:
##       Estimate Std. Error t-value  Pr(>|t|)    
## AQI -0.0442041  0.0011412 -38.734 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    524.96
## Residual Sum of Squares: 450.44
## R-Squared:      0.14195
## Adj. R-Squared: 0.083008
## F-statistic: 1500.34 on 1 and 9069 DF, p-value: < 2.22e-16

Fixed Effect regression county, year fixed

library(lfe)
## Warning: package 'lfe' was built under R version 4.3.3
## Loading required package: Matrix
## 
## Attaching package: 'lfe'
## The following object is masked from 'package:lmtest':
## 
##     waldtest
## The following object is masked from 'package:plm':
## 
##     sargan
# Run a fixed-effects with AQI as the control variable and county and year fixed effects
model_fe_3 <- felm(logPrice ~ AQI | factor(county_id) + factor(year), data = annual_epa)

# Display the summary of the fixed-effects model
summary(model_fe_3)
## 
## Call:
##    felm(formula = logPrice ~ AQI | factor(county_id) + factor(year),      data = annual_epa) 
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.60035 -0.06145  0.00053  0.06416  0.53253 
## 
## Coefficients:
##      Estimate Std. Error t value Pr(>|t|)    
## AQI 0.0119272  0.0008915   13.38   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1124 on 9048 degrees of freedom
## Multiple R-squared(full model): 0.9629   Adjusted R-squared: 0.9602 
## Multiple R-squared(proj model): 0.0194   Adjusted R-squared: -0.0504 
## F-statistic(full model):364.2 on 644 and 9048 DF, p-value: < 2.2e-16 
## F-statistic(proj model):   179 on 1 and 9048 DF, p-value: < 2.2e-16

Wooldridge Test for Serial Correlation in Panel Data

library(plm)
library(lmtest)

# Load the dataset and prepare as panel data (same as above)
panel_data <- pdata.frame(annual_epa, index = c("county_id", "year"))

# Wooldridge test for serial correlation in panel models
serial_test <- pbgtest(fe_model)

# Display the results of the Wooldridge test
print(serial_test)
## 
##  Breusch-Godfrey/Wooldridge test for serial correlation in panel models
## 
## data:  logPrice ~ AQI
## chisq = 3925.3, df = 1, p-value < 2.2e-16
## alternative hypothesis: serial correlation in idiosyncratic errors

Key Issues Identified:Serial correlation: Strong serial correlation within counties over time.Endogeneity: Evidence from the Hausman test that suggests the need to handle endogeneity.

Reverse causality : lagged variables

# Include lagged AQI in the model
lagged_model <- plm(logPrice ~ lag(AQI, 1) + factor(year), data = panel_data, model = "within")

# Summary of the lagged model
summary(lagged_model)
## Oneway (individual) effect Within Model
## 
## Call:
## plm(formula = logPrice ~ lag(AQI, 1) + factor(year), data = panel_data, 
##     model = "within")
## 
## Unbalanced Panel: n = 611, T = 1-20, N = 8508
## 
## Residuals:
##        Min.     1st Qu.      Median     3rd Qu.        Max. 
## -0.58686326 -0.05914333  0.00085594  0.06259766  0.55263071 
## 
## Coefficients:
##                    Estimate Std. Error t-value  Pr(>|t|)    
## lag(AQI, 1)      0.01216633 0.00095778 12.7027 < 2.2e-16 ***
## factor(year)2002 0.06445925 0.00885273  7.2813 3.626e-13 ***
## factor(year)2003 0.13780348 0.00881639 15.6304 < 2.2e-16 ***
## factor(year)2004 0.22938043 0.00890471 25.7594 < 2.2e-16 ***
## factor(year)2005 0.33417947 0.00896046 37.2949 < 2.2e-16 ***
## factor(year)2006 0.39641365 0.00882497 44.9195 < 2.2e-16 ***
## factor(year)2007 0.42120011 0.00893621 47.1341 < 2.2e-16 ***
## factor(year)2008 0.36974432 0.00886861 41.6914 < 2.2e-16 ***
## factor(year)2009 0.29904612 0.00903672 33.0923 < 2.2e-16 ***
## factor(year)2010 0.27831911 0.00913005 30.4838 < 2.2e-16 ***
## factor(year)2011 0.22726860 0.00896587 25.3482 < 2.2e-16 ***
## factor(year)2012 0.22143769 0.00900226 24.5980 < 2.2e-16 ***
## factor(year)2013 0.27807496 0.00918424 30.2774 < 2.2e-16 ***
## factor(year)2014 0.33345963 0.00925758 36.0202 < 2.2e-16 ***
## factor(year)2015 0.38228562 0.00932490 40.9962 < 2.2e-16 ***
## factor(year)2016 0.43620600 0.00942727 46.2707 < 2.2e-16 ***
## factor(year)2017 0.49336356 0.00976336 50.5321 < 2.2e-16 ***
## factor(year)2018 0.54572099 0.00955010 57.1430 < 2.2e-16 ***
## factor(year)2021 0.79974858 0.00956999 83.5684 < 2.2e-16 ***
## factor(year)2022 0.91501341 0.00939406 97.4034 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    424.18
## Residual Sum of Squares: 97.673
## R-Squared:      0.76974
## Adj. R-Squared: 0.75132
## F-statistic: 1316.6 on 20 and 7877 DF, p-value: < 2.22e-16

Final test to address serial correlation and endogeneity (IV required: future plan)

library(plm)
library(lmtest)

# Step 1: Convert the data into a panel format
panel_data <- pdata.frame(annual_epa, index = c("county_id", "year"))

# Step 2: Run the fixed-effects model (within model)
fe_model <- plm(logPrice ~ AQI + factor(year), data = panel_data, model = "within")

# Step 3: Use clustered standard errors at the county level to account for serial correlation and heteroskedasticity
clustered_se <- coeftest(fe_model, vcov = vcovHC(fe_model, type = "HC1", cluster = "group"))

# Step 4: Driscoll-Kraay standard errors to account for serial correlation and cross-sectional dependence
driscoll_kraay_se <- coeftest(fe_model, vcov = vcovSCC(fe_model, type = "HC0"))

# Step 5: Print the results
print(clustered_se)  # Clustered standard errors
## 
## t test of coefficients:
## 
##                   Estimate Std. Error t value  Pr(>|t|)    
## AQI              0.0119272  0.0018536  6.4346 1.301e-10 ***
## factor(year)2001 0.0730378  0.0032807 22.2632 < 2.2e-16 ***
## factor(year)2002 0.1402311  0.0049754 28.1848 < 2.2e-16 ***
## factor(year)2003 0.2137215  0.0069901 30.5748 < 2.2e-16 ***
## factor(year)2004 0.3023062  0.0091865 32.9078 < 2.2e-16 ***
## factor(year)2005 0.3860308  0.0123244 31.3225 < 2.2e-16 ***
## factor(year)2006 0.4767884  0.0139679 34.1345 < 2.2e-16 ***
## factor(year)2007 0.4849756  0.0131304 36.9354 < 2.2e-16 ***
## factor(year)2008 0.4527097  0.0113394 39.9235 < 2.2e-16 ***
## factor(year)2009 0.3805619  0.0113334 33.5788 < 2.2e-16 ***
## factor(year)2010 0.3461748  0.0113209 30.5784 < 2.2e-16 ***
## factor(year)2011 0.2987611  0.0115728 25.8157 < 2.2e-16 ***
## factor(year)2012 0.2979176  0.0118800 25.0773 < 2.2e-16 ***
## factor(year)2013 0.3503570  0.0117853 29.7284 < 2.2e-16 ***
## factor(year)2014 0.4063983  0.0121166 33.5406 < 2.2e-16 ***
## factor(year)2015 0.4537040  0.0125941 36.0252 < 2.2e-16 ***
## factor(year)2016 0.5145409  0.0134428 38.2763 < 2.2e-16 ***
## factor(year)2017 0.5577451  0.0132287 42.1616 < 2.2e-16 ***
## factor(year)2018 0.6142265  0.0134043 45.8230 < 2.2e-16 ***
## factor(year)2020 0.7313978  0.0133166 54.9238 < 2.2e-16 ***
## factor(year)2021 0.8637675  0.0138036 62.5755 < 2.2e-16 ***
## factor(year)2022 0.9928164  0.0150030 66.1747 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
print(driscoll_kraay_se)  # Driscoll-Kraay standard errors
## 
## t test of coefficients:
## 
##                   Estimate Std. Error  t value  Pr(>|t|)    
## AQI              0.0119272  0.0035935   3.3191 0.0009065 ***
## factor(year)2001 0.0730378  0.0022468  32.5070 < 2.2e-16 ***
## factor(year)2002 0.1402311  0.0034102  41.1205 < 2.2e-16 ***
## factor(year)2003 0.2137215  0.0051605  41.4152 < 2.2e-16 ***
## factor(year)2004 0.3023062  0.0068867  43.8972 < 2.2e-16 ***
## factor(year)2005 0.3860308  0.0033190 116.3096 < 2.2e-16 ***
## factor(year)2006 0.4767884  0.0074093  64.3503 < 2.2e-16 ***
## factor(year)2007 0.4849756  0.0067614  71.7268 < 2.2e-16 ***
## factor(year)2008 0.4527097  0.0106269  42.6003 < 2.2e-16 ***
## factor(year)2009 0.3805619  0.0148055  25.7041 < 2.2e-16 ***
## factor(year)2010 0.3461748  0.0143345  24.1498 < 2.2e-16 ***
## factor(year)2011 0.2987611  0.0145814  20.4892 < 2.2e-16 ***
## factor(year)2012 0.2979176  0.0168998  17.6284 < 2.2e-16 ***
## factor(year)2013 0.3503570  0.0176089  19.8966 < 2.2e-16 ***
## factor(year)2014 0.4063983  0.0186783  21.7578 < 2.2e-16 ***
## factor(year)2015 0.4537040  0.0194300  23.3507 < 2.2e-16 ***
## factor(year)2016 0.5145409  0.0220888  23.2942 < 2.2e-16 ***
## factor(year)2017 0.5577451  0.0205097  27.1942 < 2.2e-16 ***
## factor(year)2018 0.6142265  0.0201591  30.4689 < 2.2e-16 ***
## factor(year)2020 0.7313978  0.0204342  35.7928 < 2.2e-16 ***
## factor(year)2021 0.8637675  0.0189896  45.4864 < 2.2e-16 ***
## factor(year)2022 0.9928164  0.0218290  45.4815 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

`

Load data with uneployment and GDP 2017 to 2022

library(readr)  # For reading CSV files

# Updated file path to the data
file_path <- "C:/Users/mferdo2/OneDrive - EQS/Higher study USA/Finance_PhD/First_year Paper/GDP/epa_gdp_ue_2017_2022_cleaned.csv"

# Load the dataset
epa_gdp_ue_2017_2022 <- read_csv(file_path)
## Rows: 2531 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): state_name, county_name
## dbl (12): county_id, year, parameter_code, AQI, Price, Civilian_labor_force,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Inspect the first few rows of the data to understand its structure
head(epa_gdp_ue_2017_2022)
## # A tibble: 6 × 14
##   county_id  year state_name county_name parameter_code   AQI   Price
##       <dbl> <dbl> <chr>      <chr>                <dbl> <dbl>   <dbl>
## 1      1003  2017 Alabama    Baldwin              88101  7.42 195815.
## 2      1003  2018 Alabama    Baldwin              88101  7.13 207072.
## 3      1003  2020 Alabama    Baldwin              88101  7.93 235690.
## 4      1003  2021 Alabama    Baldwin              88101  7.28 276350.
## 5      1003  2022 Alabama    Baldwin              88101  7.33 319093.
## 6      1033  2017 Alabama    Colbert              88101  7.43 116845.
## # ℹ 7 more variables: Civilian_labor_force <dbl>, Employed <dbl>,
## #   Unemployed <dbl>, Unemployment_rate <dbl>, indexes_real_GDP <dbl>,
## #   Current_GDP <dbl>, Real_GDP <dbl>

Run the fixed effect regression Model

# Convert Price to logPrice by taking the natural log
epa_gdp_ue_2017_2022$logPrice <- log(epa_gdp_ue_2017_2022$Price)

# Create the em_rate variable from Unemployment_rate
epa_gdp_ue_2017_2022$em_rate <- (100 - epa_gdp_ue_2017_2022$Unemployment_rate)

# Load the necessary library for fixed effect model
library(plm)

# Convert the data to panel format
panel_data <- pdata.frame(epa_gdp_ue_2017_2022, index = c("county_id", "year"))

# Run the fixed-effects regression model
fe_model <- plm(logPrice ~ AQI + Current_GDP + em_rate, 
                data = panel_data, 
                model = "within")

# Display the summary of the fixed effect regression model
summary(fe_model)
## Oneway (individual) effect Within Model
## 
## Call:
## plm(formula = logPrice ~ AQI + Current_GDP + em_rate, data = panel_data, 
##     model = "within")
## 
## Unbalanced Panel: n = 532, T = 1-5, N = 2531
## 
## Residuals:
##      Min.   1st Qu.    Median   3rd Qu.      Max. 
## -0.663760 -0.132164  0.003953  0.119551  0.588217 
## 
## Coefficients:
##                Estimate  Std. Error t-value  Pr(>|t|)    
## AQI         -1.3939e-02  3.7531e-03 -3.7140 0.0002096 ***
## Current_GDP  7.2676e-09  4.5658e-10 15.9175 < 2.2e-16 ***
## em_rate      9.3703e-03  2.1684e-03  4.3213 1.628e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    68.645
## Residual Sum of Squares: 59.457
## R-Squared:      0.13384
## Adj. R-Squared: -0.097888
## F-statistic: 102.808 on 3 and 1996 DF, p-value: < 2.22e-16

Fixed effect using lfe

# Load the necessary library for fixed effect model
library(lfe)

# Run the fixed-effects regression model with county and year fixed effects
fe_model_2 <- felm(logPrice ~ AQI + Current_GDP + em_rate | factor(county_id) + factor(year), 
                 data = epa_gdp_ue_2017_2022)

# Display the summary of the fixed effect regression model
summary(fe_model_2)
## 
## Call:
##    felm(formula = logPrice ~ AQI + Current_GDP + em_rate | factor(county_id) +      factor(year), data = epa_gdp_ue_2017_2022) 
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.279526 -0.026393 -0.000168  0.025327  0.263303 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## AQI         -4.080e-03  1.344e-03  -3.036  0.00243 **
## Current_GDP -2.903e-10  1.702e-10  -1.706  0.08811 . 
## em_rate      3.661e-03  1.687e-03   2.170  0.03014 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.05966 on 1992 degrees of freedom
## Multiple R-squared(full model): 0.9914   Adjusted R-squared: 0.989 
## Multiple R-squared(proj model): 0.008239   Adjusted R-squared: -0.2596 
## F-statistic(full model):425.1 on 538 and 1992 DF, p-value: < 2.2e-16 
## F-statistic(proj model): 5.516 on 3 and 1992 DF, p-value: 0.0009021

Fixed effect 2017-2022 AQI only

# Load the necessary library for fixed effect model
library(lfe)

# Run the fixed-effects regression model with only AQI as control and county and year fixed effects
fe_model_aqi <- felm(logPrice ~ AQI | factor(county_id) + factor(year), 
                 data = epa_gdp_ue_2017_2022)

# Display the summary of the fixed effect regression model
summary(fe_model_aqi)
## 
## Call:
##    felm(formula = logPrice ~ AQI | factor(county_id) + factor(year),      data = epa_gdp_ue_2017_2022) 
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.27449 -0.02616  0.00000  0.02514  0.26181 
## 
## Coefficients:
##      Estimate Std. Error t value Pr(>|t|)   
## AQI -0.003920   0.001345  -2.915  0.00359 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.05975 on 1994 degrees of freedom
## Multiple R-squared(full model): 0.9913   Adjusted R-squared: 0.989 
## Multiple R-squared(proj model): 0.004244   Adjusted R-squared: -0.2634 
## F-statistic(full model):425.4 on 536 and 1994 DF, p-value: < 2.2e-16 
## F-statistic(proj model):   8.5 on 1 and 1994 DF, p-value: 0.003592

Simple regression AQI only

# Run a linear regression where the dependent variable is logPrice
# Control variables are AQI and year
simple_model <- lm(logPrice ~ AQI + year, data = epa_gdp_ue_2017_2022)

# Display the summary of the regression model
summary(simple_model)
## 
## Call:
## lm(formula = logPrice ~ AQI + year, data = epa_gdp_ue_2017_2022)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.32482 -0.36515 -0.05931  0.32941  2.58361 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.542e+02  1.180e+01 -13.067  < 2e-16 ***
## AQI         -3.102e-02  5.390e-03  -5.755 9.68e-09 ***
## year         8.259e-02  5.842e-03  14.138  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5448 on 2528 degrees of freedom
## Multiple R-squared:  0.08615,    Adjusted R-squared:  0.08543 
## F-statistic: 119.2 on 2 and 2528 DF,  p-value: < 2.2e-16
# Optional: Plot the regression results
plot(simple_model)

Panel data regression using plm AQI only:

# Run the fixed effects model again using plm for the Hausman test
fe_model_plm <- plm(logPrice ~ AQI, data = panel_data, model = "within")

# Display the summary of the regression model
summary(fe_model_plm)
## Oneway (individual) effect Within Model
## 
## Call:
## plm(formula = logPrice ~ AQI, data = panel_data, model = "within")
## 
## Unbalanced Panel: n = 532, T = 1-5, N = 2531
## 
## Residuals:
##      Min.   1st Qu.    Median   3rd Qu.      Max. 
## -0.423932 -0.136412 -0.020688  0.133603  0.455898 
## 
## Coefficients:
##       Estimate Std. Error t-value  Pr(>|t|)    
## AQI -0.0195378  0.0039897 -4.8971 1.051e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    68.645
## Residual Sum of Squares: 67.831
## R-Squared:      0.01186
## Adj. R-Squared: -0.25125
## F-statistic: 23.9812 on 1 and 1998 DF, p-value: 1.0509e-06

Random effect AQI only

library(plm)

# Convert the data to panel format for the random effects model
panel_data <- pdata.frame(epa_gdp_ue_2017_2022, index = c("county_id", "year"))

# Run the random effects model using plm
re_model_plm <- plm(logPrice ~ AQI, data = panel_data, model = "random")

summary(re_model_plm)
## Oneway (individual) effect Random Effect Model 
##    (Swamy-Arora's transformation)
## 
## Call:
## plm(formula = logPrice ~ AQI, data = panel_data, model = "random")
## 
## Unbalanced Panel: n = 532, T = 1-5, N = 2531
## 
## Effects:
##                   var std.dev share
## idiosyncratic 0.03395 0.18425 0.106
## individual    0.28695 0.53568 0.894
## theta:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.6747  0.8480  0.8480  0.8452  0.8480  0.8480 
## 
## Residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.44877 -0.14736 -0.01785  0.00115  0.13889  0.78209 
## 
## Coefficients:
##               Estimate Std. Error z-value  Pr(>|z|)    
## (Intercept) 12.5216013  0.0373006 335.695 < 2.2e-16 ***
## AQI         -0.0210374  0.0038201  -5.507  3.65e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Total Sum of Squares:    169.94
## Residual Sum of Squares: 85.855
## R-Squared:      0.49672
## Adj. R-Squared: 0.49652
## Chisq: 30.327 on 1 DF, p-value: 3.6501e-08

Test for endogeneity

# Run the fixed effects model again using plm for the Hausman test
fe_model_plm <- plm(logPrice ~ AQI, data = panel_data, model = "within")

# Perform Hausman test to check for endogeneity
hausman_test <- phtest(fe_model_plm, re_model_plm)

# Print the Hausman test result
print(hausman_test)
## 
##  Hausman Test
## 
## data:  logPrice ~ AQI
## chisq = 1.6981, df = 1, p-value = 0.1925
## alternative hypothesis: one model is inconsistent

Results:

Chi-square (chisq) = 1.6981

Degrees of freedom (df) = 1

p-value = 0.1925

Interpretation:

#The p-value of 0.1925 is greater than the typical significance level (e.g., 0.05). # This means we fail to reject the null hypothesis, which states that the random effects model is consistent and there is no endogeneity issue.

Robustness Check

install.packages("lmtest")
## Warning: package 'lmtest' is in use and will not be installed
library(plm)
library(lmtest)



# Run your fixed-effects model
fe_model <- plm(logPrice ~ AQI + Current_GDP + em_rate, 
                data = panel_data, 
                model = "within")

# Apply robust standard errors
robust_se <- coeftest(fe_model, vcov = vcovHC(fe_model, method = "arellano"))

# Print the results
print(robust_se)
## 
## t test of coefficients:
## 
##                Estimate  Std. Error t value  Pr(>|t|)    
## AQI         -1.3939e-02  3.7809e-03 -3.6867 0.0002332 ***
## Current_GDP  7.2676e-09  1.4966e-09  4.8561 1.290e-06 ***
## em_rate      9.3703e-03  1.3353e-03  7.0174 3.085e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1