This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
rm(list = ls())
# Load necessary libraries
library(readr) # For reading CSV files
## Warning: package 'readr' was built under R version 4.3.3
# File path to the data (update this if necessary)
# File path to the data
file_path <- "C:/Users/mferdo2/OneDrive - EQS/Higher study USA/Finance_PhD/First_year Paper/GDP/annual_epa.csv"
# Load the dataset
annual_epa <- read_csv(file_path)
## Rows: 9693 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): state_name, county_name
## dbl (5): county_id, year, parameter_code, AQI, Price
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Inspect the first few rows of the data to understand its structure
head(annual_epa)
## # A tibble: 6 × 7
## county_id year state_name county_name parameter_code AQI Price
## <dbl> <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 1003 2000 Alabama Baldwin 88101 14.5 118891.
## 2 1003 2001 Alabama Baldwin 88101 10.6 125572.
## 3 1003 2002 Alabama Baldwin 88101 10.4 129665.
## 4 1003 2003 Alabama Baldwin 88101 12.2 132879.
## 5 1003 2004 Alabama Baldwin 88101 11.3 140003.
## 6 1003 2005 Alabama Baldwin 88101 11.7 158390.
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Creating decile groups for housing prices
annual_epa <- annual_epa %>%
mutate(Price_Decile = ntile(Price, 10)) # ntile creates decile groups
# Group by decile and summarize to calculate mean Price and AQI
result_decile <- annual_epa %>%
group_by(Price_Decile) %>%
summarise(
Mean_Housing_Price = mean(Price, na.rm = TRUE),
Mean_AQI = mean(AQI, na.rm = TRUE),
.groups = 'drop'
)
# Print the result
print(result_decile)
## # A tibble: 10 × 3
## Price_Decile Mean_Housing_Price Mean_AQI
## <int> <dbl> <dbl>
## 1 1 78888. 10.9
## 2 2 104566. 10.4
## 3 3 122469. 10.3
## 4 4 140695. 9.61
## 5 5 160367. 9.11
## 6 6 183272. 8.88
## 7 7 211495. 8.54
## 8 8 252544. 8.41
## 9 9 324135. 8.23
## 10 10 607500. 7.94
descriptive_stats <- annual_epa %>%
summarise(
N_Obs = n(),
Mean_Price = mean(Price, na.rm = TRUE),
Median_Price = median(Price, na.rm = TRUE),
SD_Price = sd(Price, na.rm = TRUE),
Min_Price = min(Price, na.rm = TRUE),
Max_Price = max(Price, na.rm = TRUE),
Mean_AQI = mean(AQI, na.rm = TRUE),
Median_AQI = median(AQI, na.rm = TRUE),
SD_AQI = sd(AQI, na.rm = TRUE),
Min_AQI = min(AQI, na.rm = TRUE),
Max_AQI = max(AQI, na.rm = TRUE)
)
# Print the result
print(descriptive_stats)
## # A tibble: 1 × 11
## N_Obs Mean_Price Median_Price SD_Price Min_Price Max_Price Mean_AQI Median_AQI
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 9693 218557. 171140. 180530. 38622. 4536232. 9.23 8.87
## # ℹ 3 more variables: SD_AQI <dbl>, Min_AQI <dbl>, Max_AQI <dbl>
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.2
ggplot(annual_epa, aes(x = Price)) +
geom_histogram(binwidth = 10000, fill = "blue", color = "black") +
labs(title = "Distribution of House Prices", x = "House Price", y = "Frequency") +
theme_minimal()
ggplot(annual_epa, aes(x = AQI)) +
geom_histogram(binwidth = 1, fill = "green", color = "black") +
labs(title = "Distribution of Air Quality Index (AQI)", x = "AQI", y = "Frequency") +
theme_minimal()
# Convert Price to logPrice by taking the natural log
annual_epa$logPrice <- log(annual_epa$Price)
# Run a linear regression model (Price ~ AQI)
linear_model <- lm(logPrice ~ AQI, data = annual_epa)
# Print summary of the regression model
summary(linear_model)
##
## Call:
## lm(formula = logPrice ~ AQI, data = annual_epa)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.52271 -0.36251 -0.06753 0.29824 2.80130
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.644627 0.017564 719.90 <2e-16 ***
## AQI -0.057842 0.001809 -31.97 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5359 on 9691 degrees of freedom
## Multiple R-squared: 0.09539, Adjusted R-squared: 0.0953
## F-statistic: 1022 on 1 and 9691 DF, p-value: < 2.2e-16
# Plot the relationship between log(Price) and AQI
ggplot(annual_epa, aes(x = AQI, y = logPrice)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
labs(title = "Scatter Plot of AQI vs log(House Price)",
x = "Air Quality Index (AQI)",
y = "Log of House Price") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
# Load the plm library for panel data models
library(plm)
## Warning: package 'plm' was built under R version 4.3.3
##
## Attaching package: 'plm'
## The following objects are masked from 'package:dplyr':
##
## between, lag, lead
# Convert your data into panel data format (assuming 'county_id' and 'year' are your panel identifiers)
annual_epa_panel <- pdata.frame(annual_epa, index = c("county_id", "year"))
# Fixed Effects model
fe_model <- plm(logPrice ~ AQI, data = annual_epa_panel, model = "within")
# Summary of the Fixed Effects model
summary(fe_model)
## Oneway (individual) effect Within Model
##
## Call:
## plm(formula = logPrice ~ AQI, data = annual_epa_panel, model = "within")
##
## Unbalanced Panel: n = 623, T = 1-22, N = 9693
##
## Residuals:
## Min. 1st Qu. Median 3rd Qu. Max.
## -0.856095 -0.129507 -0.028973 0.109137 1.008673
##
## Coefficients:
## Estimate Std. Error t-value Pr(>|t|)
## AQI -0.0442041 0.0011412 -38.734 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Total Sum of Squares: 524.96
## Residual Sum of Squares: 450.44
## R-Squared: 0.14195
## Adj. R-Squared: 0.083008
## F-statistic: 1500.34 on 1 and 9069 DF, p-value: < 2.22e-16
# Random Effects model
re_model <- plm(logPrice ~ AQI, data = annual_epa_panel, model = "random")
# Summary of the Random Effects model
summary(re_model)
## Oneway (individual) effect Random Effect Model
## (Swamy-Arora's transformation)
##
## Call:
## plm(formula = logPrice ~ AQI, data = annual_epa_panel, model = "random")
##
## Unbalanced Panel: n = 623, T = 1-22, N = 9693
##
## Effects:
## var std.dev share
## idiosyncratic 0.04967 0.22286 0.173
## individual 0.23704 0.48687 0.827
## theta:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.5838 0.8826 0.9029 0.8885 0.9029 0.9029
##
## Residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.87005 -0.14043 -0.03321 0.00349 0.11532 1.05209
##
## Coefficients:
## Estimate Std. Error z-value Pr(>|z|)
## (Intercept) 12.4687472 0.0221948 561.788 < 2.2e-16 ***
## AQI -0.0445894 0.0011295 -39.477 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Total Sum of Squares: 1547.4
## Residual Sum of Squares: 480.41
## R-Squared: 0.69076
## Adj. R-Squared: 0.69073
## Chisq: 1558.44 on 1 DF, p-value: < 2.22e-16
# Perform a Hausman test to decide between Fixed Effects and Random Effects
hausman_test <- phtest(fe_model, re_model)
print(hausman_test)
##
## Hausman Test
##
## data: logPrice ~ AQI
## chisq = 5.5801, df = 1, p-value = 0.01817
## alternative hypothesis: one model is inconsistent
# Fit the model using lm for VIF calculation (note: this is not a panel model)
lm_model <- lm(logPrice ~ AQI + factor(year), data = annual_epa)
# Load the 'car' package
library(car)
## Warning: package 'car' was built under R version 4.3.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.3.3
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
# Calculate VIF
vif(lm_model)
## GVIF Df GVIF^(1/(2*Df))
## AQI 1.521626 1 1.233542
## factor(year) 1.521626 21 1.010045
#AQI:GVIF = 1.52: This means the variance of the AQI coefficient is inflated by a factor of 1.52 due to multicollinearity.GVIF^(1/(2*Df)) = 1.23: The adjusted GVIF (which accounts for the degrees of freedom) for AQI is 1.23, well below the commonly used threshold of 5 or 10, indicating that multicollinearity is not an issue for AQI.
#factor(year):GVIF = 1.52: This is the GVIF for the whole set of year dummy variables.GVIF^(1/(2*Df)) = 1.01: The adjusted GVIF is 1.01, which is also very low. This suggests that the year dummies do not have a multicollinearity problem.
# Fixed Effects model with time fixed effects (year as a factor)
fe_model_time <- plm(logPrice ~ AQI + factor(year), data = annual_epa_panel, model = "within")
# Summary of the Fixed Effects model with time fixed effects
summary(fe_model_time)
## Oneway (individual) effect Within Model
##
## Call:
## plm(formula = logPrice ~ AQI + factor(year), data = annual_epa_panel,
## model = "within")
##
## Unbalanced Panel: n = 623, T = 1-22, N = 9693
##
## Residuals:
## Min. 1st Qu. Median 3rd Qu. Max.
## -0.60034736 -0.06144546 0.00053231 0.06416185 0.53252710
##
## Coefficients:
## Estimate Std. Error t-value Pr(>|t|)
## AQI 0.01192716 0.00089154 13.378 < 2.2e-16 ***
## factor(year)2001 0.07303782 0.00888972 8.216 2.394e-16 ***
## factor(year)2002 0.14023106 0.00880895 15.919 < 2.2e-16 ***
## factor(year)2003 0.21372149 0.00881826 24.236 < 2.2e-16 ***
## factor(year)2004 0.30230618 0.00891358 33.915 < 2.2e-16 ***
## factor(year)2005 0.38603078 0.00875606 44.087 < 2.2e-16 ***
## factor(year)2006 0.47678842 0.00893824 53.343 < 2.2e-16 ***
## factor(year)2007 0.48497565 0.00883097 54.918 < 2.2e-16 ***
## factor(year)2008 0.45270969 0.00900126 50.294 < 2.2e-16 ***
## factor(year)2009 0.38056192 0.00905017 42.050 < 2.2e-16 ***
## factor(year)2010 0.34617479 0.00885431 39.097 < 2.2e-16 ***
## factor(year)2011 0.29876108 0.00890198 33.561 < 2.2e-16 ***
## factor(year)2012 0.29791759 0.00907274 32.837 < 2.2e-16 ***
## factor(year)2013 0.35035695 0.00913515 38.353 < 2.2e-16 ***
## factor(year)2014 0.40639827 0.00919717 44.187 < 2.2e-16 ***
## factor(year)2015 0.45370402 0.00927381 48.923 < 2.2e-16 ***
## factor(year)2016 0.51454088 0.00957220 53.754 < 2.2e-16 ***
## factor(year)2017 0.55774514 0.00938407 59.435 < 2.2e-16 ***
## factor(year)2018 0.61422650 0.00935056 65.689 < 2.2e-16 ***
## factor(year)2020 0.73139783 0.00940756 77.746 < 2.2e-16 ***
## factor(year)2021 0.86376751 0.00924302 93.451 < 2.2e-16 ***
## factor(year)2022 0.99281637 0.00958472 103.583 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Total Sum of Squares: 524.96
## Residual Sum of Squares: 114.27
## R-Squared: 0.78233
## Adj. R-Squared: 0.76684
## F-statistic: 1478.16 on 22 and 9048 DF, p-value: < 2.22e-16
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.3.3
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.3.3
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(sandwich)
## Warning: package 'sandwich' was built under R version 4.3.3
fe_model_clustered <- coeftest(fe_model_time, vcov = vcovHC(fe_model_time, type = "HC1", cluster = "group"))
print(fe_model_clustered)
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## AQI 0.0119272 0.0018536 6.4346 1.301e-10 ***
## factor(year)2001 0.0730378 0.0032807 22.2632 < 2.2e-16 ***
## factor(year)2002 0.1402311 0.0049754 28.1848 < 2.2e-16 ***
## factor(year)2003 0.2137215 0.0069901 30.5748 < 2.2e-16 ***
## factor(year)2004 0.3023062 0.0091865 32.9078 < 2.2e-16 ***
## factor(year)2005 0.3860308 0.0123244 31.3225 < 2.2e-16 ***
## factor(year)2006 0.4767884 0.0139679 34.1345 < 2.2e-16 ***
## factor(year)2007 0.4849756 0.0131304 36.9354 < 2.2e-16 ***
## factor(year)2008 0.4527097 0.0113394 39.9235 < 2.2e-16 ***
## factor(year)2009 0.3805619 0.0113334 33.5788 < 2.2e-16 ***
## factor(year)2010 0.3461748 0.0113209 30.5784 < 2.2e-16 ***
## factor(year)2011 0.2987611 0.0115728 25.8157 < 2.2e-16 ***
## factor(year)2012 0.2979176 0.0118800 25.0773 < 2.2e-16 ***
## factor(year)2013 0.3503570 0.0117853 29.7284 < 2.2e-16 ***
## factor(year)2014 0.4063983 0.0121166 33.5406 < 2.2e-16 ***
## factor(year)2015 0.4537040 0.0125941 36.0252 < 2.2e-16 ***
## factor(year)2016 0.5145409 0.0134428 38.2763 < 2.2e-16 ***
## factor(year)2017 0.5577451 0.0132287 42.1616 < 2.2e-16 ***
## factor(year)2018 0.6142265 0.0134043 45.8230 < 2.2e-16 ***
## factor(year)2020 0.7313978 0.0133166 54.9238 < 2.2e-16 ***
## factor(year)2021 0.8637675 0.0138036 62.5755 < 2.2e-16 ***
## factor(year)2022 0.9928164 0.0150030 66.1747 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Plot residuals
residuals_data <- data.frame(residuals = residuals(fe_model_time))
ggplot(residuals_data, aes(x = residuals)) +
geom_histogram(binwidth = 0.05, fill = "blue", color = "black") +
labs(title = "Residuals Distribution", x = "Residuals", y = "Frequency")
# Fixed Effect regression county plm:
# Fixed Effects model
fe_model_2 <- plm(logPrice ~ AQI, data = annual_epa_panel, model = "within")
# Summary of the Fixed Effects model
summary(fe_model_2)
## Oneway (individual) effect Within Model
##
## Call:
## plm(formula = logPrice ~ AQI, data = annual_epa_panel, model = "within")
##
## Unbalanced Panel: n = 623, T = 1-22, N = 9693
##
## Residuals:
## Min. 1st Qu. Median 3rd Qu. Max.
## -0.856095 -0.129507 -0.028973 0.109137 1.008673
##
## Coefficients:
## Estimate Std. Error t-value Pr(>|t|)
## AQI -0.0442041 0.0011412 -38.734 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Total Sum of Squares: 524.96
## Residual Sum of Squares: 450.44
## R-Squared: 0.14195
## Adj. R-Squared: 0.083008
## F-statistic: 1500.34 on 1 and 9069 DF, p-value: < 2.22e-16
library(lfe)
## Warning: package 'lfe' was built under R version 4.3.3
## Loading required package: Matrix
##
## Attaching package: 'lfe'
## The following object is masked from 'package:lmtest':
##
## waldtest
## The following object is masked from 'package:plm':
##
## sargan
# Run a fixed-effects with AQI as the control variable and county and year fixed effects
model_fe_3 <- felm(logPrice ~ AQI | factor(county_id) + factor(year), data = annual_epa)
# Display the summary of the fixed-effects model
summary(model_fe_3)
##
## Call:
## felm(formula = logPrice ~ AQI | factor(county_id) + factor(year), data = annual_epa)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.60035 -0.06145 0.00053 0.06416 0.53253
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## AQI 0.0119272 0.0008915 13.38 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1124 on 9048 degrees of freedom
## Multiple R-squared(full model): 0.9629 Adjusted R-squared: 0.9602
## Multiple R-squared(proj model): 0.0194 Adjusted R-squared: -0.0504
## F-statistic(full model):364.2 on 644 and 9048 DF, p-value: < 2.2e-16
## F-statistic(proj model): 179 on 1 and 9048 DF, p-value: < 2.2e-16
library(plm)
library(lmtest)
# Load the dataset and prepare as panel data (same as above)
panel_data <- pdata.frame(annual_epa, index = c("county_id", "year"))
# Wooldridge test for serial correlation in panel models
serial_test <- pbgtest(fe_model)
# Display the results of the Wooldridge test
print(serial_test)
##
## Breusch-Godfrey/Wooldridge test for serial correlation in panel models
##
## data: logPrice ~ AQI
## chisq = 3925.3, df = 1, p-value < 2.2e-16
## alternative hypothesis: serial correlation in idiosyncratic errors
# Include lagged AQI in the model
lagged_model <- plm(logPrice ~ lag(AQI, 1) + factor(year), data = panel_data, model = "within")
# Summary of the lagged model
summary(lagged_model)
## Oneway (individual) effect Within Model
##
## Call:
## plm(formula = logPrice ~ lag(AQI, 1) + factor(year), data = panel_data,
## model = "within")
##
## Unbalanced Panel: n = 611, T = 1-20, N = 8508
##
## Residuals:
## Min. 1st Qu. Median 3rd Qu. Max.
## -0.58686326 -0.05914333 0.00085594 0.06259766 0.55263071
##
## Coefficients:
## Estimate Std. Error t-value Pr(>|t|)
## lag(AQI, 1) 0.01216633 0.00095778 12.7027 < 2.2e-16 ***
## factor(year)2002 0.06445925 0.00885273 7.2813 3.626e-13 ***
## factor(year)2003 0.13780348 0.00881639 15.6304 < 2.2e-16 ***
## factor(year)2004 0.22938043 0.00890471 25.7594 < 2.2e-16 ***
## factor(year)2005 0.33417947 0.00896046 37.2949 < 2.2e-16 ***
## factor(year)2006 0.39641365 0.00882497 44.9195 < 2.2e-16 ***
## factor(year)2007 0.42120011 0.00893621 47.1341 < 2.2e-16 ***
## factor(year)2008 0.36974432 0.00886861 41.6914 < 2.2e-16 ***
## factor(year)2009 0.29904612 0.00903672 33.0923 < 2.2e-16 ***
## factor(year)2010 0.27831911 0.00913005 30.4838 < 2.2e-16 ***
## factor(year)2011 0.22726860 0.00896587 25.3482 < 2.2e-16 ***
## factor(year)2012 0.22143769 0.00900226 24.5980 < 2.2e-16 ***
## factor(year)2013 0.27807496 0.00918424 30.2774 < 2.2e-16 ***
## factor(year)2014 0.33345963 0.00925758 36.0202 < 2.2e-16 ***
## factor(year)2015 0.38228562 0.00932490 40.9962 < 2.2e-16 ***
## factor(year)2016 0.43620600 0.00942727 46.2707 < 2.2e-16 ***
## factor(year)2017 0.49336356 0.00976336 50.5321 < 2.2e-16 ***
## factor(year)2018 0.54572099 0.00955010 57.1430 < 2.2e-16 ***
## factor(year)2021 0.79974858 0.00956999 83.5684 < 2.2e-16 ***
## factor(year)2022 0.91501341 0.00939406 97.4034 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Total Sum of Squares: 424.18
## Residual Sum of Squares: 97.673
## R-Squared: 0.76974
## Adj. R-Squared: 0.75132
## F-statistic: 1316.6 on 20 and 7877 DF, p-value: < 2.22e-16
library(plm)
library(lmtest)
# Step 1: Convert the data into a panel format
panel_data <- pdata.frame(annual_epa, index = c("county_id", "year"))
# Step 2: Run the fixed-effects model (within model)
fe_model <- plm(logPrice ~ AQI + factor(year), data = panel_data, model = "within")
# Step 3: Use clustered standard errors at the county level to account for serial correlation and heteroskedasticity
clustered_se <- coeftest(fe_model, vcov = vcovHC(fe_model, type = "HC1", cluster = "group"))
# Step 4: Driscoll-Kraay standard errors to account for serial correlation and cross-sectional dependence
driscoll_kraay_se <- coeftest(fe_model, vcov = vcovSCC(fe_model, type = "HC0"))
# Step 5: Print the results
print(clustered_se) # Clustered standard errors
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## AQI 0.0119272 0.0018536 6.4346 1.301e-10 ***
## factor(year)2001 0.0730378 0.0032807 22.2632 < 2.2e-16 ***
## factor(year)2002 0.1402311 0.0049754 28.1848 < 2.2e-16 ***
## factor(year)2003 0.2137215 0.0069901 30.5748 < 2.2e-16 ***
## factor(year)2004 0.3023062 0.0091865 32.9078 < 2.2e-16 ***
## factor(year)2005 0.3860308 0.0123244 31.3225 < 2.2e-16 ***
## factor(year)2006 0.4767884 0.0139679 34.1345 < 2.2e-16 ***
## factor(year)2007 0.4849756 0.0131304 36.9354 < 2.2e-16 ***
## factor(year)2008 0.4527097 0.0113394 39.9235 < 2.2e-16 ***
## factor(year)2009 0.3805619 0.0113334 33.5788 < 2.2e-16 ***
## factor(year)2010 0.3461748 0.0113209 30.5784 < 2.2e-16 ***
## factor(year)2011 0.2987611 0.0115728 25.8157 < 2.2e-16 ***
## factor(year)2012 0.2979176 0.0118800 25.0773 < 2.2e-16 ***
## factor(year)2013 0.3503570 0.0117853 29.7284 < 2.2e-16 ***
## factor(year)2014 0.4063983 0.0121166 33.5406 < 2.2e-16 ***
## factor(year)2015 0.4537040 0.0125941 36.0252 < 2.2e-16 ***
## factor(year)2016 0.5145409 0.0134428 38.2763 < 2.2e-16 ***
## factor(year)2017 0.5577451 0.0132287 42.1616 < 2.2e-16 ***
## factor(year)2018 0.6142265 0.0134043 45.8230 < 2.2e-16 ***
## factor(year)2020 0.7313978 0.0133166 54.9238 < 2.2e-16 ***
## factor(year)2021 0.8637675 0.0138036 62.5755 < 2.2e-16 ***
## factor(year)2022 0.9928164 0.0150030 66.1747 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
print(driscoll_kraay_se) # Driscoll-Kraay standard errors
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## AQI 0.0119272 0.0035935 3.3191 0.0009065 ***
## factor(year)2001 0.0730378 0.0022468 32.5070 < 2.2e-16 ***
## factor(year)2002 0.1402311 0.0034102 41.1205 < 2.2e-16 ***
## factor(year)2003 0.2137215 0.0051605 41.4152 < 2.2e-16 ***
## factor(year)2004 0.3023062 0.0068867 43.8972 < 2.2e-16 ***
## factor(year)2005 0.3860308 0.0033190 116.3096 < 2.2e-16 ***
## factor(year)2006 0.4767884 0.0074093 64.3503 < 2.2e-16 ***
## factor(year)2007 0.4849756 0.0067614 71.7268 < 2.2e-16 ***
## factor(year)2008 0.4527097 0.0106269 42.6003 < 2.2e-16 ***
## factor(year)2009 0.3805619 0.0148055 25.7041 < 2.2e-16 ***
## factor(year)2010 0.3461748 0.0143345 24.1498 < 2.2e-16 ***
## factor(year)2011 0.2987611 0.0145814 20.4892 < 2.2e-16 ***
## factor(year)2012 0.2979176 0.0168998 17.6284 < 2.2e-16 ***
## factor(year)2013 0.3503570 0.0176089 19.8966 < 2.2e-16 ***
## factor(year)2014 0.4063983 0.0186783 21.7578 < 2.2e-16 ***
## factor(year)2015 0.4537040 0.0194300 23.3507 < 2.2e-16 ***
## factor(year)2016 0.5145409 0.0220888 23.2942 < 2.2e-16 ***
## factor(year)2017 0.5577451 0.0205097 27.1942 < 2.2e-16 ***
## factor(year)2018 0.6142265 0.0201591 30.4689 < 2.2e-16 ***
## factor(year)2020 0.7313978 0.0204342 35.7928 < 2.2e-16 ***
## factor(year)2021 0.8637675 0.0189896 45.4864 < 2.2e-16 ***
## factor(year)2022 0.9928164 0.0218290 45.4815 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
`
library(readr) # For reading CSV files
# Updated file path to the data
file_path <- "C:/Users/mferdo2/OneDrive - EQS/Higher study USA/Finance_PhD/First_year Paper/GDP/epa_gdp_ue_2017_2022_cleaned.csv"
# Load the dataset
epa_gdp_ue_2017_2022 <- read_csv(file_path)
## Rows: 2531 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): state_name, county_name
## dbl (12): county_id, year, parameter_code, AQI, Price, Civilian_labor_force,...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Inspect the first few rows of the data to understand its structure
head(epa_gdp_ue_2017_2022)
## # A tibble: 6 × 14
## county_id year state_name county_name parameter_code AQI Price
## <dbl> <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 1003 2017 Alabama Baldwin 88101 7.42 195815.
## 2 1003 2018 Alabama Baldwin 88101 7.13 207072.
## 3 1003 2020 Alabama Baldwin 88101 7.93 235690.
## 4 1003 2021 Alabama Baldwin 88101 7.28 276350.
## 5 1003 2022 Alabama Baldwin 88101 7.33 319093.
## 6 1033 2017 Alabama Colbert 88101 7.43 116845.
## # ℹ 7 more variables: Civilian_labor_force <dbl>, Employed <dbl>,
## # Unemployed <dbl>, Unemployment_rate <dbl>, indexes_real_GDP <dbl>,
## # Current_GDP <dbl>, Real_GDP <dbl>
# Convert Price to logPrice by taking the natural log
epa_gdp_ue_2017_2022$logPrice <- log(epa_gdp_ue_2017_2022$Price)
# Create the em_rate variable from Unemployment_rate
epa_gdp_ue_2017_2022$em_rate <- (100 - epa_gdp_ue_2017_2022$Unemployment_rate)
# Load the necessary library for fixed effect model
library(plm)
# Convert the data to panel format
panel_data <- pdata.frame(epa_gdp_ue_2017_2022, index = c("county_id", "year"))
# Run the fixed-effects regression model
fe_model <- plm(logPrice ~ AQI + Current_GDP + em_rate,
data = panel_data,
model = "within")
# Display the summary of the fixed effect regression model
summary(fe_model)
## Oneway (individual) effect Within Model
##
## Call:
## plm(formula = logPrice ~ AQI + Current_GDP + em_rate, data = panel_data,
## model = "within")
##
## Unbalanced Panel: n = 532, T = 1-5, N = 2531
##
## Residuals:
## Min. 1st Qu. Median 3rd Qu. Max.
## -0.663760 -0.132164 0.003953 0.119551 0.588217
##
## Coefficients:
## Estimate Std. Error t-value Pr(>|t|)
## AQI -1.3939e-02 3.7531e-03 -3.7140 0.0002096 ***
## Current_GDP 7.2676e-09 4.5658e-10 15.9175 < 2.2e-16 ***
## em_rate 9.3703e-03 2.1684e-03 4.3213 1.628e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Total Sum of Squares: 68.645
## Residual Sum of Squares: 59.457
## R-Squared: 0.13384
## Adj. R-Squared: -0.097888
## F-statistic: 102.808 on 3 and 1996 DF, p-value: < 2.22e-16
# Load the necessary library for fixed effect model
library(lfe)
# Run the fixed-effects regression model with county and year fixed effects
fe_model_2 <- felm(logPrice ~ AQI + Current_GDP + em_rate | factor(county_id) + factor(year),
data = epa_gdp_ue_2017_2022)
# Display the summary of the fixed effect regression model
summary(fe_model_2)
##
## Call:
## felm(formula = logPrice ~ AQI + Current_GDP + em_rate | factor(county_id) + factor(year), data = epa_gdp_ue_2017_2022)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.279526 -0.026393 -0.000168 0.025327 0.263303
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## AQI -4.080e-03 1.344e-03 -3.036 0.00243 **
## Current_GDP -2.903e-10 1.702e-10 -1.706 0.08811 .
## em_rate 3.661e-03 1.687e-03 2.170 0.03014 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.05966 on 1992 degrees of freedom
## Multiple R-squared(full model): 0.9914 Adjusted R-squared: 0.989
## Multiple R-squared(proj model): 0.008239 Adjusted R-squared: -0.2596
## F-statistic(full model):425.1 on 538 and 1992 DF, p-value: < 2.2e-16
## F-statistic(proj model): 5.516 on 3 and 1992 DF, p-value: 0.0009021
# Load the necessary library for fixed effect model
library(lfe)
# Run the fixed-effects regression model with only AQI as control and county and year fixed effects
fe_model_aqi <- felm(logPrice ~ AQI | factor(county_id) + factor(year),
data = epa_gdp_ue_2017_2022)
# Display the summary of the fixed effect regression model
summary(fe_model_aqi)
##
## Call:
## felm(formula = logPrice ~ AQI | factor(county_id) + factor(year), data = epa_gdp_ue_2017_2022)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.27449 -0.02616 0.00000 0.02514 0.26181
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## AQI -0.003920 0.001345 -2.915 0.00359 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.05975 on 1994 degrees of freedom
## Multiple R-squared(full model): 0.9913 Adjusted R-squared: 0.989
## Multiple R-squared(proj model): 0.004244 Adjusted R-squared: -0.2634
## F-statistic(full model):425.4 on 536 and 1994 DF, p-value: < 2.2e-16
## F-statistic(proj model): 8.5 on 1 and 1994 DF, p-value: 0.003592
# Run a linear regression where the dependent variable is logPrice
# Control variables are AQI and year
simple_model <- lm(logPrice ~ AQI + year, data = epa_gdp_ue_2017_2022)
# Display the summary of the regression model
summary(simple_model)
##
## Call:
## lm(formula = logPrice ~ AQI + year, data = epa_gdp_ue_2017_2022)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.32482 -0.36515 -0.05931 0.32941 2.58361
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.542e+02 1.180e+01 -13.067 < 2e-16 ***
## AQI -3.102e-02 5.390e-03 -5.755 9.68e-09 ***
## year 8.259e-02 5.842e-03 14.138 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5448 on 2528 degrees of freedom
## Multiple R-squared: 0.08615, Adjusted R-squared: 0.08543
## F-statistic: 119.2 on 2 and 2528 DF, p-value: < 2.2e-16
# Optional: Plot the regression results
plot(simple_model)
# Run the fixed effects model again using plm for the Hausman test
fe_model_plm <- plm(logPrice ~ AQI, data = panel_data, model = "within")
# Display the summary of the regression model
summary(fe_model_plm)
## Oneway (individual) effect Within Model
##
## Call:
## plm(formula = logPrice ~ AQI, data = panel_data, model = "within")
##
## Unbalanced Panel: n = 532, T = 1-5, N = 2531
##
## Residuals:
## Min. 1st Qu. Median 3rd Qu. Max.
## -0.423932 -0.136412 -0.020688 0.133603 0.455898
##
## Coefficients:
## Estimate Std. Error t-value Pr(>|t|)
## AQI -0.0195378 0.0039897 -4.8971 1.051e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Total Sum of Squares: 68.645
## Residual Sum of Squares: 67.831
## R-Squared: 0.01186
## Adj. R-Squared: -0.25125
## F-statistic: 23.9812 on 1 and 1998 DF, p-value: 1.0509e-06
library(plm)
# Convert the data to panel format for the random effects model
panel_data <- pdata.frame(epa_gdp_ue_2017_2022, index = c("county_id", "year"))
# Run the random effects model using plm
re_model_plm <- plm(logPrice ~ AQI, data = panel_data, model = "random")
summary(re_model_plm)
## Oneway (individual) effect Random Effect Model
## (Swamy-Arora's transformation)
##
## Call:
## plm(formula = logPrice ~ AQI, data = panel_data, model = "random")
##
## Unbalanced Panel: n = 532, T = 1-5, N = 2531
##
## Effects:
## var std.dev share
## idiosyncratic 0.03395 0.18425 0.106
## individual 0.28695 0.53568 0.894
## theta:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.6747 0.8480 0.8480 0.8452 0.8480 0.8480
##
## Residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.44877 -0.14736 -0.01785 0.00115 0.13889 0.78209
##
## Coefficients:
## Estimate Std. Error z-value Pr(>|z|)
## (Intercept) 12.5216013 0.0373006 335.695 < 2.2e-16 ***
## AQI -0.0210374 0.0038201 -5.507 3.65e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Total Sum of Squares: 169.94
## Residual Sum of Squares: 85.855
## R-Squared: 0.49672
## Adj. R-Squared: 0.49652
## Chisq: 30.327 on 1 DF, p-value: 3.6501e-08
# Run the fixed effects model again using plm for the Hausman test
fe_model_plm <- plm(logPrice ~ AQI, data = panel_data, model = "within")
# Perform Hausman test to check for endogeneity
hausman_test <- phtest(fe_model_plm, re_model_plm)
# Print the Hausman test result
print(hausman_test)
##
## Hausman Test
##
## data: logPrice ~ AQI
## chisq = 1.6981, df = 1, p-value = 0.1925
## alternative hypothesis: one model is inconsistent
#The p-value of 0.1925 is greater than the typical significance level (e.g., 0.05). # This means we fail to reject the null hypothesis, which states that the random effects model is consistent and there is no endogeneity issue.
install.packages("lmtest")
## Warning: package 'lmtest' is in use and will not be installed
library(plm)
library(lmtest)
# Run your fixed-effects model
fe_model <- plm(logPrice ~ AQI + Current_GDP + em_rate,
data = panel_data,
model = "within")
# Apply robust standard errors
robust_se <- coeftest(fe_model, vcov = vcovHC(fe_model, method = "arellano"))
# Print the results
print(robust_se)
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## AQI -1.3939e-02 3.7809e-03 -3.6867 0.0002332 ***
## Current_GDP 7.2676e-09 1.4966e-09 4.8561 1.290e-06 ***
## em_rate 9.3703e-03 1.3353e-03 7.0174 3.085e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1