library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)

dist_data<-read.csv("district_data.csv")
sum(is.na(dist_data$DA0GR21N))
## [1] 126
sum(is.na(dist_data$DPETALLC))
## [1] 0
sum(is.na(dist_data$DPFRAALLT))
## [1] 5
dist_clean<-dist_data %>% filter(!is.na(DA0GR21N), !is.na(DPETALLC), !is.na(DPFRAALLT))
nrow(dist_clean)
## [1] 1080
model <- lm(DA0GR21N ~ DPETALLC + DPFRAALLT, data = dist_clean)
summary(model)
## 
## Call:
## lm(formula = DA0GR21N ~ DPETALLC + DPFRAALLT, data = dist_clean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -3128.47   -15.09    -6.88     4.46  1093.88 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  9.512e+00  5.203e+00   1.828   0.0678 .  
## DPETALLC     7.974e-02  3.171e-03  25.151  < 2e-16 ***
## DPFRAALLT   -1.101e-06  2.403e-07  -4.581 5.16e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 159.7 on 1077 degrees of freedom
## Multiple R-squared:  0.9665, Adjusted R-squared:  0.9664 
## F-statistic: 1.553e+04 on 2 and 1077 DF,  p-value: < 2.2e-16
plot(model$fitted.values, model$residuals)
abline(h = 0, col = "red")

library(lmtest)
## Warning: package 'lmtest' was built under R version 4.4.3
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.4.3
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
dwtest(model)
## 
##  Durbin-Watson test
## 
## data:  model
## DW = 1.8606, p-value = 0.0106
## alternative hypothesis: true autocorrelation is greater than 0
library(lmtest)
bptest(model)
## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 123.02, df = 2, p-value < 2.2e-16
model_log <- lm(log(DA0GR21N) ~ DPETALLC + DPFRAALLT, data = dist_clean)
summary(model_log)
## 
## Call:
## lm(formula = log(DA0GR21N) ~ DPETALLC + DPFRAALLT, data = dist_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.3505 -0.7478  0.0662  0.8708  2.7207 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.051e+00  3.812e-02 106.265  < 2e-16 ***
## DPETALLC     1.146e-04  2.323e-05   4.934 9.32e-07 ***
## DPFRAALLT   -2.881e-09  1.761e-09  -1.637    0.102    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.17 on 1077 degrees of freedom
## Multiple R-squared:  0.4273, Adjusted R-squared:  0.4262 
## F-statistic: 401.7 on 2 and 1077 DF,  p-value: < 2.2e-16
qqnorm(residuals(model_log))
qqline(residuals(model_log), col = "blue")

library(car)
## Warning: package 'car' was built under R version 4.4.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.4.3
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
vif(model_log)
##  DPETALLC DPFRAALLT 
##  73.14313  73.14313