This analysis examines the relationship between educational attainment and median household income across Texas counties using a linear regression model.

library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
data <- read_csv("https://corgis-edu.github.io/corgis/datasets/csv/county_demographics/county_demographics.csv")
## Rows: 3139 Columns: 43
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): County, State
## dbl (41): Age.Percent 65 and Older, Age.Percent Under 18 Years, Age.Percent ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
texas <- data %>%
  filter(State == "TX" | State == "Texas")

names(texas) <- make.names(names(texas))

model_data <- texas %>%
  dplyr::select(
    Income.Median.Houseold.Income,
    Education.Bachelor.s.Degree.or.Higher,
    Age.Percent.65.and.Older,
    Population.2020.Population
  ) %>%
  na.omit()

summary(model_data)
##  Income.Median.Houseold.Income Education.Bachelor.s.Degree.or.Higher
##  Min.   : 25098                Min.   : 0.00                        
##  1st Qu.: 44590                1st Qu.:14.22                        
##  Median : 51371                Median :17.30                        
##  Mean   : 52848                Mean   :19.06                        
##  3rd Qu.: 58747                3rd Qu.:21.80                        
##  Max.   :100920                Max.   :52.30                        
##  Age.Percent.65.and.Older Population.2020.Population
##  Min.   : 8.90            Min.   :     64           
##  1st Qu.:14.62            1st Qu.:   6264           
##  Median :17.65            Median :  18484           
##  Mean   :18.55            Mean   : 114746           
##  3rd Qu.:22.40            3rd Qu.:  51848           
##  Max.   :37.30            Max.   :4731145
model <- lm(
  Income.Median.Houseold.Income ~ 
    Education.Bachelor.s.Degree.or.Higher +
    Age.Percent.65.and.Older +
    Population.2020.Population,
  data = model_data
)

summary(model)
## 
## Call:
## lm(formula = Income.Median.Houseold.Income ~ Education.Bachelor.s.Degree.or.Higher + 
##     Age.Percent.65.and.Older + Population.2020.Population, data = model_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -33216  -4796     18   5564  45175 
## 
## Coefficients:
##                                         Estimate Std. Error t value Pr(>|t|)
## (Intercept)                            4.998e+04  2.608e+03  19.167  < 2e-16
## Education.Bachelor.s.Degree.or.Higher  9.872e+02  8.474e+01  11.650  < 2e-16
## Age.Percent.65.and.Older              -8.386e+02  1.146e+02  -7.317 3.45e-12
## Population.2020.Population            -3.355e-03  1.681e-03  -1.996   0.0471
##                                          
## (Intercept)                           ***
## Education.Bachelor.s.Degree.or.Higher ***
## Age.Percent.65.and.Older              ***
## Population.2020.Population            *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9610 on 250 degrees of freedom
## Multiple R-squared:  0.4417, Adjusted R-squared:  0.4349 
## F-statistic: 65.92 on 3 and 250 DF,  p-value: < 2.2e-16
# Linearity
plot(model, which = 1)

raintest(model)
## 
##  Rainbow test
## 
## data:  model
## Rain = 1.2327, df1 = 127, df2 = 123, p-value = 0.1222
# Independence of errors
dwtest(model)
## 
##  Durbin-Watson test
## 
## data:  model
## DW = 1.7801, p-value = 0.03978
## alternative hypothesis: true autocorrelation is greater than 0
# Homoscedasticity
bptest(model)
## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 14.263, df = 3, p-value = 0.002568
# Normality of residuals
plot(model, which = 2)

shapiro.test(residuals(model))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(model)
## W = 0.97838, p-value = 0.0006533
# Multicollinearity
vif(model)
## Education.Bachelor.s.Degree.or.Higher              Age.Percent.65.and.Older 
##                              1.197454                              1.103365 
##            Population.2020.Population 
##                              1.305788
cor(
  model_data[, c(
    "Education.Bachelor.s.Degree.or.Higher",
    "Age.Percent.65.and.Older",
    "Population.2020.Population"
  )]
)
##                                       Education.Bachelor.s.Degree.or.Higher
## Education.Bachelor.s.Degree.or.Higher                             1.0000000
## Age.Percent.65.and.Older                                         -0.0203556
## Population.2020.Population                                        0.3941701
##                                       Age.Percent.65.and.Older
## Education.Bachelor.s.Degree.or.Higher               -0.0203556
## Age.Percent.65.and.Older                             1.0000000
## Population.2020.Population                          -0.2886943
##                                       Population.2020.Population
## Education.Bachelor.s.Degree.or.Higher                  0.3941701
## Age.Percent.65.and.Older                              -0.2886943
## Population.2020.Population                             1.0000000