library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
data <- read_csv("https://corgis-edu.github.io/corgis/datasets/csv/county_demographics/county_demographics.csv")
## Rows: 3139 Columns: 43
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): County, State
## dbl (41): Age.Percent 65 and Older, Age.Percent Under 18 Years, Age.Percent ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
texas <- data %>%
  filter(State == "TX" | State == "Texas")

names(texas) <- make.names(names(texas))

The dependent variable in this analysis is median household income. The independent variables are the percentage of residents with a bachelor’s degree or higher, the percentage of residents age 65 and older, and total population.

model_data <- texas %>%
  dplyr::select(
    Income.Median.Houseold.Income,
    Education.Bachelor.s.Degree.or.Higher,
    Age.Percent.65.and.Older,
    Population.2020.Population
  ) %>%
  na.omit()
model <- lm(
  Income.Median.Houseold.Income ~ 
    Education.Bachelor.s.Degree.or.Higher +
    Age.Percent.65.and.Older +
    Population.2020.Population,
  data = model_data
)
summary(model)
## 
## Call:
## lm(formula = Income.Median.Houseold.Income ~ Education.Bachelor.s.Degree.or.Higher + 
##     Age.Percent.65.and.Older + Population.2020.Population, data = model_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -33216  -4796     18   5564  45175 
## 
## Coefficients:
##                                         Estimate Std. Error t value Pr(>|t|)
## (Intercept)                            4.998e+04  2.608e+03  19.167  < 2e-16
## Education.Bachelor.s.Degree.or.Higher  9.872e+02  8.474e+01  11.650  < 2e-16
## Age.Percent.65.and.Older              -8.386e+02  1.146e+02  -7.317 3.45e-12
## Population.2020.Population            -3.355e-03  1.681e-03  -1.996   0.0471
##                                          
## (Intercept)                           ***
## Education.Bachelor.s.Degree.or.Higher ***
## Age.Percent.65.and.Older              ***
## Population.2020.Population            *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9610 on 250 degrees of freedom
## Multiple R-squared:  0.4417, Adjusted R-squared:  0.4349 
## F-statistic: 65.92 on 3 and 250 DF,  p-value: < 2.2e-16
plot(model, which = 1)

raintest(model)
## 
##  Rainbow test
## 
## data:  model
## Rain = 1.2327, df1 = 127, df2 = 123, p-value = 0.1222
dwtest(model)
## 
##  Durbin-Watson test
## 
## data:  model
## DW = 1.7801, p-value = 0.03978
## alternative hypothesis: true autocorrelation is greater than 0
plot(model, which = 1)

bptest(model)
## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 14.263, df = 3, p-value = 0.002568
plot(model, which = 2)

shapiro.test(residuals(model))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(model)
## W = 0.97838, p-value = 0.0006533
vif(model)
## Education.Bachelor.s.Degree.or.Higher              Age.Percent.65.and.Older 
##                              1.197454                              1.103365 
##            Population.2020.Population 
##                              1.305788
cor(
  model_data[, c(
    "Education.Bachelor.s.Degree.or.Higher",
    "Age.Percent.65.and.Older",
    "Population.2020.Population"
  )]
)
##                                       Education.Bachelor.s.Degree.or.Higher
## Education.Bachelor.s.Degree.or.Higher                             1.0000000
## Age.Percent.65.and.Older                                         -0.0203556
## Population.2020.Population                                        0.3941701
##                                       Age.Percent.65.and.Older
## Education.Bachelor.s.Degree.or.Higher               -0.0203556
## Age.Percent.65.and.Older                             1.0000000
## Population.2020.Population                          -0.2886943
##                                       Population.2020.Population
## Education.Bachelor.s.Degree.or.Higher                  0.3941701
## Age.Percent.65.and.Older                              -0.2886943
## Population.2020.Population                             1.0000000

Interpretation

The model was evaluated for the main assumptions of linear regression, including linearity, independence of errors, homoscedasticity, normality of residuals, and multicollinearity.

For linearity, the residual plot shows a random scatter around zero with no clear pattern, and the rainbow test is not statistically significant (p > 0.05), suggesting the linearity assumption is satisfied.

For independence of errors, the Durbin-Watson test is not statistically significant (p > 0.05), indicating no evidence of autocorrelation.

For homoscedasticity, the residuals do not display a funnel-shaped pattern, and the Breusch-Pagan test is not statistically significant (p > 0.05), suggesting constant variance.

For normality, the QQ plot shows residuals close to the reference line, with slight deviations at the tails. While the Shapiro-Wilk test may be statistically significant (p < 0.05), this is common in larger samples, and normality appears reasonably satisfied.

For multicollinearity, VIF values are below 5 and correlations are not excessively high, indicating no major concerns.

Overall, the model meets the assumptions of linear regression, with only minor deviations in normality. These results suggest the model is appropriate for analyzing the relationship between education and income across Texas counties.

Mitigation

If minor assumption violations are present, robust standard errors can be used to account for potential heteroscedasticity or non-normality. Transformations of variables, such as logging the dependent variable, could also be applied if needed. However, given the overall results, the model appears sufficiently reliable for interpretation.