library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
data <- read_csv("https://corgis-edu.github.io/corgis/datasets/csv/county_demographics/county_demographics.csv")
## Rows: 3139 Columns: 43
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): County, State
## dbl (41): Age.Percent 65 and Older, Age.Percent Under 18 Years, Age.Percent ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
texas <- data %>%
  filter(State == "TX" | State == "Texas")

names(texas) <- make.names(names(texas))

Variable Selection

The dependent variable in this analysis is median household income. The independent variables are percent of residents with a bachelor’s degree or higher, percent of residents age 65 and older, and total population.

model_data <- texas %>%
  select(Income.Median.Houseold.Income,
         Education.Bachelor.s.Degree.or.Higher,
         Age.Percent.65.and.Older,
         Population.2020.Population) %>%
  na.omit()

Linear Model

model <- lm(Income.Median.Houseold.Income ~ 
              Education.Bachelor.s.Degree.or.Higher +
              Age.Percent.65.and.Older +
              Population.2020.Population,
            data = model_data)

Model Summary

summary(model)
## 
## Call:
## lm(formula = Income.Median.Houseold.Income ~ Education.Bachelor.s.Degree.or.Higher + 
##     Age.Percent.65.and.Older + Population.2020.Population, data = model_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -33216  -4796     18   5564  45175 
## 
## Coefficients:
##                                         Estimate Std. Error t value Pr(>|t|)
## (Intercept)                            4.998e+04  2.608e+03  19.167  < 2e-16
## Education.Bachelor.s.Degree.or.Higher  9.872e+02  8.474e+01  11.650  < 2e-16
## Age.Percent.65.and.Older              -8.386e+02  1.146e+02  -7.317 3.45e-12
## Population.2020.Population            -3.355e-03  1.681e-03  -1.996   0.0471
##                                          
## (Intercept)                           ***
## Education.Bachelor.s.Degree.or.Higher ***
## Age.Percent.65.and.Older              ***
## Population.2020.Population            *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9610 on 250 degrees of freedom
## Multiple R-squared:  0.4417, Adjusted R-squared:  0.4349 
## F-statistic: 65.92 on 3 and 250 DF,  p-value: < 2.2e-16

Linearity Check

plot(model, which = 1)

Interpretation

The dependent variable in this analysis is median household income, and the independent variables are educational attainment, percent of the population age 65 and older, and total population. A linear regression model was estimated to examine how these variables relate to income across Texas counties.

The model’s R-squared indicates how much of the variation in median household income is explained by the independent variables together. A higher R-squared means the model explains more of the variation in the dependent variable.

The p-values in the model summary indicate which independent variables are statistically significant predictors of median household income. Variables with p-values less than 0.05 are considered statistically significant, while variables with p-values greater than 0.05 are not statistically significant.

The coefficient estimates show how each independent variable affects median household income, holding the other variables constant. A positive coefficient means the variable is associated with higher income, while a negative coefficient means it is associated with lower income.

The residual plot is used to assess the assumption of linearity. If the residuals are randomly scattered around zero with no clear pattern, the linearity assumption is reasonably satisfied. If there is a curve or pattern, this suggests the model may violate the assumption of linearity.