library(tidyverse)
library(readxl)
library(lmtest)
library(car)
library(janitor)
data_path <- "C:/Users/miche/OneDrive/Desktop/My Class Stuff/Wednesday Class/Data Diabetes/Diabetes_Data_Clean.xlsx"
diab_raw <- read_excel(data_path)
diab <- diab_raw %>%
rename(Diagnosed = Diagnosed) %>%
drop_na(Diagnosed, SNAP)
glimpse(diab)
## Rows: 371
## Columns: 3
## $ Tract <chr> "48029171929", "48029171928", "48029171927", "48029191819", …
## $ Diagnosed <dbl> 19.4, 13.0, 12.7, 11.4, 13.2, 19.9, 11.7, 9.8, 14.2, 10.3, 1…
## $ SNAP <dbl> 28.3, 6.7, 20.7, 7.9, 5.9, 27.0, 13.6, 8.5, 21.2, 19.5, 9.1,…
summary(diab)
## Tract Diagnosed SNAP
## Length:371 Min. : 2.50 Min. : 2.00
## Class :character 1st Qu.:12.70 1st Qu.: 9.90
## Mode :character Median :15.30 Median :16.70
## Mean :16.46 Mean :19.71
## 3rd Qu.:20.15 3rd Qu.:27.45
## Max. :29.50 Max. :64.70
diab_raw <- readxl::read_excel(data_path)
names(diab_raw)
## [1] "Tract" "Diagnosed" "SNAP"
names(diab)
## [1] "Tract" "Diagnosed" "SNAP"
glimpse(diab)
## Rows: 371
## Columns: 3
## $ Tract <chr> "48029171929", "48029171928", "48029171927", "48029191819", …
## $ Diagnosed <dbl> 19.4, 13.0, 12.7, 11.4, 13.2, 19.9, 11.7, 9.8, 14.2, 10.3, 1…
## $ SNAP <dbl> 28.3, 6.7, 20.7, 7.9, 5.9, 27.0, 13.6, 8.5, 21.2, 19.5, 9.1,…
summary(diab)
## Tract Diagnosed SNAP
## Length:371 Min. : 2.50 Min. : 2.00
## Class :character 1st Qu.:12.70 1st Qu.: 9.90
## Mode :character Median :15.30 Median :16.70
## Mean :16.46 Mean :19.71
## 3rd Qu.:20.15 3rd Qu.:27.45
## Max. :29.50 Max. :64.70
Linear Model: SNAP as Outcome, Diagnosed as Predictor
lm_snap_diag <- lm(SNAP ~ Diagnosed, data = diab)
summary(lm_snap_diag)
##
## Call:
## lm(formula = SNAP ~ Diagnosed, data = diab)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.559 -4.971 -1.291 3.817 26.134
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -15.47762 1.23531 -12.53 <2e-16 ***
## Diagnosed 2.13765 0.07201 29.69 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.702 on 369 degrees of freedom
## Multiple R-squared: 0.7049, Adjusted R-squared: 0.7041
## F-statistic: 881.3 on 1 and 369 DF, p-value: < 2.2e-16
This regression predicts SNAP from diabetes prevalence
(Diagnosed).
A positive, significant coefficient suggests areas with higher diabetes
rates also have a possible higher SNAP usage.
ggplot(diab, aes(x = Diagnosed, y = SNAP)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
labs(title = "SNAP vs. Diagnosed Diabetes",
x = "% Adults Diagnosed with Diabetes",
y = "% Adults Using SNAP")
raintest(lm_snap_diag)
##
## Rainbow test
##
## data: lm_snap_diag
## Rain = 0.94571, df1 = 186, df2 = 183, p-value = 0.6477
durbinWatsonTest(lm_snap_diag)
## lag Autocorrelation D-W Statistic p-value
## 1 0.2530956 1.476619 0
## Alternative hypothesis: rho != 0
plot(lm_snap_diag, which = 1) # Residuals vs Fitted
bptest(lm_snap_diag)
##
## studentized Breusch-Pagan test
##
## data: lm_snap_diag
## BP = 0.067772, df = 1, p-value = 0.7946
plot(lm_snap_diag, which = 2)
shapiro.test(residuals(lm_snap_diag))
##
## Shapiro-Wilk normality test
##
## data: residuals(lm_snap_diag)
## W = 0.95432, p-value = 2.557e-09
No Multicollinearity (VIF and Correlation) – Only two predictors in this case so not able to show.
length(coef(lm_snap_diag))
## [1] 2
if (length(coef(lm_snap_diag)) > 2) {
car::vif(lm_snap_diag)
} else {
cat
}
## function (..., file = "", sep = " ", fill = FALSE, labels = NULL,
## append = FALSE)
## {
## if (is.character(file))
## if (file == "")
## file <- stdout()
## else if (startsWith(file, "|")) {
## file <- pipe(substring(file, 2L), "w")
## on.exit(close(file))
## }
## else {
## file <- file(file, ifelse(append, "a", "w"))
## on.exit(close(file))
## }
## .Internal(cat(list(...), file, sep, fill, labels, append))
## }
## <bytecode: 0x0000019bbe9ba318>
## <environment: namespace:base>
cor(diab$SNAP, diab$Diagnosed, use = "complete.obs")
## [1] 0.8395643
Model meets all except Normality
The normality of residuals assumption is violated (non-normal error distribution).
diab <- diab %>%
mutate(log_snap = log(SNAP))
lm_log_snap_diag <- lm(log_snap ~ Diagnosed, data = diab)
summary(lm_log_snap_diag)
##
## Call:
## lm(formula = log_snap ~ Diagnosed, data = diab)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.36173 -0.19743 -0.00496 0.25595 1.53934
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.967807 0.074467 13.0 <2e-16 ***
## Diagnosed 0.109805 0.004341 25.3 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.404 on 369 degrees of freedom
## Multiple R-squared: 0.6343, Adjusted R-squared: 0.6333
## F-statistic: 639.9 on 1 and 369 DF, p-value: < 2.2e-16
plot(lm_log_snap_diag, which = 2)
shapiro.test(residuals(lm_log_snap_diag))
##
## Shapiro-Wilk normality test
##
## data: residuals(lm_log_snap_diag)
## W = 0.98223, p-value = 0.0001558
The log transformation made the data more normally distributed, which increases the reliability of the results. In both versions, there is a strong positive relationship between diabetes prevalence and SNAP participation areas with higher diabetes rates tend to have more SNAP users. After applying the log transformation, the model meets statistical assumptions, helping confirm the results are more accurate and trustworthy. Even though linear regression can handle some non-normality, transforming the data strengthens the overall model fit.