library(tidyverse)
library(readxl)
library(lmtest)
library(car)      
library(janitor) 
data_path <- "C:/Users/miche/OneDrive/Desktop/My Class Stuff/Wednesday Class/Data Diabetes/Diabetes_Data_Clean.xlsx"

diab_raw <- read_excel(data_path)

diab <- diab_raw %>%
  rename(Diagnosed = Diagnosed) %>%
  drop_na(Diagnosed, SNAP)


glimpse(diab)
## Rows: 371
## Columns: 3
## $ Tract     <chr> "48029171929", "48029171928", "48029171927", "48029191819", …
## $ Diagnosed <dbl> 19.4, 13.0, 12.7, 11.4, 13.2, 19.9, 11.7, 9.8, 14.2, 10.3, 1…
## $ SNAP      <dbl> 28.3, 6.7, 20.7, 7.9, 5.9, 27.0, 13.6, 8.5, 21.2, 19.5, 9.1,…
summary(diab)
##     Tract             Diagnosed          SNAP      
##  Length:371         Min.   : 2.50   Min.   : 2.00  
##  Class :character   1st Qu.:12.70   1st Qu.: 9.90  
##  Mode  :character   Median :15.30   Median :16.70  
##                     Mean   :16.46   Mean   :19.71  
##                     3rd Qu.:20.15   3rd Qu.:27.45  
##                     Max.   :29.50   Max.   :64.70
diab_raw <- readxl::read_excel(data_path)


names(diab_raw)
## [1] "Tract"     "Diagnosed" "SNAP"
names(diab)
## [1] "Tract"     "Diagnosed" "SNAP"
glimpse(diab)
## Rows: 371
## Columns: 3
## $ Tract     <chr> "48029171929", "48029171928", "48029171927", "48029191819", …
## $ Diagnosed <dbl> 19.4, 13.0, 12.7, 11.4, 13.2, 19.9, 11.7, 9.8, 14.2, 10.3, 1…
## $ SNAP      <dbl> 28.3, 6.7, 20.7, 7.9, 5.9, 27.0, 13.6, 8.5, 21.2, 19.5, 9.1,…
summary(diab)
##     Tract             Diagnosed          SNAP      
##  Length:371         Min.   : 2.50   Min.   : 2.00  
##  Class :character   1st Qu.:12.70   1st Qu.: 9.90  
##  Mode  :character   Median :15.30   Median :16.70  
##                     Mean   :16.46   Mean   :19.71  
##                     3rd Qu.:20.15   3rd Qu.:27.45  
##                     Max.   :29.50   Max.   :64.70

Linear Model: SNAP as Outcome, Diagnosed as Predictor

lm_snap_diag <- lm(SNAP ~ Diagnosed, data = diab)
summary(lm_snap_diag)
## 
## Call:
## lm(formula = SNAP ~ Diagnosed, data = diab)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -12.559  -4.971  -1.291   3.817  26.134 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -15.47762    1.23531  -12.53   <2e-16 ***
## Diagnosed     2.13765    0.07201   29.69   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.702 on 369 degrees of freedom
## Multiple R-squared:  0.7049, Adjusted R-squared:  0.7041 
## F-statistic: 881.3 on 1 and 369 DF,  p-value: < 2.2e-16

This regression predicts SNAP from diabetes prevalence (Diagnosed).
A positive, significant coefficient suggests areas with higher diabetes rates also have a possible higher SNAP usage.

ggplot(diab, aes(x = Diagnosed, y = SNAP)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(title = "SNAP vs. Diagnosed Diabetes",
       x = "% Adults Diagnosed with Diabetes",
       y = "% Adults Using SNAP")

raintest(lm_snap_diag)
## 
##  Rainbow test
## 
## data:  lm_snap_diag
## Rain = 0.94571, df1 = 186, df2 = 183, p-value = 0.6477
durbinWatsonTest(lm_snap_diag)
##  lag Autocorrelation D-W Statistic p-value
##    1       0.2530956      1.476619       0
##  Alternative hypothesis: rho != 0
plot(lm_snap_diag, which = 1)  # Residuals vs Fitted

bptest(lm_snap_diag)
## 
##  studentized Breusch-Pagan test
## 
## data:  lm_snap_diag
## BP = 0.067772, df = 1, p-value = 0.7946
plot(lm_snap_diag, which = 2)

shapiro.test(residuals(lm_snap_diag))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(lm_snap_diag)
## W = 0.95432, p-value = 2.557e-09

No Multicollinearity (VIF and Correlation) – Only two predictors in this case so not able to show.

length(coef(lm_snap_diag))   
## [1] 2
if (length(coef(lm_snap_diag)) > 2) {
  car::vif(lm_snap_diag)
} else {
  cat 
}
## function (..., file = "", sep = " ", fill = FALSE, labels = NULL, 
##     append = FALSE) 
## {
##     if (is.character(file)) 
##         if (file == "") 
##             file <- stdout()
##         else if (startsWith(file, "|")) {
##             file <- pipe(substring(file, 2L), "w")
##             on.exit(close(file))
##         }
##         else {
##             file <- file(file, ifelse(append, "a", "w"))
##             on.exit(close(file))
##         }
##     .Internal(cat(list(...), file, sep, fill, labels, append))
## }
## <bytecode: 0x0000019bbe9ba318>
## <environment: namespace:base>
cor(diab$SNAP, diab$Diagnosed, use = "complete.obs")
## [1] 0.8395643

Model meets all except Normality

The normality of residuals assumption is violated (non-normal error distribution).

diab <- diab %>%
  mutate(log_snap = log(SNAP))

lm_log_snap_diag <- lm(log_snap ~ Diagnosed, data = diab)
summary(lm_log_snap_diag)
## 
## Call:
## lm(formula = log_snap ~ Diagnosed, data = diab)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.36173 -0.19743 -0.00496  0.25595  1.53934 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.967807   0.074467    13.0   <2e-16 ***
## Diagnosed   0.109805   0.004341    25.3   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.404 on 369 degrees of freedom
## Multiple R-squared:  0.6343, Adjusted R-squared:  0.6333 
## F-statistic: 639.9 on 1 and 369 DF,  p-value: < 2.2e-16
plot(lm_log_snap_diag, which = 2)

shapiro.test(residuals(lm_log_snap_diag))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(lm_log_snap_diag)
## W = 0.98223, p-value = 0.0001558

The log transformation made the data more normally distributed, which increases the reliability of the results. In both versions, there is a strong positive relationship between diabetes prevalence and SNAP participation areas with higher diabetes rates tend to have more SNAP users. After applying the log transformation, the model meets statistical assumptions, helping confirm the results are more accurate and trustworthy. Even though linear regression can handle some non-normality, transforming the data strengthens the overall model fit.