Ferona Bustani
# Load packages and data
library(tidyverse)
dat = read.csv("health_data.csv", header = TRUE)
dat = dat %>%
select(County, Average.Number.of.Mentally.Unhealthy.Days,
Percent.Adults.with.Obesity,
Percent.Long.Commute...Drives.Alone) %>%
rename(all_of(c(MentallyUnhealthyDays = "Average.Number.of.Mentally.Unhealthy.Days",
Obesity = "Percent.Adults.with.Obesity",
Commute = "Percent.Long.Commute...Drives.Alone")))
dat$Obesity[dat$Obesity >= 40] = "Over 40%"
dat$Obesity[dat$Obesity < 40] = "Under 40%"
# distribution of response variable
hist(dat$MentallyUnhealthyDays, xlab = "Average Number of Mentally Unhealthy Days", main = "Marginal Distribution of Mentally Unhealthy Days in Texas Counties")

mean(dat$MentallyUnhealthyDays)
## [1] 4.608627
sd(dat$MentallyUnhealthyDays)
## [1] 0.332143
# Commute distribution, log transformation, and bivariate graph
hist(dat$Commute, xlab = "Percent Adults with Long Commutes Alone", main = "Marginal Distribution of Percent Adults with Long Commutes Alone in Texas Counties")

mean(dat$Commute)
## [1] 31.40392
sd(dat$Commute)
## [1] 13.21735
hist(log(dat$Commute), xlab = "Log Percent Adults with Long Commutes Alone", main = "Marginal Distribution of Percent Adults with Long Commutes Alone in Texas Counties")

mean(log(dat$Commute))
## [1] 3.345327
sd(log(dat$Commute))
## [1] 0.4793359
dat %>%
ggplot(aes(Commute, MentallyUnhealthyDays)) +
geom_point() +
labs(x = "Log Percent of Adults with a Long Commute", y = "Number of Mentally Unhealthy Days", title = "Number of Mentally Unhealthy Days by Commute")

# Obesity distribution and bivariate graph
table(dat$Obesity)
##
## Over 40% Under 40%
## 61 194
barplot(table(dat$Obesity), xlab = "Obesity", ylab = "Counts", main = "Distribution of Obesity Categories")

dat %>%
ggplot(aes(x = Obesity, y = MentallyUnhealthyDays)) +
geom_boxplot() +
labs(x = "Obesity", y = "Number of Mentally Unhealthy Days", title = "Number of Mentally Unhealthy Days by Obesity")

my_glm <- lm(MentallyUnhealthyDays ~ Obesity + Commute, data=dat)
dat$commute_c = dat$Commute - mean(dat$Commute)
my_glm_int <- lm(MentallyUnhealthyDays ~ Obesity * commute_c, data=dat)
summary(my_glm_int)
##
## Call:
## lm(formula = MentallyUnhealthyDays ~ Obesity * commute_c, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.9214 -0.2082 0.0149 0.2167 0.8852
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.728682 0.044799 105.553 < 2e-16 ***
## ObesityUnder 40% -0.148868 0.050643 -2.940 0.00359 **
## commute_c 0.004328 0.003640 1.189 0.23566
## ObesityUnder 40%:commute_c -0.006525 0.004041 -1.615 0.10765
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3273 on 251 degrees of freedom
## Multiple R-squared: 0.0407, Adjusted R-squared: 0.02924
## F-statistic: 3.55 on 3 and 251 DF, p-value: 0.01509
summary(my_glm_int)$adj.r.squared
## [1] 0.02923928
# create interaction plot
ggplot(dat,aes(x=commute_c,y=MentallyUnhealthyDays,color=Obesity)) +
geom_smooth(method='lm') +
geom_point() +
xlab('Percent Adults with Long Commutes') +
ylab('Average Number of Mentally Unhealthy Days') +
labs(col='Obesity') +
ggtitle('Average Mentally Unhealthy days by Percent long Commuters and Obesity Status') +
theme_classic()

#Asssmptions
#Confirm linearity of numeric predictors
plot(dat$Commute, dat$MentallyUnhealthyDays, xlab='Percent Long Commuters', ylab= 'Average Number of Mentally Unhealthy Days', main='Average Number of Mentally Unhealthy Days versus Percent ', pch=20)

#Confirm normality of residuals
hist(my_glm$residuals, main='Model Residuals', xlab='Residual', col='light grey', right=F)

#Confirm equal variance
plot(my_glm$fitted.values, my_glm$residuals, xlab= 'Fitted Values', ylab='Residuals', main='Residual Plot', pch=20)
abline(h=0, col='red')
