Ferona Bustani

# Load packages and data
library(tidyverse)

dat = read.csv("health_data.csv", header = TRUE)

dat = dat %>% 
  select(County, Average.Number.of.Mentally.Unhealthy.Days, 
         Percent.Adults.with.Obesity, 
         Percent.Long.Commute...Drives.Alone) %>% 
  rename(all_of(c(MentallyUnhealthyDays = "Average.Number.of.Mentally.Unhealthy.Days", 
                  Obesity = "Percent.Adults.with.Obesity", 
                  Commute = "Percent.Long.Commute...Drives.Alone")))
dat$Obesity[dat$Obesity >= 40] = "Over 40%"
dat$Obesity[dat$Obesity < 40] = "Under 40%"
# distribution of response variable
hist(dat$MentallyUnhealthyDays, xlab = "Average Number of Mentally Unhealthy Days", main = "Marginal Distribution of Mentally Unhealthy Days in Texas Counties")

mean(dat$MentallyUnhealthyDays)
## [1] 4.608627
sd(dat$MentallyUnhealthyDays)
## [1] 0.332143
# Commute distribution, log transformation, and bivariate graph
hist(dat$Commute, xlab = "Percent Adults with Long Commutes Alone", main = "Marginal Distribution of Percent Adults with Long Commutes Alone in Texas Counties")

mean(dat$Commute)
## [1] 31.40392
sd(dat$Commute)
## [1] 13.21735
hist(log(dat$Commute), xlab = "Log Percent Adults with Long Commutes Alone", main = "Marginal Distribution of Percent Adults with Long Commutes Alone in Texas Counties")

mean(log(dat$Commute))
## [1] 3.345327
sd(log(dat$Commute))
## [1] 0.4793359
dat %>% 
  ggplot(aes(Commute, MentallyUnhealthyDays)) +
  geom_point() +
  labs(x = "Log Percent of Adults with a Long Commute", y = "Number of Mentally Unhealthy Days", title = "Number of Mentally Unhealthy Days by Commute")

# Obesity distribution and bivariate graph
table(dat$Obesity)
## 
##  Over 40% Under 40% 
##        61       194
barplot(table(dat$Obesity), xlab = "Obesity", ylab = "Counts", main = "Distribution of Obesity Categories")

dat %>% 
  ggplot(aes(x = Obesity, y = MentallyUnhealthyDays)) +
  geom_boxplot() +
  labs(x = "Obesity", y = "Number of Mentally Unhealthy Days", title = "Number of Mentally Unhealthy Days by Obesity")

my_glm <- lm(MentallyUnhealthyDays ~ Obesity + Commute, data=dat)
dat$commute_c = dat$Commute - mean(dat$Commute)
my_glm_int <- lm(MentallyUnhealthyDays ~ Obesity * commute_c, data=dat)
summary(my_glm_int)
## 
## Call:
## lm(formula = MentallyUnhealthyDays ~ Obesity * commute_c, data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.9214 -0.2082  0.0149  0.2167  0.8852 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 4.728682   0.044799 105.553  < 2e-16 ***
## ObesityUnder 40%           -0.148868   0.050643  -2.940  0.00359 ** 
## commute_c                   0.004328   0.003640   1.189  0.23566    
## ObesityUnder 40%:commute_c -0.006525   0.004041  -1.615  0.10765    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3273 on 251 degrees of freedom
## Multiple R-squared:  0.0407, Adjusted R-squared:  0.02924 
## F-statistic:  3.55 on 3 and 251 DF,  p-value: 0.01509
summary(my_glm_int)$adj.r.squared
## [1] 0.02923928
# create interaction plot
ggplot(dat,aes(x=commute_c,y=MentallyUnhealthyDays,color=Obesity)) + 
  geom_smooth(method='lm') + 
  geom_point() + 
  xlab('Percent Adults with Long Commutes') + 
  ylab('Average Number of Mentally Unhealthy Days') + 
  labs(col='Obesity') + 
  ggtitle('Average Mentally Unhealthy days by Percent long Commuters and Obesity Status') +  
  theme_classic()

#Asssmptions
#Confirm linearity of numeric predictors
plot(dat$Commute, dat$MentallyUnhealthyDays, xlab='Percent Long Commuters', ylab= 'Average Number of Mentally Unhealthy Days', main='Average Number of Mentally Unhealthy Days versus Percent ', pch=20)

#Confirm normality of residuals
hist(my_glm$residuals, main='Model Residuals', xlab='Residual', col='light grey', right=F)

#Confirm equal variance
plot(my_glm$fitted.values, my_glm$residuals, xlab= 'Fitted Values', ylab='Residuals', main='Residual Plot', pch=20)
abline(h=0, col='red')