library(readxl)
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(MASS)
library(car)
## Loading required package: carData
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::recode() masks car::recode()
## ✖ dplyr::select() masks MASS::select()
## ✖ purrr::some()   masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
duh_data <- load("NSDUH 2022.Rdata")
nsduh <- puf2022_110424 
nsduh_small <- nsduh %>% dplyr::select(DSTNRV30, IRALCBNG30D, MJDAY30A, CIG30USE, COCUS30A, METHAM30N, PNRNM30FQ)
nsduh_small_clean<-nsduh_small |> mutate(meth_clean=ifelse(METHAM30N>32,0,METHAM30N))
nsduh_small_clean<-nsduh_small_clean |> mutate(mj_clean=ifelse(MJDAY30A>32,0,MJDAY30A))
nsduh_small_clean<-nsduh_small_clean |> mutate(cig_clean=ifelse(CIG30USE>32,0,CIG30USE))
nsduh_small_clean<-nsduh_small_clean |> mutate(pnr_clean=ifelse(PNRNM30FQ>32,0,PNRNM30FQ))
nsduh_small_clean<-nsduh_small_clean |> mutate(nervous_clean=ifelse(DSTNRV30>6,0,DSTNRV30))
nsduh_small_clean<-nsduh_small_clean |> mutate(binge_clean=ifelse(IRALCBNG30D>32,0,IRALCBNG30D))
nsduh_small_clean<-nsduh_small_clean |> mutate(coke_clean=ifelse(COCUS30A>32,0,COCUS30A))
nsduh_clean_only <- nsduh_small_clean %>% dplyr::select(nervous_clean, pnr_clean, cig_clean, mj_clean, meth_clean, binge_clean, coke_clean)

summary(nsduh_clean_only)
##  nervous_clean     pnr_clean          cig_clean         mj_clean     
##  Min.   :0.000   Min.   : 0.00000   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.:1.000   1st Qu.: 0.00000   1st Qu.: 0.000   1st Qu.: 0.000  
##  Median :3.000   Median : 0.00000   Median : 0.000   Median : 0.000  
##  Mean   :2.964   Mean   : 0.05885   Mean   : 2.446   Mean   : 2.581  
##  3rd Qu.:5.000   3rd Qu.: 0.00000   3rd Qu.: 0.000   3rd Qu.: 0.000  
##  Max.   :5.000   Max.   :30.00000   Max.   :30.000   Max.   :30.000  
##    meth_clean        binge_clean        coke_clean      
##  Min.   : 0.00000   Min.   : 0.0000   Min.   : 0.00000  
##  1st Qu.: 0.00000   1st Qu.: 0.0000   1st Qu.: 0.00000  
##  Median : 0.00000   Median : 0.0000   Median : 0.00000  
##  Mean   : 0.09062   Mean   : 0.9088   Mean   : 0.04039  
##  3rd Qu.: 0.00000   3rd Qu.: 0.0000   3rd Qu.: 0.00000  
##  Max.   :30.00000   Max.   :30.0000   Max.   :30.00000
nsduh_linear<-lm(nervous_clean ~ cig_clean + meth_clean + mj_clean + pnr_clean + binge_clean + coke_clean, data=nsduh_clean_only)

plot(nsduh_linear,which=1)

raintest(nsduh_linear)
## 
##  Rainbow test
## 
## data:  nsduh_linear
## Rain = 1.0124, df1 = 29535, df2 = 29527, p-value = 0.1451

because the pvalue is greater than .05, there is no evidence of nonlinearity and the model meets the linearity assumption.

durbinWatsonTest(nsduh_linear)
##  lag Autocorrelation D-W Statistic p-value
##    1     -0.00522978      2.010415   0.196
##  Alternative hypothesis: rho != 0

because the pvalue is greater than .05, this means the errors are independent and the model meets the independence of errors assumption

plot(nsduh_linear,which=3)

bptest(nsduh_linear)
## 
##  studentized Breusch-Pagan test
## 
## data:  nsduh_linear
## BP = 1983.4, df = 6, p-value < 2.2e-16

the plot shows a noticable curved pattern which indicates that the variance is not constant across the model; additionaly, the BP test has a pvalue well below .05 meaning homoscedasticity is rejected and violates the assumption

plot(nsduh_linear,which=2)

ks.test(nsduh_linear$residuals,"pnorm")
## Warning in ks.test.default(nsduh_linear$residuals, "pnorm"): ties should not be
## present for the one-sample Kolmogorov-Smirnov test
## 
##  Asymptotic one-sample Kolmogorov-Smirnov test
## 
## data:  nsduh_linear$residuals
## D = 0.29786, p-value < 2.2e-16
## alternative hypothesis: two-sided

my plot shows a strong S shape which means they deviate from the line. This indicates that the residuals are not normally distributed and the normality assumption is violated. Because the dataset is so large, the sample size exceeds the shapiro test limit and could not be run, KS was ran instead.

vif(nsduh_linear)
##   cig_clean  meth_clean    mj_clean   pnr_clean binge_clean  coke_clean 
##    1.093706    1.061024    1.075335    1.051313    1.054290    1.020880

every VIF value is close to 1, this indicates that the variables are not strongly correlated and the model meets the assumption of no multicolinearity

summary(nsduh_linear)
## 
## Call:
## lm(formula = nervous_clean ~ cig_clean + meth_clean + mj_clean + 
##     pnr_clean + binge_clean + coke_clean, data = nsduh_clean_only)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.7013 -1.8789  0.1211  1.3043  3.0436 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.8789218  0.0083782 343.619  < 2e-16 ***
## cig_clean    0.0220190  0.0010162  21.668  < 2e-16 ***
## meth_clean  -0.0264028  0.0052432  -5.036 4.78e-07 ***
## mj_clean     0.0005049  0.0010400   0.485  0.62735    
## pnr_clean   -0.0201214  0.0076882  -2.617  0.00887 ** 
## binge_clean  0.0386262  0.0025500  15.148  < 2e-16 ***
## coke_clean  -0.0290252  0.0095064  -3.053  0.00226 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.842 on 59062 degrees of freedom
## Multiple R-squared:  0.01432,    Adjusted R-squared:  0.01422 
## F-statistic:   143 on 6 and 59062 DF,  p-value: < 2.2e-16
nsduh_gaussian<-glm(nervous_clean ~ cig_clean + meth_clean + mj_clean + pnr_clean + binge_clean + coke_clean, data=nsduh_clean_only,family = "gaussian")

summary(nsduh_gaussian)
## 
## Call:
## glm(formula = nervous_clean ~ cig_clean + meth_clean + mj_clean + 
##     pnr_clean + binge_clean + coke_clean, family = "gaussian", 
##     data = nsduh_clean_only)
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.8789218  0.0083782 343.619  < 2e-16 ***
## cig_clean    0.0220190  0.0010162  21.668  < 2e-16 ***
## meth_clean  -0.0264028  0.0052432  -5.036 4.78e-07 ***
## mj_clean     0.0005049  0.0010400   0.485  0.62735    
## pnr_clean   -0.0201214  0.0076882  -2.617  0.00887 ** 
## binge_clean  0.0386262  0.0025500  15.148  < 2e-16 ***
## coke_clean  -0.0290252  0.0095064  -3.053  0.00226 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 3.393055)
## 
##     Null deviance: 203311  on 59068  degrees of freedom
## Residual deviance: 200401  on 59062  degrees of freedom
## AIC: 239806
## 
## Number of Fisher Scoring iterations: 2
AIC(nsduh_gaussian)
## [1] 239806