library(readxl)
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(MASS)
library(car)
## Loading required package: carData
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::recode() masks car::recode()
## ✖ dplyr::select() masks MASS::select()
## ✖ purrr::some() masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
duh_data <- load("NSDUH 2022.Rdata")
nsduh <- puf2022_110424
nsduh_small <- nsduh %>% dplyr::select(DSTNRV30, IRALCBNG30D, MJDAY30A, CIG30USE, COCUS30A, METHAM30N, PNRNM30FQ)
nsduh_small_clean<-nsduh_small |> mutate(meth_clean=ifelse(METHAM30N>32,0,METHAM30N))
nsduh_small_clean<-nsduh_small_clean |> mutate(mj_clean=ifelse(MJDAY30A>32,0,MJDAY30A))
nsduh_small_clean<-nsduh_small_clean |> mutate(cig_clean=ifelse(CIG30USE>32,0,CIG30USE))
nsduh_small_clean<-nsduh_small_clean |> mutate(pnr_clean=ifelse(PNRNM30FQ>32,0,PNRNM30FQ))
nsduh_small_clean<-nsduh_small_clean |> mutate(nervous_clean=ifelse(DSTNRV30>6,0,DSTNRV30))
nsduh_small_clean<-nsduh_small_clean |> mutate(binge_clean=ifelse(IRALCBNG30D>32,0,IRALCBNG30D))
nsduh_small_clean<-nsduh_small_clean |> mutate(coke_clean=ifelse(COCUS30A>32,0,COCUS30A))
nsduh_clean_only <- nsduh_small_clean %>% dplyr::select(nervous_clean, pnr_clean, cig_clean, mj_clean, meth_clean, binge_clean, coke_clean)
summary(nsduh_clean_only)
## nervous_clean pnr_clean cig_clean mj_clean
## Min. :0.000 Min. : 0.00000 Min. : 0.000 Min. : 0.000
## 1st Qu.:1.000 1st Qu.: 0.00000 1st Qu.: 0.000 1st Qu.: 0.000
## Median :3.000 Median : 0.00000 Median : 0.000 Median : 0.000
## Mean :2.964 Mean : 0.05885 Mean : 2.446 Mean : 2.581
## 3rd Qu.:5.000 3rd Qu.: 0.00000 3rd Qu.: 0.000 3rd Qu.: 0.000
## Max. :5.000 Max. :30.00000 Max. :30.000 Max. :30.000
## meth_clean binge_clean coke_clean
## Min. : 0.00000 Min. : 0.0000 Min. : 0.00000
## 1st Qu.: 0.00000 1st Qu.: 0.0000 1st Qu.: 0.00000
## Median : 0.00000 Median : 0.0000 Median : 0.00000
## Mean : 0.09062 Mean : 0.9088 Mean : 0.04039
## 3rd Qu.: 0.00000 3rd Qu.: 0.0000 3rd Qu.: 0.00000
## Max. :30.00000 Max. :30.0000 Max. :30.00000
nsduh_linear<-lm(nervous_clean ~ cig_clean + meth_clean + mj_clean + pnr_clean + binge_clean + coke_clean, data=nsduh_clean_only)
plot(nsduh_linear,which=1)

raintest(nsduh_linear)
##
## Rainbow test
##
## data: nsduh_linear
## Rain = 1.0124, df1 = 29535, df2 = 29527, p-value = 0.1451
because the pvalue is greater than .05, there is no evidence of
nonlinearity and the model meets the linearity assumption.
durbinWatsonTest(nsduh_linear)
## lag Autocorrelation D-W Statistic p-value
## 1 -0.00522978 2.010415 0.196
## Alternative hypothesis: rho != 0
because the pvalue is greater than .05, this means the errors are
independent and the model meets the independence of errors
assumption
plot(nsduh_linear,which=3)

bptest(nsduh_linear)
##
## studentized Breusch-Pagan test
##
## data: nsduh_linear
## BP = 1983.4, df = 6, p-value < 2.2e-16
the plot shows a noticable curved pattern which indicates that the
variance is not constant across the model; additionaly, the BP test has
a pvalue well below .05 meaning homoscedasticity is rejected and
violates the assumption
plot(nsduh_linear,which=2)

ks.test(nsduh_linear$residuals,"pnorm")
## Warning in ks.test.default(nsduh_linear$residuals, "pnorm"): ties should not be
## present for the one-sample Kolmogorov-Smirnov test
##
## Asymptotic one-sample Kolmogorov-Smirnov test
##
## data: nsduh_linear$residuals
## D = 0.29786, p-value < 2.2e-16
## alternative hypothesis: two-sided
my plot shows a strong S shape which means they deviate from the
line. This indicates that the residuals are not normally distributed and
the normality assumption is violated. Because the dataset is so large,
the sample size exceeds the shapiro test limit and could not be run, KS
was ran instead.
vif(nsduh_linear)
## cig_clean meth_clean mj_clean pnr_clean binge_clean coke_clean
## 1.093706 1.061024 1.075335 1.051313 1.054290 1.020880