library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
suicide_data <- read_csv("C:/Users/Campo/Downloads/Death_rates_for_suicide__by_sex__race__Hispanic_origin__and_age__United_States.csv")
## Rows: 6390 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): INDICATOR, UNIT, STUB_NAME, STUB_LABEL, AGE, FLAG
## dbl (7): UNIT_NUM, STUB_NAME_NUM, STUB_LABEL_NUM, YEAR, YEAR_NUM, AGE_NUM, E...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(suicide_data)
## # A tibble: 6 × 13
## INDICATOR UNIT UNIT_NUM STUB_NAME STUB_NAME_NUM STUB_LABEL STUB_LABEL_NUM
## <chr> <chr> <dbl> <chr> <dbl> <chr> <dbl>
## 1 Death rates … Deat… 1 Total 0 All perso… 0
## 2 Death rates … Deat… 1 Total 0 All perso… 0
## 3 Death rates … Deat… 1 Total 0 All perso… 0
## 4 Death rates … Deat… 1 Total 0 All perso… 0
## 5 Death rates … Deat… 1 Total 0 All perso… 0
## 6 Death rates … Deat… 1 Total 0 All perso… 0
## # ℹ 6 more variables: YEAR <dbl>, YEAR_NUM <dbl>, AGE <chr>, AGE_NUM <dbl>,
## # ESTIMATE <dbl>, FLAG <chr>
colnames(suicide_data)
## [1] "INDICATOR" "UNIT" "UNIT_NUM" "STUB_NAME"
## [5] "STUB_NAME_NUM" "STUB_LABEL" "STUB_LABEL_NUM" "YEAR"
## [9] "YEAR_NUM" "AGE" "AGE_NUM" "ESTIMATE"
## [13] "FLAG"
install.packages("car")
## Installing package into 'C:/Users/Campo/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'car' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Campo\AppData\Local\Temp\RtmpAlAZlU\downloaded_packages
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
install.packages("lmtest")
## Installing package into 'C:/Users/Campo/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'lmtest' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'lmtest'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\Campo\AppData\Local\R\win-library\4.4\00LOCK\lmtest\libs\x64\lmtest.dll
## to C:\Users\Campo\AppData\Local\R\win-library\4.4\lmtest\libs\x64\lmtest.dll:
## Permission denied
## Warning: restored 'lmtest'
##
## The downloaded binary packages are in
## C:\Users\Campo\AppData\Local\Temp\RtmpAlAZlU\downloaded_packages
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
install.packages("nortest")
## Installing package into 'C:/Users/Campo/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'nortest' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Campo\AppData\Local\Temp\RtmpAlAZlU\downloaded_packages
library(nortest)
suicide_model <- lm(ESTIMATE ~ AGE_NUM + YEAR_NUM, data = suicide_data)
summary(suicide_model)
##
## Call:
## lm(formula = ESTIMATE ~ AGE_NUM + YEAR_NUM, data = suicide_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.327 -7.956 -2.431 5.710 54.158
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.70495 0.40441 24.00 < 2e-16 ***
## AGE_NUM 1.94131 0.07664 25.33 < 2e-16 ***
## YEAR_NUM -0.04736 0.01260 -3.76 0.000172 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.89 on 5481 degrees of freedom
## (906 observations deleted due to missingness)
## Multiple R-squared: 0.1084, Adjusted R-squared: 0.1081
## F-statistic: 333.1 on 2 and 5481 DF, p-value: < 2.2e-16
plot(suicide_model, which = 1)

raintest(suicide_model)
##
## Rainbow test
##
## data: suicide_model
## Rain = 0.56241, df1 = 2742, df2 = 2739, p-value = 1
#yes it meets assumptions, since the p value is above the .05 threshold it doesn't show to be against the linearity assumption based on the rainbow test results.
durbinWatsonTest(suicide_model)
## lag Autocorrelation D-W Statistic p-value
## 1 0.9572922 0.08501115 0
## Alternative hypothesis: rho != 0
#this one violates the assumption since it is close to 0 it means the errors are not independent since the D-W was .085 and pretty close to 0. Autocorrelation is nearly spot on to 1, showing a .95 that is a indication of autocorrelation. ways to mitigate this for autocorrelation could be adding a lagged variable, like for each row corresponding to a different year or time period/age.
suicide_data <- suicide_data %>%
mutate(ESTIMATE_LAG = lag(ESTIMATE, 1))
suicide_model_lagged <- lm(ESTIMATE ~ AGE_NUM + YEAR_NUM + ESTIMATE_LAG, data = suicide_data)
summary(suicide_model_lagged)
##
## Call:
## lm(formula = ESTIMATE ~ AGE_NUM + YEAR_NUM + ESTIMATE_LAG, data = suicide_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -45.512 -0.493 -0.083 0.485 34.743
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.100405 0.108071 0.929 0.35289
## AGE_NUM 0.060292 0.020381 2.958 0.00311 **
## YEAR_NUM 0.007147 0.003189 2.241 0.02506 *
## ESTIMATE_LAG 0.968810 0.003398 285.133 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.699 on 5345 degrees of freedom
## (1041 observations deleted due to missingness)
## Multiple R-squared: 0.9454, Adjusted R-squared: 0.9454
## F-statistic: 3.085e+04 on 3 and 5345 DF, p-value: < 2.2e-16
durbinWatsonTest(suicide_model_lagged)
## lag Autocorrelation D-W Statistic p-value
## 1 -0.1423439 2.284559 0
## Alternative hypothesis: rho != 0
#after adding this the DW stat is now at 2.28 which is very close to 2, which displays meeting the independence of errors assumption.
bptest(suicide_model)
##
## studentized Breusch-Pagan test
##
## data: suicide_model
## BP = 446.45, df = 2, p-value < 2.2e-16
#the BP test shows a stron violation of the homoscedasticity assumption, a way to migitate this could be to use a log transformation to the dependent variable
suicide_data <- suicide_data %>%
mutate(log_ESTIMATE = log(ESTIMATE))
suicide_model_log <- lm(log_ESTIMATE ~ AGE_NUM + YEAR_NUM, data = suicide_data)
summary(suicide_model_log)
##
## Call:
## lm(formula = log_ESTIMATE ~ AGE_NUM + YEAR_NUM, data = suicide_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.2904 -0.6307 0.1297 0.6832 1.9716
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.954994 0.032151 60.807 <2e-16 ***
## AGE_NUM 0.133562 0.006093 21.921 <2e-16 ***
## YEAR_NUM -0.002089 0.001001 -2.086 0.037 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8658 on 5481 degrees of freedom
## (906 observations deleted due to missingness)
## Multiple R-squared: 0.08211, Adjusted R-squared: 0.08177
## F-statistic: 245.1 on 2 and 5481 DF, p-value: < 2.2e-16
plot(suicide_model_log, which = 1)

bptest(suicide_model_log)
##
## studentized Breusch-Pagan test
##
## data: suicide_model_log
## BP = 0.13024, df = 2, p-value = 0.937
#now that the numbers for BP stat=.13, and pvaule is .93 it is above the threshold that means the homoscedasticity assumption is now met.
plot(suicide_model, which = 2)

#yes this meets the normaility assumption, since the pattern is traced along with the residuals.
vif(suicide_model)
## AGE_NUM YEAR_NUM
## 1.002221 1.002221
cor(suicide_data %>% select(AGE_NUM, YEAR_NUM))
## AGE_NUM YEAR_NUM
## AGE_NUM 1.00000000 -0.01651277
## YEAR_NUM -0.01651277 1.00000000
#since the numbers are close to 1 it shows low correlation between varaibles and that multicollinearity is not a concern. Assumptions is met that tells me that corefficents can be viewed without worries of inflated standard errors due to multicollinearity.