library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
suicide_data <- read_csv("C:/Users/Campo/Downloads/Death_rates_for_suicide__by_sex__race__Hispanic_origin__and_age__United_States.csv")
## Rows: 6390 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): INDICATOR, UNIT, STUB_NAME, STUB_LABEL, AGE, FLAG
## dbl (7): UNIT_NUM, STUB_NAME_NUM, STUB_LABEL_NUM, YEAR, YEAR_NUM, AGE_NUM, E...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(suicide_data)
## # A tibble: 6 × 13
##   INDICATOR     UNIT  UNIT_NUM STUB_NAME STUB_NAME_NUM STUB_LABEL STUB_LABEL_NUM
##   <chr>         <chr>    <dbl> <chr>             <dbl> <chr>               <dbl>
## 1 Death rates … Deat…        1 Total                 0 All perso…              0
## 2 Death rates … Deat…        1 Total                 0 All perso…              0
## 3 Death rates … Deat…        1 Total                 0 All perso…              0
## 4 Death rates … Deat…        1 Total                 0 All perso…              0
## 5 Death rates … Deat…        1 Total                 0 All perso…              0
## 6 Death rates … Deat…        1 Total                 0 All perso…              0
## # ℹ 6 more variables: YEAR <dbl>, YEAR_NUM <dbl>, AGE <chr>, AGE_NUM <dbl>,
## #   ESTIMATE <dbl>, FLAG <chr>
colnames(suicide_data)
##  [1] "INDICATOR"      "UNIT"           "UNIT_NUM"       "STUB_NAME"     
##  [5] "STUB_NAME_NUM"  "STUB_LABEL"     "STUB_LABEL_NUM" "YEAR"          
##  [9] "YEAR_NUM"       "AGE"            "AGE_NUM"        "ESTIMATE"      
## [13] "FLAG"
install.packages("car")
## Installing package into 'C:/Users/Campo/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'car' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Campo\AppData\Local\Temp\RtmpAlAZlU\downloaded_packages
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
install.packages("lmtest") 
## Installing package into 'C:/Users/Campo/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'lmtest' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'lmtest'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\Campo\AppData\Local\R\win-library\4.4\00LOCK\lmtest\libs\x64\lmtest.dll
## to C:\Users\Campo\AppData\Local\R\win-library\4.4\lmtest\libs\x64\lmtest.dll:
## Permission denied
## Warning: restored 'lmtest'
## 
## The downloaded binary packages are in
##  C:\Users\Campo\AppData\Local\Temp\RtmpAlAZlU\downloaded_packages
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
install.packages("nortest")
## Installing package into 'C:/Users/Campo/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'nortest' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Campo\AppData\Local\Temp\RtmpAlAZlU\downloaded_packages
library(nortest)

suicide_model <- lm(ESTIMATE ~ AGE_NUM + YEAR_NUM, data = suicide_data)

summary(suicide_model)
## 
## Call:
## lm(formula = ESTIMATE ~ AGE_NUM + YEAR_NUM, data = suicide_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -17.327  -7.956  -2.431   5.710  54.158 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  9.70495    0.40441   24.00  < 2e-16 ***
## AGE_NUM      1.94131    0.07664   25.33  < 2e-16 ***
## YEAR_NUM    -0.04736    0.01260   -3.76 0.000172 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.89 on 5481 degrees of freedom
##   (906 observations deleted due to missingness)
## Multiple R-squared:  0.1084, Adjusted R-squared:  0.1081 
## F-statistic: 333.1 on 2 and 5481 DF,  p-value: < 2.2e-16
plot(suicide_model, which = 1)

raintest(suicide_model)
## 
##  Rainbow test
## 
## data:  suicide_model
## Rain = 0.56241, df1 = 2742, df2 = 2739, p-value = 1
#yes it meets assumptions, since the p value is above the .05 threshold it doesn't show to be against the linearity assumption based on the rainbow test results.


durbinWatsonTest(suicide_model)
##  lag Autocorrelation D-W Statistic p-value
##    1       0.9572922    0.08501115       0
##  Alternative hypothesis: rho != 0
#this one violates the assumption since it is close to 0 it means the errors are not independent since the D-W was .085 and pretty close to 0. Autocorrelation is nearly spot on to 1, showing a .95 that is a indication of autocorrelation. ways to mitigate this for autocorrelation could be adding a lagged variable, like for each row corresponding to a different year or time period/age. 

suicide_data <- suicide_data %>%
  mutate(ESTIMATE_LAG = lag(ESTIMATE, 1))

suicide_model_lagged <- lm(ESTIMATE ~ AGE_NUM + YEAR_NUM + ESTIMATE_LAG, data = suicide_data)

summary(suicide_model_lagged)
## 
## Call:
## lm(formula = ESTIMATE ~ AGE_NUM + YEAR_NUM + ESTIMATE_LAG, data = suicide_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -45.512  -0.493  -0.083   0.485  34.743 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.100405   0.108071   0.929  0.35289    
## AGE_NUM      0.060292   0.020381   2.958  0.00311 ** 
## YEAR_NUM     0.007147   0.003189   2.241  0.02506 *  
## ESTIMATE_LAG 0.968810   0.003398 285.133  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.699 on 5345 degrees of freedom
##   (1041 observations deleted due to missingness)
## Multiple R-squared:  0.9454, Adjusted R-squared:  0.9454 
## F-statistic: 3.085e+04 on 3 and 5345 DF,  p-value: < 2.2e-16
durbinWatsonTest(suicide_model_lagged)
##  lag Autocorrelation D-W Statistic p-value
##    1      -0.1423439      2.284559       0
##  Alternative hypothesis: rho != 0
#after adding this the DW stat is now at 2.28 which is very close to 2, which displays meeting the independence of errors assumption. 


bptest(suicide_model)
## 
##  studentized Breusch-Pagan test
## 
## data:  suicide_model
## BP = 446.45, df = 2, p-value < 2.2e-16
#the BP test shows a stron violation of the homoscedasticity assumption, a way to migitate this could be to use a log transformation to the dependent variable

suicide_data <- suicide_data %>%
  mutate(log_ESTIMATE = log(ESTIMATE))
suicide_model_log <- lm(log_ESTIMATE ~ AGE_NUM + YEAR_NUM, data = suicide_data)

summary(suicide_model_log)
## 
## Call:
## lm(formula = log_ESTIMATE ~ AGE_NUM + YEAR_NUM, data = suicide_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.2904 -0.6307  0.1297  0.6832  1.9716 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.954994   0.032151  60.807   <2e-16 ***
## AGE_NUM      0.133562   0.006093  21.921   <2e-16 ***
## YEAR_NUM    -0.002089   0.001001  -2.086    0.037 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8658 on 5481 degrees of freedom
##   (906 observations deleted due to missingness)
## Multiple R-squared:  0.08211,    Adjusted R-squared:  0.08177 
## F-statistic: 245.1 on 2 and 5481 DF,  p-value: < 2.2e-16
plot(suicide_model_log, which = 1)

bptest(suicide_model_log)
## 
##  studentized Breusch-Pagan test
## 
## data:  suicide_model_log
## BP = 0.13024, df = 2, p-value = 0.937
#now that the numbers for BP stat=.13, and pvaule is .93 it is above the threshold that means the homoscedasticity assumption is now met. 


plot(suicide_model, which = 2)

#yes this meets the normaility assumption, since the pattern is traced along with the residuals.

vif(suicide_model)
##  AGE_NUM YEAR_NUM 
## 1.002221 1.002221
cor(suicide_data %>% select(AGE_NUM, YEAR_NUM))
##              AGE_NUM    YEAR_NUM
## AGE_NUM   1.00000000 -0.01651277
## YEAR_NUM -0.01651277  1.00000000
#since the numbers are close to 1 it shows low correlation between varaibles and that multicollinearity is not a concern. Assumptions is met that tells me that corefficents can be viewed without worries of inflated standard errors due to multicollinearity.