#Load in the NC Stroke Deaths Data

library(readxl)
NCStrokeDeaths <- read_excel("C:/Users/lizet/Downloads/NCStrokeDeaths.xlsx", 
    sheet = "NCStrokeDeaths")
View(NCStrokeDeaths)

#summary statistics for Stroke Death Rate

summary(NCStrokeDeaths$Stroke_Death_Rate_Per_100000)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   61.10   75.72   82.75   85.64   92.72  180.70
sd(NCStrokeDeaths$Stroke_Death_Rate_Per_100000)
## [1] 16.29968

#Summary statistic for Diagnosed Diabesetes Percentage

summary(NCStrokeDeaths$dm_prev_adj)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    6.90    9.40   10.45   10.49   11.43   15.00
sd(NCStrokeDeaths$dm_prev_adj)
## [1] 1.533564

#Create scatter plot for both variables from above

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
ggplot(NCStrokeDeaths, aes(x =dm_prev_adj, y= Stroke_Death_Rate_Per_100000))+ 
  geom_point()

#It looks like there might be a correlation, and we do see some outliers!

#Create correlation matrix to determine 3 variables that are most correlated to stroke death per 100,000 individuals in NC (numeric variables only)

cor(NCStrokeDeaths$Stroke_Death_Rate_Per_100000, NCStrokeDeaths[, sapply(NCStrokeDeaths, is.numeric)])
##       cnty_fips Stroke_Death_Rate_Per_100000 dm_prev_adj ob_prev_adj  no_hsdip
## [1,] 0.09672448                            1   0.4065097   0.3812871 0.3064511
##      no_college female_hd  foodstmp     income   home_val    povpct
## [1,]  0.3317368 0.4638931 0.4336871 -0.2809179 -0.4407266 0.3620404
##      GINI - Income Inequality   perc_api  perc_aian perc_black perc_white
## [1,]               0.06864517 -0.1561695 -0.0356443  0.3726983 -0.3396383
##       perc_hisp  perc_65up total_cost
## [1,] 0.08628502 -0.1507679   0.270866

#The 3 variables that appear to be the most correlated to Death Rates are female_hd, foodstmp, and home_val. #From the 3 variables above, I would probably drop foodstmp because it appears to have the highest correlation with the other variables because it has the lowest absolute value from them.

#Regression Models #Regression Model 1 (SLR)

model <- lm(Stroke_Death_Rate_Per_100000 ~ dm_prev_adj, data = NCStrokeDeaths)
summary(model)
## 
## Call:
## lm(formula = Stroke_Death_Rate_Per_100000 ~ dm_prev_adj, data = NCStrokeDeaths)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -25.950  -9.321  -2.708   6.015  80.754 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  40.3215    10.3984   3.878 0.000191 ***
## dm_prev_adj   4.3206     0.9809   4.405 2.71e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.97 on 98 degrees of freedom
## Multiple R-squared:  0.1653, Adjusted R-squared:  0.1567 
## F-statistic:  19.4 on 1 and 98 DF,  p-value: 2.708e-05

Analyze the results using \(alpha(0.05)\) We have an adjusted R-Squared % of 14.98. Hypothesis: \[H_0: \beta_1 =0\] \[H_a: \beta_1\neq 0\] From the model output, the p-value is 2.708e-05. Using \(\alpha=0.05\) we are going to reject the null hypothesis and conclude that there is a significant relationship between diagnosed diabetes % and stroke death rate in NC.

Explanation: Null hypothesis (Hâ‚€) says that there is no significant relationship between the percentage of diagnosed diabetes (dm_prev_adj) and stroke death rates (Stroke_Death_Rate_Per_100000). meanwhile the alternative hypothesis (Ha) says that there is a significant relationship between the percentage of diagnosed diabetes (dm_prev_adj) and stroke death rates (Stroke_Death_Rate_Per_100000). since our p-value is less than that, it falls in the rejection region, and thus we reject H0 and accept Ha. The actual equation would be \(Y=40.32 + 4.32x,\) meaning that if there were no diagnosed diabetes percentage, we would still have 40.32 stroke deaths. But for every diagnosed diabetes percentage that we have, our deaths go up by 4.32!

#Multilinear Regression Model

model2 <- lm(Stroke_Death_Rate_Per_100000 ~ dm_prev_adj + foodstmp + income + home_val + `GINI - Income Inequality` + povpct + perc_api + perc_aian + perc_black + perc_white + perc_hisp + perc_65up + total_cost, data = NCStrokeDeaths)
summary(model2)
## 
## Call:
## lm(formula = Stroke_Death_Rate_Per_100000 ~ dm_prev_adj + foodstmp + 
##     income + home_val + `GINI - Income Inequality` + povpct + 
##     perc_api + perc_aian + perc_black + perc_white + perc_hisp + 
##     perc_65up + total_cost, data = NCStrokeDeaths)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -24.138  -8.206  -1.551   5.709  79.123 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)  
## (Intercept)                 2.623e+02  2.180e+02   1.203    0.232  
## dm_prev_adj                -1.136e-01  1.733e+00  -0.066    0.948  
## foodstmp                    9.444e-01  6.185e-01   1.527    0.130  
## income                      8.073e-01  5.407e-01   1.493    0.139  
## home_val                   -1.952e-01  9.592e-02  -2.035    0.045 *
## `GINI - Income Inequality`  8.436e+01  6.844e+01   1.233    0.221  
## povpct                      5.167e-01  8.098e-01   0.638    0.525  
## perc_api                   -4.468e+00  2.928e+00  -1.526    0.131  
## perc_aian                  -2.924e+00  2.361e+00  -1.239    0.219  
## perc_black                 -2.127e+00  2.244e+00  -0.948    0.346  
## perc_white                 -2.083e+00  2.192e+00  -0.951    0.345  
## perc_hisp                  -1.810e+00  2.277e+00  -0.795    0.429  
## perc_65up                  -6.087e-01  5.687e-01  -1.070    0.287  
## total_cost                 -1.679e-03  9.911e-04  -1.694    0.094 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.37 on 86 degrees of freedom
## Multiple R-squared:  0.3253, Adjusted R-squared:  0.2233 
## F-statistic: 3.189 on 13 and 86 DF,  p-value: 0.0006155