library(readxl)
library(dplyr)
library(DT)
setwd("C:/Users/ajgan/downloads")
Data_Health_Ass4 <- read_excel("Data_Health_Ass4.xlsx")
# Remove incomplete rows
Data_Health_Ass4_clean <- Data_Health_Ass4 %>%
na.omit()
# Interactive table
datatable(Data_Health_Ass4_clean,
options = list(pagelength = 10,
scrollX = TRUE))DS7310 Assignment 4
Assignment Guideline (20 Points)
Create a folder for this project and call it as DS7310_Assignment4_YourFullName.
Create a project file in the folder.
Change the name of author in YAML.
Save this file as you lastname_firstname_Assignment4.qmd
Add a table of content
Add your code to the R code chunkes.
Compress (zip) the DS7310_Assignment4_YourFullName folder.
After upload the compress folder, download it, and make sure it is the correct one.
Question 1 (50 points)
- Import the Data_Health_Ass4, remove incomplete rows, and create an interactive table for that.
- Create an interactive histogram for Stroke variable.
#create an interactive histogram
library(ggplot2)
ggplot(
data = Data_Health_Ass4_clean,
aes(x = Stroke))+
geom_histogram(fill = "red", color = "white", bins = 30) +
labs(title = "Histogram of Stroke Rates",
x = "Stroke Rate",
y = "Count")Get a subset of data for “Stroke”,“BloodPressure”,“Depression”, and “HighCholesterol,”
Make stroke prediction models based on different combinations of selecting 2 or 3 variables from BloodPressure, Depression, and HighCholesterol. Call each model from 1 to 4, like Model1, Model3, Model3, and Model4.
Make sure to use as.numeric for the variables, like as.numeric(Stroke), to convert the four variables to a numeric data, such as
Data_Health_Ass4_subset$Stroke <- as.numeric(Data_Health_Ass4_subset$Stroke)
#create subset: "Stroke, blood pressure, depression and high cholesterol
Data_Health_Ass4_subset <- Data_Health_Ass4_clean %>%
select(Stroke, BloodPressure, Depression, HighCholesterol)
#as.numeric for variables
Data_Health_Ass4_subset <- Data_Health_Ass4_subset %>%
mutate(
Stroke = as.numeric(Stroke),
BloodPressure = as.numeric(BloodPressure),
Depression = as.numeric(Depression),
HighCholesterol = as.numeric(HighCholesterol))
#model 1:
model1 <- lm(Stroke ~ BloodPressure + Depression, data = Data_Health_Ass4_subset)
summary(model1)
Call:
lm(formula = Stroke ~ BloodPressure + Depression, data = Data_Health_Ass4_subset)
Residuals:
Min 1Q Median 3Q Max
-1.4076 -0.3004 -0.0246 0.2566 3.2568
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.097974 0.083547 -13.142 <2e-16 ***
BloodPressure 0.143544 0.001607 89.342 <2e-16 ***
Depression -0.003436 0.002960 -1.161 0.246
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.4498 on 2476 degrees of freedom
Multiple R-squared: 0.7696, Adjusted R-squared: 0.7694
F-statistic: 4135 on 2 and 2476 DF, p-value: < 2.2e-16
#model 2:
model2 <- lm(Stroke ~ BloodPressure + HighCholesterol, data = Data_Health_Ass4_subset)
summary(model2)
Call:
lm(formula = Stroke ~ BloodPressure + HighCholesterol, data = Data_Health_Ass4_subset)
Residuals:
Min 1Q Median 3Q Max
-1.3969 -0.2976 -0.0283 0.2517 3.2461
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.103410 0.107445 -10.269 <2e-16 ***
BloodPressure 0.144481 0.002450 58.982 <2e-16 ***
HighCholesterol -0.003030 0.004347 -0.697 0.486
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.4499 on 2476 degrees of freedom
Multiple R-squared: 0.7695, Adjusted R-squared: 0.7693
F-statistic: 4133 on 2 and 2476 DF, p-value: < 2.2e-16
#model 3:
model3 <- lm(Stroke ~ Depression + HighCholesterol, data = Data_Health_Ass4_subset)
summary(model3)
Call:
lm(formula = Stroke ~ Depression + HighCholesterol, data = Data_Health_Ass4_subset)
Residuals:
Min 1Q Median 3Q Max
-2.0490 -0.4289 -0.0697 0.3168 3.7800
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -3.089804 0.177777 -17.380 < 2e-16 ***
Depression 0.011876 0.004575 2.596 0.00949 **
HighCholesterol 0.191201 0.004407 43.390 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.6968 on 2476 degrees of freedom
Multiple R-squared: 0.4471, Adjusted R-squared: 0.4467
F-statistic: 1001 on 2 and 2476 DF, p-value: < 2.2e-16
#model 4:
model4 <- lm(Stroke ~ BloodPressure + Depression + HighCholesterol, data = Data_Health_Ass4_subset)
summary(model4)
Call:
lm(formula = Stroke ~ BloodPressure + Depression + HighCholesterol,
data = Data_Health_Ass4_subset)
Residuals:
Min 1Q Median 3Q Max
-1.4133 -0.2995 -0.0251 0.2525 3.2495
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.043555 0.119934 -8.701 <2e-16 ***
BloodPressure 0.144722 0.002459 58.858 <2e-16 ***
Depression -0.003330 0.002965 -1.123 0.262
HighCholesterol -0.002754 0.004354 -0.633 0.527
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.4499 on 2475 degrees of freedom
Multiple R-squared: 0.7696, Adjusted R-squared: 0.7693
F-statistic: 2756 on 3 and 2475 DF, p-value: < 2.2e-16
Report the evaluation results of the four combinations in a table, with three columns of of Model variables (e.g., BloodPressure + Depression), model number from 1 to 4, and adjusted RSquared.
Show the report_lm_results with knitr::kable.
First create the empty table using the following code. Add your code to the following code.
report_lm_results <- data.frame(
Variables = character(),
Model = integer(),
Adj_R_squared = numeric(),
stringsAsFactors = FALSE
)
#Example: report_lm_results[1,1] <- "BloodPressure + Depression"
# report_lm_results[1,2] <- 1
# report_lm_results[1,3] <- summary(Model1)$adj.r.squared
#fill in the results for each model
#model_1
report_lm_results[1,1] <- "BloodPressure + Pressure"
report_lm_results[1,2] <- 1
report_lm_results[1,3] <- summary(model1)$adj.r.squared
#model_2
report_lm_results[2,1] <- "BloodPressure + HighCholesterol"
report_lm_results[2,2] <- 2
report_lm_results[2,3] <- summary(model2)$adj.r.squared
#model_3
report_lm_results[3,1] <- "Depression + HighCholesterol"
report_lm_results[3,2] <- 3
report_lm_results[3,3] <- summary(model3)$adj.r.squared
#model_4
report_lm_results[4,1] <- "BloodPressure + Depression + HighCholesterol"
report_lm_results[4,2] <- 4
report_lm_results[4,3] <- summary(model4)$adj.r.squared
#results
knitr:: kable(report_lm_results)| Variables | Model | Adj_R_squared |
|---|---|---|
| BloodPressure + Pressure | 1 | 0.7693899 |
| BloodPressure + HighCholesterol | 2 | 0.7693097 |
| Depression + HighCholesterol | 3 | 0.4466909 |
| BloodPressure + Depression + HighCholesterol | 4 | 0.7693340 |
- Find the summary of the best model.
models <- list(
model1 = model1,
model2 = model2,
model3 = model3,
model4 = model4
)
adj_r2_values <- sapply(models, function(m) summary(m)$adj.r.squared)
adj_r2_values model1 model2 model3 model4
0.7693899 0.7693097 0.4466909 0.7693340
best_model <- names(which.max(adj_r2_values))
best_model[1] "model1"
summary(models[[best_model]])
Call:
lm(formula = Stroke ~ BloodPressure + Depression, data = Data_Health_Ass4_subset)
Residuals:
Min 1Q Median 3Q Max
-1.4076 -0.3004 -0.0246 0.2566 3.2568
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.097974 0.083547 -13.142 <2e-16 ***
BloodPressure 0.143544 0.001607 89.342 <2e-16 ***
Depression -0.003436 0.002960 -1.161 0.246
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.4498 on 2476 degrees of freedom
Multiple R-squared: 0.7696, Adjusted R-squared: 0.7694
F-statistic: 4135 on 2 and 2476 DF, p-value: < 2.2e-16
Write the regression formula for the best model in bold format.
Type the formula here:
Stroke ~ Blood Pressure + Depression + High Cholesterol
Stroke = \(\beta_0\) + \(\beta_1\)(BloodPressure) + \(\beta_2\)(Depression) + \(\beta_3\)(highCholesterol) + \(\epsilon\)
Question 2 (30 points)
Cluster the data using k-means.
Select the best estimate number of clusters other than min.nc and max.nc.
set.seed(123)
#cluster the data
data_kmeans <- Data_Health_Ass4_subset
#scale the data
scaled_data <- scale(data_kmeans)
#Try k = 2 to 5
set.seed(123)
wss <- sapply(2:5, function(k) {
kmeans(scaled_data, centers = k, nstart = 25)$tot.withinss})
best_k <- which.min(wss) + 1
best_k[1] 5
set.seed(123)
kmeans_results <- kmeans(scaled_data, centers = best_k, nstart = 25)
kmeans_resultsK-means clustering with 5 clusters of sizes 854, 417, 349, 437, 422
Cluster means:
Stroke BloodPressure Depression HighCholesterol
1 -0.07891134 -0.0120943 0.07622062 0.1274650
2 -1.02793486 -1.2096735 0.18460734 -1.4544704
3 1.50488683 1.4175571 -0.58091886 0.8902141
4 -0.76757364 -0.7793328 -1.18769930 -0.4591696
5 0.72574243 0.8545104 1.37367678 0.9185577
Clustering vector:
[1] 1 1 3 5 5 3 3 1 3 5 1 3 3 5 5 1 1 3 3 5 5 5 5 3 5 1 3 5 5 5 5 3 3 3 1 5 1
[38] 5 1 5 2 1 3 3 1 3 5 1 1 3 1 5 3 3 1 5 3 1 4 3 3 3 1 5 3 3 5 3 1 2 1 2 2 3
[75] 4 3 1 2 1 4 1 4 5 3 5 2 5 3 1 5 3 2 5 5 5 1 5 1 5 5 5 3 5 2 5 5 5 1 5 5 5
[112] 1 5 5 5 3 5 3 5 3 1 5 5 1 5 5 5 5 3 3 5 3 5 3 5 5 5 5 5 1 5 3 1 5 5 1 5 5
[149] 1 5 2 5 3 5 4 1 2 1 4 4 1 4 2 1 1 4 4 4 2 1 2 4 2 4 3 1 2 2 4 4 1 4 4 1 4
[186] 2 2 2 2 4 4 2 4 2 4 2 1 1 4 4 2 4 1 3 4 1 4 2 2 2 2 2 4 2 2 4 2 4 3 2 1 2
[223] 2 2 2 2 4 2 2 2 2 1 2 4 2 2 2 1 2 2 2 2 1 4 2 1 4 2 2 4 4 2 1 2 2 4 2 2 4
[260] 4 4 2 2 4 2 2 4 4 4 3 2 3 3 5 3 1 1 4 4 3 3 3 1 5 3 2 2 3 1 3 1 3 4 1 3 4
[297] 2 3 1 2 3 4 3 4 3 1 4 3 4 3 3 1 1 3 4 3 3 3 4 3 1 4 3 3 3 3 4 1 4 1 4 3 1
[334] 1 1 3 3 4 1 1 3 1 1 3 1 4 4 3 1 1 5 3 3 3 1 1 3 3 4 2 3 4 4 1 3 3 3 1 3 3
[371] 3 3 1 3 1 1 1 1 4 1 4 1 1 1 1 1 3 3 3 3 3 1 1 1 3 3 1 1 4 3 3 1 3 3 3 3 1
[408] 1 3 3 1 3 3 3 3 1 1 3 3 3 3 3 1 1 3 3 3 3 4 4 4 2 2 2 1 2 4 2 1 2 4 2 2 2
[445] 4 3 1 4 4 1 4 4 4 2 4 4 4 2 4 4 1 2 4 4 1 4 4 1 1 4 4 1 2 1 4 1 4 4 4 2 1
[482] 4 4 1 1 4 4 2 4 2 4 1 4 1 1 4 1 4 1 1 1 4 1 4 3 1 4 4 1 4 1 4 1 1 1 4 4 4
[519] 4 1 2 2 1 2 1 1 1 2 1 1 5 1 1 1 2 1 1 2 5 1 1 1 1 1 1 1 2 2 1 2 1 1 1 1 1
[556] 1 1 1 2 1 2 1 4 1 1 1 2 1 1 1 2 1 1 1 1 1 1 1 1 1 4 1 1 1 1 1 2 1 1 1 1 1
[593] 1 2 1 1 1 1 1 4 1 1 1 1 1 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
[630] 4 4 4 4 4 4 4 4 4 4 4 4 2 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 2 4 4
[667] 4 4 4 4 4 4 4 1 1 1 4 1 1 4 3 1 1 4 4 1 2 4 3 4 4 4 2 4 1 4 4 1 4 1 4 4 1
[704] 1 4 3 1 4 4 1 4 2 1 4 4 1 4 4 5 5 1 5 5 5 2 5 5 1 5 5 5 1 5 5 1 2 5 5 5 2
[741] 1 5 5 5 1 5 5 2 5 5 1 5 1 5 5 5 5 1 1 5 1 5 1 5 5 5 1 1 5 1 5 5 5 5 5 5 5
[778] 5 1 5 5 5 2 5 5 5 5 5 1 5 1 5 5 5 1 5 2 5 5 5 5 5 5 5 2 5 2 1 1 1 1 5 5 5
[815] 2 5 5 5 5 1 5 5 1 5 5 5 3 1 3 5 5 5 3 5 5 1 3 1 5 5 5 5 5 5 5 5 1 5 1 2 1
[852] 3 3 5 1 1 1 5 5 5 5 5 5 1 3 5 3 5 5 5 5 5 5 5 5 2 5 5 1 5 1 5 1 5 1 1 1 1
[889] 1 1 1 1 5 1 5 1 5 1 1 4 4 4 1 4 1 4 3 4 1 4 4 1 4 4 4 4 1 1 1 1 3 2 1 1 1
[926] 4 4 2 1 2 2 2 4 4 2 2 3 1 1 1 1 5 1 1 1 1 1 1 1 1 1 5 1 5 2 5 1 1 1 1 1 5
[963] 5 2 2 1 2 1 2 2 5 3 2 1 2 5 2 3 1 1 1 1 3 3 1 1 2 1 1 1 4 1 1 1 3 1 1 4 1
[1000] 5 5 3 1 2 3 1 1 1 1 1 1 1 2 1 1 1 2 1 2 2 2 2 2 2 1 2 2 1 1 2 2 2 2 1 1 1
[1037] 2 1 2 2 1 2 1 2 1 2 1 2 1 2 2 1 2 2 1 2 2 2 2 2 1 2 1 1 1 1 2 2 1 2 2 2 2
[1074] 2 2 2 1 1 2 1 2 2 2 2 2 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4 3 1 1 3 3 1 3
[1111] 3 3 1 1 3 3 3 3 3 2 4 3 3 3 1 3 3 3 4 3 3 3 3 3 3 3 2 3 1 3 3 1 1 3 4 3 3
[1148] 3 1 3 3 1 1 1 3 1 3 3 3 3 1 3 3 3 3 2 1 1 1 5 1 3 5 2 1 1 1 1 1 2 1 5 1 3
[1185] 1 1 5 4 1 1 1 5 3 5 5 5 3 5 1 1 1 2 1 3 1 3 1 5 1 4 1 1 2 1 1 1 1 1 1 1 1
[1222] 1 1 1 1 3 1 3 1 3 3 1 2 3 3 3 1 1 1 4 1 2 1 1 2 3 1 1 4 1 1 1 3 1 1 3 1 1
[1259] 1 3 1 5 2 2 1 1 1 4 2 2 2 1 4 2 2 2 1 2 1 4 2 2 2 2 3 2 4 2 4 4 4 4 4 4 4
[1296] 4 4 4 4 4 1 4 4 4 4 1 1 1 3 4 4 4 4 2 1 2 1 2 2 2 2 2 2 4 4 4 4 4 4 4 4 4
[1333] 4 4 4 4 4 4 4 4 4 4 4 4 4 1 1 1 2 4 2 1 4 2 3 3 1 3 2 3 4 2 2 1 4 4 3 1 4
[1370] 1 4 4 1 4 4 1 1 1 1 1 2 4 2 1 4 4 1 1 1 4 4 4 2 4 1 2 1 2 1 4 4 1 4 2 2 4
[1407] 1 2 2 4 4 4 4 4 4 2 4 1 1 1 4 4 4 2 4 1 1 1 4 4 1 1 1 3 3 1 1 3 3 3 3 4 1
[1444] 4 1 1 1 3 1 1 3 3 3 1 3 1 2 1 1 1 1 3 2 3 1 1 1 3 1 1 2 3 2 1 1 3 2 3 1 1
[1481] 2 3 1 3 1 1 3 1 3 2 1 3 4 1 1 3 2 2 3 1 1 3 3 2 3 1 3 3 1 1 5 3 3 1 1 5 1
[1518] 1 4 3 4 3 3 2 1 5 3 1 3 4 4 4 2 4 2 2 2 4 4 4 2 4 1 2 2 5 1 1 5 2 2 1 1 2
[1555] 5 1 1 2 1 1 5 5 1 1 1 4 1 2 5 2 1 5 1 2 5 2 2 1 1 1 5 5 1 1 5 1 2 1 5 1 1
[1592] 1 1 1 1 1 2 5 1 1 5 1 5 1 1 3 1 1 1 1 5 2 1 1 1 1 1 5 1 1 1 1 5 1 2 1 5 2
[1629] 5 1 1 2 1 5 1 5 5 5 5 5 2 5 5 5 2 1 5 5 2 5 1 5 1 1 5 5 1 5 5 1 5 5 5 1 1
[1666] 1 5 5 5 5 5 5 5 5 5 1 5 5 5 5 2 5 5 5 5 2 5 5 5 1 1 1 5 5 2 1 1 2 2 2 2 1
[1703] 1 3 2 1 1 2 2 2 1 1 1 2 1 2 2 2 2 2 2 1 2 2 2 2 2 1 2 1 1 1 2 1 1 2 2 1 1
[1740] 2 4 1 1 2 2 1 2 2 4 1 1 1 1 1 1 1 1 1 1 2 2 1 1 2 2 1 1 1 1 4 4 1 4 1 1 2
[1777] 1 1 1 1 1 1 1 2 1 1 2 1 4 1 2 4 4 4 2 4 3 1 3 1 3 3 4 4 3 4 1 1 3 3 3 3 3
[1814] 4 3 3 1 3 4 1 3 1 3 1 1 1 3 4 3 3 3 1 3 3 2 4 3 1 1 3 3 4 4 4 2 4 4 2 4 3
[1851] 4 4 4 3 4 4 4 4 3 4 3 4 4 5 5 5 5 5 5 5 5 5 5 1 5 5 5 5 5 5 2 5 5 5 5 5 5
[1888] 5 5 5 5 5 5 5 1 5 5 5 5 5 5 5 5 5 5 5 1 5 5 5 5 5 5 5 5 5 5 5 5 1 5 5 2 5
[1925] 5 5 5 5 5 5 5 1 2 5 5 5 1 5 5 1 5 2 5 5 5 5 5 5 5 5 2 1 1 2 1 3 4 1 4 3 1
[1962] 4 2 2 1 4 3 1 4 2 1 1 1 1 1 1 4 1 4 1 5 4 2 1 4 1 3 4 1 1 3 3 1 2 1 1 4 1
[1999] 1 1 4 1 3 1 1 1 2 4 4 2 1 1 1 1 4 1 1 4 2 2 4 1 1 1 1 1 1 1 4 1 5 1 4 1 1
[2036] 2 5 1 1 1 1 1 3 4 4 1 1 1 1 5 1 3 1 4 1 4 2 4 3 1 4 1 1 1 4 1 1 3 1 3 1 3
[2073] 2 1 1 4 3 1 4 4 2 1 4 1 2 1 3 1 1 3 1 1 4 1 1 1 4 1 4 1 1 3 5 2 3 4 1 4 1
[2110] 1 3 3 3 1 1 1 1 5 1 1 1 1 1 4 2 1 1 1 2 3 1 1 4 4 5 1 1 2 1 1 4 1 1 1 1 4
[2147] 4 4 1 3 4 1 1 1 2 2 1 2 2 1 2 1 2 2 2 1 2 2 2 2 2 4 2 2 2 2 2 2 2 2 2 2 2
[2184] 2 3 4 5 3 1 5 2 1 1 1 1 3 5 3 1 1 5 3 3 4 1 1 3 5 1 3 4 4 1 1 3 1 1 1 1 5
[2221] 1 3 3 1 1 3 1 4 3 4 1 3 5 4 1 3 1 3 3 3 2 5 4 3 3 3 1 5 3 3 1 2 2 4 5 1 3
[2258] 1 1 1 5 5 1 5 3 1 4 3 3 5 1 5 3 5 5 4 2 5 2 2 1 5 3 3 4 4 3 2 5 1 2 5 2 4
[2295] 4 3 1 2 3 4 1 2 2 1 1 1 1 4 1 2 1 2 1 2 2 1 2 2 2 5 2 2 5 1 1 2 2 2 1 1 1
[2332] 1 5 5 1 2 1 2 1 2 2 1 2 2 2 2 2 5 1 5 5 5 5 5 5 5 1 5 5 5 5 5 5 5 1 5 5 5
[2369] 5 5 5 5 5 5 5 2 5 5 5 1 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 3 1 1 1 2 1 4 4
[2406] 1 2 1 2 1 1 2 2 2 1 1 2 1 1 4 3 1 2 1 2 1 2 1 1 1 1 1 1 1 1 2 4 1 1 2 4 2
[2443] 1 2 4 1 2 2 1 1 1 4 1 4 1 3 4 1 4 4 1 1 2 1 2 4 2 4 4 4 4 2 4 2 4 4 2 2 2
Within cluster sum of squares by cluster:
[1] 867.4192 572.0308 649.0868 518.2114 533.5076
(between_SS / total_SS = 68.3 %)
Available components:
[1] "cluster" "centers" "totss" "withinss" "tot.withinss"
[6] "betweenss" "size" "iter" "ifault"
Data_Health_Ass4_subset$cluster <- kmeans_results$cluster
table(Data_Health_Ass4_subset$cluster)
1 2 3 4 5
854 417 349 437 422
kmeans_results$centers Stroke BloodPressure Depression HighCholesterol
1 -0.07891134 -0.0120943 0.07622062 0.1274650
2 -1.02793486 -1.2096735 0.18460734 -1.4544704
3 1.50488683 1.4175571 -0.58091886 0.8902141
4 -0.76757364 -0.7793328 -1.18769930 -0.4591696
5 0.72574243 0.8545104 1.37367678 0.9185577
- Run the model with the best estimate number number of clusters.
#Type your code here- Show the average summary of variables in each cluster.
#Type your code here- Show the states in the healthiest cluster.
#Type your code here