DS7310 Assignment 4

Author

Desyne Martinez

Assignment Guideline (20 Points)

  1. Create a folder for this project and call it as DS7310_Assignment4_YourFullName.

  2. Create a project file in the folder.

  3. Change the name of author in YAML.

  4. Save this file as you lastname_firstname_Assignment4.qmd

  5. Add a table of content

  6. Add your code to the R code chunkes.

  7. Compress (zip) the DS7310_Assignment4_YourFullName folder.

  8. After upload the compress folder, download it, and make sure it is the correct one.

Question 1 (50 points)

  • Import the Data_Health_Ass4, remove incomplete rows, and create an interactive table for that.
#Type your code here
library(psych)
library(readxl)
library(ggplot2)
library(MASS)
library(rstatix)
library(tidyverse)
library(tidymodels)
library(lm.beta)
library(olsrr)
library(car)
library(DT)
library(dplyr)
library(rsample)
library(glmtoolbox)
library(lmtest)
library(DescTools)
library(plotly)
library(knitr)
library(kableExtra)
library(broom)

setwd("C:/Users/Owner/Downloads/DS7130_Assignment4_Martinez_Desyne/")
Healthdata <- read_xlsx("Data_Health_Ass4.xlsx")

ls(Healthdata)
 [1] "Arthritis"       "Asthma"          "BloodPressure"   "Chlamydia"      
 [5] "COPD"            "County"          "Depression"      "Gonorrhea"      
 [9] "Heart"           "HighCholesterol" "HIV"             "Stroke"         
[13] "Syphilis"       
# 1. Select rows where syphillis is not 0 , and return all columns.
Healthdata <- Healthdata[Healthdata$Syphilis > 0, ]


# INTERACTIVE TABLE (Requirement: At least one)
datatable(head(Healthdata, 100), options = list(pageLength = 5), 
          caption = 'Table 1: Interactive view of Health Data')
  • Create an interactive histogram for Stroke variable.
#Type your code here
plot_ly(
  data = Healthdata,
  x = ~Stroke,
  type = "histogram",
  marker = list(color = "gold")
) %>%
  layout(
    title = "Interactive Histogram of Stroke frequency",
    xaxis = list(title = "Stroke count"),
    yaxis = list(title = "Count")
  )
  • Get a subset of data for “Stroke”,“BloodPressure”,“Depression”, and “HighCholesterol,”

  • Make stroke prediction models based on different combinations of selecting 2 or 3 variables from BloodPressure, Depression, and HighCholesterol. Call each model from 1 to 4, like Model1, Model3, Model3, and Model4.

  • Make sure to use as.numeric for the variables, like as.numeric(Stroke), to convert the four variables to a numeric data, such as Data_Health_Ass4_subset$Stroke <- as.numeric(Data_Health_Ass4_subset$Stroke)

#Type your code here
Healthdatasub <-Healthdata %>%
  dplyr::select(Stroke,BloodPressure,Depression,HighCholesterol)
ls(Healthdatasub)
[1] "BloodPressure"   "Depression"      "HighCholesterol" "Stroke"         
Healthdatasub$Stroke <- as.numeric(Healthdatasub$Stroke)
Healthdatasub$BloodPressure <- as.numeric(Healthdatasub$BloodPressure)
Healthdatasub$Depression <- as.numeric(Healthdatasub$Depression)
Healthdatasub$HighCholesterol <- as.numeric(Healthdatasub$HighCholesterol)

model_health1 <- lm(Stroke ~ BloodPressure, 
                  data = Healthdatasub)
model_health2 <- lm(Stroke ~ BloodPressure + Depression, 
                  data = Healthdatasub)
model_health3 <-  lm(Stroke ~ BloodPressure + HighCholesterol, 
                     data = Healthdatasub)
model_health4 <- lm(Stroke ~ BloodPressure + Depression + HighCholesterol, 
                  data = Healthdatasub)
summary(model_health1)

Call:
lm(formula = Stroke ~ BloodPressure, data = Healthdatasub)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.3772 -0.2917 -0.0217  0.2431  3.2682 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)   -1.192938   0.066313  -17.99   <2e-16 ***
BloodPressure  0.143536   0.001753   81.88   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.455 on 1982 degrees of freedom
Multiple R-squared:  0.7718,    Adjusted R-squared:  0.7717 
F-statistic:  6704 on 1 and 1982 DF,  p-value: < 2.2e-16
summary(model_health2)

Call:
lm(formula = Stroke ~ BloodPressure + Depression, data = Healthdatasub)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.3772 -0.2917 -0.0217  0.2431  3.2682 

Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)   -1.193e+00  9.260e-02 -12.881   <2e-16 ***
BloodPressure  1.435e-01  1.792e-03  80.104   <2e-16 ***
Depression    -3.863e-06  3.293e-03  -0.001    0.999    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.4551 on 1981 degrees of freedom
Multiple R-squared:  0.7718,    Adjusted R-squared:  0.7716 
F-statistic:  3350 on 2 and 1981 DF,  p-value: < 2.2e-16
summary(model_health3)

Call:
lm(formula = Stroke ~ BloodPressure + HighCholesterol, data = Healthdatasub)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.3905 -0.2905 -0.0249  0.2456  3.2522 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)     -1.057440   0.123872  -8.537   <2e-16 ***
BloodPressure    0.146194   0.002699  54.159   <2e-16 ***
HighCholesterol -0.006436   0.004970  -1.295    0.195    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.4549 on 1981 degrees of freedom
Multiple R-squared:  0.772, Adjusted R-squared:  0.7718 
F-statistic:  3354 on 2 and 1981 DF,  p-value: < 2.2e-16
summary(model_health4)

Call:
lm(formula = Stroke ~ BloodPressure + Depression + HighCholesterol, 
    data = Healthdatasub)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.3888 -0.2909 -0.0250  0.2449  3.2519 

Coefficients:
                  Estimate Std. Error t value Pr(>|t|)    
(Intercept)     -1.0630752  0.1362474  -7.803 9.74e-15 ***
BloodPressure    0.1461734  0.0027082  53.974  < 2e-16 ***
Depression       0.0003284  0.0033025   0.099    0.921    
HighCholesterol -0.0064744  0.0049862  -1.298    0.194    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.455 on 1980 degrees of freedom
Multiple R-squared:  0.772, Adjusted R-squared:  0.7717 
F-statistic:  2235 on 3 and 1980 DF,  p-value: < 2.2e-16
  • Report the evaluation results of the four combinations in a table, with three columns of of Model variables (e.g., BloodPressure + Depression), model number from 1 to 4, and adjusted RSquared.

  • Show the report_lm_results with knitr::kable.

  • First create the empty table using the following code. Add your code to the following code.

report_lm_results <- data.frame(
  Variables = character(),
  Model = integer(),
  Adj_R_squared = numeric(),
  stringsAsFactors = FALSE
)

#Example: report_lm_results[1,1] <- "BloodPressure + Depression"
# report_lm_results[1,2] <- 1
# report_lm_results[1,3] <- summary(Model1)$adj.r.squared
library(knitr)
# Model 1
report_lm_results[1,1] <- "BloodPressure"
report_lm_results[1,2] <- 1
report_lm_results[1,3] <- summary(model_health1)$adj.r.squared

# Model 2
report_lm_results[2,1] <- "Depression + BloodPressure"
report_lm_results[2,2] <- 2
report_lm_results[2,3] <- summary(model_health2)$adj.r.squared

# Model 3
report_lm_results[3,1] <- "BloodPressure + HighCholesterol"
report_lm_results[3,2] <- 3
report_lm_results[3,3] <- summary(model_health3)$adj.r.squared

# Model 4
report_lm_results[4,1] <- "BloodPressure + Depression + HighCholesterol"
report_lm_results[4,2] <- 4
report_lm_results[4,3] <- summary(model_health4)$adj.r.squared

# Display table
kable(report_lm_results, caption = "Stroke Prediction Model Comparison Table")
Stroke Prediction Model Comparison Table
Variables Model Adj_R_squared
BloodPressure 1 0.7717082
Depression + BloodPressure 2 0.7715930
BloodPressure + HighCholesterol 3 0.7717861
BloodPressure + Depression + HighCholesterol 4 0.7716720
  • Find the summary of the best model.
#Type your code here
summary(model_health3)

Call:
lm(formula = Stroke ~ BloodPressure + HighCholesterol, data = Healthdatasub)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.3905 -0.2905 -0.0249  0.2456  3.2522 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)     -1.057440   0.123872  -8.537   <2e-16 ***
BloodPressure    0.146194   0.002699  54.159   <2e-16 ***
HighCholesterol -0.006436   0.004970  -1.295    0.195    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.4549 on 1981 degrees of freedom
Multiple R-squared:  0.772, Adjusted R-squared:  0.7718 
F-statistic:  3354 on 2 and 1981 DF,  p-value: < 2.2e-16
library(broom)
coeffs <- tidy(model_health3)
coeffs
# A tibble: 3 × 5
  term            estimate std.error statistic  p.value
  <chr>              <dbl>     <dbl>     <dbl>    <dbl>
1 (Intercept)     -1.06      0.124       -8.54 2.71e-17
2 BloodPressure    0.146     0.00270     54.2  0       
3 HighCholesterol -0.00644   0.00497     -1.29 1.95e- 1
  • Write the regression formula for the best model in bold format.

Stroke = -1.0574+ (0.1462 * BloodPressure) + (-0.0064 * HighCholesterol)

Question 2 (30 points)

  • Cluster the Data_Health_Ass4_subsetdata using k-means.

  • Select the best estimate number of clusters other than min.nc and max.nc.

set.seed(123)

#Type your code here
# Load clustering package and sample dataset package
library(cluster)
library(GDAdata)

# Standardize the data so variables with larger scales do not dominate
Health_data_scale <- scale(Healthdatasub)


# Load package for estimating the number of clusters
library(NbClust)

# Estimate the best number of clusters from 2 to 10 using k-means criteria
number_cluster_estimate <- NbClust(
  Health_data_scale,
  distance = "euclidean",
  min.nc = 2,
  max.nc = 10,
  method = "kmeans"
)

*** : The Hubert index is a graphical method of determining the number of clusters.
                In the plot of Hubert index, we seek a significant knee that corresponds to a 
                significant increase of the value of the measure i.e the significant peak in Hubert
                index second differences plot. 
 

*** : The D index is a graphical method of determining the number of clusters. 
                In the plot of D index, we seek a significant knee (the significant peak in Dindex
                second differences plot) that corresponds to a significant increase of the value of
                the measure. 
 
******************************************************************* 
* Among all indices:                                                
* 10 proposed 2 as the best number of clusters 
* 3 proposed 3 as the best number of clusters 
* 5 proposed 4 as the best number of clusters 
* 2 proposed 6 as the best number of clusters 
* 3 proposed 10 as the best number of clusters 

                   ***** Conclusion *****                            
 
* According to the majority rule, the best number of clusters is  2 
 
 
******************************************************************* 
  • Run the model with the best estimate number number of clusters.
#Type your code here
# Show the voting results for the best number of clusters
number_cluster_estimate$Best.nc
                    KL       CH Hartigan     CCC    Scott      Marriot TrCovW
Number_clusters  2.000    2.000   4.0000  2.0000    3.000            6      4
Value_Index     13.869 1545.355 200.7471 -7.0904 1446.913 375903812050 692580
                  TraceW Friedman   Rubin  Cindex     DB Silhouette   Duda
Number_clusters   4.0000   3.0000  6.0000 10.0000 2.0000     2.0000 2.0000
Value_Index     337.2885   2.8911 -0.2037  0.2195 1.1324     0.3635 1.1381
                 PseudoT2   Beale Ratkowsky     Ball PtBiserial Frey McClain
Number_clusters    2.0000  2.0000    2.0000    3.000     4.0000    1  2.0000
Value_Index     -141.4059 -0.2928    0.4497 1038.049     0.5022   NA  0.6063
                   Dunn Hubert SDindex Dindex    SDbw
Number_clusters 10.0000      0   4.000      0 10.0000
Value_Index      0.0269      0   2.013      0  0.4835
  • Show the average summary of variables in each cluster.
#Type your code here

# Run PAM clustering with 5 clusters
# Note: the slides call this k-means, but this function is PAM
kmeans_Health_data_scale_cluster <- pam(Health_data_scale, k = 4)

# Show medoids for the clusters
kmeans_Health_data_scale_cluster$medoids
         Stroke BloodPressure   Depression HighCholesterol
[1,] -0.1806096   -0.06465508 -0.003448993       0.1920344
[2,]  1.3946959    1.35948638 -0.507987401       0.6659103
[3,]  0.6595533    0.77610313  1.131762425       0.7922772
[4,] -0.8107318   -1.09415494 -0.413386450      -0.9136761
# Show the cluster assignment for each row
kmeans_Health_data_scale_cluster$clustering
   [1] 1 1 2 3 3 2 2 1 2 3 1 2 2 3 3 1 3 2 2 3 3 3 3 2 3 1 2 3 3 3 2 2 2 3 3 1 3
  [38] 1 3 4 1 2 2 1 2 3 1 3 2 2 3 2 1 3 2 1 4 2 2 2 1 3 2 2 2 1 4 2 4 2 4 2 2 4
  [75] 1 4 1 4 3 2 3 4 3 2 3 2 1 3 3 3 1 3 1 3 3 3 2 3 4 3 3 3 3 3 3 3 3 3 2 3 2
 [112] 3 2 1 3 3 1 3 3 3 3 2 2 3 2 3 3 3 3 1 3 2 1 3 3 1 3 2 4 3 3 4 1 4 1 4 4 1
 [149] 4 4 4 1 4 1 4 4 1 4 4 4 2 1 4 4 4 4 1 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 1 4 4
 [186] 4 4 1 4 1 4 4 4 4 4 4 4 4 1 4 1 4 4 4 4 4 4 4 1 4 4 4 4 1 4 4 1 4 4 1 4 4
 [223] 4 4 4 4 4 4 4 4 4 4 4 4 4 4 1 4 2 4 2 2 3 1 4 1 2 2 2 1 3 2 4 4 2 1 2 1 1
 [260] 1 2 1 2 1 4 1 4 2 2 1 2 1 2 2 1 1 2 4 2 2 2 1 4 2 2 2 1 1 4 1 4 1 1 2 2 4
 [297] 1 1 2 1 2 1 4 1 2 1 1 3 2 2 1 1 2 2 4 4 2 4 1 2 2 1 2 2 1 2 1 1 1 1 4 1 4
 [334] 1 1 1 1 2 1 2 2 2 2 3 2 2 2 2 2 1 2 1 2 2 2 2 1 1 2 2 2 1 1 2 2 4 4 4 4 4
 [371] 4 1 4 4 4 4 4 4 1 4 1 4 4 4 4 4 4 1 1 1 4 1 4 1 4 4 4 4 1 4 1 1 4 4 4 4 1
 [408] 1 4 1 1 1 4 1 2 1 4 4 1 4 1 4 1 1 4 1 4 4 1 4 4 4 1 4 1 3 1 1 1 4 3 1 4 3
 [445] 1 1 1 1 1 1 4 4 1 4 1 1 1 1 1 4 1 1 1 1 4 1 1 4 1 1 1 1 4 1 1 4 3 1 1 4 1
 [482] 1 1 1 1 4 1 1 1 1 4 4 4 4 1 1 4 1 4 4 4 4 4 1 4 4 4 4 1 4 1 4 4 4 4 4 4 1
 [519] 4 4 4 4 4 1 4 4 4 4 4 4 4 1 1 1 2 1 1 1 4 4 4 4 1 4 1 1 1 4 4 1 4 1 1 1 2
 [556] 4 1 4 1 1 1 1 1 1 3 3 4 3 3 1 3 3 1 3 1 1 3 3 1 3 3 3 3 3 4 3 3 1 3 3 3 3
 [593] 3 1 1 3 3 1 3 3 1 1 1 3 3 3 3 3 3 3 3 3 4 3 3 3 1 3 3 3 1 3 4 3 3 3 3 3 1
 [630] 3 4 1 1 1 3 4 3 3 3 3 1 3 3 1 3 3 3 2 1 2 3 3 3 3 3 1 2 3 3 3 3 3 3 3 3 3
 [667] 1 3 3 1 1 2 2 3 1 1 1 3 3 3 3 3 3 1 2 3 2 3 3 3 3 3 3 3 3 4 3 3 1 3 1 3 1
 [704] 3 1 1 1 1 1 1 1 3 1 4 3 4 4 4 1 4 1 1 2 4 4 4 4 4 1 1 2 1 1 2 4 1 1 1 1 4
 [741] 4 1 4 4 4 4 4 4 1 1 1 1 1 1 1 1 1 1 4 1 1 1 1 3 1 4 4 4 1 4 4 2 1 1 1 4 1
 [778] 4 1 1 4 1 1 1 1 4 1 4 1 1 1 1 1 4 1 4 4 4 4 4 4 4 4 1 4 4 1 4 4 4 4 4 4 4
 [815] 1 1 4 1 1 4 1 4 4 4 4 1 4 4 4 4 4 4 4 4 1 4 4 4 4 2 1 2 2 2 2 2 2 2 2 2 2
 [852] 2 2 1 4 2 1 1 2 2 1 2 2 1 2 2 2 2 4 1 2 2 2 1 2 2 2 1 2 2 2 2 2 2 2 4 2 1
 [889] 2 2 1 1 2 4 2 2 2 2 2 2 1 2 2 2 1 2 2 2 2 2 2 2 4 3 1 3 1 2 3 4 1 3 1 1 2
 [926] 4 1 3 1 2 1 4 1 1 3 2 3 3 3 2 3 1 4 2 3 2 1 3 3 4 1 1 4 1 1 1 1 1 3 1 1 1
 [963] 2 2 1 2 2 1 2 2 1 1 1 1 1 4 1 1 4 1 4 1 1 2 1 2 1 1 3 2 1 3 4 1 1 4 1 4 4
[1000] 4 1 4 4 4 4 4 4 2 4 4 4 4 4 1 4 4 4 4 4 4 1 4 1 4 4 1 2 4 1 1 4 1 4 4 4 4
[1037] 4 4 4 1 4 4 4 1 1 4 4 4 4 4 4 4 4 4 4 1 4 4 4 4 4 1 1 2 4 4 4 1 4 2 2 1 4
[1074] 2 1 4 4 4 1 4 2 1 1 1 1 4 4 4 1 1 1 1 1 1 1 1 4 4 1 1 1 4 1 4 4 4 1 4 1 4
[1111] 4 1 1 4 4 4 1 4 4 4 4 4 4 4 4 4 4 1 1 4 4 4 4 1 1 4 1 1 1 2 3 1 2 2 2 2 4
[1148] 1 4 1 1 1 2 1 1 2 2 2 1 2 1 4 1 1 1 1 2 4 2 1 1 1 1 2 4 2 4 1 1 2 4 1 4 2
[1185] 1 2 1 3 2 2 4 1 2 1 1 1 2 4 4 2 1 1 2 2 4 2 1 2 2 3 1 3 2 2 1 1 3 3 4 2 4
[1222] 2 2 4 1 3 2 1 2 1 4 4 4 4 4 4 1 4 4 4 4 4 3 1 1 3 4 1 1 1 3 1 1 1 1 1 3 3
[1259] 1 1 4 1 1 3 4 3 1 4 3 1 4 1 3 1 3 3 1 1 3 1 4 1 3 1 1 1 1 1 1 1 4 3 1 3 1
[1296] 3 3 1 2 3 1 3 4 1 1 3 1 3 1 1 1 1 3 1 4 1 3 4 3 1 1 4 1 3 3 3 3 3 3 4 3 3
[1333] 3 4 1 3 3 4 3 1 3 3 3 3 3 1 3 3 3 3 3 1 3 1 3 3 3 3 3 3 3 1 3 3 3 3 4 3 3
[1370] 3 3 1 3 3 3 1 1 1 3 3 1 1 4 4 4 4 1 1 2 4 1 1 4 4 4 1 1 1 4 3 4 4 4 4 4 4
[1407] 1 4 4 4 4 4 1 4 1 1 1 4 1 1 4 4 1 1 4 4 1 1 4 4 4 4 1 1 1 1 1 1 4 4 1 1 4
[1444] 4 4 1 1 4 4 4 1 4 1 1 1 1 1 1 4 1 4 1 4 1 4 4 4 4 4 4 2 1 2 1 2 2 4 4 2 4
[1481] 1 2 2 2 2 2 2 1 2 2 2 2 4 1 2 1 2 1 1 2 2 4 2 2 2 1 2 2 4 4 2 1 1 2 2 4 4
[1518] 4 4 4 2 1 1 4 2 4 4 4 4 2 4 2 4 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 3 3 3 3
[1555] 3 3 3 3 3 3 3 1 3 3 3 3 3 3 3 3 3 1 3 3 3 3 3 3 3 3 3 3 1 3 3 4 3 3 3 3 3
[1592] 3 3 4 3 3 3 1 3 3 1 3 3 3 3 3 3 4 1 1 4 1 2 1 1 4 1 4 4 4 1 4 4 1 1 1 1 1
[1629] 1 1 1 1 3 4 1 1 2 4 1 2 1 4 4 1 1 1 4 1 2 1 4 4 4 4 1 1 1 4 1 1 4 4 4 1 1
[1666] 1 1 1 1 4 1 1 4 1 4 3 1 3 1 1 1 1 1 1 1 1 3 1 1 4 1 4 4 1 2 4 3 1 1 1 2 1
[1703] 2 1 2 4 1 1 4 4 1 4 4 2 1 1 1 1 1 1 1 1 1 1 4 2 4 1 1 2 1 1 1 1 1 1 4 4 1
[1740] 1 4 4 2 3 4 3 1 4 4 1 4 1 1 3 1 4 1 4 1 2 4 1 1 4 4 1 4 1 4 4 4 4 4 4 4 4
[1777] 4 4 4 4 4 3 4 1 1 1 2 2 1 1 1 1 4 4 1 2 1 1 1 1 2 1 1 2 1 1 4 1 3 4 1 2 3
[1814] 2 4 1 2 2 2 1 2 4 4 4 2 1 3 1 3 3 3 3 1 4 2 3 3 4 4 3 4 4 1 3 2 4 2 4 3 1
[1851] 4 3 4 4 4 2 1 4 2 1 4 1 1 1 1 4 1 4 1 4 4 4 3 4 1 4 4 4 3 1 1 4 4 4 1 1 1
[1888] 1 3 3 4 1 4 1 4 4 4 4 4 4 4 3 1 3 3 3 3 1 3 3 3 3 3 1 3 3 3 3 3 3 3 4 3 3
[1925] 1 3 3 3 3 3 3 3 3 1 4 1 4 1 4 4 1 1 4 4 4 1 4 1 4 1 4 1 4 4 1 1 1 1 4 1 4
[1962] 4 4 1 4 1 4 4 1 4 1 4 4 4 4 1 4 4 4 4 4 4 4 4
# Plot the clustering result in two reduced dimensions
plot(kmeans_Health_data_scale_cluster)

# Add the assigned cluster to the original Health
Health_data_cluster <- Healthdatasub %>%
  mutate(cluster = kmeans_Health_data_scale_cluster$clustering)

# Show the dataset with assigned clusters
Health_data_cluster
# A tibble: 1,984 × 5
   Stroke BloodPressure Depression HighCholesterol cluster
    <dbl>         <dbl>      <dbl>           <dbl>   <int>
 1    3.7          40.6       23.8            35.9       1
 2    3.7          38.7       23.5            38.9       1
 3    6.1          48.9       22.8            39.4       2
 4    4.6          42.3       25.9            38.6       3
 5    4.3          40.5       26.8            40         3
 6    7.2          52.4       21.6            37.9       2
 7    5.7          47.7       22.8            39.8       2
 8    4.2          41.3       24.6            37.4       1
 9    5.6          46.8       24.2            39.5       2
10    4.6          44.3       25              40.9       3
# ℹ 1,974 more rows
# Compute the average of each variable by cluster
Health_cluster_summary <- Health_data_cluster %>%
  group_by(cluster) %>%
  summarise(across(everything(), ~ mean(.x, na.rm = TRUE)))

# Show cluster summaries to compare 
Health_cluster_summary
# A tibble: 4 × 5
  cluster Stroke BloodPressure Depression HighCholesterol
    <int>  <dbl>         <dbl>      <dbl>           <dbl>
1       1   4.04          37.1       23.6            36.9
2       2   5.58          45.5       21.9            39.1
3       3   4.85          42.1       28.0            39.3
4       4   3.26          31.2       22.4            33.2
  • Show the counties in the healthiest cluster.
#Type your code here

Health_data_cluster[Health_data_cluster$cluster == 4, ]
# A tibble: 631 × 5
   Stroke BloodPressure Depression HighCholesterol cluster
    <dbl>         <dbl>      <dbl>           <dbl>   <int>
 1    3.3          34.8       25.2            32.4       4
 2    2.8          36.8       23.2            35.5       4
 3    3.4          25.2       22.5            27.3       4
 4    3.7          29.7       24.1            32.8       4
 5    3.2          29.3       20.6            33.3       4
 6    3.6          31.4       24.1            34.1       4
 7    4            31.1       18.8            35.3       4
 8    4.4          33.3       19.5            35         4
 9    2.9          32.6       23.8            33.3       4
10    3.2          33.6       27.5            32.9       4
# ℹ 621 more rows
Healthdata$cluster <- Health_data_cluster$cluster

Healthdataclustersub <-Healthdata %>%
  dplyr::select(County,Stroke,BloodPressure,Depression,HighCholesterol, cluster)
Healthdataclustersub[Healthdataclustersub$cluster == 4, ]
# A tibble: 631 × 6
   County       Stroke BloodPressure      Depression HighCholesterol    cluster
   <chr>         <dbl> <chr>                   <dbl> <chr>                <int>
 1 Lee,AL          3.3 34.799999999999997       25.2 32.4                     4
 2 Shelby,AL       2.8 36.799999999999997       23.2 35.5                     4
 3 Coconino,AZ     3.4 25.2                     22.5 27.3                     4
 4 Graham,AZ       3.7 29.7                     24.1 32.799999999999997       4
 5 Maricopa,AZ     3.2 29.3                     20.6 33.299999999999997       4
 6 Pima,AZ         3.6 31.4                     24.1 34.1                     4
 7 SantaCruz,AZ    4   31.1                     18.8 35.299999999999997       4
 8 Yuma,AZ         4.4 33.299999999999997       19.5 35                       4
 9 Benton,AR       2.9 32.6                     23.8 33.299999999999997       4
10 Faulkner,AR     3.2 33.6                     27.5 32.9                     4
# ℹ 621 more rows