Module 8 Content Review Group Work

Review - Cleaning and Recoding Survey Variables

Load in a few packages

Code
library(corrplot)      #easy correlation matrices
library(tidyverse)     #data manipulation
library(tidymodels)    #easy visualizations of clusters
library(NbClust)       #determine optimal no. of clusters
library(psych)         #descriptive statistics
library(standardize)   #easy standardization
Code
library(haven)
engage <- read_csv("engage.xlsx - Sheet1.csv")
Code
describe(engage$behavioral_eng)
   vars    n mean   sd median trimmed  mad  min  max range skew kurtosis   se
X1    1 1000 2.75 0.97   2.61    2.71 1.12 0.55 5.29  4.75 0.31    -0.78 0.03
Code
describe(engage$affective_eng)
   vars    n mean   sd median trimmed  mad  min max range skew kurtosis   se
X1    1 1000 3.27 0.96   3.42     3.3 1.08 0.57 5.5  4.94 -0.3    -0.79 0.03
Code
describe(engage$cognitive_eng)
   vars    n mean   sd median trimmed  mad  min  max range skew kurtosis   se
X1    1 1000 3.71 1.52   3.32    3.62 1.48 0.66 7.53  6.87  0.5    -0.94 0.05
Code
describe(engage$social_eng)
   vars    n mean   sd median trimmed  mad min max range skew kurtosis   se
X1    1 1000 4.39 1.39    4.5    4.42 1.67 1.2 7.2     6 -0.2    -1.01 0.04
Code
glimpse(engage)
Rows: 1,000
Columns: 4
$ behavioral_eng <dbl> 2.885871, 1.707288, 4.653985, 2.786255, 1.753118, 4.340…
$ affective_eng  <dbl> 5.137745, 1.956721, 3.843508, 4.226747, 4.006949, 4.640…
$ cognitive_eng  <dbl> 2.203054, 2.648275, 6.171584, 3.334334, 2.324768, 4.659…
$ social_eng     <dbl> 4.711175, 2.346699, 5.468555, 3.660912, 5.777080, 6.823…
Code
ggplot(data = engage, mapping = aes(x = behavioral_eng)) +
  geom_density(color="black", fill="purple") + 
  labs(title = "Density Plot of Behavioral Engagement",
                      x = "Behavioral Engagement Score")

Code
ggplot(data = engage, mapping = aes(x = affective_eng)) +
  geom_density(color="black", fill="forest green") + 
  labs(title = "Density Plot of Affective Engagement",
                      x = "Affective Engagement Score")

Code
ggplot(data = engage, mapping = aes(x = cognitive_eng)) +
  geom_density(color="black", fill="red") + 
  labs(title = "Density Plot of Behavioral Engagement",
                      x = "Cognitive Engagement Score")

Code
ggplot(data = engage, mapping = aes(x = social_eng)) +
  geom_density(color="black", fill="blue") + 
  labs(title = "Density Plot of Social Engagement",
                      x = "Social Engagement Score")

Code
engage <- engage %>%
  mutate(.,
         behavioral_eng_std = scale(engage$behavioral_eng),
         affective_eng_std = scale(engage$affective_eng),
         cognitive_eng_std = scale(engage$cognitive_eng),
         social_eng_std = scale(engage$social_eng))

describe(engage$behavioral_eng_std)  
   vars    n mean sd median trimmed  mad   min  max range skew kurtosis   se
X1    1 1000    0  1  -0.14   -0.04 1.16 -2.27 2.63   4.9 0.31    -0.78 0.03
Code
describe(engage$affective_eng_std)
   vars    n mean sd median trimmed  mad  min  max range skew kurtosis   se
X1    1 1000    0  1   0.16    0.04 1.12 -2.8 2.32  5.12 -0.3    -0.79 0.03
Code
describe(engage$cognitive_eng_std)
   vars    n mean sd median trimmed  mad min  max range skew kurtosis   se
X1    1 1000    0  1  -0.25   -0.06 0.97  -2 2.51  4.51  0.5    -0.94 0.03
Code
describe(engage$social_eng_std)
   vars    n mean sd median trimmed mad   min  max range skew kurtosis   se
X1    1 1000    0  1   0.07    0.02 1.2 -2.29 2.01   4.3 -0.2    -1.01 0.03
Code
library(ggcorrplot)
corr <- engage %>%
  select(.,
         behavioral_eng_std,
         affective_eng_std,
         cognitive_eng_std,
         social_eng_std) %>%
  na.omit()

corr_engage <- cor(corr)

corr_engage
                   behavioral_eng_std affective_eng_std cognitive_eng_std
behavioral_eng_std          1.0000000         0.3587287         0.8130752
affective_eng_std           0.3587287         1.0000000         0.4136730
cognitive_eng_std           0.8130752         0.4136730         1.0000000
social_eng_std              0.5369821         0.7768180         0.6134703
                   social_eng_std
behavioral_eng_std      0.5369821
affective_eng_std       0.7768180
cognitive_eng_std       0.6134703
social_eng_std          1.0000000
Code
hclusts <- hclust(dist(corr, method = "euclidean"), method = "ward.D2")
hclusts

Call:
hclust(d = dist(corr, method = "euclidean"), method = "ward.D2")

Cluster method   : ward.D2 
Distance         : euclidean 
Number of objects: 1000 
Code
plot(hclusts)

Code
wardclust <- NbClust(data = corr, method = "ward.D2") 

*** : The Hubert index is a graphical method of determining the number of clusters.
                In the plot of Hubert index, we seek a significant knee that corresponds to a 
                significant increase of the value of the measure i.e the significant peak in Hubert
                index second differences plot. 
 

*** : The D index is a graphical method of determining the number of clusters. 
                In the plot of D index, we seek a significant knee (the significant peak in Dindex
                second differences plot) that corresponds to a significant increase of the value of
                the measure. 
 
******************************************************************* 
* Among all indices:                                                
* 5 proposed 2 as the best number of clusters 
* 5 proposed 3 as the best number of clusters 
* 10 proposed 4 as the best number of clusters 
* 1 proposed 9 as the best number of clusters 
* 1 proposed 10 as the best number of clusters 
* 1 proposed 15 as the best number of clusters 

                   ***** Conclusion *****                            
 
* According to the majority rule, the best number of clusters is  4 
 
 
******************************************************************* 
Code
plot(hclusts)
rect.hclust(hclusts,k=4, border="purple")

Code
wardclust <- NbClust(data = corr, method = "average") 

*** : The Hubert index is a graphical method of determining the number of clusters.
                In the plot of Hubert index, we seek a significant knee that corresponds to a 
                significant increase of the value of the measure i.e the significant peak in Hubert
                index second differences plot. 
 

*** : The D index is a graphical method of determining the number of clusters. 
                In the plot of D index, we seek a significant knee (the significant peak in Dindex
                second differences plot) that corresponds to a significant increase of the value of
                the measure. 
 
******************************************************************* 
* Among all indices:                                                
* 4 proposed 2 as the best number of clusters 
* 10 proposed 3 as the best number of clusters 
* 2 proposed 4 as the best number of clusters 
* 4 proposed 5 as the best number of clusters 
* 1 proposed 6 as the best number of clusters 
* 1 proposed 9 as the best number of clusters 
* 1 proposed 15 as the best number of clusters 

                   ***** Conclusion *****                            
 
* According to the majority rule, the best number of clusters is  3 
 
 
******************************************************************* 

Approach # 2 Kmeans clustering

Code
library(tidymodels)
kclusts <- 
  tibble(k = 2:4) %>%
  mutate(
    kclust = map(k, ~kmeans(corr, .x)),
    tidied = map(kclust, tidy),
    glanced = map(kclust, glance),
    augmented = map(kclust, augment, corr)
  )
Code
clusters <- 
  kclusts %>%
  unnest(cols = c(tidied))

assignments <- 
  kclusts %>% 
  unnest(cols = c(augmented))

clusterings <- 
  kclusts %>%
  unnest(cols = c(glanced))
Code
p1 <- 
  ggplot(assignments, aes(x = affective_eng_std, y = cognitive_eng_std)) +
  geom_jitter(aes(color = .cluster), alpha = 0.8) + 
  facet_wrap(~ k)
p1

Code
p2 <- p1 + geom_point(data = clusters, size = 10, shape = "x")
p2

Code
kmeansclust <- NbClust(data = corr, method = "kmeans")

*** : The Hubert index is a graphical method of determining the number of clusters.
                In the plot of Hubert index, we seek a significant knee that corresponds to a 
                significant increase of the value of the measure i.e the significant peak in Hubert
                index second differences plot. 
 

*** : The D index is a graphical method of determining the number of clusters. 
                In the plot of D index, we seek a significant knee (the significant peak in Dindex
                second differences plot) that corresponds to a significant increase of the value of
                the measure. 
 
******************************************************************* 
* Among all indices:                                                
* 1 proposed 2 as the best number of clusters 
* 8 proposed 3 as the best number of clusters 
* 11 proposed 4 as the best number of clusters 
* 1 proposed 7 as the best number of clusters 
* 2 proposed 15 as the best number of clusters 

                   ***** Conclusion *****                            
 
* According to the majority rule, the best number of clusters is  4 
 
 
*******************************************************************