Review - Cleaning and Recoding Survey Variables

Load in a few packages

library(corrplot)      #easy correlation matrices

corrplot 0.84 loaded

library(tidyverse)     #data manipulation

Registered S3 methods overwritten by 'dbplyr':
  method         from
  print.tbl_lazy     
  print.tbl_sql      
── Attaching packages ───────────────────────────────────────────── tidyverse 1.3.0 ──
✓ ggplot2 3.3.3     ✓ purrr   0.3.4
✓ tibble  3.0.6     ✓ dplyr   1.0.4
✓ tidyr   1.1.2     ✓ stringr 1.4.0
✓ readr   1.4.0     ✓ forcats 0.5.1
── Conflicts ──────────────────────────────────────────────── tidyverse_conflicts() ──
x dplyr::filter() masks stats::filter()
x dplyr::lag()    masks stats::lag()

library(tidymodels)    #easy visualizations of clusters

── Attaching packages ──────────────────────────────────────────── tidymodels 0.1.2 ──
✓ broom     0.7.5      ✓ recipes   0.1.15
✓ dials     0.0.9      ✓ rsample   0.0.9 
✓ infer     0.5.4      ✓ tune      0.1.2 
✓ modeldata 0.1.0      ✓ workflows 0.2.1 
✓ parsnip   0.1.5      ✓ yardstick 0.0.7 
── Conflicts ─────────────────────────────────────────────── tidymodels_conflicts() ──
x scales::discard() masks purrr::discard()
x dplyr::filter()   masks stats::filter()
x recipes::fixed()  masks stringr::fixed()
x dplyr::lag()      masks stats::lag()
x yardstick::spec() masks readr::spec()
x recipes::step()   masks stats::step()

library(NbClust)       #determine optimal no. of clusters
library(psych)         #descriptive statistics


Attaching package: ‘psych’

The following objects are masked from ‘package:scales’:

    alpha, rescale

The following objects are masked from ‘package:ggplot2’:

    %+%, alpha

library(standardize)   #easy standardization


 *********************************************************** 
          Loading standardize package version 0.2.2          
     Call standardize.news() to see new features/changes     
 ***********************************************************

library(haven)
cluster <- read_dta("ELS_Cluster_500.dta")

SES Composite Score

Let’s check the variable labels and get a breakdown of values:

describe(cluster$byses1)

ABCDEFGHIJ0123456789

	vars <dbl>	n <dbl>	mean <dbl>	sd <dbl>	median <dbl>	trimmed <dbl>	mad <dbl>	min <dbl>	max <dbl>
X1	1	500	-0.37	1.73	-0.02	-0.37	0.93	-8	1.8

str(cluster$byses1)

 dbl+lbl [1:500] -0.95,  0.55, -8.00,  0.95,  1.37,  0.12,  1.06,  0.65,  0.18,  ...
 @ label       : chr "Socio-economic status composite, v.1"
 @ format.stata: chr "%12.2g"
 @ labels      : Named num [1:2] -8 -4
  ..- attr(*, "names")= chr [1:2] "{Survey component legitimate skip/NA}" "{Nonrespondent}"

Recode missing, legitimate skips, partial interviews, and nonrespondents as NA in the whole dataset

cluster.clean <- cluster %>%
  na_if(., -9) %>%
  na_if(., -8) %>%
  na_if(., -7) %>%
  na_if(., -4)

glimpse(cluster.clean)

Rows: 500
Columns: 55
$ stu_id   <dbl> 278108, 429101, 195207, 408224, 352115, 276213, 227207, 455121, 117…
$ sch_id   <dbl> 2781, 4291, 1952, 4082, 3521, 2762, 2272, 4551, 1172, 3561, 4341, 1…
$ strat_id <dbl> 278, 429, 195, 408, 352, 276, 227, 455, 117, 356, 434, 160, 417, 14…
$ psu      <dbl+lbl> 1, 1, 2, 2, 1, 2, 2, 1, 2, 1, 1, 2, 2, 2, 2, 2, 1, 2, 1, 2, 2, …
$ f1sch_id <dbl+lbl> 2781, 4291, 1952, 4082, 3521, 2762, 2272, 4551, 1172,   NA, 434…
$ f1univ1  <dbl+lbl> 101, 101, 120, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101…
$ f1univ2a <dbl+lbl> 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ f1univ2b <dbl+lbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 4, 5, 7, 1, 1, …
$ f2univ_p <dbl+lbl> 102, 102, 119, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101…
$ bystuwt  <dbl+lbl> 338.0031,  58.1918,   0.0000,  65.7478,  74.7694, 274.6407, 292…
$ bysex    <dbl+lbl>  2,  2,  1,  1,  2,  1,  1,  1,  1,  2,  1,  1,  1,  1,  2,  1,…
$ byrace   <dbl+lbl>  4,  5, NA,  7,  2,  7,  7,  7,  4,  7,  7,  2,  7,  3,  7,  2,…
$ bystlang <dbl+lbl>  0,  1, NA,  1,  0,  1,  1,  1,  0,  1,  1,  0,  1,  1,  1,  1,…
$ byses1   <dbl+lbl> -0.95,  0.55,    NA,  0.95,  1.37,  0.12,  1.06,  0.65,  0.18, …
$ byses1qu <dbl+lbl>  1,  4, NA,  4,  4,  3,  4,  4,  3,  4,  4,  1,  4,  2,  3,  2,…
$ bygrdrpt <dbl+lbl>  0,  0,  1,  0,  0, 98,  0,  0,  0,  0,  0, 98,  0,  0,  0,  0,…
$ byhomlit <dbl+lbl>  0, NA, NA,  3,  3,  1,  3,  3,  2,  3,  1,  0,  3,  3,  3,  3,…
$ byparasp <dbl+lbl> 5, 7, 5, 6, 7, 5, 3, 7, 3, 5, 6, 5, 5, 5, 5, 6, 2, 5, 2, 5, 6, …
$ byiepflg <dbl+lbl>  1, NA, NA,  0,  0, NA,  0, NA,  0,  0,  0,  1, NA,  0,  0,  0,…
$ bytxcstd <dbl+lbl> 35.61, 57.20,    NA, 53.53, 71.03, 58.91, 54.09, 54.44, 25.92, …
$ bytxcqu  <dbl+lbl>  1,  3, NA,  3,  4,  4,  3,  3,  1,  3,  4,  1,  4,  2,  2,  2,…
$ bywrtnga <dbl+lbl>     NA,  1.951,  0.052,     NA,  1.951,  1.001, -1.847,  1.654,…
$ byxtracu <dbl+lbl>  0,  2, NA,  3,  5,  0,  0,  1,  0,  0,  0,  1,  2,  0,  0,  0,…
$ byhmwrk  <dbl+lbl>  8, 10, NA, 15, 11, 28,  4,  5, 10, 11,  9, 10, 13,  5,  3,  5,…
$ bytvvigm <dbl+lbl> 99,  0, NA,  6,  1,  5, NA,  2,  6,  1,  2,  0,  8,  5,  1,  2,…
$ f1qwt    <dbl+lbl>  398.7574,   62.9523,  139.3723,   62.9402,   71.8370,  294.149…
$ f1pnlwt  <dbl+lbl> 362.2636,  65.4358,   0.0000,  62.9030,  72.8376, 294.3836, 286…
$ f1trscwt <dbl+lbl> 306.8801,  66.4096, 150.3712,  61.8688,  98.8436, 305.3489, 282…
$ f2qtscwt <dbl+lbl>   0.0000,   0.0000, 191.7053,  62.0716,  75.9791, 300.6391, 299…
$ f2qwt    <dbl+lbl>   0.0000,   0.0000, 169.1791,  61.9468,  74.4226, 300.8338, 291…
$ f2f1wt   <dbl+lbl>   0.0000,   0.0000, 180.9959,  63.4727,  77.2253, 337.8765, 295…
$ f2bywt   <dbl+lbl>   0.0000,   0.0000, 167.0404,  60.7443,  73.6182, 297.8983, 286…
$ bys14    <dbl+lbl>  2,  2, NA,  1,  2,  1,  1,  1,  1,  2,  1,  1,  1,  1,  2,  1,…
$ bys15    <dbl+lbl>  1,  1, NA,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,…
$ bys24d   <dbl+lbl>  1,  1, NA,  2,  1,  1, NA,  2,  1,  1,  1,  1,  2,  2,  2,  2,…
$ bys24e   <dbl+lbl>  1,  1, NA,  1,  1,  1, NA,  1,  1,  1,  1,  1,  1,  1,  2,  1,…
$ bys24f   <dbl+lbl>  1,  1, NA,  1,  1,  1, NA,  1,  1,  1,  1,  1,  1,  1,  2,  1,…
$ bys33a   <dbl+lbl>  0,  0, NA,  0,  0,  0, NA,  0, NA,  1,  1,  0,  1,  1,  0,  0,…
$ bys33b   <dbl+lbl>  0,  0, NA,  0,  0,  0, NA,  0, NA,  1,  0,  0,  0,  0,  0,  0,…
$ bys43    <dbl+lbl>  1,  0, NA,  0,  2, 10,  2,  5,  0,  3,  9,  2,  2,  3,  0,  0,…
$ bys62a   <dbl+lbl> NA, -3, NA, -3, -3,  0, -3, -3, NA, -3, -3,  0, -3, -3, -3, -3,…
$ bys62g   <dbl+lbl> NA, -3, NA, -3, -3,  1, -3, -3, NA, -3, -3,  0, -3, -3, -3, -3,…
$ bys66a   <dbl+lbl> NA,  1, NA,  1,  1,  6, NA,  1,  1,  1,  1,  6,  1,  1,  1,  1,…
$ bys66b   <dbl+lbl> NA,  1, NA, -3,  1,  6, NA,  1,  1,  1,  1, NA,  1,  1,  1,  1,…
$ bys66c   <dbl+lbl> NA, -1, NA,  6,  1,  6, NA, -1,  1,  1, -1,  6,  1,  1,  1,  6,…
$ bys66f   <dbl+lbl> NA,  1, NA,  1,  1,  1, NA,  1, -1,  1, -1,  6,  1,  1,  1, -1,…
$ bys66g   <dbl+lbl> NA,  1, NA, -3, -3, -1, NA,  1, -1,  1, -1,  6,  1,  1,  1, -1,…
$ bys67    <dbl+lbl>  0, NA, NA,  1,  0,  1,  1,  1,  0,  1,  1,  0,  1,  1,  1,  1,…
$ bys71d   <dbl+lbl>  0, NA, NA,  0,  0,  0, NA,  0,  0, NA,  0,  0,  0,  0,  0,  0,…
$ bys71e   <dbl+lbl>  0, NA, NA,  1,  1,  0, NA,  0,  0, NA,  0,  0,  0,  0,  0,  0,…
$ bys83a   <dbl+lbl>  6, NA, NA,  6,  5, -1,  6,  7,  6,  7,  7, -3,  6,  7,  6,  4,…
$ bys83b   <dbl+lbl>  6, NA, NA, -1,  8, -1,  8,  6,  1,  7,  6, NA,  6,  7,  6,  5,…
$ bys87a   <dbl+lbl>  2, NA, NA,  2,  3,  1, NA,  2,  2,  4,  3,  3,  2,  3,  2,  3,…
$ bys87e   <dbl+lbl>  3, NA, NA,  3,  2,  1, NA,  2,  3,  1,  1,  3,  4,  3,  3,  3,…
$ bys87f   <dbl+lbl>  1, NA, NA,  2,  2,  4, NA,  3,  3,  1,  3,  1,  3,  2,  2,  4,…

ggplot(data = cluster.clean, mapping = aes(x = byses1)) +
  geom_histogram(binwidth = .25, color="black", fill="steel blue") + 
  labs(title = "Histogram of SES Composite Score",
                      x = "SES Composite Score")

Hours Spent on Homework Per Week

describe(cluster.clean$byhmwrk)

ABCDEFGHIJ0123456789

	vars <dbl>	n <dbl>	mean <dbl>	sd <dbl>	median <dbl>	trimmed <dbl>	mad <dbl>	min <dbl>	max <dbl>
X1	1	439	12.21	17.14	8	12.21	5.93	0	98

str(cluster.clean$byhmwrk)

 dbl+lbl [1:500]  8, 10, NA, 15, 11, 28,  4,  5, 10, 11,  9, 10, 13,  5,  3,  5, ...
 @ label       : chr "BY hours per week spent on homework (in and out of school)"
 @ format.stata: chr "%12.0f"
 @ labels      : Named num [1:6] -9 -8 -4 97 98 99
  ..- attr(*, "names")= chr [1:6] "{Missing}" "{Survey component legitimate skip/NA}" "{Nonrespondent}" "Out-of-school hmwork hrs top-coded at 26" ...

ggplot(data = cluster.clean, mapping = aes(x = byhmwrk)) +
  geom_bar(color="black", fill="forest green") + 
  labs(title = "Histogram of Hours Spent on Homework Per Week",
                      x = "Hours Spent on Homework Per Week")

Ideally, we want these scores to be relatively normal…so we may want to filter out those scores that are near 100 hours per week.

Math Engagement (Totally Absorbed in Math)

describe(cluster.clean$bys87a)

ABCDEFGHIJ0123456789

	vars <dbl>	n <dbl>	mean <dbl>	sd <dbl>	median <dbl>	trimmed <dbl>	mad <dbl>	min <dbl>	max <dbl>
X1	1	361	2.48	0.8	2	2.48	1.48	1	4

table(cluster.clean$bys87a)


  1   2   3   4 
 31 166 125  39

str(cluster.clean$bys87a)

 dbl+lbl [1:500]  2, NA, NA,  2,  3,  1, NA,  2,  2,  4,  3,  3,  2,  3,  2,  3, ...
 @ label       : chr "Gets totally absorbed in mathematics"
 @ format.stata: chr "%12.0f"
 @ labels      : Named num [1:9] -9 -8 -7 -6 -4 1 2 3 4
  ..- attr(*, "names")= chr [1:9] "{Missing}" "{Survey component legitimate skip/NA}" "{Partial interview-breakoff}" "{Multiple response}" ...

ggplot(data = cluster.clean, mapping = aes(x = bys87a)) +
  geom_bar(color="black", fill="dark red") + 
  labs(title = "Histogram of Math Engagement",
                      x = "Math Engagement")

Math Importance

describe(cluster.clean$bys87f)

ABCDEFGHIJ0123456789

	vars <dbl>	n <dbl>	mean <dbl>	sd <dbl>	median <dbl>	trimmed <dbl>	mad <dbl>	min <dbl>	max <dbl>
X1	1	364	2.52	0.9	2	2.52	1.48	1	4

table(cluster.clean$bys87f)


  1   2   3   4 
 40 156 108  60

str(cluster.clean$bys87f)

 dbl+lbl [1:500]  1, NA, NA,  2,  2,  4, NA,  3,  3,  1,  3,  1,  3,  2,  2,  4, ...
 @ label       : chr "Mathematics is important"
 @ format.stata: chr "%12.0f"
 @ labels      : Named num [1:9] -9 -8 -7 -6 -4 1 2 3 4
  ..- attr(*, "names")= chr [1:9] "{Missing}" "{Survey component legitimate skip/NA}" "{Partial interview-breakoff}" "{Multiple response}" ...

ggplot(data = cluster.clean, mapping = aes(x = bys87f)) +
  geom_bar(color="black", fill="purple") + 
  labs(title = "Histogram of Math Importance",
                      x = "Math Importance")

Quick Data Management Lesson - Create Standardized Versions of Your Variables

The package standardize is a great way to standardize your variables and control for continuous variable scaling and factor contrasts:

cluster.clean <- cluster.clean %>% 
  mutate(.,
        hwk_std = scale(cluster.clean$byhmwrk),
        import_std = scale(cluster.clean$bys87f),
        engage_std =  scale(cluster.clean$bys87a)) %>%
  filter(byhmwrk < 50) #Filter out those really high HW hours

describe(cluster.clean$hwk_std)

ABCDEFGHIJ0123456789

	vars <dbl>	n <dbl>	mean <dbl>	sd <dbl>	median <dbl>	trimmed <dbl>	mad <dbl>	min <dbl>	max <dbl>
X1	1	425	-0.16	0.43	-0.3	-0.23	0.35	-0.71	1.45

describe(cluster.clean$import_std)

ABCDEFGHIJ0123456789

	vars <dbl>	n <dbl>	mean <dbl>	sd <dbl>	median <dbl>	trimmed <dbl>	mad <dbl>	min <dbl>	max <dbl>
X1	1	342	-0.01	1.01	-0.58	0	1.66	-1.69	1.66

describe(cluster.clean$engage_std)

ABCDEFGHIJ0123456789

	vars <dbl>	n <dbl>	mean <dbl>	sd <dbl>	median <dbl>	trimmed <dbl>	mad <dbl>	min <dbl>	max <dbl>
X1	1	339	-0.02	1	-0.6	-0.05	1.85	-1.85	1.91

Now, summarize/ visualize and analyze correlations:

corr_cluster

               hwk_std  import_std engage_std
hwk_std     1.00000000 -0.08078977 -0.1596826
import_std -0.08078977  1.00000000  0.5166737
engage_std -0.15968257  0.51667365  1.0000000

Step 1: Choose a Clustering Approach and Find a Solution

Approach #1: Hierarchical Clustering

hclusts <- hclust(dist(corr, method = "euclidean"), method = "ward.D2")
hclusts


Call:
hclust(d = dist(corr, method = "euclidean"), method = "ward.D2")

Cluster method   : ward.D2 
Distance         : euclidean 
Number of objects: 333

Create a dendrogram to visualize the solution:

plot(hclusts)

wardclust <- NbClust(data = corr, method = "ward.D2")

*** : The Hubert index is a graphical method of determining the number of clusters.
                In the plot of Hubert index, we seek a significant knee that corresponds to a 
                significant increase of the value of the measure i.e the significant peak in Hubert
                index second differences plot.

*** : The D index is a graphical method of determining the number of clusters. 
                In the plot of D index, we seek a significant knee (the significant peak in Dindex
                second differences plot) that corresponds to a significant increase of the value of
                the measure. 
 
******************************************************************* 
* Among all indices:                                                
* 5 proposed 2 as the best number of clusters 
* 2 proposed 3 as the best number of clusters 
* 4 proposed 4 as the best number of clusters 
* 5 proposed 6 as the best number of clusters 
* 1 proposed 9 as the best number of clusters 
* 1 proposed 13 as the best number of clusters 
* 1 proposed 14 as the best number of clusters 
* 4 proposed 15 as the best number of clusters 

                   ***** Conclusion *****                            
 
* According to the majority rule, the best number of clusters is  2 
 
 
*******************************************************************

plot(hclusts)
rect.hclust(hclusts,k=8, border="red")

Approach #2: K-Means Clustering

library(tidymodels)
kclusts <- 
  tibble(k = 1:9) %>%
  mutate(
    kclust = map(k, ~kmeans(corr, .x)),
    tidied = map(kclust, tidy),
    glanced = map(kclust, glance),
    augmented = map(kclust, augment, corr)
  )

Visualize it

clusters <- 
  kclusts %>%
  unnest(cols = c(tidied))

assignments <- 
  kclusts %>% 
  unnest(cols = c(augmented))

clusterings <- 
  kclusts %>%
  unnest(cols = c(glanced))

Scatter matrix with group colors

p1 <- 
  ggplot(assignments, aes(x = import_std, y = engage_std)) +
  geom_jitter(aes(color = .cluster), alpha = 0.8) + 
  facet_wrap(~ k)
p1

Same thing, with X’es denoting the centers of each cluster…

p2 <- p1 + geom_point(data = clusters, size = 10, shape = "x")
p2

Use `NbClust` to get fit statistics for each solution

kmeansclust <- NbClust(data = corr, method = "kmeans")

*** : The Hubert index is a graphical method of determining the number of clusters.
                In the plot of Hubert index, we seek a significant knee that corresponds to a 
                significant increase of the value of the measure i.e the significant peak in Hubert
                index second differences plot.

*** : The D index is a graphical method of determining the number of clusters. 
                In the plot of D index, we seek a significant knee (the significant peak in Dindex
                second differences plot) that corresponds to a significant increase of the value of
                the measure. 
 
******************************************************************* 
* Among all indices:                                                
* 5 proposed 2 as the best number of clusters 
* 3 proposed 3 as the best number of clusters 
* 1 proposed 4 as the best number of clusters 
* 6 proposed 5 as the best number of clusters 
* 1 proposed 10 as the best number of clusters 
* 2 proposed 14 as the best number of clusters 
* 5 proposed 15 as the best number of clusters 

                   ***** Conclusion *****                            
 
* According to the majority rule, the best number of clusters is  5 
 
 
*******************************************************************

LS0tCnRpdGxlOiAiTXVsdGl2YXJpYXRlIFN0YXRpc3RpY3M6IE1vZHVsZSA4LSBJbnRyb2R1Y3Rpb24gdG8gQ2x1c3RlciBBbmFseXNpcyIKYXV0aG9yOiAiRHIuIEJyb2RhIgpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKLS0tCiMgUmV2aWV3IC0gQ2xlYW5pbmcgYW5kIFJlY29kaW5nIFN1cnZleSBWYXJpYWJsZXMKIyMgTG9hZCBpbiBhIGZldyBwYWNrYWdlcwpgYGB7cn0KbGlicmFyeShjb3JycGxvdCkgICAgICAjZWFzeSBjb3JyZWxhdGlvbiBtYXRyaWNlcwpsaWJyYXJ5KHRpZHl2ZXJzZSkgICAgICNkYXRhIG1hbmlwdWxhdGlvbgpsaWJyYXJ5KHRpZHltb2RlbHMpICAgICNlYXN5IHZpc3VhbGl6YXRpb25zIG9mIGNsdXN0ZXJzCmxpYnJhcnkoTmJDbHVzdCkgICAgICAgI2RldGVybWluZSBvcHRpbWFsIG5vLiBvZiBjbHVzdGVycwpsaWJyYXJ5KHBzeWNoKSAgICAgICAgICNkZXNjcmlwdGl2ZSBzdGF0aXN0aWNzCmxpYnJhcnkoc3RhbmRhcmRpemUpICAgI2Vhc3kgc3RhbmRhcmRpemF0aW9uCmBgYAoKYGBge3J9CmxpYnJhcnkoaGF2ZW4pCmNsdXN0ZXIgPC0gcmVhZF9kdGEoIkVMU19DbHVzdGVyXzUwMC5kdGEiKQpgYGAKCiMjIFNFUyBDb21wb3NpdGUgU2NvcmUKTGV0J3MgY2hlY2sgdGhlIHZhcmlhYmxlIGxhYmVscyBhbmQgZ2V0IGEgYnJlYWtkb3duIG9mIHZhbHVlczoKYGBge3J9CmRlc2NyaWJlKGNsdXN0ZXIkYnlzZXMxKQpzdHIoY2x1c3RlciRieXNlczEpCmBgYAoKUmVjb2RlIG1pc3NpbmcsIGxlZ2l0aW1hdGUgc2tpcHMsIHBhcnRpYWwgaW50ZXJ2aWV3cywgYW5kIG5vbnJlc3BvbmRlbnRzIGFzIGBOQWAgaW4gdGhlIHdob2xlIGRhdGFzZXQKYGBge3J9CmNsdXN0ZXIuY2xlYW4gPC0gY2x1c3RlciAlPiUKICBuYV9pZiguLCAtOSkgJT4lCiAgbmFfaWYoLiwgLTgpICU+JQogIG5hX2lmKC4sIC03KSAlPiUKICBuYV9pZiguLCAtNCkKYGBgCgpgYGB7cn0KZ2xpbXBzZShjbHVzdGVyLmNsZWFuKQpgYGAKCmBgYHtyfQpnZ3Bsb3QoZGF0YSA9IGNsdXN0ZXIuY2xlYW4sIG1hcHBpbmcgPSBhZXMoeCA9IGJ5c2VzMSkpICsKICBnZW9tX2hpc3RvZ3JhbShiaW53aWR0aCA9IC4yNSwgY29sb3I9ImJsYWNrIiwgZmlsbD0ic3RlZWwgYmx1ZSIpICsgCiAgbGFicyh0aXRsZSA9ICJIaXN0b2dyYW0gb2YgU0VTIENvbXBvc2l0ZSBTY29yZSIsCiAgICAgICAgICAgICAgICAgICAgICB4ID0gIlNFUyBDb21wb3NpdGUgU2NvcmUiKQpgYGAKCiMjIEhvdXJzIFNwZW50IG9uIEhvbWV3b3JrIFBlciBXZWVrCmBgYHtyfQpkZXNjcmliZShjbHVzdGVyLmNsZWFuJGJ5aG13cmspCnN0cihjbHVzdGVyLmNsZWFuJGJ5aG13cmspCmBgYAoKYGBge3J9CmdncGxvdChkYXRhID0gY2x1c3Rlci5jbGVhbiwgbWFwcGluZyA9IGFlcyh4ID0gYnlobXdyaykpICsKICBnZW9tX2Jhcihjb2xvcj0iYmxhY2siLCBmaWxsPSJmb3Jlc3QgZ3JlZW4iKSArIAogIGxhYnModGl0bGUgPSAiSGlzdG9ncmFtIG9mIEhvdXJzIFNwZW50IG9uIEhvbWV3b3JrIFBlciBXZWVrIiwKICAgICAgICAgICAgICAgICAgICAgIHggPSAiSG91cnMgU3BlbnQgb24gSG9tZXdvcmsgUGVyIFdlZWsiKQpgYGAKCklkZWFsbHksIHdlIHdhbnQgdGhlc2Ugc2NvcmVzIHRvIGJlIHJlbGF0aXZlbHkgbm9ybWFsLi4uc28gd2UgbWF5IHdhbnQgdG8gZmlsdGVyIG91dCB0aG9zZSBzY29yZXMgdGhhdCBhcmUgbmVhciAxMDAgaG91cnMgcGVyIHdlZWsuCgojIyBNYXRoIEVuZ2FnZW1lbnQgKFRvdGFsbHkgQWJzb3JiZWQgaW4gTWF0aCkKYGBge3J9CmRlc2NyaWJlKGNsdXN0ZXIuY2xlYW4kYnlzODdhKQp0YWJsZShjbHVzdGVyLmNsZWFuJGJ5czg3YSkKc3RyKGNsdXN0ZXIuY2xlYW4kYnlzODdhKQpgYGAKCmBgYHtyfQpnZ3Bsb3QoZGF0YSA9IGNsdXN0ZXIuY2xlYW4sIG1hcHBpbmcgPSBhZXMoeCA9IGJ5czg3YSkpICsKICBnZW9tX2Jhcihjb2xvcj0iYmxhY2siLCBmaWxsPSJkYXJrIHJlZCIpICsgCiAgbGFicyh0aXRsZSA9ICJIaXN0b2dyYW0gb2YgTWF0aCBFbmdhZ2VtZW50IiwKICAgICAgICAgICAgICAgICAgICAgIHggPSAiTWF0aCBFbmdhZ2VtZW50IikKYGBgCgojIyBNYXRoIEltcG9ydGFuY2UKYGBge3J9CmRlc2NyaWJlKGNsdXN0ZXIuY2xlYW4kYnlzODdmKQp0YWJsZShjbHVzdGVyLmNsZWFuJGJ5czg3ZikKc3RyKGNsdXN0ZXIuY2xlYW4kYnlzODdmKQpgYGAKCmBgYHtyfQpnZ3Bsb3QoZGF0YSA9IGNsdXN0ZXIuY2xlYW4sIG1hcHBpbmcgPSBhZXMoeCA9IGJ5czg3ZikpICsKICBnZW9tX2Jhcihjb2xvcj0iYmxhY2siLCBmaWxsPSJwdXJwbGUiKSArIAogIGxhYnModGl0bGUgPSAiSGlzdG9ncmFtIG9mIE1hdGggSW1wb3J0YW5jZSIsCiAgICAgICAgICAgICAgICAgICAgICB4ID0gIk1hdGggSW1wb3J0YW5jZSIpCmBgYAoKIyBRdWljayBEYXRhIE1hbmFnZW1lbnQgTGVzc29uIC0gQ3JlYXRlIFN0YW5kYXJkaXplZCBWZXJzaW9ucyBvZiBZb3VyIFZhcmlhYmxlcwoKVGhlIHBhY2thZ2UgYHN0YW5kYXJkaXplYCBpcyBhIGdyZWF0IHdheSB0byBzdGFuZGFyZGl6ZSB5b3VyIHZhcmlhYmxlcyBhbmQgY29udHJvbCBmb3IgY29udGludW91cyB2YXJpYWJsZSBzY2FsaW5nIGFuZCBmYWN0b3IgY29udHJhc3RzOgpgYGB7cn0KY2x1c3Rlci5jbGVhbiA8LSBjbHVzdGVyLmNsZWFuICU+JSAKICBtdXRhdGUoLiwKICAgICAgICBod2tfc3RkID0gc2NhbGUoY2x1c3Rlci5jbGVhbiRieWhtd3JrKSwKICAgICAgICBpbXBvcnRfc3RkID0gc2NhbGUoY2x1c3Rlci5jbGVhbiRieXM4N2YpLAogICAgICAgIGVuZ2FnZV9zdGQgPSAgc2NhbGUoY2x1c3Rlci5jbGVhbiRieXM4N2EpKSAlPiUKICBmaWx0ZXIoYnlobXdyayA8IDUwKSAjRmlsdGVyIG91dCB0aG9zZSByZWFsbHkgaGlnaCBIVyBob3VycwogIApgYGAKCmBgYHtyfQpkZXNjcmliZShjbHVzdGVyLmNsZWFuJGh3a19zdGQpCmRlc2NyaWJlKGNsdXN0ZXIuY2xlYW4kaW1wb3J0X3N0ZCkKZGVzY3JpYmUoY2x1c3Rlci5jbGVhbiRlbmdhZ2Vfc3RkKQpgYGAKTm93LCBzdW1tYXJpemUvIHZpc3VhbGl6ZSBhbmQgYW5hbHl6ZSBjb3JyZWxhdGlvbnM6CmBgYHtyfQpsaWJyYXJ5KGdnY29ycnBsb3QpCmNvcnIgPC0gY2x1c3Rlci5jbGVhbiAlPiUKICBzZWxlY3QoLiwKICAgICAgICAgaHdrX3N0ZCwKICAgICAgICAgaW1wb3J0X3N0ZCwKICAgICAgICAgZW5nYWdlX3N0ZCkgJT4lCiAgbmEub21pdCgpCgpjb3JyX2NsdXN0ZXIgPC0gY29yKGNvcnIpCgpjb3JyX2NsdXN0ZXIKYGBgCgojIFN0ZXAgMTogQ2hvb3NlIGEgQ2x1c3RlcmluZyBBcHByb2FjaCBhbmQgRmluZCBhIFNvbHV0aW9uCiMjIEFwcHJvYWNoICMxOiBIaWVyYXJjaGljYWwgQ2x1c3RlcmluZwpgYGB7cn0KaGNsdXN0cyA8LSBoY2x1c3QoZGlzdChjb3JyLCBtZXRob2QgPSAiZXVjbGlkZWFuIiksIG1ldGhvZCA9ICJ3YXJkLkQyIikKaGNsdXN0cwpgYGAKCiMjIENyZWF0ZSBhIGRlbmRyb2dyYW0gdG8gdmlzdWFsaXplIHRoZSBzb2x1dGlvbjoKYGBge3J9CnBsb3QoaGNsdXN0cykKYGBgCgpgYGB7cn0Kd2FyZGNsdXN0IDwtIE5iQ2x1c3QoZGF0YSA9IGNvcnIsIG1ldGhvZCA9ICJ3YXJkLkQyIikgCmBgYAoKYGBge3J9CnBsb3QoaGNsdXN0cykKcmVjdC5oY2x1c3QoaGNsdXN0cyxrPTgsIGJvcmRlcj0icmVkIikKYGBgCgojIEFwcHJvYWNoICMyOiBLLU1lYW5zIENsdXN0ZXJpbmcKYGBge3J9CmxpYnJhcnkodGlkeW1vZGVscykKa2NsdXN0cyA8LSAKICB0aWJibGUoayA9IDE6OSkgJT4lCiAgbXV0YXRlKAogICAga2NsdXN0ID0gbWFwKGssIH5rbWVhbnMoY29yciwgLngpKSwKICAgIHRpZGllZCA9IG1hcChrY2x1c3QsIHRpZHkpLAogICAgZ2xhbmNlZCA9IG1hcChrY2x1c3QsIGdsYW5jZSksCiAgICBhdWdtZW50ZWQgPSBtYXAoa2NsdXN0LCBhdWdtZW50LCBjb3JyKQogICkKYGBgCiMjIFZpc3VhbGl6ZSBpdApgYGB7cn0KY2x1c3RlcnMgPC0gCiAga2NsdXN0cyAlPiUKICB1bm5lc3QoY29scyA9IGModGlkaWVkKSkKCmFzc2lnbm1lbnRzIDwtIAogIGtjbHVzdHMgJT4lIAogIHVubmVzdChjb2xzID0gYyhhdWdtZW50ZWQpKQoKY2x1c3RlcmluZ3MgPC0gCiAga2NsdXN0cyAlPiUKICB1bm5lc3QoY29scyA9IGMoZ2xhbmNlZCkpCmBgYAoKIyMgU2NhdHRlciBtYXRyaXggd2l0aCBncm91cCBjb2xvcnMKYGBge3J9CnAxIDwtIAogIGdncGxvdChhc3NpZ25tZW50cywgYWVzKHggPSBpbXBvcnRfc3RkLCB5ID0gZW5nYWdlX3N0ZCkpICsKICBnZW9tX2ppdHRlcihhZXMoY29sb3IgPSAuY2x1c3RlciksIGFscGhhID0gMC44KSArIAogIGZhY2V0X3dyYXAofiBrKQpwMQpgYGAKCiMjIFNhbWUgdGhpbmcsIHdpdGggWCdlcyBkZW5vdGluZyB0aGUgY2VudGVycyBvZiBlYWNoIGNsdXN0ZXIuLi4KYGBge3J9CnAyIDwtIHAxICsgZ2VvbV9wb2ludChkYXRhID0gY2x1c3RlcnMsIHNpemUgPSAxMCwgc2hhcGUgPSAieCIpCnAyCmBgYAojIyBVc2UgYE5iQ2x1c3RgIHRvIGdldCBmaXQgc3RhdGlzdGljcyBmb3IgZWFjaCBzb2x1dGlvbgpgYGB7cn0Ka21lYW5zY2x1c3QgPC0gTmJDbHVzdChkYXRhID0gY29yciwgbWV0aG9kID0gImttZWFucyIpCmBgYAoK

Multivariate Statistics: Module 8- Introduction to Cluster Analysis

Dr. Broda

Review - Cleaning and Recoding Survey Variables

Load in a few packages

SES Composite Score

Hours Spent on Homework Per Week

Math Engagement (Totally Absorbed in Math)

Math Importance

Quick Data Management Lesson - Create Standardized Versions of Your Variables

Step 1: Choose a Clustering Approach and Find a Solution

Approach #1: Hierarchical Clustering

Create a dendrogram to visualize the solution:

Approach #2: K-Means Clustering

Visualize it

Scatter matrix with group colors

Same thing, with X’es denoting the centers of each cluster…

Use `NbClust` to get fit statistics for each solution

Multivariate Statistics: Module 8- Introduction to Cluster Analysis

Dr. Broda

Review - Cleaning and Recoding Survey Variables

Load in a few packages

SES Composite Score

Hours Spent on Homework Per Week

Math Engagement (Totally Absorbed in Math)

Math Importance

Quick Data Management Lesson - Create Standardized Versions of Your Variables

Step 1: Choose a Clustering Approach and Find a Solution

Approach #1: Hierarchical Clustering

Create a dendrogram to visualize the solution:

Approach #2: K-Means Clustering

Visualize it

Scatter matrix with group colors

Same thing, with X’es denoting the centers of each cluster…

Use NbClust to get fit statistics for each solution

Use `NbClust` to get fit statistics for each solution