library(corrplot) #easy correlation matrices
corrplot 0.84 loaded
library(tidyverse) #data manipulation
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
── Attaching packages ───────────────────────────────────────────── tidyverse 1.3.0 ──
✓ ggplot2 3.3.3 ✓ purrr 0.3.4
✓ tibble 3.0.6 ✓ dplyr 1.0.4
✓ tidyr 1.1.2 ✓ stringr 1.4.0
✓ readr 1.4.0 ✓ forcats 0.5.1
── Conflicts ──────────────────────────────────────────────── tidyverse_conflicts() ──
x dplyr::filter() masks stats::filter()
x dplyr::lag() masks stats::lag()
library(tidymodels) #easy visualizations of clusters
── Attaching packages ──────────────────────────────────────────── tidymodels 0.1.2 ──
✓ broom 0.7.5 ✓ recipes 0.1.15
✓ dials 0.0.9 ✓ rsample 0.0.9
✓ infer 0.5.4 ✓ tune 0.1.2
✓ modeldata 0.1.0 ✓ workflows 0.2.1
✓ parsnip 0.1.5 ✓ yardstick 0.0.7
── Conflicts ─────────────────────────────────────────────── tidymodels_conflicts() ──
x scales::discard() masks purrr::discard()
x dplyr::filter() masks stats::filter()
x recipes::fixed() masks stringr::fixed()
x dplyr::lag() masks stats::lag()
x yardstick::spec() masks readr::spec()
x recipes::step() masks stats::step()
library(NbClust) #determine optimal no. of clusters
library(psych) #descriptive statistics
Attaching package: ‘psych’
The following objects are masked from ‘package:scales’:
alpha, rescale
The following objects are masked from ‘package:ggplot2’:
%+%, alpha
library(standardize) #easy standardization
***********************************************************
Loading standardize package version 0.2.2
Call standardize.news() to see new features/changes
***********************************************************
library(haven)
cluster <- read_dta("ELS_Cluster_500.dta")
Let’s check the variable labels and get a breakdown of values:
describe(cluster$byses1)
vars <dbl> | n <dbl> | mean <dbl> | sd <dbl> | median <dbl> | trimmed <dbl> | mad <dbl> | min <dbl> | max <dbl> | ||
---|---|---|---|---|---|---|---|---|---|---|
X1 | 1 | 500 | -0.37 | 1.73 | -0.02 | -0.37 | 0.93 | -8 | 1.8 |
str(cluster$byses1)
dbl+lbl [1:500] -0.95, 0.55, -8.00, 0.95, 1.37, 0.12, 1.06, 0.65, 0.18, ...
@ label : chr "Socio-economic status composite, v.1"
@ format.stata: chr "%12.2g"
@ labels : Named num [1:2] -8 -4
..- attr(*, "names")= chr [1:2] "{Survey component legitimate skip/NA}" "{Nonrespondent}"
Recode missing, legitimate skips, partial interviews, and nonrespondents as NA
in the whole dataset
cluster.clean <- cluster %>%
na_if(., -9) %>%
na_if(., -8) %>%
na_if(., -7) %>%
na_if(., -4)
glimpse(cluster.clean)
Rows: 500
Columns: 55
$ stu_id <dbl> 278108, 429101, 195207, 408224, 352115, 276213, 227207, 455121, 117…
$ sch_id <dbl> 2781, 4291, 1952, 4082, 3521, 2762, 2272, 4551, 1172, 3561, 4341, 1…
$ strat_id <dbl> 278, 429, 195, 408, 352, 276, 227, 455, 117, 356, 434, 160, 417, 14…
$ psu <dbl+lbl> 1, 1, 2, 2, 1, 2, 2, 1, 2, 1, 1, 2, 2, 2, 2, 2, 1, 2, 1, 2, 2, …
$ f1sch_id <dbl+lbl> 2781, 4291, 1952, 4082, 3521, 2762, 2272, 4551, 1172, NA, 434…
$ f1univ1 <dbl+lbl> 101, 101, 120, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101…
$ f1univ2a <dbl+lbl> 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ f1univ2b <dbl+lbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 4, 5, 7, 1, 1, …
$ f2univ_p <dbl+lbl> 102, 102, 119, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101…
$ bystuwt <dbl+lbl> 338.0031, 58.1918, 0.0000, 65.7478, 74.7694, 274.6407, 292…
$ bysex <dbl+lbl> 2, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1,…
$ byrace <dbl+lbl> 4, 5, NA, 7, 2, 7, 7, 7, 4, 7, 7, 2, 7, 3, 7, 2,…
$ bystlang <dbl+lbl> 0, 1, NA, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1,…
$ byses1 <dbl+lbl> -0.95, 0.55, NA, 0.95, 1.37, 0.12, 1.06, 0.65, 0.18, …
$ byses1qu <dbl+lbl> 1, 4, NA, 4, 4, 3, 4, 4, 3, 4, 4, 1, 4, 2, 3, 2,…
$ bygrdrpt <dbl+lbl> 0, 0, 1, 0, 0, 98, 0, 0, 0, 0, 0, 98, 0, 0, 0, 0,…
$ byhomlit <dbl+lbl> 0, NA, NA, 3, 3, 1, 3, 3, 2, 3, 1, 0, 3, 3, 3, 3,…
$ byparasp <dbl+lbl> 5, 7, 5, 6, 7, 5, 3, 7, 3, 5, 6, 5, 5, 5, 5, 6, 2, 5, 2, 5, 6, …
$ byiepflg <dbl+lbl> 1, NA, NA, 0, 0, NA, 0, NA, 0, 0, 0, 1, NA, 0, 0, 0,…
$ bytxcstd <dbl+lbl> 35.61, 57.20, NA, 53.53, 71.03, 58.91, 54.09, 54.44, 25.92, …
$ bytxcqu <dbl+lbl> 1, 3, NA, 3, 4, 4, 3, 3, 1, 3, 4, 1, 4, 2, 2, 2,…
$ bywrtnga <dbl+lbl> NA, 1.951, 0.052, NA, 1.951, 1.001, -1.847, 1.654,…
$ byxtracu <dbl+lbl> 0, 2, NA, 3, 5, 0, 0, 1, 0, 0, 0, 1, 2, 0, 0, 0,…
$ byhmwrk <dbl+lbl> 8, 10, NA, 15, 11, 28, 4, 5, 10, 11, 9, 10, 13, 5, 3, 5,…
$ bytvvigm <dbl+lbl> 99, 0, NA, 6, 1, 5, NA, 2, 6, 1, 2, 0, 8, 5, 1, 2,…
$ f1qwt <dbl+lbl> 398.7574, 62.9523, 139.3723, 62.9402, 71.8370, 294.149…
$ f1pnlwt <dbl+lbl> 362.2636, 65.4358, 0.0000, 62.9030, 72.8376, 294.3836, 286…
$ f1trscwt <dbl+lbl> 306.8801, 66.4096, 150.3712, 61.8688, 98.8436, 305.3489, 282…
$ f2qtscwt <dbl+lbl> 0.0000, 0.0000, 191.7053, 62.0716, 75.9791, 300.6391, 299…
$ f2qwt <dbl+lbl> 0.0000, 0.0000, 169.1791, 61.9468, 74.4226, 300.8338, 291…
$ f2f1wt <dbl+lbl> 0.0000, 0.0000, 180.9959, 63.4727, 77.2253, 337.8765, 295…
$ f2bywt <dbl+lbl> 0.0000, 0.0000, 167.0404, 60.7443, 73.6182, 297.8983, 286…
$ bys14 <dbl+lbl> 2, 2, NA, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1,…
$ bys15 <dbl+lbl> 1, 1, NA, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,…
$ bys24d <dbl+lbl> 1, 1, NA, 2, 1, 1, NA, 2, 1, 1, 1, 1, 2, 2, 2, 2,…
$ bys24e <dbl+lbl> 1, 1, NA, 1, 1, 1, NA, 1, 1, 1, 1, 1, 1, 1, 2, 1,…
$ bys24f <dbl+lbl> 1, 1, NA, 1, 1, 1, NA, 1, 1, 1, 1, 1, 1, 1, 2, 1,…
$ bys33a <dbl+lbl> 0, 0, NA, 0, 0, 0, NA, 0, NA, 1, 1, 0, 1, 1, 0, 0,…
$ bys33b <dbl+lbl> 0, 0, NA, 0, 0, 0, NA, 0, NA, 1, 0, 0, 0, 0, 0, 0,…
$ bys43 <dbl+lbl> 1, 0, NA, 0, 2, 10, 2, 5, 0, 3, 9, 2, 2, 3, 0, 0,…
$ bys62a <dbl+lbl> NA, -3, NA, -3, -3, 0, -3, -3, NA, -3, -3, 0, -3, -3, -3, -3,…
$ bys62g <dbl+lbl> NA, -3, NA, -3, -3, 1, -3, -3, NA, -3, -3, 0, -3, -3, -3, -3,…
$ bys66a <dbl+lbl> NA, 1, NA, 1, 1, 6, NA, 1, 1, 1, 1, 6, 1, 1, 1, 1,…
$ bys66b <dbl+lbl> NA, 1, NA, -3, 1, 6, NA, 1, 1, 1, 1, NA, 1, 1, 1, 1,…
$ bys66c <dbl+lbl> NA, -1, NA, 6, 1, 6, NA, -1, 1, 1, -1, 6, 1, 1, 1, 6,…
$ bys66f <dbl+lbl> NA, 1, NA, 1, 1, 1, NA, 1, -1, 1, -1, 6, 1, 1, 1, -1,…
$ bys66g <dbl+lbl> NA, 1, NA, -3, -3, -1, NA, 1, -1, 1, -1, 6, 1, 1, 1, -1,…
$ bys67 <dbl+lbl> 0, NA, NA, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1,…
$ bys71d <dbl+lbl> 0, NA, NA, 0, 0, 0, NA, 0, 0, NA, 0, 0, 0, 0, 0, 0,…
$ bys71e <dbl+lbl> 0, NA, NA, 1, 1, 0, NA, 0, 0, NA, 0, 0, 0, 0, 0, 0,…
$ bys83a <dbl+lbl> 6, NA, NA, 6, 5, -1, 6, 7, 6, 7, 7, -3, 6, 7, 6, 4,…
$ bys83b <dbl+lbl> 6, NA, NA, -1, 8, -1, 8, 6, 1, 7, 6, NA, 6, 7, 6, 5,…
$ bys87a <dbl+lbl> 2, NA, NA, 2, 3, 1, NA, 2, 2, 4, 3, 3, 2, 3, 2, 3,…
$ bys87e <dbl+lbl> 3, NA, NA, 3, 2, 1, NA, 2, 3, 1, 1, 3, 4, 3, 3, 3,…
$ bys87f <dbl+lbl> 1, NA, NA, 2, 2, 4, NA, 3, 3, 1, 3, 1, 3, 2, 2, 4,…
ggplot(data = cluster.clean, mapping = aes(x = byses1)) +
geom_histogram(binwidth = .25, color="black", fill="steel blue") +
labs(title = "Histogram of SES Composite Score",
x = "SES Composite Score")
describe(cluster.clean$byhmwrk)
vars <dbl> | n <dbl> | mean <dbl> | sd <dbl> | median <dbl> | trimmed <dbl> | mad <dbl> | min <dbl> | max <dbl> | ||
---|---|---|---|---|---|---|---|---|---|---|
X1 | 1 | 439 | 12.21 | 17.14 | 8 | 12.21 | 5.93 | 0 | 98 |
str(cluster.clean$byhmwrk)
dbl+lbl [1:500] 8, 10, NA, 15, 11, 28, 4, 5, 10, 11, 9, 10, 13, 5, 3, 5, ...
@ label : chr "BY hours per week spent on homework (in and out of school)"
@ format.stata: chr "%12.0f"
@ labels : Named num [1:6] -9 -8 -4 97 98 99
..- attr(*, "names")= chr [1:6] "{Missing}" "{Survey component legitimate skip/NA}" "{Nonrespondent}" "Out-of-school hmwork hrs top-coded at 26" ...
ggplot(data = cluster.clean, mapping = aes(x = byhmwrk)) +
geom_bar(color="black", fill="forest green") +
labs(title = "Histogram of Hours Spent on Homework Per Week",
x = "Hours Spent on Homework Per Week")
Ideally, we want these scores to be relatively normal…so we may want to filter out those scores that are near 100 hours per week.
describe(cluster.clean$bys87a)
vars <dbl> | n <dbl> | mean <dbl> | sd <dbl> | median <dbl> | trimmed <dbl> | mad <dbl> | min <dbl> | max <dbl> | ||
---|---|---|---|---|---|---|---|---|---|---|
X1 | 1 | 361 | 2.48 | 0.8 | 2 | 2.48 | 1.48 | 1 | 4 |
table(cluster.clean$bys87a)
1 2 3 4
31 166 125 39
str(cluster.clean$bys87a)
dbl+lbl [1:500] 2, NA, NA, 2, 3, 1, NA, 2, 2, 4, 3, 3, 2, 3, 2, 3, ...
@ label : chr "Gets totally absorbed in mathematics"
@ format.stata: chr "%12.0f"
@ labels : Named num [1:9] -9 -8 -7 -6 -4 1 2 3 4
..- attr(*, "names")= chr [1:9] "{Missing}" "{Survey component legitimate skip/NA}" "{Partial interview-breakoff}" "{Multiple response}" ...
ggplot(data = cluster.clean, mapping = aes(x = bys87a)) +
geom_bar(color="black", fill="dark red") +
labs(title = "Histogram of Math Engagement",
x = "Math Engagement")
describe(cluster.clean$bys87f)
vars <dbl> | n <dbl> | mean <dbl> | sd <dbl> | median <dbl> | trimmed <dbl> | mad <dbl> | min <dbl> | max <dbl> | ||
---|---|---|---|---|---|---|---|---|---|---|
X1 | 1 | 364 | 2.52 | 0.9 | 2 | 2.52 | 1.48 | 1 | 4 |
table(cluster.clean$bys87f)
1 2 3 4
40 156 108 60
str(cluster.clean$bys87f)
dbl+lbl [1:500] 1, NA, NA, 2, 2, 4, NA, 3, 3, 1, 3, 1, 3, 2, 2, 4, ...
@ label : chr "Mathematics is important"
@ format.stata: chr "%12.0f"
@ labels : Named num [1:9] -9 -8 -7 -6 -4 1 2 3 4
..- attr(*, "names")= chr [1:9] "{Missing}" "{Survey component legitimate skip/NA}" "{Partial interview-breakoff}" "{Multiple response}" ...
ggplot(data = cluster.clean, mapping = aes(x = bys87f)) +
geom_bar(color="black", fill="purple") +
labs(title = "Histogram of Math Importance",
x = "Math Importance")
The package standardize
is a great way to standardize your variables and control for continuous variable scaling and factor contrasts:
cluster.clean <- cluster.clean %>%
mutate(.,
hwk_std = scale(cluster.clean$byhmwrk),
import_std = scale(cluster.clean$bys87f),
engage_std = scale(cluster.clean$bys87a)) %>%
filter(byhmwrk < 50) #Filter out those really high HW hours
describe(cluster.clean$hwk_std)
vars <dbl> | n <dbl> | mean <dbl> | sd <dbl> | median <dbl> | trimmed <dbl> | mad <dbl> | min <dbl> | max <dbl> | ||
---|---|---|---|---|---|---|---|---|---|---|
X1 | 1 | 425 | -0.16 | 0.43 | -0.3 | -0.23 | 0.35 | -0.71 | 1.45 |
describe(cluster.clean$import_std)
vars <dbl> | n <dbl> | mean <dbl> | sd <dbl> | median <dbl> | trimmed <dbl> | mad <dbl> | min <dbl> | max <dbl> | ||
---|---|---|---|---|---|---|---|---|---|---|
X1 | 1 | 342 | -0.01 | 1.01 | -0.58 | 0 | 1.66 | -1.69 | 1.66 |
describe(cluster.clean$engage_std)
vars <dbl> | n <dbl> | mean <dbl> | sd <dbl> | median <dbl> | trimmed <dbl> | mad <dbl> | min <dbl> | max <dbl> | ||
---|---|---|---|---|---|---|---|---|---|---|
X1 | 1 | 339 | -0.02 | 1 | -0.6 | -0.05 | 1.85 | -1.85 | 1.91 |
Now, summarize/ visualize and analyze correlations:
corr_cluster
hwk_std import_std engage_std
hwk_std 1.00000000 -0.08078977 -0.1596826
import_std -0.08078977 1.00000000 0.5166737
engage_std -0.15968257 0.51667365 1.0000000
hclusts <- hclust(dist(corr, method = "euclidean"), method = "ward.D2")
hclusts
Call:
hclust(d = dist(corr, method = "euclidean"), method = "ward.D2")
Cluster method : ward.D2
Distance : euclidean
Number of objects: 333
plot(hclusts)
wardclust <- NbClust(data = corr, method = "ward.D2")
*** : The Hubert index is a graphical method of determining the number of clusters.
In the plot of Hubert index, we seek a significant knee that corresponds to a
significant increase of the value of the measure i.e the significant peak in Hubert
index second differences plot.
*** : The D index is a graphical method of determining the number of clusters.
In the plot of D index, we seek a significant knee (the significant peak in Dindex
second differences plot) that corresponds to a significant increase of the value of
the measure.
*******************************************************************
* Among all indices:
* 5 proposed 2 as the best number of clusters
* 2 proposed 3 as the best number of clusters
* 4 proposed 4 as the best number of clusters
* 5 proposed 6 as the best number of clusters
* 1 proposed 9 as the best number of clusters
* 1 proposed 13 as the best number of clusters
* 1 proposed 14 as the best number of clusters
* 4 proposed 15 as the best number of clusters
***** Conclusion *****
* According to the majority rule, the best number of clusters is 2
*******************************************************************
plot(hclusts)
rect.hclust(hclusts,k=8, border="red")
library(tidymodels)
kclusts <-
tibble(k = 1:9) %>%
mutate(
kclust = map(k, ~kmeans(corr, .x)),
tidied = map(kclust, tidy),
glanced = map(kclust, glance),
augmented = map(kclust, augment, corr)
)
clusters <-
kclusts %>%
unnest(cols = c(tidied))
assignments <-
kclusts %>%
unnest(cols = c(augmented))
clusterings <-
kclusts %>%
unnest(cols = c(glanced))
p1 <-
ggplot(assignments, aes(x = import_std, y = engage_std)) +
geom_jitter(aes(color = .cluster), alpha = 0.8) +
facet_wrap(~ k)
p1
p2 <- p1 + geom_point(data = clusters, size = 10, shape = "x")
p2
NbClust
to get fit statistics for each solutionkmeansclust <- NbClust(data = corr, method = "kmeans")
*** : The Hubert index is a graphical method of determining the number of clusters.
In the plot of Hubert index, we seek a significant knee that corresponds to a
significant increase of the value of the measure i.e the significant peak in Hubert
index second differences plot.
*** : The D index is a graphical method of determining the number of clusters.
In the plot of D index, we seek a significant knee (the significant peak in Dindex
second differences plot) that corresponds to a significant increase of the value of
the measure.
*******************************************************************
* Among all indices:
* 5 proposed 2 as the best number of clusters
* 3 proposed 3 as the best number of clusters
* 1 proposed 4 as the best number of clusters
* 6 proposed 5 as the best number of clusters
* 1 proposed 10 as the best number of clusters
* 2 proposed 14 as the best number of clusters
* 5 proposed 15 as the best number of clusters
***** Conclusion *****
* According to the majority rule, the best number of clusters is 5
*******************************************************************