knitr::opts_chunk$set(echo = TRUE)
library(clustRcompaR)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# inaugural addresses
d <- inaugural_addresses
d <- mutate(d, century = ifelse(Year < 1800, "17th",
ifelse(Year >= 1800 & Year < 1900, "18th",
ifelse(Year >= 1900 & Year < 2000, "19th", "20th"))))
three_clusters <- cluster(d, n_clusters = 3)
## Document-feature matrix of: 58 documents, 2,820 features (79.6% sparse).
extract_terms(three_clusters)
## Cluster.1.Terms Cluster.1.Term.Frequencies Cluster.2.Terms
## 1 in 34.200000 in
## 2 my 13.866667 their
## 3 their 12.333333 govern
## 4 will 11.200000 will
## 5 govern 9.533333 has
## 6 peopl 7.200000 it
## 7 it 7.133333 state
## 8 nation 7.000000 been
## 9 has 6.733333 peopl
## 10 countri 6.533333 nation
## Cluster.2.Term.Frequencies Cluster.3.Terms Cluster.3.Term.Frequencies
## 1 77.52941 in 36.692308
## 2 22.88235 will 16.076923
## 3 21.41176 nation 12.500000
## 4 20.29412 us 12.038462
## 5 20.00000 world 9.807692
## 6 19.41176 peopl 9.307692
## 7 18.23529 can 7.769231
## 8 17.82353 must 7.730769
## 9 16.05882 america 7.423077
## 10 14.41176 no 7.192308
three_clusters_comparison <- compare(three_clusters, "century")
compare_plot(three_clusters_comparison)

compare_test(three_clusters_comparison)
## Warning in stats::chisq.test(comparison_table): Chi-squared approximation
## may be incorrect
## [[1]]
##
## Pearson's Chi-squared test
##
## data: comparison_table
## X-squared = 41.292, df = 6, p-value = 2.537e-07
##
##
## [[2]]
## "century"
## clusters 17th 18th 19th 20th
## 1 Not Sig. Sig. Greater Not Sig. Sig. Lesser
## 2 Not Sig. Sig. Greater Sig. Lesser Sig. Lesser
## 3 Sig. Lesser Not Sig. Sig. Greater Sig. Greater
d <- readr::read_csv("../scip_data.csv")
## Parsed with column specification:
## cols(
## ID = col_integer(),
## grade = col_integer(),
## teacher = col_integer(),
## time = col_integer(),
## T1 = col_integer(),
## T2 = col_integer(),
## T3 = col_integer(),
## T4 = col_integer(),
## S1 = col_integer(),
## S2 = col_integer(),
## S3 = col_integer(),
## S4 = col_integer(),
## S5 = col_integer(),
## purpose = col_character(),
## criteria = col_character(),
## generality = col_character(),
## evidence = col_character(),
## audience1 = col_character(),
## audience2 = col_character()
## )
d <- select(d, audience2, everything())
four_cluster_solution <- cluster(d, n_clusters = 4)
## Document-feature matrix of: 193 documents, 188 features (94.6% sparse).
four_cluster_comparison <- compare(four_cluster_solution, "teacher")
compare_plot(four_cluster_comparison)

compare_test(four_cluster_comparison)
## [[1]]
##
## Pearson's Chi-squared test
##
## data: comparison_table
## X-squared = 80.675, df = 9, p-value = 1.187e-13
##
##
## [[2]]
## "teacher"
## clusters 1 2 3 4
## 1 Sig. Lesser Sig. Greater Not Sig. Not Sig.
## 2 Sig. Greater Not Sig. Not Sig. Sig. Lesser
## 3 Not Sig. Not Sig. Sig. Greater Sig. Greater
## 4 Sig. Greater Not Sig. Sig. Lesser Not Sig.
sessionInfo()
## R version 3.4.3 (2017-11-30)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.2
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] bindrcpp_0.2 dplyr_0.7.4.9000 clustRcompaR_0.2.0
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.12.14 quanteda_0.99.22 RColorBrewer_1.1-2
## [4] pillar_1.0.1 compiler_3.4.3 plyr_1.8.4
## [7] bindr_0.1 ppls_1.6-1 tools_3.4.3
## [10] digest_0.6.13 lubridate_1.7.1 evaluate_0.10.1
## [13] tibble_1.4.1 gtable_0.2.0 lattice_0.20-35
## [16] pkgconfig_2.0.1 rlang_0.1.6.9002 Matrix_1.2-12
## [19] fastmatch_1.1-0 yaml_2.1.16 stringr_1.2.0
## [22] knitr_1.18 hms_0.4.0 rprojroot_1.3-2
## [25] grid_3.4.3 tidyselect_0.2.3 glue_1.2.0
## [28] data.table_1.10.4-3 R6_2.2.2 rmarkdown_1.8
## [31] readr_1.1.1 purrr_0.2.4 ggplot2_2.2.1.9000
## [34] spacyr_0.9.3 magrittr_1.5 MASS_7.3-47
## [37] splines_3.4.3 SnowballC_0.5.1 backports_1.1.2
## [40] scales_0.5.0.9000 htmltools_0.3.6 assertthat_0.2.0
## [43] colorspace_1.3-2 labeling_0.3 stringi_1.1.6
## [46] RcppParallel_4.3.20 lazyeval_0.2.1 munsell_0.4.3
save.image("1-11-2018.Rdata")