knitr::opts_chunk$set(echo = TRUE)
library(clustRcompaR)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# inaugural addresses
d <- inaugural_addresses
d <- mutate(d, century = ifelse(Year < 1800, "17th",
                                ifelse(Year >= 1800 & Year < 1900, "18th",
                                       ifelse(Year >= 1900 & Year < 2000, "19th", "20th"))))

three_clusters <- cluster(d, n_clusters = 3)
## Document-feature matrix of: 58 documents, 2,820 features (79.6% sparse).
extract_terms(three_clusters)
##    Cluster.1.Terms Cluster.1.Term.Frequencies Cluster.2.Terms
## 1               in                  34.200000              in
## 2               my                  13.866667           their
## 3            their                  12.333333          govern
## 4             will                  11.200000            will
## 5           govern                   9.533333             has
## 6            peopl                   7.200000              it
## 7               it                   7.133333           state
## 8           nation                   7.000000            been
## 9              has                   6.733333           peopl
## 10         countri                   6.533333          nation
##    Cluster.2.Term.Frequencies Cluster.3.Terms Cluster.3.Term.Frequencies
## 1                    77.52941              in                  36.692308
## 2                    22.88235            will                  16.076923
## 3                    21.41176          nation                  12.500000
## 4                    20.29412              us                  12.038462
## 5                    20.00000           world                   9.807692
## 6                    19.41176           peopl                   9.307692
## 7                    18.23529             can                   7.769231
## 8                    17.82353            must                   7.730769
## 9                    16.05882         america                   7.423077
## 10                   14.41176              no                   7.192308
three_clusters_comparison <- compare(three_clusters, "century")
compare_plot(three_clusters_comparison)

compare_test(three_clusters_comparison)
## Warning in stats::chisq.test(comparison_table): Chi-squared approximation
## may be incorrect
## [[1]]
## 
##  Pearson's Chi-squared test
## 
## data:  comparison_table
## X-squared = 41.292, df = 6, p-value = 2.537e-07
## 
## 
## [[2]]
##         "century"
## clusters 17th        18th         19th         20th        
##        1 Not Sig.    Sig. Greater Not Sig.     Sig. Lesser 
##        2 Not Sig.    Sig. Greater Sig. Lesser  Sig. Lesser 
##        3 Sig. Lesser Not Sig.     Sig. Greater Sig. Greater
d <- readr::read_csv("../scip_data.csv")
## Parsed with column specification:
## cols(
##   ID = col_integer(),
##   grade = col_integer(),
##   teacher = col_integer(),
##   time = col_integer(),
##   T1 = col_integer(),
##   T2 = col_integer(),
##   T3 = col_integer(),
##   T4 = col_integer(),
##   S1 = col_integer(),
##   S2 = col_integer(),
##   S3 = col_integer(),
##   S4 = col_integer(),
##   S5 = col_integer(),
##   purpose = col_character(),
##   criteria = col_character(),
##   generality = col_character(),
##   evidence = col_character(),
##   audience1 = col_character(),
##   audience2 = col_character()
## )
d <- select(d, audience2, everything())
four_cluster_solution <- cluster(d, n_clusters = 4)
## Document-feature matrix of: 193 documents, 188 features (94.6% sparse).
four_cluster_comparison <- compare(four_cluster_solution, "teacher")
compare_plot(four_cluster_comparison)

compare_test(four_cluster_comparison)
## [[1]]
## 
##  Pearson's Chi-squared test
## 
## data:  comparison_table
## X-squared = 80.675, df = 9, p-value = 1.187e-13
## 
## 
## [[2]]
##         "teacher"
## clusters 1            2            3            4           
##        1 Sig. Lesser  Sig. Greater Not Sig.     Not Sig.    
##        2 Sig. Greater Not Sig.     Not Sig.     Sig. Lesser 
##        3 Not Sig.     Not Sig.     Sig. Greater Sig. Greater
##        4 Sig. Greater Not Sig.     Sig. Lesser  Not Sig.
sessionInfo()
## R version 3.4.3 (2017-11-30)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.2
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] bindrcpp_0.2       dplyr_0.7.4.9000   clustRcompaR_0.2.0
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.14        quanteda_0.99.22    RColorBrewer_1.1-2 
##  [4] pillar_1.0.1        compiler_3.4.3      plyr_1.8.4         
##  [7] bindr_0.1           ppls_1.6-1          tools_3.4.3        
## [10] digest_0.6.13       lubridate_1.7.1     evaluate_0.10.1    
## [13] tibble_1.4.1        gtable_0.2.0        lattice_0.20-35    
## [16] pkgconfig_2.0.1     rlang_0.1.6.9002    Matrix_1.2-12      
## [19] fastmatch_1.1-0     yaml_2.1.16         stringr_1.2.0      
## [22] knitr_1.18          hms_0.4.0           rprojroot_1.3-2    
## [25] grid_3.4.3          tidyselect_0.2.3    glue_1.2.0         
## [28] data.table_1.10.4-3 R6_2.2.2            rmarkdown_1.8      
## [31] readr_1.1.1         purrr_0.2.4         ggplot2_2.2.1.9000 
## [34] spacyr_0.9.3        magrittr_1.5        MASS_7.3-47        
## [37] splines_3.4.3       SnowballC_0.5.1     backports_1.1.2    
## [40] scales_0.5.0.9000   htmltools_0.3.6     assertthat_0.2.0   
## [43] colorspace_1.3-2    labeling_0.3        stringi_1.1.6      
## [46] RcppParallel_4.3.20 lazyeval_0.2.1      munsell_0.4.3
save.image("1-11-2018.Rdata")