Let’s work on imputing missing values in the College Scorecard data so we can do clustering. This time, let’s try using random forest.

Loading the Data and Counting the Missing Values

Let’s open up the filtered, processed version of the College Scorecard data we have worked on together and settled on using at this point.

library(readr)
colleges <- read_csv("data/scorecard_reduced_features.csv", na = c("NULL", "PrivacySuppressed"))
dim(colleges)
## [1] 2473   59

How many NA values do we have in each column?

sapply(colleges, function(x) sum(is.na(x)))
##                    UNITID                    INSTNM                       ZIP 
##                         0                         0                         0 
##                      HCM2                   CONTROL                    LOCALE 
##                         0                         0                       115 
##                   CCBASIC                  CCUGPROF                  CCSIZSET 
##                       200                       817                       803 
##                      HBCU                       PBI                     ANNHI 
##                       115                       115                       115 
##                    TRIBAL                   AANAPII                       HSI 
##                       115                       115                       115 
##                     NANTI                   MENONLY                 WOMENONLY 
##                       115                       115                       115 
##              ADM_RATE_ALL                   SATVR25                   SATVR75 
##                       519                      1036                      1036 
##               SAT_AVG_ALL              DISTANCEONLY                UGDS_WHITE 
##                       831                       115                       338 
##                UGDS_BLACK                 UGDS_HISP                UGDS_ASIAN 
##                       338                       338                       338 
##                  INEXPFTE                 AVGFACSAL                    PFTFAC 
##                       115                       130                       142 
##                    C150_4              C150_4_BLACK               C150_4_HISP 
##                       519                       679                       664 
##              C150_4_ASIAN               C150_4_AIAN                C150_4_NRA 
##                       767                       904                       713 
##                  PCTFLOAN                   UG25abv   LO_INC_COMP_ORIG_YR6_RT 
##                       349                       335                       545 
##   MD_INC_COMP_ORIG_YR6_RT   HI_INC_COMP_ORIG_YR6_RT FIRSTGEN_COMP_ORIG_YR6_RT 
##                       516                      1457                       585 
##         PAR_ED_PCT_1STGEN                  DEBT_MDN            mn_earn_wne_p8 
##                       281                       305                       368 
##         pct25_earn_wne_p8         pct75_earn_wne_p8         pct90_earn_wne_p8 
##                       450                       450                       450 
##           RPY_3YR_RT_SUPP    LO_INC_RPY_3YR_RT_SUPP    MD_INC_RPY_3YR_RT_SUPP 
##                       351                       390                       398 
##    HI_INC_RPY_3YR_RT_SUPP    NONCOM_RPY_3YR_RT_SUPP  FIRSTGEN_RPY_3YR_RT_SUPP 
##                       492                       388                       426 
##          PCT_LIBERAL_ARTS          PCT_PROFESSIONAL             PCT_RELIGIOUS 
##                       306                       306                       306 
##                      COST            PCT_VOCATIONAL 
##                       458                       306
sapply(colleges, function(x) mean(is.na(x)))
##                    UNITID                    INSTNM                       ZIP 
##                0.00000000                0.00000000                0.00000000 
##                      HCM2                   CONTROL                    LOCALE 
##                0.00000000                0.00000000                0.04650222 
##                   CCBASIC                  CCUGPROF                  CCSIZSET 
##                0.08087343                0.33036797                0.32470683 
##                      HBCU                       PBI                     ANNHI 
##                0.04650222                0.04650222                0.04650222 
##                    TRIBAL                   AANAPII                       HSI 
##                0.04650222                0.04650222                0.04650222 
##                     NANTI                   MENONLY                 WOMENONLY 
##                0.04650222                0.04650222                0.04650222 
##              ADM_RATE_ALL                   SATVR25                   SATVR75 
##                0.20986656                0.41892438                0.41892438 
##               SAT_AVG_ALL              DISTANCEONLY                UGDS_WHITE 
##                0.33602911                0.04650222                0.13667610 
##                UGDS_BLACK                 UGDS_HISP                UGDS_ASIAN 
##                0.13667610                0.13667610                0.13667610 
##                  INEXPFTE                 AVGFACSAL                    PFTFAC 
##                0.04650222                0.05256773                0.05742014 
##                    C150_4              C150_4_BLACK               C150_4_HISP 
##                0.20986656                0.27456531                0.26849980 
##              C150_4_ASIAN               C150_4_AIAN                C150_4_NRA 
##                0.31014962                0.36554792                0.28831379 
##                  PCTFLOAN                   UG25abv   LO_INC_COMP_ORIG_YR6_RT 
##                0.14112414                0.13546300                0.22038011 
##   MD_INC_COMP_ORIG_YR6_RT   HI_INC_COMP_ORIG_YR6_RT FIRSTGEN_COMP_ORIG_YR6_RT 
##                0.20865346                0.58916296                0.23655479 
##         PAR_ED_PCT_1STGEN                  DEBT_MDN            mn_earn_wne_p8 
##                0.11362717                0.12333199                0.14880712 
##         pct25_earn_wne_p8         pct75_earn_wne_p8         pct90_earn_wne_p8 
##                0.18196522                0.18196522                0.18196522 
##           RPY_3YR_RT_SUPP    LO_INC_RPY_3YR_RT_SUPP    MD_INC_RPY_3YR_RT_SUPP 
##                0.14193288                0.15770319                0.16093813 
##    HI_INC_RPY_3YR_RT_SUPP    NONCOM_RPY_3YR_RT_SUPP  FIRSTGEN_RPY_3YR_RT_SUPP 
##                0.19894865                0.15689446                0.17226041 
##          PCT_LIBERAL_ARTS          PCT_PROFESSIONAL             PCT_RELIGIOUS 
##                0.12373635                0.12373635                0.12373635 
##                      COST            PCT_VOCATIONAL 
##                0.18520016                0.12373635

Most of those are not so bad, many around 15% or less.

Substituting Missing Values Using Random Forest

Let’s try imputing the missing values using the missForest package, and implementation of random forest for missing value imputation.

library(missForest)
colleges[,c(4:18,23)] <- lapply(colleges[,c(4:18,23)], as.factor)
colleges_subset <- colleges[4:59]
impute_results <- missForest(as.data.frame(colleges_subset), verbose = TRUE)
##   missForest iteration 1 in progress...done!
##     estimated error(s): 0.1271946 0.1299746 
##     difference(s): 0.009159569 0.04384856 
##     time: 138.047 seconds
## 
##   missForest iteration 2 in progress...done!
##     estimated error(s): 0.1257042 0.1271617 
##     difference(s): 0.001501774 0.01511322 
##     time: 146.269 seconds
## 
##   missForest iteration 3 in progress...done!
##     estimated error(s): 0.1255491 0.1279206 
##     difference(s): 0.0003400747 0.01122119 
##     time: 147.159 seconds
## 
##   missForest iteration 4 in progress...done!
##     estimated error(s): 0.124779 0.1285108 
##     difference(s): 0.0002245109 0.0123332 
##     time: 140.091 seconds
## 
##   missForest iteration 5 in progress...done!
##     estimated error(s): 0.1242404 0.1296151 
##     difference(s): 0.0002325653 0.01106955 
##     time: 137.552 seconds
## 
##   missForest iteration 6 in progress...done!
##     estimated error(s): 0.1239226 0.1271489 
##     difference(s): 0.0001957441 0.009300445 
##     time: 141.707 seconds
## 
##   missForest iteration 7 in progress...done!
##     estimated error(s): 0.1246732 0.1291785 
##     difference(s): 0.0002683856 0.01033664 
##     time: 136.865 seconds

Did this work?

sapply(as.data.frame(impute_results$ximp), function(x) mean(is.na(x)))
##                      HCM2                   CONTROL                    LOCALE 
##                         0                         0                         0 
##                   CCBASIC                  CCUGPROF                  CCSIZSET 
##                         0                         0                         0 
##                      HBCU                       PBI                     ANNHI 
##                         0                         0                         0 
##                    TRIBAL                   AANAPII                       HSI 
##                         0                         0                         0 
##                     NANTI                   MENONLY                 WOMENONLY 
##                         0                         0                         0 
##              ADM_RATE_ALL                   SATVR25                   SATVR75 
##                         0                         0                         0 
##               SAT_AVG_ALL              DISTANCEONLY                UGDS_WHITE 
##                         0                         0                         0 
##                UGDS_BLACK                 UGDS_HISP                UGDS_ASIAN 
##                         0                         0                         0 
##                  INEXPFTE                 AVGFACSAL                    PFTFAC 
##                         0                         0                         0 
##                    C150_4              C150_4_BLACK               C150_4_HISP 
##                         0                         0                         0 
##              C150_4_ASIAN               C150_4_AIAN                C150_4_NRA 
##                         0                         0                         0 
##                  PCTFLOAN                   UG25abv   LO_INC_COMP_ORIG_YR6_RT 
##                         0                         0                         0 
##   MD_INC_COMP_ORIG_YR6_RT   HI_INC_COMP_ORIG_YR6_RT FIRSTGEN_COMP_ORIG_YR6_RT 
##                         0                         0                         0 
##         PAR_ED_PCT_1STGEN                  DEBT_MDN            mn_earn_wne_p8 
##                         0                         0                         0 
##         pct25_earn_wne_p8         pct75_earn_wne_p8         pct90_earn_wne_p8 
##                         0                         0                         0 
##           RPY_3YR_RT_SUPP    LO_INC_RPY_3YR_RT_SUPP    MD_INC_RPY_3YR_RT_SUPP 
##                         0                         0                         0 
##    HI_INC_RPY_3YR_RT_SUPP    NONCOM_RPY_3YR_RT_SUPP  FIRSTGEN_RPY_3YR_RT_SUPP 
##                         0                         0                         0 
##          PCT_LIBERAL_ARTS          PCT_PROFESSIONAL             PCT_RELIGIOUS 
##                         0                         0                         0 
##                      COST            PCT_VOCATIONAL 
##                         0                         0

What is the estimate of the OOB (out-of-bag) imputation error (normalized root mean squared error)?

impute_results$OOBerror
##     NRMSE       PFC 
## 0.1239226 0.1271489

Saving Imputed Results to File

Let’s save these results with imputed values replaced for the missing values to a CSV for later use.

colleges_imputed <- as.data.frame(impute_results$ximp)
write_csv(colleges_imputed, "./scorecard_imputed.csv")

Clustering Results with Random Forest Imputation

Let’s do k-means clustering now and see what kind of results we get. Let’s use the broom package to explore how many clusters may be appropriate for this data set. For starters, let’s try out 4 to 20 clusters.

library(dplyr)
library(broom)
library(tidyr)

set.seed(1234)
kclusts <- data.frame(k=4:20) %>% 
    group_by(k) %>% 
    do(kclust = kmeans(colleges_imputed, .$k))

Now let’s use tidy, augment, and glance to organize these various sets of k-means clusterings.

clusters <- kclusts %>% group_by(k) %>% do(tidy(.$kclust[[1]]))
assignments <- kclusts %>% group_by(k) %>% do(augment(.$kclust[[1]], colleges_imputed))
clusterings <- kclusts %>% group_by(k) %>% do(glance(.$kclust[[1]]))

Specifically, let’s look at the total within sum of squares, saved in the tot.withinss column in clusterings. This shows us the variance within the clusters.

library(ggfortify)
ggplot(clusterings, aes(k, tot.withinss)) +
    geom_line(color = "blue", alpha = 0.5, size = 2) +
    geom_point(size = 0.8)

Notice that the variance decreases with more clusters, but there is a “bend” or “elbow” around 12 clusters. This is around the same place we saw an elbow in the early subset of colleges, and tells us we don’t gain much by going beyond 12 or so clusters.

Plotting Clustering Results

Let’s look at how the colleges are distributed in the space of principal component 1 and 2, clustered with \(k = 12\).

colleges_imputed[] <- lapply(colleges_imputed[], as.numeric)
set.seed(1234)
autoplot(kmeans(colleges_imputed, 12), data = colleges_imputed, size = 3, alpha = 0.5) + 
    ggtitle("K-Means Clustering of College Scorecard Data") +
    theme(legend.position="none")

That looks REMARKABLY like the results from the clustering imputed with medians. Let’s label a few of these.

library(ggrepel)
set.seed(1234)
my_kmeans <- kmeans(colleges_imputed, 12)
autoplot(my_kmeans, data = colleges_imputed, size = 3, alpha = 0.5) + 
    ggtitle("K-Means Clustering of College Scorecard Data") +
    theme(legend.position="none") +
    geom_text_repel(aes(PC1, PC2, 
                        color = factor(my_kmeans$cluster), 
                        label = ifelse(SAT_AVG_ALL > 1475, colleges$INSTNM,'')),
                    segment.color = NA)

So it turns out the clustering is a little bit different now than before.

Summary

It’s important to remember that this fancier imputation method is still not adding any new information to the data set; it’s just a matter of which is a more appropriate way of filling in NA values. Let’s look, just for kicks, at a few of these clusters.

colleges <- colleges %>% 
    mutate(cluster = my_kmeans$cluster) 

colleges %>%
    filter(cluster == 4) %>%
    select(INSTNM)
## # A tibble: 27 × 1
##                                                  INSTNM
##                                                   <chr>
## 1      University of California-Hastings College of Law
## 2                University of California-San Francisco
## 3                 Western University of Health Sciences
## 4                                 Georgetown University
## 5                   Midwestern University-Downers Grove
## 6  Rosalind Franklin University of Medicine and Science
## 7                                    Harvard University
## 8                 Massachusetts Institute of Technology
## 9  University of Massachusetts Medical School Worcester
## 10              A T Still University of Health Sciences
## # ... with 17 more rows
colleges %>%
    filter(cluster == 7) %>%
    select(INSTNM)
## # A tibble: 6 × 1
##                                                         INSTNM
##                                                          <chr>
## 1                                              Yale University
## 2                                 Morehouse School of Medicine
## 3                                Weill Cornell Medical College
## 4              University of Texas Southwestern Medical Center
## 5 The University of Texas Health Science Center at San Antonio
## 6     The University of Texas Health Science Center at Houston
colleges %>%
    filter(cluster == 9) %>%
    select(INSTNM)
## # A tibble: 66 × 1
##                                         INSTNM
##                                          <chr>
## 1  University of Arkansas for Medical Sciences
## 2           California Institute of Technology
## 3                  California Maritime Academy
## 4             California Western School of Law
## 5                    Claremont McKenna College
## 6                          Harvey Mudd College
## 7                        Loma Linda University
## 8                    University of the Pacific
## 9                    Samuel Merritt University
## 10               Marshall B Ketchum University
## # ... with 56 more rows

Let’s look at what are the most important variables for principal component analysis, since that is so related to k-means.

library(reshape2)

colleges_imputed <- scale(colleges_imputed)
college_pca <- prcomp(colleges_imputed)
melted <- melt(college_pca$rotation[,1:9])

ggplot(data = melted[melted$Var2 == "PC1",]) +
    theme(legend.position = "none", 
          axis.ticks.x = element_blank()) + 
    labs(x = "Measurements in College Scorecard data",
         y = "Relative importance in principle component",
         title = "Variables in PC1") +
    geom_bar(aes(x=Var1, y=value, fill=Var1), stat="identity") +
    coord_flip()

A lot of these are very important: SAT scores, the repayment data, the completion data.