Don’t be K-Mean, join a gRoup

Download Data

workshopQueryData = read.csv(url('https://drive.google.com/uc?id=1ff4xFh4fl0-SvpYNeYQoNvDbzdiZfn-t'))
bootcampTurnoutData = read.csv(url('https://drive.google.com/uc?id=1sovKLesEqPbkUte_ysRP9mGQ1gFCO2ME'))
workshopsData = read.csv(url('https://drive.google.com/uc?id=10MngpIZoAGgwAk_sxoORj7WPYs74nz5Y'))
applicantData = read.csv(url('https://drive.google.com/uc?id=1mPK8_AasPMdqy3D9D0kxjjKcIXmhhcQo'))

Construct scoring matrix to cluster

cltrApplicantData = select(applicantData, c(userid, age_bin, classification, first_generation, 
                                            datascience_experience, technology_experience, num_hackathons_attended))
cltrApplicantData$age_bin = as.ordered(cltrApplicantData$age_bin)
cltrApplicantData$classification = ordered(cltrApplicantData$classification, levels = c('Fr', 'So', 'Jr', 'Sr', 'O', 'Ma', 'PhD'))
cltrApplicantData$first_generation = as.numeric(cltrApplicantData$first_generation)
for(i in 1:length(cltrApplicantData$technology_experience)) {
  cltrApplicantData$technology_experience[i]=length(strsplit(cltrApplicantData$technology_experience[i], ',')[[1]])
}
cltrApplicantData$num_hackathons_attended=ordered(cltrApplicantData$num_hackathons_attended, levels = c('0', '1-3', '4-7', '8-10', '10+'))
cltrApplicantData = data.frame(userid = cltrApplicantData$userid,data.matrix(select(cltrApplicantData, !userid)))
rownames(cltrApplicantData) = cltrApplicantData[,1]
cltrApplicantData = select(cltrApplicantData, !userid)
cltrApplicantData = scale(cltrApplicantData)

Perform naive clustering with 4 clusters

distance = get_dist(cltrApplicantData)
fviz_dist(distance, gradient = list(low = 'red', mid = 'yellow', high = 'green'), show_labels = FALSE)

k4 = kmeans(cltrApplicantData, centers = 4, nstart = 25); k4

## K-means clustering with 4 clusters of sizes 240, 251, 411, 348
## 
## Cluster means:
##      age_bin classification first_generation datascience_experience
## 1  0.1206025     0.01069171        1.9745479             -0.2379226
## 2 -0.3268342    -0.18224314       -0.4170945              0.4916803
## 3 -0.7655678    -0.85568913       -0.5060399             -0.6257270
....

## Within cluster sum of squares by cluster:
## [1] 1040.0609  829.9047 1010.2029 1105.1418
##  (between_SS / total_SS =  46.8 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

fviz_cluster(k4, data = cltrApplicantData, geom = 'point')

Calculate the gap statistic to find the optimal number of clusters

set.seed(123456789)
gapStat = clusGap(cltrApplicantData, FUN = kmeans, nstart = 25, K.max = 30, B = 50)
fviz_gap_stat(gapStat)

kOptimalGap = kmeans(cltrApplicantData, centers = 21, nstart = 25)
fviz_cluster(kOptimalGap, data = cltrApplicantData, geom = 'point')

Sanity check: are people in the same cluster ‘similar’?

clusteredApplicants = applicantData %>% mutate(cluster = kOptimalGap$cluster) %>% group_by(cluster)
#head(clusteredApplicants)
#table(clusteredApplicants$cluster)
filter(clusteredApplicants, cluster == 14) %>% select(!c(school, other_school, minors)) %>% sample_n(5)

## # A tibble: 5 x 11
## # Groups:   cluster [1]
##   userid majors age_bin classification first_generation datascience_exp~
##   <chr>  <chr>  <chr>   <chr>          <lgl>                       <int>
## 1 a8ec4~ ['Dat~ (20, 2~ Sr             TRUE                            1
## 2 790c0~ ['Com~ (20, 2~ Sr             FALSE                           3
## 3 1041a~ ['Ana~ (22, 2~ Ma             TRUE                            2
## 4 ab2e0~ ['Com~ (22, 2~ Ma             FALSE                           3
## 5 8a932~ ['Com~ (20, 2~ Jr             FALSE                           3
## # ... with 5 more variables: technology_experience <chr>,
## #   num_hackathons_attended <chr>, workshop_suggestions <chr>,
## #   relavent_industries <chr>, cluster <int>

filter(clusteredApplicants, cluster == 1) %>% select(!c(school, other_school, minors)) %>% sample_n(5)

## # A tibble: 5 x 11
## # Groups:   cluster [1]
##   userid majors age_bin classification first_generation datascience_exp~
##   <chr>  <chr>  <chr>   <chr>          <lgl>                       <int>
## 1 71039~ ['Com~ (15, 1~ Fr             TRUE                            1
## 2 04c27~ ['Com~ (18, 2~ So             TRUE                            0
## 3 530f8~ ['Com~ (18, 2~ Jr             TRUE                            0
## 4 51ef8~ ['Ind~ (18, 2~ So             TRUE                            0
## 5 01bb0~ ['Lan~ (15, 1~ Fr             TRUE                            0
## # ... with 5 more variables: technology_experience <chr>,
## #   num_hackathons_attended <chr>, workshop_suggestions <chr>,
## #   relavent_industries <chr>, cluster <int>

Looking at cluster 14, it appears to be a group of people who are mostly upperclassmen or graduate level students with lots of experience and many technical skills. In comparison, looking at cluster 1 it appears to be a group of people who are mostly underclassmen who are first gen students, very little experience and not many technical skills.

Given a userid, recommend 5 group mates of similar ability

groupMake = function(user){
  userCluster = filter(clusteredApplicants, userid == user) %>% select(cluster)
  filter(clusteredApplicants, cluster == userCluster, userid != user) %>% select(userid) %>% sample_n(5) %>% return()
}
randomUserID = sample(applicantData$userid, 1)
groupMake(randomUserID)

## # A tibble: 5 x 2
## # Groups:   cluster [1]
##   cluster userid                          
##     <int> <chr>                           
## 1      21 a71eed608a35168b5d4df396cff9360a
## 2      21 823786777a39b69f4f1a338b716ae7e8
## 3      21 eb1080b0953d56a49a5c7bc84a57899e
## 4      21 0efea25e2b91303de9efee883ddeca49
## 5      21 08a9f8ba39b3f9f6369a760bb1fbed8a