workshopQueryData = read.csv(url('https://drive.google.com/uc?id=1ff4xFh4fl0-SvpYNeYQoNvDbzdiZfn-t'))
bootcampTurnoutData = read.csv(url('https://drive.google.com/uc?id=1sovKLesEqPbkUte_ysRP9mGQ1gFCO2ME'))
workshopsData = read.csv(url('https://drive.google.com/uc?id=10MngpIZoAGgwAk_sxoORj7WPYs74nz5Y'))
applicantData = read.csv(url('https://drive.google.com/uc?id=1mPK8_AasPMdqy3D9D0kxjjKcIXmhhcQo'))
cltrApplicantData = select(applicantData, c(userid, age_bin, classification, first_generation,
datascience_experience, technology_experience, num_hackathons_attended))
cltrApplicantData$age_bin = as.ordered(cltrApplicantData$age_bin)
cltrApplicantData$classification = ordered(cltrApplicantData$classification, levels = c('Fr', 'So', 'Jr', 'Sr', 'O', 'Ma', 'PhD'))
cltrApplicantData$first_generation = as.numeric(cltrApplicantData$first_generation)
for(i in 1:length(cltrApplicantData$technology_experience)) {
cltrApplicantData$technology_experience[i]=length(strsplit(cltrApplicantData$technology_experience[i], ',')[[1]])
}
cltrApplicantData$num_hackathons_attended=ordered(cltrApplicantData$num_hackathons_attended, levels = c('0', '1-3', '4-7', '8-10', '10+'))
cltrApplicantData = data.frame(userid = cltrApplicantData$userid,data.matrix(select(cltrApplicantData, !userid)))
rownames(cltrApplicantData) = cltrApplicantData[,1]
cltrApplicantData = select(cltrApplicantData, !userid)
cltrApplicantData = scale(cltrApplicantData)
distance = get_dist(cltrApplicantData)
fviz_dist(distance, gradient = list(low = 'red', mid = 'yellow', high = 'green'), show_labels = FALSE)
k4 = kmeans(cltrApplicantData, centers = 4, nstart = 25); k4
## K-means clustering with 4 clusters of sizes 240, 251, 411, 348
##
## Cluster means:
## age_bin classification first_generation datascience_experience
## 1 0.1206025 0.01069171 1.9745479 -0.2379226
## 2 -0.3268342 -0.18224314 -0.4170945 0.4916803
## 3 -0.7655678 -0.85568913 -0.5060399 -0.6257270
....
## Within cluster sum of squares by cluster:
## [1] 1040.0609 829.9047 1010.2029 1105.1418
## (between_SS / total_SS = 46.8 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
fviz_cluster(k4, data = cltrApplicantData, geom = 'point')
set.seed(123456789)
gapStat = clusGap(cltrApplicantData, FUN = kmeans, nstart = 25, K.max = 30, B = 50)
fviz_gap_stat(gapStat)
kOptimalGap = kmeans(cltrApplicantData, centers = 21, nstart = 25)
fviz_cluster(kOptimalGap, data = cltrApplicantData, geom = 'point')
clusteredApplicants = applicantData %>% mutate(cluster = kOptimalGap$cluster) %>% group_by(cluster)
#head(clusteredApplicants)
#table(clusteredApplicants$cluster)
filter(clusteredApplicants, cluster == 14) %>% select(!c(school, other_school, minors)) %>% sample_n(5)
## # A tibble: 5 x 11
## # Groups: cluster [1]
## userid majors age_bin classification first_generation datascience_exp~
## <chr> <chr> <chr> <chr> <lgl> <int>
## 1 a8ec4~ ['Dat~ (20, 2~ Sr TRUE 1
## 2 790c0~ ['Com~ (20, 2~ Sr FALSE 3
## 3 1041a~ ['Ana~ (22, 2~ Ma TRUE 2
## 4 ab2e0~ ['Com~ (22, 2~ Ma FALSE 3
## 5 8a932~ ['Com~ (20, 2~ Jr FALSE 3
## # ... with 5 more variables: technology_experience <chr>,
## # num_hackathons_attended <chr>, workshop_suggestions <chr>,
## # relavent_industries <chr>, cluster <int>
filter(clusteredApplicants, cluster == 1) %>% select(!c(school, other_school, minors)) %>% sample_n(5)
## # A tibble: 5 x 11
## # Groups: cluster [1]
## userid majors age_bin classification first_generation datascience_exp~
## <chr> <chr> <chr> <chr> <lgl> <int>
## 1 71039~ ['Com~ (15, 1~ Fr TRUE 1
## 2 04c27~ ['Com~ (18, 2~ So TRUE 0
## 3 530f8~ ['Com~ (18, 2~ Jr TRUE 0
## 4 51ef8~ ['Ind~ (18, 2~ So TRUE 0
## 5 01bb0~ ['Lan~ (15, 1~ Fr TRUE 0
## # ... with 5 more variables: technology_experience <chr>,
## # num_hackathons_attended <chr>, workshop_suggestions <chr>,
## # relavent_industries <chr>, cluster <int>
Looking at cluster 14, it appears to be a group of people who are mostly upperclassmen or graduate level students with lots of experience and many technical skills. In comparison, looking at cluster 1 it appears to be a group of people who are mostly underclassmen who are first gen students, very little experience and not many technical skills.
groupMake = function(user){
userCluster = filter(clusteredApplicants, userid == user) %>% select(cluster)
filter(clusteredApplicants, cluster == userCluster, userid != user) %>% select(userid) %>% sample_n(5) %>% return()
}
randomUserID = sample(applicantData$userid, 1)
groupMake(randomUserID)
## # A tibble: 5 x 2
## # Groups: cluster [1]
## cluster userid
## <int> <chr>
## 1 21 a71eed608a35168b5d4df396cff9360a
## 2 21 823786777a39b69f4f1a338b716ae7e8
## 3 21 eb1080b0953d56a49a5c7bc84a57899e
## 4 21 0efea25e2b91303de9efee883ddeca49
## 5 21 08a9f8ba39b3f9f6369a760bb1fbed8a