Thanks for taking the time to complete the Capstone Interest Survey. Scroll down to see your group assignments. Want to explore the data for yourself (sorry, class members only)? Grab the data and script here (gated), and save to a directory called interestSurvey. Change the path to this directory in the first code chunk. If you don't have R, download from here and install. Then download and install RStudio. Use RStudio to open the .rmd file in interestSurvey.
# set working directory
setwd("/Users/ericpgreen/Dropbox/Teaching/Duke/Capstone/S14/interestSurvey")
# libraries
require(ggplot2)
## Loading required package: ggplot2
# grab the anonymized data
dat <- read.csv("dat.csv", stringsAsFactors=FALSE)
# create dummy variables for select all questions
# topics
topics <- c("intimate partner violence / sexual assault",
"obesity",
"maternal health",
"malnutrition",
"evidence based practice",
"social protection",
"family planning",
"medical records",
"health insurance",
"community outreach workers",
"diagnostics",
"mental health",
"access to healthcare",
"funding for research/basic science",
"empowerment",
"poverty alleviation",
"other")
topics.abb <- c("ipv",
"obe",
"mh",
"mal",
"evp",
"sp",
"fp",
"rec",
"ins",
"com",
"dia",
"mh",
"a2h",
"fun",
"emp",
"pov",
"oth")
for (t in 1:length(topics)) {
dat[paste("top", topics.abb[t], sep=".")] <- ifelse(regexpr(topics[t],
dat$all.topics) > 0,
1, 0)
}
# deliverables
deliverables <- c("software, app",
"communications campaign",
"curriculum",
"training manual",
"evaluation framework, tools",
"white paper",
"website",
"infographic",
"quantitative analysis",
"policy paper",
"social media campaign",
"printed materials",
"video",
"crowd funding",
"other")
deliverables.abb <- c("app",
"cc",
"cur",
"tm",
"ef",
"wp",
"web",
"ifo",
"qa",
"pp",
"smc",
"pri",
"vid",
"cf",
"oth")
for (d in 1:length(deliverables)) {
dat[paste("del", deliverables.abb[d], sep=".")] <-
ifelse(regexpr(deliverables[d], dat$all.deliverables) > 0, 1, 0)
}
# populations
populations <- c("rural",
"public private partnerships",
"orphans and vulnerable children",
"policy makers",
"substance abusers",
"resettled refugees",
"schools",
"low-income workers",
"prisons",
"hispanic",
"homeless",
"other")
populations.abb <- c("rur",
"ppp",
"ovc",
"om",
"sa",
"ref",
"sch",
"liw",
"prs",
"his",
"hom",
"oth")
for (p in 1:length(populations)) {
dat[paste("pop", populations.abb[p], sep=".")] <-
ifelse(regexpr(populations[p], dat$all.pops) > 0, 1, 0)
}
# skills have
have <- c("writing",
"visual design / graphics",
"lit reviews",
"quantitative analysis",
"statistical programming",
"computer programming",
"web development",
"video",
"social media",
"program design",
"evaluation",
"project management",
"other")
have.abb <- c("write",
"design",
"litrev",
"quant",
"statpro",
"comppro",
"webdev",
"video",
"socmed",
"prodes",
"eval",
"projm",
"other")
for (h in 1:length(have)) {
dat[paste("have", have.abb[h], sep=".")] <-
ifelse(regexpr(have[h], dat$skills.have) > 0, 1, 0)
}
# skills want
want <- c("writing",
"visual design / graphics",
"lit reviews",
"quantitative analysis",
"statistical programming",
"computer programming",
"web development",
"video",
"social media",
"program design",
"evaluation",
"project management",
"other")
want.abb <- c("write",
"design",
"litrev",
"quant",
"statpro",
"comppro",
"webdev",
"video",
"socmed",
"prodes",
"eval",
"projm",
"other")
for (w in 1:length(want)) {
dat[paste("want", want.abb[w], sep=".")] <-
ifelse(regexpr(want[w], dat$skills.want) > 0, 1, 0)
}
# drop "all" variables
dat$all.topics <- dat$all.deliverables <- dat$all.pops <- NULL
dat$skills.have <- dat$skills.want <- NULL
In class we brainstormed possible topics for Capstone projects. The following plot shows how many students (N=16) expressed any interest in each topic via the Google Form.
# prepare data
top.var <- grep("top.", names(dat), fixed=TRUE) # index variables with top.
top.sum <- as.data.frame(colSums(dat[top.var])) # count
names(top.sum) <- "count" # rename count variable
top.sum$topic <- row.names(top.sum) # create topic name
for (i in 1:nrow(top.sum)) { # give topics labels
label <- grep(substr(top.sum$topic[i], 5, 7), topics.abb)
top.sum$topic.lab[i] <- topics[label]
}
top.sum = transform(top.sum, # reorder by count
topic.lab = reorder(topic.lab, count))
# plot
ggplot(top.sum) +
geom_point(aes(y=count, x=topic.lab)) +
coord_flip() +
theme_bw() +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
axis.line = element_line(color = 'black'),
panel.background = element_blank()) +
xlab("Topics") +
ylab("Count")
Here's a plot of first choices for topics:
top.1 <- data.frame(table(dat$topic1))
top.1 = transform(top.1, Var1 = reorder(Var1, Freq))
ggplot(top.1) +
geom_point(aes(y=Freq, x=Var1)) +
coord_flip() +
theme_bw() +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
axis.line = element_line(color = 'black'),
panel.background = element_blank()) +
xlab("First Choice Topics") +
ylab("Count")
This plot displays how many students (N=16) expressed any interest in each type of deliverable we listed.
# prepare data
del.var <- grep("del.", names(dat), fixed=TRUE) # index variables with del.
del.sum <- as.data.frame(colSums(dat[del.var])) # count
names(del.sum) <- "count" # rename count variable
del.sum$deliverable <- row.names(del.sum) # create name
for (i in 1:nrow(del.sum)) { # give labels
label <- grep(substr(del.sum$deliverable[i], 5, 7), deliverables.abb)
del.sum$deliverable.lab[i] <- deliverables[label]
}
del.sum = transform(del.sum, # reorder by count
deliverable.lab = reorder(deliverable.lab, count))
# plot
ggplot(del.sum) +
geom_point(aes(y=count, x=deliverable.lab)) +
coord_flip() +
theme_bw() +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
axis.line = element_line(color = 'black'),
panel.background = element_blank()) +
xlab("Deliverables") +
ylab("Count")
Followed by a plot of first choices for deliverables:
del.1 <- data.frame(table(dat$deliverable1))
del.1 = transform(del.1, Var1 = reorder(Var1, Freq))
ggplot(del.1) +
geom_point(aes(y=Freq, x=Var1)) +
coord_flip() +
theme_bw() +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
axis.line = element_line(color = 'black'),
panel.background = element_blank()) +
xlab("First Choice Deliverables") +
ylab("Count")
The next plot displays how many students (N=16) expressed any interest in each population we mentioned.
# prepare data
pop.var <- grep("pop.", names(dat), fixed=TRUE) # index variables with pop.
pop.sum <- as.data.frame(colSums(dat[pop.var])) # count
names(pop.sum) <- "count" # rename count variable
pop.sum$population <- row.names(pop.sum) # create name
for (i in 1:nrow(pop.sum)) { # give labels
label <- grep(substr(pop.sum$population[i], 5, 7), populations.abb)
pop.sum$population.lab[i] <- populations[label]
}
pop.sum = transform(pop.sum, # reorder by count
population.lab = reorder(population.lab, count))
# plot
ggplot(pop.sum) +
geom_point(aes(y=count, x=population.lab)) +
coord_flip() +
theme_bw() +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
axis.line = element_line(color = 'black'),
panel.background = element_blank()) +
xlab("Populations") +
ylab("Count")
Here's a plot of first choices for populations:
pop.1 <- data.frame(table(dat$population1))
pop.1 = transform(pop.1, Var1 = reorder(Var1, Freq))
ggplot(pop.1) +
geom_point(aes(y=Freq, x=Var1)) +
coord_flip() +
theme_bw() +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
axis.line = element_line(color = 'black'),
panel.background = element_blank()) +
xlab("First Choice Populations") +
ylab("Count")
While it might not be possible to have someone work on their top choices, we can use this information to group students. In the next code chunk, I weight each response by students' first, second, and third choices. For instance, if a student indicated interest in 5 topics, her top choice is multiplied by 4; her second choice is multiplied by 3; and her third choice is multipled by 2. Basic interests, coded as 1's remain a value of 1. Zero interest remains a value of 0.
# weight interests by top choices (1stx4, 2ndx3, 3rdx2)
for (i in 1:nrow(dat)) { # topics
# first choice
first <- dat$topic1[i]
index <- grep(first, topics)
abb <- topics.abb[index]
index2 <- grep(paste("top", abb, sep="."), names(dat))
dat[i, index2] <- dat[i, index2]*4
# second choice
second <- dat$topic2[i]
index <- grep(second, topics)
abb <- topics.abb[index]
index2 <- grep(paste("top", abb, sep="."), names(dat))
dat[i, index2] <- dat[i, index2]*3
# third choice
third <- dat$topic3[i]
index <- grep(third, topics)
abb <- topics.abb[index]
index2 <- grep(paste("top", abb, sep="."), names(dat))
dat[i, index2] <- dat[i, index2]*2
}
for (i in 1:nrow(dat)) { # deliverables
# first choice
first <- dat$deliverable1[i]
index <- grep(first, deliverables)
abb <- deliverables.abb[index]
index2 <- grep(paste("del", abb, sep="."), names(dat))
dat[i, index2] <- dat[i, index2]*4
# second choice
second <- dat$deliverable2[i]
index <- grep(second, deliverables)
abb <- deliverables.abb[index]
index2 <- grep(paste("del", abb, sep="."), names(dat))
dat[i, index2] <- dat[i, index2]*3
# third choice
third <- dat$deliverable3[i]
index <- grep(third, deliverables)
abb <- deliverables.abb[index]
index2 <- grep(paste("del", abb, sep="."), names(dat))
dat[i, index2] <- dat[i, index2]*2
}
for (i in 1:nrow(dat)) { # populations
# first choice
first <- dat$population1[i]
index <- grep(first, populations)
abb <- populations.abb[index]
index2 <- grep(paste("pop", abb, sep="."), names(dat))
dat[i, index2] <- dat[i, index2]*4
# second choice
second <- dat$population2[i]
index <- grep(second, populations)
abb <- populations.abb[index]
index2 <- grep(paste("pop", abb, sep="."), names(dat))
dat[i, index2] <- dat[i, index2]*3
# third choice
third <- dat$population3[i]
index <- grep(third, populations)
abb <- populations.abb[index]
index2 <- grep(paste("pop", abb, sep="."), names(dat))
dat[i, index2] <- dat[i, index2]*2
}
I asked everyone to indicate the most important factor in their decision-making: topic, deliverable, or population. In the next code chunk, I add an additional weight (x2) to all variables in a student's most important domain. Variables coded 0 in these domains remain 0.
# weight interests by most important factor x2
for (i in top.var) {
dat[,i][dat$most.imp=="topic"] <- dat[,i][dat$most.imp=="topic"]*2
}
for (i in del.var) {
dat[,i][dat$most.imp=="deliverable"] <- dat[,i][dat$most.imp=="deliverable"]*2
}
for (i in pop.var) {
dat[,i][dat$most.imp=="population"] <- dat[,i][dat$most.imp=="population"]*2
}
With all of the interest variables propertly weighted, it's time to find potential groups (clusters). In the following chunk, I subset the data frame to include the weighted interest variables and student preferences for local vs global work and technology-oriented projects. Then I run a hierarchical cluster analysis and plot a dendrogram to show close relatives.
# hierarchical clustering
lc.var <- grep("local.global", names(dat))
ti.var <- grep("tech.interest", names(dat))
dat.cluster <- dat[,c(top.var, del.var, pop.var, lc.var, ti.var)]
distance <- dist(dat[,c(top.var, del.var, pop.var)], method="euclidean")
cluster <- hclust(distance, method="average")
plot(cluster, hang=-1, label=dat$id)
# assign clusters
clusters <- data.frame(cluster$order)
clusters$cluster <- c(1,1,1,2,2,2,2,3,3,3,3,1,4,4,4,4)
names(clusters) <- c("id", "cluster")
dat <- merge(dat, clusters, by="id")
Based on the dendrogram, there appear to be 3 clear clusters: 13, 14, 1, 10; 6, 5, 7, 9; and 8, 12, 4, 15. I checked the raw data and confirmed that these look like good groupings.
Students 2, 3, 16 are pretty close, so we can group them. Student 11 is closer to the last cluster, but we need even groups and the last cluster is full.
Now that we have some groups, let's look at how students' self-rated skills are distributed across the groups.
# prepare data
have.var <- grep("have.", names(dat), fixed=TRUE) # index variables with have.
have.dat <- dat[have.var]
have.dat <- cbind(have.dat, dat$cluster)
have.sum <- as.data.frame(sapply(split(have.dat, dat$cluster), colSums))
names(have.sum) <- c("grp1", "grp2", "grp3", "grp4") # rename count variable
have.sum$have <- row.names(have.sum) # create name
have.sum <- have.sum[1:(nrow(have.sum)-1),]
for (i in 1:nrow(have.sum)) { # give labels
label <- grep(substr(have.sum$have[i], 6,
nchar(have.sum$have[i])), have.abb)
have.sum$have.lab[i] <- have[label]
}
have.sum.long <- reshape(have.sum,
direction="long",
varying=list(names(have.sum)[1:4]))
names(have.sum.long)[names(have.sum.long)=="time"] <- "group"
names(have.sum.long)[names(have.sum.long)=="grp1"] <- "count"
have.sum.long = transform(have.sum.long, # reorder by count
have.lab = reorder(have.lab, count))
# plot
ggplot(have.sum.long) +
geom_point(aes(y=count, x=have.lab)) +
coord_flip() +
theme_bw() +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
axis.line = element_line(color = 'black'),
panel.background = element_blank()) +
xlab("Skills Possessed") +
ylab("Count") +
facet_wrap(~ group)
Finally, we can see if students who rated themselves highly on the “tend to lead” dimension are also distributed across groups. I labeled any student in the top 50th percentile as “tends to lead”.
[I removed this section of the analysis (and data) because it might be possible to infer where a few individual students fall on the self-reported leadership distribution. Not exact scores, but removed anyway.]
# merge in names
# it is not possible to determine student identities without the key
# set eric==0 below to stop the script from running; otherwise will throw error
# because key will be missing
eric <- 1 # set to 0 if not Eric so the script stops running
if (eric==1) {
name.key <- read.csv("key.csv", stringsAsFactors=FALSE)
dat <- merge(dat, name.key, by="id")
table(dat$name, dat$cluster)
}
##
## 1 2 3 4
## Aj 1 0 0 0
## Be 0 1 0 0
## Ch 1 0 0 0
## Cr 1 0 0 0
## Da 0 0 0 1
## El 0 1 0 0
## Ha 0 0 0 1
## Ke 0 0 0 1
## Li 0 1 0 0
## Lu 0 0 1 0
## Ma 1 0 0 0
## Mo 0 0 1 0
## Ne 0 0 1 0
## Ni 0 1 0 0
## Ov 0 0 0 1
## Re 0 0 1 0