Capstone Groups

Thanks for taking the time to complete the Capstone Interest Survey. Scroll down to see your group assignments. Want to explore the data for yourself (sorry, class members only)? Grab the data and script here (gated), and save to a directory called interestSurvey. Change the path to this directory in the first code chunk. If you don't have R, download from here and install. Then download and install RStudio. Use RStudio to open the .rmd file in interestSurvey.

# set working directory
  setwd("/Users/ericpgreen/Dropbox/Teaching/Duke/Capstone/S14/interestSurvey")
# libraries
  require(ggplot2)

## Loading required package: ggplot2

# grab the anonymized data
  dat <- read.csv("dat.csv", stringsAsFactors=FALSE)
# create dummy variables for select all questions
  # topics
    topics <- c("intimate partner violence / sexual assault",
                    "obesity",
                    "maternal health",
                    "malnutrition",
                    "evidence based practice",
                    "social protection",
                    "family planning",
                    "medical records",
                    "health insurance",
                    "community outreach workers",
                    "diagnostics",
                    "mental health",
                    "access to healthcare",
                    "funding for research/basic science",
                    "empowerment",
                    "poverty alleviation",
                    "other")
    topics.abb <- c("ipv",
                            "obe",
                            "mh",
                            "mal",
                            "evp",
                            "sp",
                            "fp",
                            "rec",
                            "ins",
                            "com",
                            "dia",
                            "mh",
                            "a2h",
                            "fun",
                            "emp",
                            "pov",
                            "oth")
    for (t in 1:length(topics)) {
        dat[paste("top", topics.abb[t], sep=".")] <- ifelse(regexpr(topics[t], 
                                                          dat$all.topics) > 0, 
                                                          1, 0)
    }
  # deliverables
    deliverables <- c("software, app",
                      "communications campaign",
                      "curriculum",
                      "training manual",
                      "evaluation framework, tools",
                      "white paper",
                      "website",
                      "infographic",
                      "quantitative analysis",
                      "policy paper",
                      "social media campaign",
                      "printed materials",
                      "video",
                      "crowd funding",
                      "other")
    deliverables.abb <- c("app",
                          "cc",
                          "cur",
                          "tm",
                          "ef",
                          "wp",
                          "web",
                          "ifo",
                          "qa",
                          "pp",
                          "smc",
                          "pri",
                          "vid",
                          "cf",
                          "oth")
    for (d in 1:length(deliverables)) {
      dat[paste("del", deliverables.abb[d], sep=".")] <- 
            ifelse(regexpr(deliverables[d], dat$all.deliverables) > 0, 1, 0)
    }
  # populations
    populations <- c("rural",
                      "public private partnerships",
                      "orphans and vulnerable children",
                      "policy makers",
                      "substance abusers",
                      "resettled refugees",
                      "schools",
                      "low-income workers",
                      "prisons",
                      "hispanic",
                      "homeless",
                      "other")
    populations.abb <- c("rur",
                          "ppp",
                          "ovc",
                          "om",
                          "sa",
                          "ref",
                          "sch",
                          "liw",
                          "prs",
                          "his",
                          "hom",
                          "oth")
    for (p in 1:length(populations)) {
        dat[paste("pop", populations.abb[p], sep=".")] <- 
              ifelse(regexpr(populations[p], dat$all.pops) > 0, 1, 0)
        }
  # skills have
    have <- c("writing",
               "visual design / graphics",
               "lit reviews",
               "quantitative analysis",
               "statistical programming",
               "computer programming",
               "web development",
               "video",
               "social media",
               "program design",
               "evaluation",
               "project management",
               "other")
    have.abb <- c("write",
                 "design",
                 "litrev",
                 "quant",
                 "statpro",
                 "comppro",
                 "webdev",
                 "video",
                 "socmed",
                 "prodes",
                 "eval",
                 "projm",
                 "other")
    for (h in 1:length(have)) {
        dat[paste("have", have.abb[h], sep=".")] <- 
              ifelse(regexpr(have[h], dat$skills.have) > 0, 1, 0)
    }
  # skills want
    want <- c("writing",
               "visual design / graphics",
               "lit reviews",
               "quantitative analysis",
               "statistical programming",
               "computer programming",
               "web development",
               "video",
               "social media",
               "program design",
               "evaluation",
               "project management",
               "other")
    want.abb <- c("write",
                 "design",
                 "litrev",
                 "quant",
                 "statpro",
                 "comppro",
                 "webdev",
                 "video",
                 "socmed",
                 "prodes",
                 "eval",
                 "projm",
                 "other")
    for (w in 1:length(want)) {
        dat[paste("want", want.abb[w], sep=".")] <- 
              ifelse(regexpr(want[w], dat$skills.want) > 0, 1, 0)
    }
# drop "all" variables
  dat$all.topics <- dat$all.deliverables <- dat$all.pops <- NULL
  dat$skills.have <- dat$skills.want <- NULL

Topics

In class we brainstormed possible topics for Capstone projects. The following plot shows how many students (N=16) expressed any interest in each topic via the Google Form.

# prepare data
  top.var <- grep("top.", names(dat), fixed=TRUE) # index variables with top.
  top.sum <- as.data.frame(colSums(dat[top.var])) # count
  names(top.sum) <- "count"                       # rename count variable
  top.sum$topic <- row.names(top.sum)             # create topic name
  for (i in 1:nrow(top.sum)) {                    # give topics labels
    label <- grep(substr(top.sum$topic[i], 5, 7), topics.abb)
    top.sum$topic.lab[i] <- topics[label] 
  }
  top.sum = transform(top.sum,                    # reorder by count
                      topic.lab = reorder(topic.lab, count))
# plot
  ggplot(top.sum) +
    geom_point(aes(y=count, x=topic.lab)) +
    coord_flip() +
    theme_bw() +
    theme(
      panel.grid.major = element_blank(),
      panel.grid.minor = element_blank(),
      panel.border = element_blank(),
      axis.line = element_line(color = 'black'),
      panel.background = element_blank()) +
    xlab("Topics") +
    ylab("Count")

plot of chunk topic-freq

Here's a plot of first choices for topics:

  top.1 <- data.frame(table(dat$topic1))  
  top.1 = transform(top.1, Var1 = reorder(Var1, Freq))
  ggplot(top.1) +
    geom_point(aes(y=Freq, x=Var1)) +
    coord_flip() +
    theme_bw() +
    theme(
      panel.grid.major = element_blank(),
      panel.grid.minor = element_blank(),
      panel.border = element_blank(),
      axis.line = element_line(color = 'black'),
      panel.background = element_blank()) +
    xlab("First Choice Topics") +
    ylab("Count")

plot of chunk topic-1

Deliverables

This plot displays how many students (N=16) expressed any interest in each type of deliverable we listed.

# prepare data
  del.var <- grep("del.", names(dat), fixed=TRUE) # index variables with del.
  del.sum <- as.data.frame(colSums(dat[del.var])) # count
  names(del.sum) <- "count"                       # rename count variable
  del.sum$deliverable <- row.names(del.sum)       # create name
  for (i in 1:nrow(del.sum)) {                    # give labels
    label <- grep(substr(del.sum$deliverable[i], 5, 7), deliverables.abb)
    del.sum$deliverable.lab[i] <- deliverables[label] 
  }
  del.sum = transform(del.sum,                    # reorder by count
                      deliverable.lab = reorder(deliverable.lab, count))
# plot
  ggplot(del.sum) +
    geom_point(aes(y=count, x=deliverable.lab)) +
    coord_flip() +
    theme_bw() +
    theme(
      panel.grid.major = element_blank(),
      panel.grid.minor = element_blank(),
      panel.border = element_blank(),
      axis.line = element_line(color = 'black'),
      panel.background = element_blank()) +
    xlab("Deliverables") +
    ylab("Count")

plot of chunk deliverables-freq

Followed by a plot of first choices for deliverables:

  del.1 <- data.frame(table(dat$deliverable1))  
  del.1 = transform(del.1, Var1 = reorder(Var1, Freq))
  ggplot(del.1) +
    geom_point(aes(y=Freq, x=Var1)) +
    coord_flip() +
    theme_bw() +
    theme(
      panel.grid.major = element_blank(),
      panel.grid.minor = element_blank(),
      panel.border = element_blank(),
      axis.line = element_line(color = 'black'),
      panel.background = element_blank()) +
    xlab("First Choice Deliverables") +
    ylab("Count")

plot of chunk deliverable-1

Populations

The next plot displays how many students (N=16) expressed any interest in each population we mentioned.

# prepare data
  pop.var <- grep("pop.", names(dat), fixed=TRUE) # index variables with pop.
  pop.sum <- as.data.frame(colSums(dat[pop.var])) # count
  names(pop.sum) <- "count"                       # rename count variable
  pop.sum$population <- row.names(pop.sum)        # create name
  for (i in 1:nrow(pop.sum)) {                    # give labels
    label <- grep(substr(pop.sum$population[i], 5, 7), populations.abb)
    pop.sum$population.lab[i] <- populations[label] 
  }
  pop.sum = transform(pop.sum,                    # reorder by count
                      population.lab = reorder(population.lab, count))
# plot
  ggplot(pop.sum) +
    geom_point(aes(y=count, x=population.lab)) +
    coord_flip() +
    theme_bw() +
    theme(
      panel.grid.major = element_blank(),
      panel.grid.minor = element_blank(),
      panel.border = element_blank(),
      axis.line = element_line(color = 'black'),
      panel.background = element_blank()) +
    xlab("Populations") +
    ylab("Count")

plot of chunk populations-freq

Here's a plot of first choices for populations:

  pop.1 <- data.frame(table(dat$population1))  
  pop.1 = transform(pop.1, Var1 = reorder(Var1, Freq))
  ggplot(pop.1) +
    geom_point(aes(y=Freq, x=Var1)) +
    coord_flip() +
    theme_bw() +
    theme(
      panel.grid.major = element_blank(),
      panel.grid.minor = element_blank(),
      panel.border = element_blank(),
      axis.line = element_line(color = 'black'),
      panel.background = element_blank()) +
    xlab("First Choice Populations") +
    ylab("Count")

plot of chunk population-1

Finding Common Interests

While it might not be possible to have someone work on their top choices, we can use this information to group students. In the next code chunk, I weight each response by students' first, second, and third choices. For instance, if a student indicated interest in 5 topics, her top choice is multiplied by 4; her second choice is multiplied by 3; and her third choice is multipled by 2. Basic interests, coded as 1's remain a value of 1. Zero interest remains a value of 0.

# weight interests by top choices (1stx4, 2ndx3, 3rdx2)
  for (i in 1:nrow(dat)) {   # topics
    # first choice
      first <- dat$topic1[i]
      index <- grep(first, topics)
      abb <- topics.abb[index]
      index2 <- grep(paste("top", abb, sep="."), names(dat))
      dat[i, index2] <- dat[i, index2]*4
    # second choice
      second <- dat$topic2[i]
      index <- grep(second, topics)
      abb <- topics.abb[index]
      index2 <- grep(paste("top", abb, sep="."), names(dat))
      dat[i, index2] <- dat[i, index2]*3
    # third choice
      third <- dat$topic3[i]
      index <- grep(third, topics)
      abb <- topics.abb[index]
      index2 <- grep(paste("top", abb, sep="."), names(dat))
      dat[i, index2] <- dat[i, index2]*2
  }
  for (i in 1:nrow(dat)) {   # deliverables
    # first choice
      first <- dat$deliverable1[i]
      index <- grep(first, deliverables)
      abb <- deliverables.abb[index]
      index2 <- grep(paste("del", abb, sep="."), names(dat))
      dat[i, index2] <- dat[i, index2]*4
    # second choice
      second <- dat$deliverable2[i]
      index <- grep(second, deliverables)
      abb <- deliverables.abb[index]
      index2 <- grep(paste("del", abb, sep="."), names(dat))
      dat[i, index2] <- dat[i, index2]*3
    # third choice
      third <- dat$deliverable3[i]
      index <- grep(third, deliverables)
      abb <- deliverables.abb[index]
      index2 <- grep(paste("del", abb, sep="."), names(dat))
      dat[i, index2] <- dat[i, index2]*2
  }
  for (i in 1:nrow(dat)) {   # populations
      # first choice
        first <- dat$population1[i]
        index <- grep(first, populations)
        abb <- populations.abb[index]
        index2 <- grep(paste("pop", abb, sep="."), names(dat))
        dat[i, index2] <- dat[i, index2]*4
      # second choice
        second <- dat$population2[i]
        index <- grep(second, populations)
        abb <- populations.abb[index]
        index2 <- grep(paste("pop", abb, sep="."), names(dat))
        dat[i, index2] <- dat[i, index2]*3
      # third choice
        third <- dat$population3[i]
        index <- grep(third, populations)
        abb <- populations.abb[index]
        index2 <- grep(paste("pop", abb, sep="."), names(dat))
        dat[i, index2] <- dat[i, index2]*2
    }

I asked everyone to indicate the most important factor in their decision-making: topic, deliverable, or population. In the next code chunk, I add an additional weight (x2) to all variables in a student's most important domain. Variables coded 0 in these domains remain 0.

# weight interests by most important factor x2
  for (i in top.var) {
  dat[,i][dat$most.imp=="topic"] <- dat[,i][dat$most.imp=="topic"]*2
  }
  for (i in del.var) {
  dat[,i][dat$most.imp=="deliverable"] <- dat[,i][dat$most.imp=="deliverable"]*2
  }
  for (i in pop.var) {
  dat[,i][dat$most.imp=="population"] <- dat[,i][dat$most.imp=="population"]*2
  }

With all of the interest variables propertly weighted, it's time to find potential groups (clusters). In the following chunk, I subset the data frame to include the weighted interest variables and student preferences for local vs global work and technology-oriented projects. Then I run a hierarchical cluster analysis and plot a dendrogram to show close relatives.

# hierarchical clustering
  lc.var <- grep("local.global", names(dat))
  ti.var <- grep("tech.interest", names(dat))
  dat.cluster <- dat[,c(top.var, del.var, pop.var, lc.var, ti.var)]
  distance <- dist(dat[,c(top.var, del.var, pop.var)], method="euclidean")
  cluster <- hclust(distance, method="average")
  plot(cluster, hang=-1, label=dat$id)

plot of chunk clustering

# assign clusters
  clusters <- data.frame(cluster$order)
  clusters$cluster <- c(1,1,1,2,2,2,2,3,3,3,3,1,4,4,4,4)
  names(clusters) <- c("id", "cluster")
  dat <- merge(dat, clusters, by="id")

Based on the dendrogram, there appear to be 3 clear clusters: 13, 14, 1, 10; 6, 5, 7, 9; and 8, 12, 4, 15. I checked the raw data and confirmed that these look like good groupings.

Students 2, 3, 16 are pretty close, so we can group them. Student 11 is closer to the last cluster, but we need even groups and the last cluster is full.

How are skills and leadership distributed?

Now that we have some groups, let's look at how students' self-rated skills are distributed across the groups.

# prepare data
  have.var <- grep("have.", names(dat), fixed=TRUE) # index variables with have.
  have.dat <- dat[have.var]
  have.dat <- cbind(have.dat, dat$cluster)
  have.sum <- as.data.frame(sapply(split(have.dat, dat$cluster), colSums))
  names(have.sum) <- c("grp1", "grp2", "grp3", "grp4")  # rename count variable
  have.sum$have <- row.names(have.sum)              # create name
  have.sum <- have.sum[1:(nrow(have.sum)-1),]
  for (i in 1:nrow(have.sum)) {                     # give labels
    label <- grep(substr(have.sum$have[i], 6, 
                         nchar(have.sum$have[i])), have.abb)
    have.sum$have.lab[i] <- have[label] 
  }
  have.sum.long <- reshape(have.sum, 
                           direction="long", 
                           varying=list(names(have.sum)[1:4]))
  names(have.sum.long)[names(have.sum.long)=="time"] <- "group"
  names(have.sum.long)[names(have.sum.long)=="grp1"] <- "count"
  have.sum.long = transform(have.sum.long,                    # reorder by count
                            have.lab = reorder(have.lab, count))
# plot
  ggplot(have.sum.long) +
    geom_point(aes(y=count, x=have.lab)) +
    coord_flip() +
    theme_bw() +
    theme(
      panel.grid.major = element_blank(),
      panel.grid.minor = element_blank(),
      panel.border = element_blank(),
      axis.line = element_line(color = 'black'),
      panel.background = element_blank()) +
    xlab("Skills Possessed") +
    ylab("Count") +
    facet_wrap(~ group)

plot of chunk skills-freq

Finally, we can see if students who rated themselves highly on the “tend to lead” dimension are also distributed across groups. I labeled any student in the top 50th percentile as “tends to lead”.

[I removed this section of the analysis (and data) because it might be possible to infer where a few individual students fall on the self-reported leadership distribution. Not exact scores, but removed anyway.]

Here are the groups

# merge in names
# it is not possible to determine student identities without the key
# set eric==0 below to stop the script from running; otherwise will throw error
# because key will be missing
  eric <- 1  # set to 0 if not Eric so the script stops running
  if (eric==1) {
    name.key <- read.csv("key.csv", stringsAsFactors=FALSE)
    dat <- merge(dat, name.key, by="id")
    table(dat$name, dat$cluster)
  }

##     
##      1 2 3 4
##   Aj 1 0 0 0
##   Be 0 1 0 0
##   Ch 1 0 0 0
##   Cr 1 0 0 0
##   Da 0 0 0 1
##   El 0 1 0 0
##   Ha 0 0 0 1
##   Ke 0 0 0 1
##   Li 0 1 0 0
##   Lu 0 0 1 0
##   Ma 1 0 0 0
##   Mo 0 0 1 0
##   Ne 0 0 1 0
##   Ni 0 1 0 0
##   Ov 0 0 0 1
##   Re 0 0 1 0