Environment set-up

In addition to loading the needed libraries, I also loaded the saved data set from Part 1.

# Clear environment
rm(list = ls())

# Load libraries
library(kableExtra)
library(dplyr)
library(class)
library(knitr)
library(RCurl)
library(XML)
library(jsonlite)
library(rvest)
library(stringr)
library(tidyr)
library(ggplot2)

# Load saved data set
load("long_school_set.Rdata")

Initial examination of the data set after the keyword review

To begin, we found initial statistics about the data set, beginning with the full long data set that we compare against the list of keywords.

## total number of observations
n.long <- nrow(long.school.set)

## total number of courses in data set
n.courses <- length(unique(long.school.set$Title.Desc))
paste("The long school data set contains", n.long, "observations for", n.courses, "unique courses.")
## [1] "The long school data set contains 12012 observations for 273 unique courses."

We subset the entire data set to return only courses with matched keywords.

school.work <- long.school.set %>%
  select(c(School, Degree, Degree.Name, Title, Title.Desc, Soft.or.technical, Skill, Appears)) %>%
  filter(Appears == TRUE)

## total number of skills found
n.results <- nrow(school.work)

## total number of courses with skills
n.courses.w.skills <- length(unique(school.work$Title.Desc))
paste("The keyword search returns", n.results, "keyword matches, identified in", n.courses.w.skills, "courses.")
## [1] "The keyword search returns 441 keyword matches, identified in 201 courses."
## Average number of skills found in each course (including zeroes)
avg.n.results.per.course <- n.results/n.courses
paste("On average, each course matches", round(avg.n.results.per.course, 2), "keywords.")
## [1] "On average, each course matches 1.62 keywords."

Examining overall results

We then looked at results for the the overall data set, including a breakdown of matched skills by skill type (soft skill or technical) and matched skills by school.

We see that 93 out of the 441 matched skills are soft skills, or approximately 21% of the matched skills.

Each school is represented in the matched output as well, with a minimum of 11 matches for University of Maryland and a maximum of 47 matches for the University of Cincinnati. CUNY has 36 matches!

## Soft v. Technical
table(school.work$Soft.or.technical, school.work$Appears) %>%
  kable("html") %>%
  kable_styling(bootstrap_options = "striped")
TRUE
soft 93
technical 348
table(school.work$Soft.or.technical, school.work$Appears) %>%
  as.data.frame %>%
  ggplot(aes(x = Var1, y = Freq, fill = Var1)) +
    geom_col() +
    labs(title = "Education Results by Skill Type", x = "Skill Type", y = "Number of Results") + 
    scale_fill_discrete(name = "Skill Type") +
    geom_text(aes(label = Freq, y = Freq + 12), size = 5, position = position_dodge(0.9), vjust = 0)

## By School

table(school.work$School, school.work$Appears) %>%
  kable("html") %>%
  kable_styling(bootstrap_options = "striped")
TRUE
Arizona State University 17
Berkeley University 18
Boston University Metropolitan College 45
CUNY SPS 36
Duke University 24
Michigan State University 30
New York University 44
North Carolina State University 20
Northwestern University SPS 41
Purdue University 45
Texas A&M University 44
University of Cincinnati 47
University of Maryland 11
University of Maryland University College 19
table(school.work$School, school.work$Appears) %>%
  as.data.frame %>%
  ggplot(aes(x = Var1, y = Freq)) +#, fill = Var1)) +
    geom_col(fill = "steelblue2") +
    #theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
    labs(title = "Education Results by School", x = "School", y = "Number of Results") + 
    coord_flip() +
    geom_text(aes(label = Freq, y = Freq + 2), size = 3, position = position_dodge(0.9), vjust = 0)

## Using frequency
nCourseBySchool <- school.work %>%
  group_by(School) %>%
  summarise(n.courses.schools = n_distinct(Title))
SchoolbyCourse<- as.data.frame(table(school.work$School))
colnames(SchoolbyCourse) <- c("School", "Freq")
SchoolbyCourse %>%
  inner_join(nCourseBySchool, by = "School") %>%
  mutate(Freq.perCourse = Freq/n.courses.schools) %>%
  ggplot(aes(x = School, y = Freq.perCourse)) +
  geom_col(fill = "steelblue2") +
  labs(title = "Education Results by School", x = "School", y = "Skills per Course") + 
  coord_flip() +
  geom_text(aes(label = round(Freq.perCourse, digits = 1), y = round(Freq.perCourse, digits = 1)  + 0.2), size = 3, position = position_dodge(0.9), vjust = 0)

Matched Skills

To answer our main question of which skills are most valued in academia, we want to examine the results by skill.

## By Skill
school.work %>%
  group_by(Soft.or.technical, Skill) %>% 
  summarize(Skill.Percent = 100 * round(sum(Appears == TRUE)/n.courses,5)) %>%
  kable("html") %>%
  kable_styling(bootstrap_options = "striped")
Soft.or.technical Skill Skill.Percent
soft collaboration 2.930
soft communication 7.326
soft creativity 0.366
soft critical thinking 1.099
soft customer service 0.366
soft decision making 6.593
soft leadership 5.128
soft presentation 9.158
soft problem solving 1.099
technical a/b testing 0.366
technical big data 5.128
technical c++ 2.930
technical d3.js 0.366
technical data manipulation 1.832
technical data mining 10.256
technical data visualization 5.495
technical data wrangling 0.366
technical decision tree 1.465
technical hadoop 3.663
technical java 2.930
technical machine learning 6.227
technical matlab 0.733
technical microsoft excel 0.366
technical modeling 35.531
technical powerpoint 0.366
technical prediction 9.158
technical python 4.396
technical r 7.326
technical sas 4.762
technical scripting languages 0.366
technical sql 4.396
technical statistics 17.949
technical tableau 1.099
school.work %>%
  group_by(Soft.or.technical, Skill) %>% 
  summarize(Skill.Percent = 100 * sum(Appears == TRUE)/n.courses) %>%
  ggplot(aes(x = reorder(Skill, Skill.Percent), Skill.Percent, fill = Soft.or.technical)) + 
    geom_bar(stat = 'identity', position = 'dodge') +
    coord_flip() +
    labs(title = "Data Science Masters Program Skills", x = "Skill", y = "Percentage of Courses with Skill") + 
    scale_fill_discrete(name = "Skill Type")

school.work %>%
  group_by(School, Skill) %>% 
  summarize(Skill.Percent = 100 * sum(Appears == TRUE)/n.courses) %>%
  ggplot(aes(x = reorder(Skill, Skill.Percent, FUN = sum), Skill.Percent, fill = School)) + 
    geom_bar(stat = 'identity') +
    coord_flip() +
    #facet_wrap(~ School)+
    labs(title = "Data Science Masters Program Skills", x = "Skill", y = "Percentage of Courses with Skill") + 
    scale_fill_discrete(name = "School")