In addition to loading the needed libraries, I also loaded the saved data set from Part 1.
# Clear environment
rm(list = ls())
# Load libraries
library(kableExtra)
library(dplyr)
library(class)
library(knitr)
library(RCurl)
library(XML)
library(jsonlite)
library(rvest)
library(stringr)
library(tidyr)
library(ggplot2)
# Load saved data set
load("long_school_set.Rdata")To begin, we found initial statistics about the data set, beginning with the full long data set that we compare against the list of keywords.
## total number of observations
n.long <- nrow(long.school.set)
## total number of courses in data set
n.courses <- length(unique(long.school.set$Title.Desc))
paste("The long school data set contains", n.long, "observations for", n.courses, "unique courses.")## [1] "The long school data set contains 12012 observations for 273 unique courses."
We subset the entire data set to return only courses with matched keywords.
school.work <- long.school.set %>%
select(c(School, Degree, Degree.Name, Title, Title.Desc, Soft.or.technical, Skill, Appears)) %>%
filter(Appears == TRUE)
## total number of skills found
n.results <- nrow(school.work)
## total number of courses with skills
n.courses.w.skills <- length(unique(school.work$Title.Desc))
paste("The keyword search returns", n.results, "keyword matches, identified in", n.courses.w.skills, "courses.")## [1] "The keyword search returns 441 keyword matches, identified in 201 courses."
## Average number of skills found in each course (including zeroes)
avg.n.results.per.course <- n.results/n.courses
paste("On average, each course matches", round(avg.n.results.per.course, 2), "keywords.")## [1] "On average, each course matches 1.62 keywords."
We then looked at results for the the overall data set, including a breakdown of matched skills by skill type (soft skill or technical) and matched skills by school.
We see that 93 out of the 441 matched skills are soft skills, or approximately 21% of the matched skills.
Each school is represented in the matched output as well, with a minimum of 11 matches for University of Maryland and a maximum of 47 matches for the University of Cincinnati. CUNY has 36 matches!
## Soft v. Technical
table(school.work$Soft.or.technical, school.work$Appears) %>%
kable("html") %>%
kable_styling(bootstrap_options = "striped")| TRUE | |
|---|---|
| soft | 93 |
| technical | 348 |
table(school.work$Soft.or.technical, school.work$Appears) %>%
as.data.frame %>%
ggplot(aes(x = Var1, y = Freq, fill = Var1)) +
geom_col() +
labs(title = "Education Results by Skill Type", x = "Skill Type", y = "Number of Results") +
scale_fill_discrete(name = "Skill Type") +
geom_text(aes(label = Freq, y = Freq + 12), size = 5, position = position_dodge(0.9), vjust = 0)## By School
table(school.work$School, school.work$Appears) %>%
kable("html") %>%
kable_styling(bootstrap_options = "striped")| TRUE | |
|---|---|
| Arizona State University | 17 |
| Berkeley University | 18 |
| Boston University Metropolitan College | 45 |
| CUNY SPS | 36 |
| Duke University | 24 |
| Michigan State University | 30 |
| New York University | 44 |
| North Carolina State University | 20 |
| Northwestern University SPS | 41 |
| Purdue University | 45 |
| Texas A&M University | 44 |
| University of Cincinnati | 47 |
| University of Maryland | 11 |
| University of Maryland University College | 19 |
table(school.work$School, school.work$Appears) %>%
as.data.frame %>%
ggplot(aes(x = Var1, y = Freq)) +#, fill = Var1)) +
geom_col(fill = "steelblue2") +
#theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
labs(title = "Education Results by School", x = "School", y = "Number of Results") +
coord_flip() +
geom_text(aes(label = Freq, y = Freq + 2), size = 3, position = position_dodge(0.9), vjust = 0)## Using frequency
nCourseBySchool <- school.work %>%
group_by(School) %>%
summarise(n.courses.schools = n_distinct(Title))
SchoolbyCourse<- as.data.frame(table(school.work$School))
colnames(SchoolbyCourse) <- c("School", "Freq")
SchoolbyCourse %>%
inner_join(nCourseBySchool, by = "School") %>%
mutate(Freq.perCourse = Freq/n.courses.schools) %>%
ggplot(aes(x = School, y = Freq.perCourse)) +
geom_col(fill = "steelblue2") +
labs(title = "Education Results by School", x = "School", y = "Skills per Course") +
coord_flip() +
geom_text(aes(label = round(Freq.perCourse, digits = 1), y = round(Freq.perCourse, digits = 1) + 0.2), size = 3, position = position_dodge(0.9), vjust = 0)To answer our main question of which skills are most valued in academia, we want to examine the results by skill.
## By Skill
school.work %>%
group_by(Soft.or.technical, Skill) %>%
summarize(Skill.Percent = 100 * round(sum(Appears == TRUE)/n.courses,5)) %>%
kable("html") %>%
kable_styling(bootstrap_options = "striped")| Soft.or.technical | Skill | Skill.Percent |
|---|---|---|
| soft | collaboration | 2.930 |
| soft | communication | 7.326 |
| soft | creativity | 0.366 |
| soft | critical thinking | 1.099 |
| soft | customer service | 0.366 |
| soft | decision making | 6.593 |
| soft | leadership | 5.128 |
| soft | presentation | 9.158 |
| soft | problem solving | 1.099 |
| technical | a/b testing | 0.366 |
| technical | big data | 5.128 |
| technical | c++ | 2.930 |
| technical | d3.js | 0.366 |
| technical | data manipulation | 1.832 |
| technical | data mining | 10.256 |
| technical | data visualization | 5.495 |
| technical | data wrangling | 0.366 |
| technical | decision tree | 1.465 |
| technical | hadoop | 3.663 |
| technical | java | 2.930 |
| technical | machine learning | 6.227 |
| technical | matlab | 0.733 |
| technical | microsoft excel | 0.366 |
| technical | modeling | 35.531 |
| technical | powerpoint | 0.366 |
| technical | prediction | 9.158 |
| technical | python | 4.396 |
| technical | r | 7.326 |
| technical | sas | 4.762 |
| technical | scripting languages | 0.366 |
| technical | sql | 4.396 |
| technical | statistics | 17.949 |
| technical | tableau | 1.099 |
school.work %>%
group_by(Soft.or.technical, Skill) %>%
summarize(Skill.Percent = 100 * sum(Appears == TRUE)/n.courses) %>%
ggplot(aes(x = reorder(Skill, Skill.Percent), Skill.Percent, fill = Soft.or.technical)) +
geom_bar(stat = 'identity', position = 'dodge') +
coord_flip() +
labs(title = "Data Science Masters Program Skills", x = "Skill", y = "Percentage of Courses with Skill") +
scale_fill_discrete(name = "Skill Type")school.work %>%
group_by(School, Skill) %>%
summarize(Skill.Percent = 100 * sum(Appears == TRUE)/n.courses) %>%
ggplot(aes(x = reorder(Skill, Skill.Percent, FUN = sum), Skill.Percent, fill = School)) +
geom_bar(stat = 'identity') +
coord_flip() +
#facet_wrap(~ School)+
labs(title = "Data Science Masters Program Skills", x = "Skill", y = "Percentage of Courses with Skill") +
scale_fill_discrete(name = "School")