This project addresses the question: “What are the most valued data science skills? To answer this question, the team took the following approach: 1. COLLECTING: mined the job board glassdoor.com to identify which skills were requested the most frequently in job descriptions. (Alvaro Bueno & Silverio Vasquez) 2. TIDYING: creating databases from the raw data. (Sarah Wigodsky) 3. ANALYZING: developing relevant dataframes from the databases. (Nathan Cooper) 4. VISUALIZING: creating graphs and descriptions of the conclusions from the analysis. (Jill Anderson)
suppressWarnings(suppressMessages(library(stringr)))
suppressWarnings(suppressMessages(library(dplyr)))
suppressWarnings(suppressMessages(library(RCurl)))
suppressWarnings(suppressMessages(library(tidyr)))
suppressWarnings(suppressMessages(library(DT)))
suppressWarnings(suppressMessages(library(ggplot2)))
suppressWarnings(suppressMessages(library(knitr)))
salary <- read.csv("https://raw.githubusercontent.com/sjv1030/group_project/master/company_salary.csv", stringsAsFactors = FALSE)
salarytidy <- salary[,-1]
salarytidy <- salarytidy %>%
separate(salary, c("salary", "pay_period"), sep="per ")
salarytidy$salary <- gsub(",", "", salarytidy$salary)
salarytidy$salary <- unlist(str_extract_all(salarytidy$salary, "[[:digit:]]{3,}|[[:digit:]]{1,}.[[:digit:]]{1,}"))
salarytidy$salary <- as.numeric(salarytidy$salary)
summary(salarytidy)
## company salary pay_period
## Length:519 Min. : 12.81 Length:519
## Class :character 1st Qu.: 98002.00 Class :character
## Mode :character Median :122895.00 Mode :character
## Mean :111381.06
## 3rd Qu.:138688.50
## Max. :245145.00
datatable(salarytidy)
for (i in 1:length(salarytidy$salary)){
if (salarytidy$pay_period[i] == "month") {salarytidy$salary[i] <- salarytidy$salary[i]*12}
if (salarytidy$pay_period[i] == "week") {salarytidy$salary[i] <- salarytidy$salary[i]*52}
if (salarytidy$pay_period[i] == "hour") {salarytidy$salary[i] <- salarytidy$salary[i]*8*5*52}
if (salarytidy$pay_period[i] == "day") {salarytidy$salary[i] <- salarytidy$salary[i]*5*52}
}
joblistma <- read.csv("https://raw.githubusercontent.com/delagroove/dataScience/master/jobOffers_final_with_dupes_reduced.csv", stringsAsFactors = FALSE)
joblistny <- read.csv("https://raw.githubusercontent.com/delagroove/dataScience/master/jobOffers_ny.csv", stringsAsFactors=FALSE)
joblistsf <- read.csv("https://raw.githubusercontent.com/delagroove/dataScience/master/jobOffers_SF.csv", stringsAsFactors=FALSE)
joblistsf <- joblistsf[,-1]
joblistchi <- read.csv("https://raw.githubusercontent.com/delagroove/dataScience/master/jobOffers_CHI.csv", stringsAsFactors=FALSE)
joblistchi <- joblistchi[,-1]
joblisttidy <- rbind(joblistma, joblistny, joblistsf, joblistchi)
joblisttidy <- joblisttidy[,-5]
joblisttidy <- subset(joblisttidy, duplicated(joblisttidy)==FALSE)
joblisttidy$location <- unlist(str_extract_all(joblisttidy$location, "[[:upper:]]{1}[[:lower:]]{2,}, [[:alpha:]]{2}|[[:upper:]]{1}[[:lower:]]{2,} [[:alpha:]]{2,}, [[:alpha:]]{2}"))
joblisttidy$description <- gsub("[^[:ascii:]]", "", joblisttidy$description, perl=T)
compskills <- joblisttidy %>%
mutate(python = grepl("python", description, ignore.case=TRUE)) %>%
mutate(perl = grepl("perl", description, ignore.case=TRUE)) %>%
mutate(Cplusplus = grepl("C++", description, fixed=TRUE)) %>%
mutate(SQL = grepl("SQL", description)) %>%
mutate(java = grepl("java\\b", description, ignore.case=TRUE)) %>%
mutate(javascript = grepl("javascript", description, ignore.case=TRUE)) %>%
mutate(R = grepl("\\bR\\b,", description)) %>%
mutate(hadoop = grepl("hadoop", description, ignore.case=TRUE)) %>%
mutate(spark = grepl("spark", description, ignore.case=TRUE)) %>%
mutate(scala = grepl("scala", description, ignore.case=TRUE)) %>%
select(job_title, company, python, perl, Cplusplus, SQL, java, javascript, R, hadoop, spark, scala)
## Warning in grepl("python", description, ignore.case = TRUE): input string 1
## is invalid in this locale
## Warning in grepl("python", description, ignore.case = TRUE): input string 2
## is invalid in this locale
## Warning in grepl("python", description, ignore.case = TRUE): input string 3
## is invalid in this locale
## Warning in grepl("python", description, ignore.case = TRUE): input string 5
## is invalid in this locale
## Warning in grepl("python", description, ignore.case = TRUE): input string 7
## is invalid in this locale
## Warning in grepl("perl", description, ignore.case = TRUE): input string 1
## is invalid in this locale
## Warning in grepl("perl", description, ignore.case = TRUE): input string 2
## is invalid in this locale
## Warning in grepl("perl", description, ignore.case = TRUE): input string 3
## is invalid in this locale
## Warning in grepl("perl", description, ignore.case = TRUE): input string 5
## is invalid in this locale
## Warning in grepl("perl", description, ignore.case = TRUE): input string 7
## is invalid in this locale
## Warning in grepl("C++", description, fixed = TRUE): input string 1 is
## invalid in this locale
## Warning in grepl("C++", description, fixed = TRUE): input string 2 is
## invalid in this locale
## Warning in grepl("C++", description, fixed = TRUE): input string 3 is
## invalid in this locale
## Warning in grepl("C++", description, fixed = TRUE): input string 5 is
## invalid in this locale
## Warning in grepl("C++", description, fixed = TRUE): input string 7 is
## invalid in this locale
## Warning in grepl("SQL", description): input string 1 is invalid in this
## locale
## Warning in grepl("SQL", description): input string 2 is invalid in this
## locale
## Warning in grepl("SQL", description): input string 3 is invalid in this
## locale
## Warning in grepl("SQL", description): input string 5 is invalid in this
## locale
## Warning in grepl("SQL", description): input string 7 is invalid in this
## locale
## Warning in grepl("java\\b", description, ignore.case = TRUE): input string
## 1 is invalid in this locale
## Warning in grepl("java\\b", description, ignore.case = TRUE): input string
## 2 is invalid in this locale
## Warning in grepl("java\\b", description, ignore.case = TRUE): input string
## 3 is invalid in this locale
## Warning in grepl("java\\b", description, ignore.case = TRUE): input string
## 5 is invalid in this locale
## Warning in grepl("java\\b", description, ignore.case = TRUE): input string
## 7 is invalid in this locale
## Warning in grepl("javascript", description, ignore.case = TRUE): input
## string 1 is invalid in this locale
## Warning in grepl("javascript", description, ignore.case = TRUE): input
## string 2 is invalid in this locale
## Warning in grepl("javascript", description, ignore.case = TRUE): input
## string 3 is invalid in this locale
## Warning in grepl("javascript", description, ignore.case = TRUE): input
## string 5 is invalid in this locale
## Warning in grepl("javascript", description, ignore.case = TRUE): input
## string 7 is invalid in this locale
## Warning in grepl("\\bR\\b,", description): input string 1 is invalid in
## this locale
## Warning in grepl("\\bR\\b,", description): input string 2 is invalid in
## this locale
## Warning in grepl("\\bR\\b,", description): input string 3 is invalid in
## this locale
## Warning in grepl("\\bR\\b,", description): input string 5 is invalid in
## this locale
## Warning in grepl("\\bR\\b,", description): input string 7 is invalid in
## this locale
## Warning in grepl("hadoop", description, ignore.case = TRUE): input string 1
## is invalid in this locale
## Warning in grepl("hadoop", description, ignore.case = TRUE): input string 2
## is invalid in this locale
## Warning in grepl("hadoop", description, ignore.case = TRUE): input string 3
## is invalid in this locale
## Warning in grepl("hadoop", description, ignore.case = TRUE): input string 5
## is invalid in this locale
## Warning in grepl("hadoop", description, ignore.case = TRUE): input string 7
## is invalid in this locale
## Warning in grepl("spark", description, ignore.case = TRUE): input string 1
## is invalid in this locale
## Warning in grepl("spark", description, ignore.case = TRUE): input string 2
## is invalid in this locale
## Warning in grepl("spark", description, ignore.case = TRUE): input string 3
## is invalid in this locale
## Warning in grepl("spark", description, ignore.case = TRUE): input string 5
## is invalid in this locale
## Warning in grepl("spark", description, ignore.case = TRUE): input string 7
## is invalid in this locale
## Warning in grepl("scala", description, ignore.case = TRUE): input string 1
## is invalid in this locale
## Warning in grepl("scala", description, ignore.case = TRUE): input string 2
## is invalid in this locale
## Warning in grepl("scala", description, ignore.case = TRUE): input string 3
## is invalid in this locale
## Warning in grepl("scala", description, ignore.case = TRUE): input string 5
## is invalid in this locale
## Warning in grepl("scala", description, ignore.case = TRUE): input string 7
## is invalid in this locale
summary(compskills)
## job_title company python perl
## Length:498 Length:498 Mode :logical Mode :logical
## Class :character Class :character FALSE:252 FALSE:483
## Mode :character Mode :character TRUE :246 TRUE :15
## Cplusplus SQL java javascript
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:446 FALSE:329 FALSE:408 FALSE:476
## TRUE :52 TRUE :169 TRUE :90 TRUE :22
## R hadoop spark scala
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:374 FALSE:391 FALSE:382 FALSE:421
## TRUE :124 TRUE :107 TRUE :116 TRUE :77
barplot(table(compskills$python),main = "Python")
barplot(table(compskills$perl),main = "Perl")
barplot(table(compskills$Cplusplus),main = "C++")
barplot(table(compskills$SQL),main = "SQL")
barplot(table(compskills$java),main = "Java")
barplot(table(compskills$javascript),main = "JavaScript")
barplot(table(compskills$R),main = "R")
barplot(table(compskills$hadoop),main = "Hadoop")
barplot(table(compskills$spark),main = "Spark")
barplot(table(compskills$scala),main = "Scala")
{r} skills <- joblisttidy %>% mutate(machinelearning = grepl("machine learning", description, ignore.case=TRUE)) %>% mutate(statisticalmodeling = grepl("statistical model", description, ignore.case=TRUE)) %>% mutate(techwriting = grepl("technical writing", description, ignore.case=TRUE)) %>% mutate(plateau = grepl("plateau", description, ignore.case=TRUE)) %>% mutate(d3 = grepl("D3", description)) %>% select(job_title, company, machinelearning, statisticalmodeling, techwriting, plateau, d3) summary(skills) barplot(table(skills$machinelearning),main = "Machine Learning") barplot(table(skills$statisticalmodeling),main = "Statistical Modeling") barplot(table(skills$techwriting),main = "Technical Writing") barplot(table(skills$plateau),main = "Plateau") barplot(table(skills$d3),main = "D3") ##softskills <- joblisttidy %>%
mutate(collaborative = grepl("collaborat", description, ignore.case=TRUE)) %>%
mutate(organized = grepl("organized", description, ignore.case=TRUE)) %>%
mutate(selfstarter = grepl("self starter", description, ignore.case=TRUE)) %>%
mutate(attndetail = grepl("attention to detail", description, ignore.case=TRUE)) %>%
mutate(communication = grepl("communicat", description, ignore.case=TRUE)) %>%
mutate(creative = grepl("creativ", description, ignore.case=TRUE)) %>%
mutate(visualization = grepl("visualization", description, ignore.case=TRUE)) %>%
select(job_title, company, collaborative, organized, selfstarter, attndetail, communication, creative, visualization)
## Warning in grepl("collaborat", description, ignore.case = TRUE): input
## string 1 is invalid in this locale
## Warning in grepl("collaborat", description, ignore.case = TRUE): input
## string 2 is invalid in this locale
## Warning in grepl("collaborat", description, ignore.case = TRUE): input
## string 3 is invalid in this locale
## Warning in grepl("collaborat", description, ignore.case = TRUE): input
## string 5 is invalid in this locale
## Warning in grepl("collaborat", description, ignore.case = TRUE): input
## string 7 is invalid in this locale
## Warning in grepl("organized", description, ignore.case = TRUE): input
## string 1 is invalid in this locale
## Warning in grepl("organized", description, ignore.case = TRUE): input
## string 2 is invalid in this locale
## Warning in grepl("organized", description, ignore.case = TRUE): input
## string 3 is invalid in this locale
## Warning in grepl("organized", description, ignore.case = TRUE): input
## string 5 is invalid in this locale
## Warning in grepl("organized", description, ignore.case = TRUE): input
## string 7 is invalid in this locale
## Warning in grepl("self starter", description, ignore.case = TRUE): input
## string 1 is invalid in this locale
## Warning in grepl("self starter", description, ignore.case = TRUE): input
## string 2 is invalid in this locale
## Warning in grepl("self starter", description, ignore.case = TRUE): input
## string 3 is invalid in this locale
## Warning in grepl("self starter", description, ignore.case = TRUE): input
## string 5 is invalid in this locale
## Warning in grepl("self starter", description, ignore.case = TRUE): input
## string 7 is invalid in this locale
## Warning in grepl("attention to detail", description, ignore.case = TRUE):
## input string 1 is invalid in this locale
## Warning in grepl("attention to detail", description, ignore.case = TRUE):
## input string 2 is invalid in this locale
## Warning in grepl("attention to detail", description, ignore.case = TRUE):
## input string 3 is invalid in this locale
## Warning in grepl("attention to detail", description, ignore.case = TRUE):
## input string 5 is invalid in this locale
## Warning in grepl("attention to detail", description, ignore.case = TRUE):
## input string 7 is invalid in this locale
## Warning in grepl("communicat", description, ignore.case = TRUE): input
## string 1 is invalid in this locale
## Warning in grepl("communicat", description, ignore.case = TRUE): input
## string 2 is invalid in this locale
## Warning in grepl("communicat", description, ignore.case = TRUE): input
## string 3 is invalid in this locale
## Warning in grepl("communicat", description, ignore.case = TRUE): input
## string 5 is invalid in this locale
## Warning in grepl("communicat", description, ignore.case = TRUE): input
## string 7 is invalid in this locale
## Warning in grepl("creativ", description, ignore.case = TRUE): input string
## 1 is invalid in this locale
## Warning in grepl("creativ", description, ignore.case = TRUE): input string
## 2 is invalid in this locale
## Warning in grepl("creativ", description, ignore.case = TRUE): input string
## 3 is invalid in this locale
## Warning in grepl("creativ", description, ignore.case = TRUE): input string
## 5 is invalid in this locale
## Warning in grepl("creativ", description, ignore.case = TRUE): input string
## 7 is invalid in this locale
## Warning in grepl("visualization", description, ignore.case = TRUE): input
## string 1 is invalid in this locale
## Warning in grepl("visualization", description, ignore.case = TRUE): input
## string 2 is invalid in this locale
## Warning in grepl("visualization", description, ignore.case = TRUE): input
## string 3 is invalid in this locale
## Warning in grepl("visualization", description, ignore.case = TRUE): input
## string 5 is invalid in this locale
## Warning in grepl("visualization", description, ignore.case = TRUE): input
## string 7 is invalid in this locale
summary(softskills)
## job_title company collaborative organized
## Length:498 Length:498 Mode :logical Mode :logical
## Class :character Class :character FALSE:328 FALSE:478
## Mode :character Mode :character TRUE :170 TRUE :20
## selfstarter attndetail communication creative
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:496 FALSE:465 FALSE:261 FALSE:401
## TRUE :2 TRUE :33 TRUE :237 TRUE :97
## visualization
## Mode :logical
## FALSE:408
## TRUE :90
barplot(table(softskills$collaborative),main = "Collaborative")
barplot(table(softskills$organized),main = "Organized")
barplot(table(softskills$selfstarter),main = "Self Starter")
barplot(table(softskills$attndetail),main = "Attention to Detail")
barplot(table(softskills$communication),main = "Communication")
barplot(table(softskills$creative),main = "Creative")
barplot(table(softskills$visualization),main = "Visualization")
softskills_df <- do.call(cbind, lapply(softskills, summary))
softskills_df <- softskills_df[-c(1),]
softskills_df <- softskills_df[,-c(1:2)]
softskills_dft <- t(softskills_df)
softskills_dft1 <- cbind(rownames(softskills_dft), data.frame(softskills_dft, row.names=NULL))
colnames(softskills_dft1) <- c("skill", "appears", "not")
datatable(softskills_dft1)
ggplot(softskills_dft1, aes(x = softskills_dft1$skill, y = softskills_dft1$appears, fill = softskills_dft1$skill)) +
geom_bar(stat = "identity") +
xlab("Soft Skill") +
ylab("Number of Times Appeared") +
theme(legend.position = "none",
axis.text.x = element_text(angle = 65, hjust = 1)) +
ggtitle("Frequency of Soft Skills Appearing in Job Descriptions")
compskills_df <- do.call(cbind, lapply(compskills, summary))
#### Remove unnecessary columns and rows, rename the labels, transpose the dataframe, and make the row names their own column.
compskills_df <- compskills_df[-c(1),]
compskills_df <- compskills_df[,-c(1:2)]
compskills_dft <- t(compskills_df)
compskills_dft1 <- cbind(rownames(compskills_dft), data.frame(compskills_dft, row.names=NULL))
colnames(compskills_dft1) <- c("skill", "appears", "not")
datatable(compskills_dft1)
ggplot(compskills_dft1, aes(x = compskills_dft1$skill, y = compskills_dft1$appears, fill = compskills_dft1$skill)) +
geom_bar(stat = "identity") +
xlab("Computer Skill") +
ylab("Number of Times Appeared") +
theme(legend.position = "none",
axis.text.x = element_text(angle = 65, hjust = 1)) +
ggtitle("Frequency of Computer Skills Appearing in Job Descriptions")
{r} skills_df <- do.call(cbind, lapply(skills, summary)) ####skills_df <- skills_df[-c(1),] skills_df <- skills_df[,-c(1:2)] skills_dft <- t(skills_df)
skills_dft1 <- cbind(rownames(skills_dft), data.frame(skills_dft, row.names=NULL)) colnames(skills_dft1) <- c(“skill”, “appears”, “not”) datatable(skills_dft1) ####```
{r} ggplot(skills_dft1, aes(x = skills_dft1$skill, y = skills_dft1$appears, fill = skills_dft1$skill)) + geom_bar(stat = "identity") + xlab("Analytical Skill") + ylab("Number of Times Appeared") + theme(legend.position = "none", axis.text.x = element_text(angle = 65, hjust = 1)) + ggtitle("Frequency of Analytical Skills Appearing in Job Descriptions") ####