library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(stringr)
library(rvest)
## Warning: package 'rvest' was built under R version 3.4.2
## Loading required package: xml2
## Warning: package 'xml2' was built under R version 3.4.2
suppressWarnings(library(dplyr))
suppressWarnings(library(rvest))
sumDS <- read.csv("C:\\temp\\project3\\links_summary_data.csv", sep = ",", header=TRUE, stringsAsFactors = FALSE )
A vector is created with the words that are tipically used to construct and describe soft skill sets:
skill_lexicon <- c('problem', 'solving', 'verbal','attention','written','oral','management','friendly', 'energetic', 'detail', 'oriented', 'leadership','skill', 'skills','emotional','intelligence','communication')
aaaa
trigram <- sumDS$most_common_trigrams[1]
cityname <- sumDS$location[1]
ds <- data.frame(str_extract_all(trigram,"\\({2}.+?[[:digit:]]{1,}"))
colnames(ds)<-"A"
ds$A <- str_sub(ds$A,2)
ds <- mutate(ds, ngram = str_sub(ds$A,str_locate(ds$A, "\\(?.+?\\)")[,1],str_locate(ds$A, "\\(?.+?\\)")[,2]))
ds <- mutate(ds, freq = as.numeric(str_sub(ds$A,str_locate(ds$A, "\\(?.+?\\)")[,2]+2)))
ds <- ds[,2:3]
ds <- mutate(ds,city=cityname)
na.omit(ds)
## ngram freq city
## 1 ('main', 'menu', 'industries') 66 CHI
## 2 ('overview', 'main', 'menu') 51 CHI
## 3 ('main', 'menu', 'services') 33 CHI
## 4 ('menu', 'industries', 'healthcare') 24 CHI
## 5 ('ground', 'naval', 'air') 20 CHI
## 6 ('naval', 'air', 'service') 20 CHI
## 7 ('menu', 'services', 'tax') 18 CHI
## 8 ('main', 'menu', 'careers') 18 CHI
## 9 ('my', 'saved', 'searches') 13 CHI
## 10 ('what', 'we', 'do') 13 CHI
## 11 ('tax', 'overview', 'main') 12 CHI
## 12 ('menu', 'industries', 'banking') 12 CHI
## 13 ('main', 'menu', 'insights') 12 CHI
## 14 ('my', 'jobpage', 'basic') 11 CHI
## 15 ('jobpage', 'basic', 'search') 11 CHI
## 16 ('basic', 'search', 'advanced') 11 CHI
## 17 ('search', 'advanced', 'search') 11 CHI
## 18 ('my', 'submissions', 'my') 11 CHI
## 19 ('armed', 'forces', 'service') 10 CHI
## 20 ('forces', 'service', 'medal') 10 CHI
## 21 ('omb', 'control', 'number') 10 CHI
## 22 ('impairment', 'medical', 'condition') 10 CHI
## 23 ('qualified', 'receive', 'consideration') 9 CHI
## 24 ('search', 'matching', 'my') 9 CHI
## 25 ('matching', 'my', 'profile') 9 CHI
## 26 ('problem', 'solving', 'skills') 9 CHI
## 28 ('services', 'main', 'menu') 9 CHI
## 29 ('overview', 'audit', 'tax') 9 CHI
## 30 ('menu', 'careers', 'what') 9 CHI
## 31 ('careers', 'what', 'we') 9 CHI
## 32 ('can', 'share', 'examples') 9 CHI
## 33 ('all', 'qualified', 'receive') 8 CHI
## 34 ('receive', 'consideration', 'employment') 8 CHI
## 35 ('consideration', 'employment', 'religion') 8 CHI
## 36 ('please', 'review', 'application') 8 CHI
## 37 ('review', 'application', 'instructions') 8 CHI
## 38 ('application', 'instructions', 'applying') 8 CHI
## 39 ('advanced', 'search', 'matching') 8 CHI
## 40 ('my', 'profile', 'my') 8 CHI
## 41 ('submissions', 'my', 'cart') 8 CHI
## 42 ('my', 'cart', 'my') 8 CHI
## 43 ('cart', 'my', 'saved') 8 CHI
## 44 ('who', 'we', 'are') 7 CHI
## 45 ('structured', 'unstructured', 'data') 7 CHI
## 46 ('religion', 'age', 'ancestry') 7 CHI
## 47 ('all', 'rights', 'reserved') 7 CHI
## 48 ('data', 'visualization', 'tools') 7 CHI
## 49 ('excellent', 'communication', 'skills') 6 CHI
## 50 ('degree', 'computer', 'science') 6 CHI
ds$ngram <- str_replace_all(ds$ngram,"[\\(\\)]","")
ds$ngram <- str_replace_all(ds$ngram,",","")
ds$ngram <- str_replace_all(ds$ngram,"\\'","")
ds_filltered <- c()
for (j in 1:nrow(ds)) {
if (any(strsplit(paste(ds$ngram[j], collapse = " "), ' ')[[1]] %in% skill_lexicon == TRUE)){
ds_filltered <- rbind(ds_filltered,ds[j,])
}
}
Create a function:
createDS <- function(x,y) {
trigram <- x
cityname <- y
ds <- data.frame(str_extract_all(trigram,"\\({2}.+?[[:digit:]]{1,}"))
colnames(ds)<-"A"
ds$A <- str_sub(ds$A,2)
ds <- mutate(ds, skills = str_sub(ds$A,str_locate(ds$A, "\\(?.+?\\)")[,1],str_locate(ds$A, "\\(?.+?\\)")[,2]))
ds <- mutate(ds, freq = as.numeric(str_sub(ds$A,str_locate(ds$A, "\\(?.+?\\)")[,2]+2)))
ds <- ds[,2:3]
ds <- mutate(ds,city=y)
na.omit(ds)
ds$skills <- str_replace_all(ds$skills,"[\\(\\)]","")
ds$skills <- str_replace_all(ds$skills,",","")
ds$skills <- str_replace_all(ds$skills,"\\'","")
ds_filltered <- c()
for (j in 1:nrow(ds)) {
if (any(strsplit(paste(ds$skills[j], collapse = " "), ' ')[[1]] %in% skill_lexicon == TRUE)){
ds_filltered <- rbind(ds_filltered,ds[j,])
}
}
return (ds_filltered)
}
skill_cities <- c()
for (i in 1:nrow(sumDS))
skill_cities <- rbind(skill_cities,createDS(sumDS$most_common_trigrams[i],sumDS$location[i]))
row.names(skill_cities) <- NULL
Figure 1:
ggplot(skill_cities, aes(x = skills, y = freq)) + geom_point(aes(color = city,
size = freq)) + labs(title = "Most desired soft skills for Data Scientist",
y = "Frequency") + theme(axis.text.x = element_text(angle = 90,
hjust = 1))
ggplot(skill_cities, aes(skills, freq)) + geom_bar(aes(fill = city),
stat = "identity", position = "dodge") + labs(title = "Most desired soft skills for Data Scientist",
y = "skills") + theme(axis.text.x = element_text(angle = 90,
hjust = 1))