Import
indeed_hard <- read.csv("https://raw.githubusercontent.com/Tyllis/Data607/master/indeed_hardskills.csv", stringsAsFactors = F, header = F)
Indeed Hard Skills - Tidy
#Separte the column into two: one for the number and one for the list of skills
#Remove the number column since we don't need it
indeed_hard_tidy <- indeed_hard %>%
separate(col = "V1", into = c("num", "skills"), sep = "^\\d{1,} +?") %>%
select(skills)
#find unique skills by looping through each row of skills, separating them, adding each skill
#to a list, then find the unique values.
unique_skills <- c()
for(i in indeed_hard_tidy$skills){
x <- strsplit(i, "[, ]")
unique_skills <- append(unique_skills, unlist(x))
}
#These will be the row-names for our document matrix
unique_skills <- unique(unique_skills)
#Replace the space between words with an underscore so "Big Data" won't become two separate words.
indeed_hard_tidy$skills <- str_replace_all(indeed_hard_tidy$skills, "(\\w) (\\w)", "\\1_\\2")
indeed_hard_tidy$skills <- str_replace_all(indeed_hard_tidy$skills, "R(,)", "RProgramming\\1")
Make each row of skills a “document”
# prepare corpus
# Make each row of skills a "document"
# remove characters that aren't int th english lan
corpus <- Corpus(VectorSource(indeed_hard_tidy$skills))
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removePunctuation)
# error below, can't stem
corpus <- tm_map(corpus, stemDocument, language = "english")
corpus
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 2082
Calculate the distance between skills and how often they appear together
# MDS with raw term-document matrix compute distance matrix
#Calculates the "distance" between the words, or words that appear together more often
td.mat <- as.matrix(TermDocumentMatrix(corpus))
dist.mat <- dist(t(as.matrix(td.mat)))
Add weighting. The more often a skill appears, the more weight it has
#Add weighting
#Words that appear more often have more weight
td.mat.lsa <- lw_bintf(td.mat) * gw_idf(td.mat) # weighting
lsaSpace <- lsa(td.mat.lsa) # create LSA space
## Warning in lsa(td.mat.lsa): [lsa] - there are singular values which are
## zero.
dist.mat.lsa <- dist(t(as.textmatrix(lsaSpace))) # compute distance matrix
#dist.mat.lsa # check distance mantrix
Plot 3D line graph
#Plot a 3D graph.
#Words that appear together will be in clumps
#Words that are in a document together will have a line, the darker the line the more times they appear together
#Compare more skills by changing the "n" value
plot_neighbors("rprogram", n=10, tvectors=td.mat, method = "PCA", dims = 3, connect.lines = "all", alpha = "shade")
## x y z
## rprogram -0.5064869 -0.58932933 0.42035171
## python -0.7863782 -0.19455288 0.44366525
## machinelearn -0.7892852 -0.42284344 0.18950772
## sas -0.1728170 -0.85433969 -0.01507288
## hadoop -0.1998705 -0.16977449 0.84223116
## datamin -0.2381542 -0.76503363 0.11780014
## datawarehous 0.1201753 -0.61436266 0.52388812
## datasci -0.8875896 -0.05149828 0.13835809
## spark -0.1955890 -0.07569958 0.84540881
## bigdata -0.3979788 -0.07490607 0.67331644
Plot word graph
#Takes top 20 skills and plots them
#Skills that appear together are in clumps
#You can change it to 3D by setting dims = 3
#Compare more words by changing the 20 in the heads() function to something larger
words <- head(rownames(td.mat), 20)
plot_wordlist(words, method = "MDS", dims = 2, tvectors = td.mat)
## x y
## bigdata -0.06450334 0.257694070
## datasci 0.11414549 -0.097456798
## git -0.36199233 -0.008566226
## imageprocess -0.33601669 -0.149616935
## jenkin -0.47571965 -0.056720127
## jira -0.45260525 -0.052054372
## machinelearn 0.20741429 0.023005317
## matlab -0.19613779 0.246480660
## maven -0.47762767 -0.066120838
## php -0.44218536 -0.051513548
## python 0.20186877 0.075041699
## svn -0.48125822 -0.066619198
## naturallanguageprocess 0.41501405 -0.364013100
## nodej 0.39659903 -0.508324930
## tensorflow 0.41427333 -0.445869699
## datawarehous 0.25510584 0.402776252
## rprogram 0.38211582 0.411629655
## agil 0.29189268 0.153310003
## azur 0.25384632 0.070808425
## java 0.35577070 0.226129691
Create a data frame with the unique skills as row names
skill_matrix <- data.frame(unique_skills, row.names = T)
Loop through each job of skills and if the row name is in the list of skills, put a 1 in the column, else 0.
count <- 1
#find length of unique skills vector
len <- length(unique_skills)
#create vector of zeros with the length of unique skills vector
c <- c(rep(0,len))
#create empty data frame
word_matrix <- data.frame(c)
#loop through all the jobs and the list of skills
for(i in indeed_hard_tidy$skills){
j <- 1
#zero out the vector
c <- c(rep(0,len))
#split string into separate skills
x <- strsplit(i, "[,]")
x <- unlist(x)
while(j <= len){
#if rowname skill matches any of the job skills change the zero to 1
if(unique_skills[j] %in% x){c[j] <- 1}
j <- j + 1
}
#add vector to data frame
word_matrix[,count] <- c
count <- count + 1
}
Add row names
#add row names to the data frame
rownames(word_matrix) <- unique_skills
colnames(word_matrix)[1] <- "V1"
Which words are grouped together?
words <- c(" Machine Learning")
neighbors("words", n=5, tvectors = word_matrix)
## Warning in neighbors("words", n = 5, tvectors = word_matrix): x must be a
## word in rownames(tvectors)
## [1] NA