Analysis

Import

indeed_hard <- read.csv("https://raw.githubusercontent.com/Tyllis/Data607/master/indeed_hardskills.csv", stringsAsFactors = F, header = F)

Indeed Hard Skills - Tidy

#Separte the column into two: one for the number and one for the list of skills
#Remove the number column since we don't need it
indeed_hard_tidy <- indeed_hard %>%
  separate(col = "V1", into = c("num", "skills"), sep = "^\\d{1,} +?") %>%
  select(skills)
  
#find unique skills by looping through each row of skills, separating them, adding each skill 
#to a list, then find the unique values.
unique_skills <- c()
for(i in indeed_hard_tidy$skills){
  x <- strsplit(i, "[, ]")
  unique_skills <- append(unique_skills, unlist(x))
}

#These will be the row-names for our document matrix
unique_skills <- unique(unique_skills)
#Replace the space between words with an underscore so "Big Data" won't become two separate words.
indeed_hard_tidy$skills <- str_replace_all(indeed_hard_tidy$skills, "(\\w) (\\w)", "\\1_\\2")
indeed_hard_tidy$skills <- str_replace_all(indeed_hard_tidy$skills, "R(,)", "RProgramming\\1")

Make each row of skills a “document”

# prepare corpus
# Make each row of skills a "document"
# remove characters that aren't int th english lan
corpus <- Corpus(VectorSource(indeed_hard_tidy$skills))
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removePunctuation)

# error below, can't stem
corpus  <- tm_map(corpus, stemDocument, language = "english")
corpus

## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 2082

Calculate the distance between skills and how often they appear together

# MDS with raw term-document matrix compute distance matrix
#Calculates the "distance" between the words, or words that appear together more often
td.mat <- as.matrix(TermDocumentMatrix(corpus))

dist.mat <- dist(t(as.matrix(td.mat)))

Add weighting. The more often a skill appears, the more weight it has

#Add weighting
#Words that appear more often have more weight
td.mat.lsa <- lw_bintf(td.mat) * gw_idf(td.mat)  # weighting
lsaSpace <- lsa(td.mat.lsa)  # create LSA space

## Warning in lsa(td.mat.lsa): [lsa] - there are singular values which are
## zero.

dist.mat.lsa <- dist(t(as.textmatrix(lsaSpace)))  # compute distance matrix
#dist.mat.lsa  # check distance mantrix

Plot 3D line graph

#Plot a 3D graph. 
#Words that appear together will be in clumps
#Words that are in a document together will have a line, the darker the line the more times they appear together
#Compare more skills by changing the "n" value

 plot_neighbors("rprogram", n=10, tvectors=td.mat, method = "PCA", dims = 3, connect.lines = "all", alpha = "shade")

##                       x           y           z
## rprogram     -0.5064869 -0.58932933  0.42035171
## python       -0.7863782 -0.19455288  0.44366525
## machinelearn -0.7892852 -0.42284344  0.18950772
## sas          -0.1728170 -0.85433969 -0.01507288
## hadoop       -0.1998705 -0.16977449  0.84223116
## datamin      -0.2381542 -0.76503363  0.11780014
## datawarehous  0.1201753 -0.61436266  0.52388812
## datasci      -0.8875896 -0.05149828  0.13835809
## spark        -0.1955890 -0.07569958  0.84540881
## bigdata      -0.3979788 -0.07490607  0.67331644

Plot word graph

#Takes top 20 skills and plots them
#Skills that appear together are in clumps
#You can change it to 3D by setting dims = 3
#Compare more words by changing the 20 in the heads() function to something larger
words <- head(rownames(td.mat), 20)
plot_wordlist(words, method = "MDS", dims = 2, tvectors = td.mat)

##                                  x            y
## bigdata                -0.06450334  0.257694070
## datasci                 0.11414549 -0.097456798
## git                    -0.36199233 -0.008566226
## imageprocess           -0.33601669 -0.149616935
## jenkin                 -0.47571965 -0.056720127
## jira                   -0.45260525 -0.052054372
## machinelearn            0.20741429  0.023005317
## matlab                 -0.19613779  0.246480660
## maven                  -0.47762767 -0.066120838
## php                    -0.44218536 -0.051513548
## python                  0.20186877  0.075041699
## svn                    -0.48125822 -0.066619198
## naturallanguageprocess  0.41501405 -0.364013100
## nodej                   0.39659903 -0.508324930
## tensorflow              0.41427333 -0.445869699
## datawarehous            0.25510584  0.402776252
## rprogram                0.38211582  0.411629655
## agil                    0.29189268  0.153310003
## azur                    0.25384632  0.070808425
## java                    0.35577070  0.226129691

Create a data frame with the unique skills as row names

skill_matrix <- data.frame(unique_skills, row.names = T)

Loop through each job of skills and if the row name is in the list of skills, put a 1 in the column, else 0.

count <- 1
#find length of unique skills vector
len <- length(unique_skills)
#create vector of zeros with the length of unique skills vector
c <- c(rep(0,len))
#create empty data frame
word_matrix <- data.frame(c)

#loop through all the jobs and the list of skills
for(i in indeed_hard_tidy$skills){
  j <- 1
  #zero out the vector
  c <- c(rep(0,len))
  #split string into separate skills
  x <- strsplit(i, "[,]")
  x <- unlist(x)
  
  while(j <= len){
    #if rowname skill matches any of the job skills change the zero to 1
    if(unique_skills[j] %in% x){c[j] <- 1}
    j <- j + 1
  }
  #add vector to data frame
  word_matrix[,count] <- c
  count <- count + 1
}

Add row names

#add row names to the data frame
rownames(word_matrix) <- unique_skills
colnames(word_matrix)[1] <- "V1"

Which words are grouped together?

words <- c(" Machine Learning")
neighbors("words", n=5, tvectors = word_matrix)

## Warning in neighbors("words", n = 5, tvectors = word_matrix): x must be a
## word in rownames(tvectors)

## [1] NA

Analysis

Chad Smith

October 19, 2017