library(jsonlite)
library(tidyverse)
## -- Attaching packages ------------------------------ tidyverse 1.2.1 --
## v ggplot2 3.0.0     v purrr   0.2.5
## v tibble  1.4.2     v dplyr   0.7.6
## v tidyr   0.8.1     v stringr 1.3.1
## v readr   1.1.1     v forcats 0.3.0
## Warning: package 'ggplot2' was built under R version 3.5.1
## Warning: package 'readr' was built under R version 3.5.1
## Warning: package 'dplyr' was built under R version 3.5.1
## -- Conflicts --------------------------------- tidyverse_conflicts() --
## x dplyr::filter()  masks stats::filter()
## x purrr::flatten() masks jsonlite::flatten()
## x dplyr::lag()     masks stats::lag()
library(httr)
library(xml2)
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 3.5.1
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(rvest)
## Warning: package 'rvest' was built under R version 3.5.1
## 
## Attaching package: 'rvest'
## The following object is masked from 'package:Hmisc':
## 
##     html
## The following object is masked from 'package:purrr':
## 
##     pluck
## The following object is masked from 'package:readr':
## 
##     guess_encoding
library(tm)
## Warning: package 'tm' was built under R version 3.5.1
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:httr':
## 
##     content
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.5.1
## Loading required package: RColorBrewer
library(dplyr)
library(tidytext)
## Warning: package 'tidytext' was built under R version 3.5.1
library(ggplot2)

Load the Jobs

There are now 21 pages of jobs available from GlassDoor associated with Comcast. But you could scrape (say) 50, and the site returns blanks.

z=NULL #R Markdown requires

for (i in 1:21){
  
a=paste0("https://www.glassdoor.com/Jobs/Comcast-Jobs-E1280_P", i,".htm") #build the file name

pg <- read_html(a) #read the html
links <- html_nodes(pg, ".jl") #scrape the node data
z1=bind_rows(lapply(xml_attrs(links), function(x) data.frame(as.list(x), stringsAsFactors=FALSE))) #build the data frame for the current page
z=rbind(z,z1) #build the global data frame
}

mydata=as.data.frame(z) #be sure that the data are in a data.frame

str(z)
## 'data.frame':    838 obs. of  13 variables:
##  $ class                    : chr  "jl" "jl" "jl" "jl" ...
##  $ data.id                  : chr  "2909924793" "2907516594" "2897682366" "2901779186" ...
##  $ data.emp.id              : chr  "1280" "1280" "1280" "1280" ...
##  $ data.is.organic.job      : chr  "false" "false" "false" "false" ...
##  $ data.ad.order.id         : chr  "293165" "293165" "293165" "293165" ...
##  $ data.sgoc.id             : chr  "1003" "1010" "1008" "1008" ...
##  $ data.purchase.ad.order.id: chr  "0" "0" "0" "0" ...
##  $ data.is.easy.apply       : chr  "false" "false" "false" "false" ...
##  $ data.normalize.job.title : chr  "data entry" "telephone interviewer" "tax analyst" "financial representative" ...
##  $ data.job.loc             : chr  "Towson, MD" "Miramar, FL" "Sterling Heights, MI" "Sunrise, FL" ...
##  $ data.job.loc.id          : chr  "1153644" "1166164" "1159809" "1166212" ...
##  $ data.job.loc.type        : chr  "C" "C" "C" "C" ...
##  $ data.njslv               : chr  "false" "false" "false" "false" ...
head(z) #look at the header
##   class    data.id data.emp.id data.is.organic.job data.ad.order.id
## 1    jl 2909924793        1280               false           293165
## 2    jl 2907516594        1280               false           293165
## 3    jl 2897682366        1280               false           293165
## 4    jl 2901779186        1280               false           293165
## 5    jl 2908692225        1280               false           293165
## 6    jl 2908693234        1280               false           293165
##   data.sgoc.id data.purchase.ad.order.id data.is.easy.apply
## 1         1003                         0              false
## 2         1010                         0              false
## 3         1008                         0              false
## 4         1008                         0              false
## 5         1016                         0              false
## 6         1011                         0              false
##   data.normalize.job.title         data.job.loc data.job.loc.id
## 1               data entry           Towson, MD         1153644
## 2    telephone interviewer          Miramar, FL         1166164
## 3              tax analyst Sterling Heights, MI         1159809
## 4 financial representative          Sunrise, FL         1166212
## 5       scientific advisor       Centennial, CO         1164284
## 6               it support          Atlanta, GA         1155583
##   data.job.loc.type data.njslv
## 1                 C      false
## 2                 C      false
## 3                 C      false
## 4                 C      false
## 5                 C      false
## 6                 C      false

Now, let’s get the job descriptions

Not too hard to do…

library(purrr)
y=NULL
#length(mydata$data.normalize.job.title) may be used
for (i in 1:5){
myjobs=paste0("https://www.glassdoor.com/job-listing/", mydata$data.normalize.job.title[i], "-JV_IC",mydata$data.job.loc.id[i],"_KO0,*,*", ".htm?jl=", mydata$data.id[i])
thepage = try(readLines(myjobs))
y=rbind(thepage[324],y)
}

Clean

Here, we use the tm() function to mine the jobs.

mytext=paste(y, collapse=" ") #Make a vector without double spaces
reviewSource=VectorSource(mytext) #establish the vector as a vector source for mining
corpus=Corpus(reviewSource) #make the corpus...we are not comparing two documents.
corpus=tm_map(corpus, content_transformer(tolower)) #make sure everything is lower case
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(tolower)):
## transformation drops documents
corpus=tm_map(corpus, removePunctuation) #remove punctuation if any
## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation
## drops documents
corpus=tm_map(corpus, stripWhitespace) #strip out all white space
## Warning in tm_map.SimpleCorpus(corpus, stripWhitespace): transformation
## drops documents
corpus=tm_map(corpus, removeWords, stopwords("english")) #remove common words
## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("english")):
## transformation drops documents
dtm=DocumentTermMatrix(corpus) #build the document term matrix for frequency counts
dtm2=as.matrix(dtm)  #Convert to matrix
freq=colSums(dtm2)   #Get frequencies
freq=sort(freq, decreasing=TRUE) #Sort descending
freq[1:5]
##  customers   services   products   customer technology 
##         21         18         16         13         12
barplot(freq[1:5], main="Top 5 Words")

##Sentiment Analysis Analysis of The Seven Pillars of Wisdom, Scraping then Mining

#Initial Reading and Cleaning
mytext=readLines("http://gutenberg.net.au/ebooks01/0100111h.html")
mytext=mytext[223:24545]
mytext=gsub("<p>","", mytext)
mytext=gsub("</p>","", mytext)
mytext=gsub(glob2rx("<h3>*</h3>"), "", mytext)
mystring=gsub("[\r\n]","", as.String(mytext))

#Method 1
reviewSource2=VectorSource(as.vector(mystring)) #establish the vector as a vector source for mining
corpus2=Corpus(reviewSource2) #make the corpus...we are not comparing two documents.
corpus2=tm_map(corpus2, content_transformer(tolower)) #make sure everything is lower case
## Warning in tm_map.SimpleCorpus(corpus2, content_transformer(tolower)):
## transformation drops documents
corpus2=tm_map(corpus2, removePunctuation) #remove punctuation if any
## Warning in tm_map.SimpleCorpus(corpus2, removePunctuation): transformation
## drops documents
corpus2=tm_map(corpus2, stripWhitespace) #strip out all white space
## Warning in tm_map.SimpleCorpus(corpus2, stripWhitespace): transformation
## drops documents
corpus2=tm_map(corpus2, removeWords, stopwords("english")) #remove common words
## Warning in tm_map.SimpleCorpus(corpus2, removeWords, stopwords("english")):
## transformation drops documents
dtm2=DocumentTermMatrix(corpus2) #build the document term matrix for frequency counts
dtm2=as.matrix(dtm2)  #Convert to matrix
freq2=colSums(dtm2)   #Get frequencies
freq2=sort(freq2, decreasing=TRUE) #Sort descending
freq2[1:10]
##    men    one   made    two  turks   like   came camels little   arab 
##    557    507    398    390    377    372    355    306    289    287
barplot(freq2[1:10], main="Top 10 Words")

#Method 2, Using TidyText

df=as.list(mytext)
mydataframe=data_frame(text=df)

newdf=mydataframe%>%
unnest_tokens(word, text)%>%
  anti_join(stop_words)
## Joining, by = "word"
newdf%>%
  count(word, sort=TRUE)%>%
  filter(n>300)%>%
  mutate(word=reorder(word,n))%>%
  ggplot(aes(word,n))+
  geom_col()+
  xlab(NULL)+
  coord_flip()

nrc=get_sentiments("nrc")%>%
  filter(sentiment=="sadness")

newdf%>%
  inner_join(nrc)%>%
  count(word, sort=TRUE)
## Joining, by = "word"
## # A tibble: 613 x 2
##    word       n
##    <chr>  <int>
##  1 desert   179
##  2 dark     115
##  3 fell     101
##  4 broken    97
##  5 lost      87
##  6 black     84
##  7 broke     78
##  8 shot      65
##  9 death     62
## 10 hollow    59
## # ... with 603 more rows
mysent=newdf%>%
  inner_join(get_sentiments("bing"))%>%
  count(index=row_number()%/%80, sentiment)%>%
  spread(sentiment, n, fill=0)%>%
  mutate(sentiment=positive-negative)
## Joining, by = "word"
  ggplot(mysent, aes(index, sentiment))+
    geom_col(show.legend=FALSE)

  newdf%>%
    count(word)%>%
    with(wordcloud(word, n, max.words=50))
## Warning in wordcloud(word, n, max.words = 50): camels could not be fit on
## page. It will not be plotted.