Loading Packages
library(rvest)
library(XML)
library(RCurl)
library(splitstackshape)
library(tm)
library(xml2)
library(stringr)
library(qdapRegex)
library(plyr)
library(ggplot2)
base_url1 <- "https://jobs.aa.com"
html_data <- readLines("https://jobs.aa.com/search/?q=analyst")
html_data <- grep (".jobTitle-link",html_data,value = TRUE)
html_data <- gsub('.*(href=")',"",html_data)
html_data <- gsub('\">*',"",html_data)
html_data <- gsub('*</a>',"",html_data)
html_data <- gsub('\\s*\\w*$',"",html_data)
html_data <- unlist(strsplit(html_data,split = '/',fixed = TRUE))
html_data1 <-paste(html_data[1],html_data[2],html_data[3],html_data[4],sep = '/')
html_data1 <- paste(base_url1,html_data1,'/',sep = '')
html_data2 <- paste(html_data[6],html_data[7],html_data[8],html_data[9],sep = '/')
html_data2 <- paste(base_url1,html_data2,'/',sep = '')
html_data3 <-paste(html_data[11],html_data[12],html_data[13],html_data[14],sep = '/')
html_data3 <- paste(base_url1,html_data3,'/',sep = '')
html_data4 <-paste(html_data[16],html_data[17],html_data[18],html_data[19],sep = '/')
html_data4<- paste(base_url1,html_data4,'/',sep = '')
html_data5 <-paste(html_data[21],html_data[22],html_data[23],html_data[24],sep = '/')
html_data5<- paste(base_url1,html_data5,'/',sep = '')
html_data6 <-paste(html_data[26],html_data[27],html_data[28],html_data[29],sep = '/')
html_data6<- paste(base_url1,html_data6,'/',sep = '')
html_data7 <-paste(html_data[31],html_data[32],html_data[33],html_data[34],sep = '/')
html_data7<- paste(base_url1,html_data7,'/',sep = '')
html_data8 <-paste(html_data[36],html_data[37],html_data[38],html_data[39],sep = '/')
html_data8<- paste(base_url1,html_data8,'/',sep = '')
html_data9 <-paste(html_data[41],html_data[42],html_data[43],html_data[44],sep = '/')
html_data9<- paste(base_url1,html_data9,'/',sep = '')
html_data10 <-paste(html_data[47],html_data[48],html_data[49],html_data[50],sep = '/')
html_data10<- paste(base_url1,html_data10,'/',sep = '')
html_data11 <-paste(html_data[52],html_data[53],html_data[54],html_data[55],sep = '/')
html_data11<- paste(base_url1,html_data11,'/',sep = '')
html_data12 <-paste(html_data[57],html_data[58],html_data[59],html_data[60],sep = '/')
html_data12<- paste(base_url1,html_data12,'/',sep = '')
html_data13 <-paste(html_data[62],html_data[63],html_data[64],html_data[65],sep = '/')
html_data13<- paste(base_url1,html_data13,'/',sep = '')
html_data14 <-paste(html_data[67],html_data[68],html_data[69],html_data[70],sep = '/')
html_data14<- paste(base_url1,html_data14,'/',sep = '')
html_data15<-paste(html_data[72],html_data[73],html_data[74],html_data[75],sep = '/')
html_data15<- paste(base_url1,html_data15,'/',sep = '')
html_data16 <-paste(html_data[77],html_data[78],html_data[79],html_data[80],sep = '/')
html_data16<- paste(base_url1,html_data16,'/',sep = '')
html_data17 <-paste(html_data[83],html_data[84],html_data[85],html_data[86],sep = '/')
html_data17<- paste(base_url1,html_data17,'/',sep = '')
html_data18 <-paste(html_data[88],html_data[89],html_data[90],html_data[91],sep = '/')
html_data18<- paste(base_url1,html_data18,'/',sep = '')
html_data19 <-paste(html_data[94],html_data[95],html_data[96],html_data[97],sep = '/')
html_data19<- paste(base_url1,html_data19,'/',sep = '')
html_data20 <-paste(html_data[99],html_data[100],html_data[101],html_data[102],sep = '/')
html_data20<- paste(base_url1,html_data20,'/',sep = '')
html_data21 <-paste(html_data[104],html_data[105],html_data[106],html_data[107],sep = '/')
html_data21<- paste(base_url1,html_data21,'/',sep = '')
html_data22 <-paste(html_data[109],html_data[110],html_data[111],html_data[112],sep = '/')
html_data22<- paste(base_url1,html_data22,'/',sep = '')
html_data23 <-paste(html_data[114],html_data[115],html_data[116],html_data[117],sep = '/')
html_data23<- paste(base_url1,html_data23,'/',sep = '')
html_data24<-paste(html_data[119],html_data[120],html_data[121],html_data[122],sep = '/')
html_data24<- paste(base_url1,html_data24,'/',sep = '')
html_data25<-paste(html_data[124],html_data[125],html_data[126],html_data[127],sep = '/')
html_data25<- paste(base_url1,html_data25,'/',sep = '')
html_megaset <- rbind(html_data1,html_data2,html_data3,html_data4,html_data5,html_data6,html_data7,html_data8,html_data9,html_data10,html_data11,html_data12,html_data13,html_data14,html_data15,html_data16,html_data17,html_data18,html_data19,html_data20,html_data21,html_data22,html_data23,html_data24,html_data25)
html_megaset
## [,1]
## html_data1 "https://jobs.aa.com/job/Ft-Worth-Cargo-Revenue-Management-Analyst-TX-76101/426136700/"
## html_data2 "https://jobs.aa.com/job/Ft-Worth-Revenue-Management-Analyst-TX-76101/425350600/"
## html_data3 "https://jobs.aa.com/job/Ft-Worth-Revenue-Management-Analyst%2C-Pricing-&-Yield-Management-TX-76101/422059700/"
## html_data4 "https://jobs.aa.com/job/Ft-Worth-Financial-Analyst-Intern-TX-76101/426144100/"
## html_data5 "https://jobs.aa.com/job/Ft-Worth-Sales-Analyst-TX-76101/426085100/"
## html_data6 "https://jobs.aa.com/job/Dallas-Quality-Assurance-Analyst-%28IT%29-TX-75201/431773600/"
## html_data7 "https://jobs.aa.com/job/New-York-City-Analyst%2C-Sales-Planning-NY-10001/437678600/"
## html_data8 "https://jobs.aa.com/job/Ft-Worth-MBA-Financial-Strategy-Analyst-TX-76101/430028500/"
## html_data9 "https://jobs.aa.com/job/Ft-Worth-AnalystSenior-Analyst%2C-Division-Finance-TX-76101/431554300/"
## html_data10 "https://jobs.aa.com/job/Dallas-Operations-Research-Analyst-TX-75201/431533800/"
## html_data11 "https://jobs.aa.com/job/Dallas-Object-Orient-Programmer-Analyst-TX-75201/431550700/"
## html_data12 "https://jobs.aa.com/job/Dallas-Senior-Object-Oriented-Programmer-Analyst-TX-75201/431819800/"
## html_data13 "https://jobs.aa.com/job/Dallas-Senior-Object-Oriented-Programmer-Analyst-TX-75201/431814400/"
## html_data14 "https://jobs.aa.com/job/Phoenix-Senior-Object-Oriented-Programmer-Analyst-AZ-85001/431821500/"
## html_data15 "https://jobs.aa.com/job/Dallas-Senior-Object-Oriented-Programmer-Analyst-TX-75201/431764700/"
## html_data16 "https://jobs.aa.com/job/Phoenix-AnalystSenior-Analyst%2C-IT-Security-Intelligence-&-Response-AZ-85001/432476100/"
## html_data17 "https://jobs.aa.com/job/Ft-Worth-Manager%2C-Labor-Analysis-TX-76101/437370800/"
## html_data18 "https://jobs.aa.com/job/Ft-Worth-AnalystSenior-Analyst%2C-Supply-Chain-Performance-TX-76101/434891700/"
## html_data19 "https://jobs.aa.com/job/Ft-Worth-MBA-Commercial-Strategy-Analyst-Intern-TX-76101/430027400/"
## html_data20 "https://jobs.aa.com/job/Ft-Worth-MBA-Commercial-Strategy-Analyst-TX-76101/430028300/"
## html_data21 "https://jobs.aa.com/job/Dallas-Sr_-Object-Oriented-Programmer-Analyst-TX-75201/424366800/"
## html_data22 "https://jobs.aa.com/job/Phoenix-Tester%2C-Revenue-Accounting-Automation-AZ-85001/436535700/"
## html_data23 "https://jobs.aa.com/job/Phoenix-Senior-Business-Analyst-Payroll-Systems-AZ-85001/431712200/"
## html_data24 "https://jobs.aa.com/job/Ft-Worth-Analyst%2C-Cargo-Revenue-Management-TX-76101/432914000/"
## html_data25 "https://jobs.aa.com/job/London-Finance-and-Business-Planning-AnalystSr_-Analyst-HNS/436742200/"
t1<-as.character()
for(i in 1:length(html_megaset))
{
webpage<-getURL(html_megaset[i])
webpage <- readLines(tc <- textConnection(webpage)); close(tc)
pagetree <- htmlTreeParse(webpage, error=function(...){}, useInternalNodes = TRUE)
skillwords1 <-unlist(xpathSApply(pagetree,"/html/body/div/descendant::*[contains (text(),'skills')]",xmlValue))
skillwords2 <-unlist(xpathSApply(pagetree,"/html/body/div/descendant::*[contains (text(),'experience')]",xmlValue))
skillwords3 <-unlist(xpathSApply(pagetree,"/html/body/div/descendant::*[contains (text(),'language')]",xmlValue))
skillwords4 <-unlist(xpathSApply(pagetree,"/html/body/div/descendant::*[contains (text(),'Job Quatlifications')]",xmlValue))
if (length(paste(skillwords1,skillwords2,skillwords3,skillwords4))>0 )
{ t1[i]<-paste(skillwords1,skillwords2,skillwords3,skillwords4) }
else
t1[i]<-"0"
}
write.table(t1,file = "data1.csv")
file1<-read.csv("data1.csv",header = FALSE,stringsAsFactors = FALSE)
file1$V1<-paste(file1$V1,file1$V2,file1$V3,file1$V4,file1$V4,file1$V5,file1$V6,file1$V7,sep = "")
review_text1 <- paste(file1$V1, collapse=" ")
review_source1 <- VectorSource(review_text1)
c1 <- Corpus(review_source1)
c1 <- tm_map(c1, content_transformer(tolower))
c1 <- tm_map(c1, removePunctuation)
c1 <- tm_map(c1, stripWhitespace)
c1 <- tm_map(c1, removeWords, stopwords("english"))
dtm1 <- DocumentTermMatrix(c1)
dtm21 <- as.matrix(dtm1)
frequency1 <- colSums(dtm21)
frequency1 <- sort(frequency1, decreasing=TRUE)
head(frequency1,200)
## true
## 124
## calledsfshorturl
## 75
## experience
## 63
## false
## 50
## function
## 50
## var
## 49
## skills
## 31
## job
## 28
## information
## 27
## track
## 27
## addthiscallbackdata
## 25
## addthistoolboxmouseoverfunctione
## 25
## addthisupdateshare
## 25
## callbackaddthiscallback
## 25
## dataeventtargetattraddthisurldatashorturl
## 25
## datagaproperty
## 25
## datagasocial
## 25
## datashorturl
## 25
## datatrackclickback
## 25
## evente
## 25
## thissfshorturl
## 25
## trackvars
## 25
## ua1267643119
## 25
## ui508compliant
## 25
## uiclick
## 25
## uilanguage
## 25
## url
## 25
## addthisconfig
## 24
## uiopenwindows
## 24
## usernameaddthisj2w
## 24
## years
## 20
## degree
## 18
## performance
## 18
## data
## 17
## plus
## 16
## web
## 16
## testing
## 15
## business
## 14
## career
## 14
## development
## 14
## including
## 14
## levelposition
## 14
## computer
## 13
## details
## 13
## engineering
## 13
## provided
## 13
## referrals
## 13
## analytical
## 12
## will
## 12
## java
## 11
## management
## 11
## tools
## 11
## demonstrated
## 10
## related
## 10
## applications
## 9
## customer
## 9
## soap
## 9
## strong
## 9
## using
## 9
## ability
## 8
## analysis
## 8
## analytics
## 8
## field
## 8
## javascript
## 8
## masters
## 8
## science
## 8
## spring
## 8
## accepted
## 7
## analysts
## 7
## bachelors
## 7
## customers
## 7
## develop
## 7
## net
## 7
## services
## 7
## sql
## 7
## systems
## 7
## technology
## 7
## agile
## 6
## airlines
## 6
## alternative
## 6
## american
## 6
## chain
## 6
## communication
## 6
## digital
## 6
## framework
## 6
## marketing
## 6
## server
## 6
## supply
## 6
## team
## 6
## xml
## 6
## airline
## 5
## application
## 5
## based
## 5
## complex
## 5
## delivering
## 5
## developing
## 5
## driven
## 5
## either
## 5
## etc
## 5
## financial
## 5
## ibm
## 5
## implement
## 5
## implementation
## 5
## industry
## 5
## mathematics
## 5
## people
## 5
## planning
## 5
## proficiency
## 5
## programming
## 5
## regression
## 5
## requirements
## 5
## software
## 5
## support
## 5
## tags
## 5
## various
## 5
## across
## 4
## adobe
## 4
## andor
## 4
## automated
## 4
## cadre
## 4
## can
## 4
## css
## 4
## database
## 4
## design
## 4
## equivalent
## 4
## exceptional
## 4
## flight
## 4
## frameworks
## 4
## innovative
## 4
## media
## 4
## mobile
## 4
## operations
## 4
## oracle
## 4
## oriented
## 4
## part
## 4
## project
## 4
## projects
## 4
## quality
## 4
## research
## 4
## statistical
## 4
## supported
## 4
## tag
## 4
## talented
## 4
## technical
## 4
## test
## 4
## tfs
## 4
## tool
## 4
## visual
## 4
## within
## 4
## acquired
## 3
## analyst
## 3
## analyze
## 3
## angular
## 3
## api
## 3
## build
## 3
## classification
## 3
## code
## 3
## commitment
## 3
## company
## 3
## concepts
## 3
## control
## 3
## core
## 3
## curriculum
## 3
## customercentric
## 3
## dedicated
## 3
## educational
## 3
## ejb
## 3
## employee
## 3
## ensure
## 3
## external
## 3
## finance
## 3
## five
## 3
## forecasting
## 3
## foundation
## 3
## functional
## 3
## git
## 3
## global
## 3
## high
## 3
## html
## 3
## industrial
## 3
## internal
## 3
## jsp
## 3
## knowledge
## 3
## leadership
## 3
## maintain
## 3
## methods
## 3
## models
## 3
## multiple
## 3
## necessary
## 3
## needs
## 3
## new
## 3
## objectoriented
## 3
## organization
## 3
## pay
## 3
## plans
## 3
## prepare
## 3
## present
## 3
## presentations
## 3
## python
## 3
## quantitative
## 3
skill_data1<-as.data.frame(frequency1)
skill_data1$skill1<-rownames(skill_data1)
non_tech_skill1<-skill_data1[skill_data1$skill1 %in% c("Self","interpersonal","innovation","creative","curiosity","leadership","team","management","strategy","communication","planning","driven","willing","commitment","attention","proficiency","knowledge","teamwork","focus","selfmotivated"),]
tech_skills1 <- skill_data1[skill_data1$skill1 %in% c("technology","python","microsoft","powerpoint","tableau","simulation","javascrpts","sql","agile","objectoriented","sas","java","javascript","statistical","regression","Rstudio"),]
SoftSkill <- non_tech_skill1$skill1
SFrequency <- non_tech_skill1$frequency1
NTSkills<- tech_skills1$skill1
NTFrequency <- tech_skills1$frequency1
merger1 <- as.data.frame(cbind(SoftSkill,SFrequency))
merger1
## SoftSkill SFrequency
## 1 management 11
## 2 communication 6
## 3 team 6
## 4 driven 5
## 5 planning 5
## 6 proficiency 5
## 7 commitment 3
## 8 knowledge 3
## 9 leadership 3
## 10 strategy 3
## 11 willing 3
## 12 interpersonal 2
## 13 creative 1
## 14 teamwork 1
merger2 <- as.data.frame(cbind(NTSkills,NTFrequency))
merger2
## NTSkills NTFrequency
## 1 java 11
## 2 javascript 8
## 3 sql 7
## 4 technology 7
## 5 agile 6
## 6 regression 5
## 7 statistical 4
## 8 objectoriented 3
## 9 python 3
## 10 microsoft 2
## 11 sas 2
## 12 simulation 2
## 13 tableau 2
base_url <- "http://careers.united.com/"
dataunited <- readLines("http://careers.united.com/ListJobs/ByKeyword/analyst/")
## Warning in readLines("http://careers.united.com/ListJobs/ByKeyword/
## analyst/"): incomplete final line found on 'http://careers.united.com/
## ListJobs/ByKeyword/analyst/'
dataunited <- grep ('<a href=',dataunited,value = TRUE)
dataunited <- grep ('<a href=\"/ShowJob/Id',dataunited,value = TRUE)
dataunited <- gsub(" <a href=",'',dataunited)
dataunited <- gsub('*</a>','',dataunited)
dataunited <- str_trim(dataunited,"left")
dataunited <- unlist(strsplit(dataunited,split = '/',fixed = TRUE))
dataunited1 <- paste(dataunited[2],dataunited[3],dataunited[4],dataunited[5],sep = '/')
dataunited1 <- paste(base_url,dataunited1,'/',sep = '')
dataunited1
## [1] "http://careers.united.com/ShowJob/Id/1375308/Senior-Manager-Claims-Analytics,-Data-Security-and-Financial-Controls/"
dataunited2 <- paste(dataunited[8],dataunited[9],dataunited[10],dataunited[11],sep = '/')
dataunited2 <- paste(base_url,dataunited2,'/',sep = '')
dataunited2
## [1] "http://careers.united.com/ShowJob/Id/1284888/Senior-Analyst-–-Information-Technology/"
dataunited3 <- paste(dataunited[14],dataunited[15],dataunited[16],dataunited[17],sep = '/')
dataunited3 <- paste(base_url,dataunited3,'/',sep = '')
dataunited3
## [1] "http://careers.united.com/ShowJob/Id/1384304/Coordinator-Cargo-Claims/"
dataunited4 <- paste(dataunited[20],dataunited[21],dataunited[22],dataunited[23],sep = '/')
dataunited4 <- paste(base_url,dataunited4,'/',sep = '')
dataunited4
## [1] "http://careers.united.com/ShowJob/Id/1384303/Senior-Analyst-Technical-Operations-Finance/"
dataunited5 <- paste(dataunited[26],dataunited[27],dataunited[28],dataunited[29],sep = '/')
dataunited5 <- paste(base_url,dataunited5,'/',sep = '')
dataunited5
## [1] "http://careers.united.com/ShowJob/Id/1384145/Analyst-Merchandising/"
dataunited6 <- paste(dataunited[32],dataunited[33],dataunited[34],dataunited[35],sep = '/')
dataunited6 <- paste(base_url,dataunited6,'/',sep = '')
dataunited6
## [1] "http://careers.united.com/ShowJob/Id/1382849/Analyst-Service-Improvement-and-Solutions/"
dataunited7 <- paste(dataunited[38],dataunited[39],dataunited[40],dataunited[41],sep = '/')
dataunited7 <- paste(base_url,dataunited7,'/',sep = '')
dataunited7
## [1] "http://careers.united.com/ShowJob/Id/1382759/Senior-Analyst-Network-Operations-Analysis/"
dataunited8 <- paste(dataunited[44],dataunited[45],dataunited[46],dataunited[47],sep = '/')
dataunited8 <- paste(base_url,dataunited8,'/',sep = '')
dataunited8
## [1] "http://careers.united.com/ShowJob/Id/1382568/Program-Manager-Merchandising/"
dataunited9 <- paste(dataunited[50],dataunited[51],dataunited[52],dataunited[53],sep = '/')
dataunited9 <- paste(base_url,dataunited9,'/',sep = '')
dataunited9
## [1] "http://careers.united.com/ShowJob/Id/1381896/Senior-Analyst-Cargo-Sales-Strategy/"
dataunited10 <- paste(dataunited[56],dataunited[57],dataunited[58],dataunited[59],sep = '/')
dataunited10<- paste(base_url,dataunited10,'/',sep = '')
dataunited10
## [1] "http://careers.united.com/ShowJob/Id/1328782/Senior-Analyst-Finance-and-Accounts-Payable-Systems-Administration/"
dataunited11 <- paste(dataunited[62],dataunited[63],dataunited[64],dataunited[65],sep = '/')
dataunited11 <- paste(base_url,dataunited11,'/',sep = '')
dataunited11
## [1] "http://careers.united.com/ShowJob/Id/1339623/Senior-Analyst-Statistics-and-Operations-Research/"
dataunited12 <- paste(dataunited[68],dataunited[69],dataunited[70],dataunited[71],sep = '/')
dataunited12<- paste(base_url,dataunited12,'/',sep = '')
dataunited12
## [1] "http://careers.united.com/ShowJob/Id/1338423/Senior-Analyst-Capital-Planning-Analysis/"
dataunited13<- paste(dataunited[74],dataunited[75],dataunited[76],dataunited[77],sep = '/')
dataunited13 <- paste(base_url,dataunited13,'/',sep = '')
dataunited13
## [1] "http://careers.united.com/ShowJob/Id/1275170/Senior-Analyst-Information-Technology/"
dataunited14 <- paste(dataunited[80],dataunited[81],dataunited[82],dataunited[83],sep = '/')
dataunited14<- paste(base_url,dataunited14,'/',sep = '')
dataunited14
## [1] "http://careers.united.com/ShowJob/Id/1375303/Intern-Financial-Planning-Analysis-(MBA-Summer-2018)/"
dataunited15<- paste(dataunited[86],dataunited[87],dataunited[88],dataunited[89],sep = '/')
dataunited15 <- paste(base_url,dataunited15,'/',sep = '')
dataunited15
## [1] "http://careers.united.com/ShowJob/Id/1373003/Sr.-Analyst-Marketing/"
dataunited16<- paste(dataunited[92],dataunited[93],dataunited[94],dataunited[95],sep = '/')
dataunited16<- paste(base_url,dataunited16,'/',sep = '')
dataunited16
## [1] "http://careers.united.com/ShowJob/Id/1358013/Analyst-Loyalty/"
dataunited17<- paste(dataunited[98],dataunited[99],dataunited[100],dataunited[101],sep = '/')
dataunited17<- paste(base_url,dataunited17,'/',sep = '')
dataunited17
## [1] "http://careers.united.com/ShowJob/Id/1290320/Analyst-Statistics-and-Operations-Research/"
dataunited18<- paste(dataunited[104],dataunited[105],dataunited[106],dataunited[107],sep = '/')
dataunited18<- paste(base_url,dataunited18,'/',sep = '')
dataunited18
## [1] "http://careers.united.com/ShowJob/Id/1354184/Senior-Analyst-Financial-Planning-Analysis-(MBA)/"
dataunited19<- paste(dataunited[110],dataunited[111],dataunited[112],dataunited[113],sep = '/')
dataunited19<- paste(base_url,dataunited19,'/',sep = '')
dataunited19
## [1] "http://careers.united.com/ShowJob/Id/1354183/Sr.-Analyst-Network-Planning-and-Strategy-(MBA)/"
dataunited20<- paste(dataunited[116],dataunited[117],dataunited[118],dataunited[119],sep = '/')
dataunited20<- paste(base_url,dataunited20,'/',sep = '')
dataunited20
## [1] "http://careers.united.com/ShowJob/Id/1328972/Associate-Analyst-Pricing-Revenue-Management/"
dataunited21 <- paste(dataunited[123],dataunited[124],dataunited[125],dataunited[126],sep = '/')
dataunited21 <- paste(base_url,dataunited21,'/',sep = '')
dataunited21
## [1] "http://careers.united.com/ShowJob/Id/1341983/Associate-Analyst-Financial-Planning-Analysis/"
dataunited22<- paste(dataunited[129],dataunited[130],dataunited[131],dataunited[132],sep = '/')
dataunited22 <- paste(base_url,dataunited22,'/',sep = '')
dataunited22
## [1] "http://careers.united.com/ShowJob/Id/1341980/Associate-Analyst-Network-Planning-Scheduling/"
dataunited23<- paste(dataunited[135],dataunited[136],dataunited[137],dataunited[138],sep = '/')
dataunited23<- paste(base_url,dataunited23,'/',sep = '')
dataunited23
## [1] "http://careers.united.com/ShowJob/Id/1328973/Intern-Financial-Planning-Analysis-(Summer-2018)/"
megasetunited <- rbind(dataunited1,dataunited2,dataunited3,dataunited4,dataunited5,dataunited6,dataunited7,dataunited8,dataunited9,dataunited10,dataunited11,dataunited12,dataunited13,dataunited14,dataunited15,dataunited16,dataunited17,dataunited18,dataunited19,dataunited20,dataunited21,dataunited22,dataunited23)
x1<-as.character()
for(i in 1:length(megasetunited))
{
webpage<-getURL(megasetunited[i])
webpage <- readLines(tc <- textConnection(webpage)); close(tc)
pagetree <- htmlTreeParse(webpage, error=function(...){}, useInternalNodes = TRUE)
t1<-unlist(xpathSApply(pagetree,"/html/body/div/descendant::*[contains (text(),'skills')]",xmlValue))
t2<-unlist(xpathSApply(pagetree,"/html/body/div/descendant::*[contains (text(),'Preferred')]",xmlValue))
t3<-unlist(xpathSApply(pagetree,"/html/body/div/descendant::*[contains (text(),'Required')]",xmlValue))
t4 <- unlist(xpathSApply(pagetree,"/html/body/div/descendant::*[contains (text(),'Job overview')]",xmlValue))
t5 <- unlist(xpathSApply(pagetree,"/html/body/div/descendant::*[contains (text(),'proficiency')]",xmlValue))
if (length(paste(t1,t2,t3,t4,t5))>0 )
{ x1[i]<-paste(t1,t2,t3,t4,t5) }
else
x1[i]<-"0"
}
write.csv(x1,file = "united.csv")
file2<-read.csv("united.csv",header = FALSE,stringsAsFactors = FALSE)
file2$V1<-paste(file2$V1,file2$V2,sep = "")
review_text2 <- paste(file2$V1, collapse=" ")
review_source2 <- VectorSource(review_text2)
c2 <- Corpus(review_source2)
c2 <- tm_map(c2, content_transformer(tolower))
c2 <- tm_map(c2, removePunctuation)
c2 <- tm_map(c2, stripWhitespace)
c2 <- tm_map(c2, removeWords, stopwords("english"))
dtm2 <- DocumentTermMatrix(c2)
dtm22 <- as.matrix(dtm2)
frequency2 <- colSums(dtm22)
frequency2 <- sort(frequency2, decreasing=TRUE)
head(frequency2,200)
## skills required job
## 24 22 14
## overview responsibilities preferred
## 14 14 13
## experience microsoft powerpoint
## 7 7 7
## proficiency excel office
## 7 6 6
## proven communication analytical
## 6 5 4
## business new problem
## 4 4 4
## solving technical will
## 4 4 4
## data enable interpersonal
## 3 3 3
## knowledge roles training
## 3 3 3
## word access analysts
## 3 2 2
## appropriate attention balance
## 2 2 2
## claims department detail
## 2 2 2
## focus however industry
## 2 2 2
## information loyalty management
## 2 2 2
## needs pricing progress
## 2 2 2
## revenue sheet significant
## 2 2 2
## specifically strong teamwork
## 2 2 2
## uniteds verbal written
## 2 2 2
## 10excellent 116000 11good
## 1 1 1
## 12technical 13technical 14solid
## 1 1 1
## 15highly 16loyalty 17demonstrates
## 1 1 1
## 180 19exceptional 1job
## 1 1 1
## 20associate 21track 22exceptional
## 1 1 1
## 23solid 2analytical 3excellent
## 1 1 1
## 5excellent 6excellent 7strong
## 1 1 1
## 9presentation account achieving
## 1 1 1
## active adaptability advance
## 1 1 1
## airline also analysis
## 1 1 1
## andor apply area
## 1 1 1
## assigned assistance better
## 1 1 1
## bill build changing
## 1 1 1
## collaborating comfortable commitment
## 1 1 1
## compensation compensationworkers complex
## 1 1 1
## continue continued course
## 1 1 1
## create creation critical
## 1 1 1
## current database date
## 1 1 1
## demonstrates demonstrating depending
## 1 1 1
## development developments dynamics
## 1 1 1
## economics either environment
## 1 1 1
## excellent expectation expert
## 1 1 1
## expertise focused formal
## 1 1 1
## four fundamentals goals
## 1 1 1
## high hires improved
## 1 1 1
## improvement improvements including
## 1 1 1
## increasing individual informed
## 1 1 1
## intermediate involves keeping
## 1 1 1
## key knowing knowledgeskills
## 1 1 1
## languages lead leadership
## 1 1 1
## liability listening made
## 1 1 1
## make manager matter
## 1 1 1
## meaningful medical mentor
## 1 1 1
## modeling nax needed
## 1 1 1
## net open opportunity
## 1 1 1
## organizations organized outstanding
## 1 1 1
## part past personalized
## 1 1 1
## placed position practices
## 1 1 1
## presentation processes products
## 1 1 1
## proficiencyexperience program programming
## 1 1 1
## provided python record
## 1 1 1
## records relevant review
## 1 1 1
## rotation sales sas
## 1 1 1
## selfmotivated senior simio
## 1 1 1
## simulation sources spotfire
## 1 1 1
## spreadsheet stakeholders staying
## 1 1 1
## street structured subject
## 1 1 1
## tableau team technicalfunctional
## 1 1 1
## time tools
## 1 1
skill_data2<-as.data.frame(frequency2)
skill_data2$skill2<-rownames(skill_data2)
non_tech_skill2<-skill_data2[skill_data2$skill2 %in% c("Self","interpersonal","innovation","creative","curiosity","leadership","team","management","strategy","communication","planning","driven","willing","commitment","attention","proficiency","knowledge","teamwork","focus","selfmotivated"),]
tech_skills2 <- skill_data2[skill_data2$skill2 %in% c("technology","python","microsoft","powerpoint","tableau","simulation","javascrpts","sas","simulation","javascrpts","sql","agile","objectoriented"),]
SoftSkill <- non_tech_skill2$skill2
SFrequency <- non_tech_skill2$frequency2
NTSkills<- tech_skills2$skill2
NTFrequency <- tech_skills2$frequency2
merger3 <- as.data.frame(cbind(SoftSkill,SFrequency))
merger3
## SoftSkill SFrequency
## 1 proficiency 7
## 2 communication 5
## 3 interpersonal 3
## 4 knowledge 3
## 5 attention 2
## 6 focus 2
## 7 management 2
## 8 teamwork 2
## 9 commitment 1
## 10 leadership 1
## 11 selfmotivated 1
## 12 team 1
merger4 <- as.data.frame(cbind(NTSkills,NTFrequency))
merger4
## NTSkills NTFrequency
## 1 microsoft 7
## 2 powerpoint 7
## 3 python 1
## 4 sas 1
## 5 simulation 1
## 6 tableau 1
MegaMerger1 <- rbind(merger1,merger3)
MegaMerger1$SFrequency <- as.integer(MegaMerger1$SFrequency)
MegaMerger1 <- ddply(MegaMerger1,.(SoftSkill),summarize,sum=sum(SFrequency))
MegaMerger1 <-arrange(MegaMerger1,desc(MegaMerger1$sum),SoftSkill)
MegaMerger1$SoftSkill <- as.character(MegaMerger1$SoftSkill)
MegaMerger1
## SoftSkill sum
## 1 proficiency 12
## 2 communication 11
## 3 knowledge 8
## 4 interpersonal 7
## 5 team 7
## 6 commitment 5
## 7 driven 5
## 8 leadership 5
## 9 management 5
## 10 planning 5
## 11 strategy 4
## 12 teamwork 4
## 13 willing 4
## 14 attention 3
## 15 focus 3
## 16 creative 1
## 17 selfmotivated 1
MegaMerger1$SoftSkill <- factor(MegaMerger1$SoftSkill,levels = MegaMerger1$SoftSkill[order(MegaMerger1$sum)])
ggplot(MegaMerger1,aes(x = SoftSkill , y = sum)) + theme_bw(base_size = 16) +theme(axis.text.x=element_text(angle=90,hjust=1)) + geom_bar(stat="identity",fill = "brown")
MegaMerger2 <- rbind(merger2,merger4)
MegaMerger2$NTFrequency <- as.integer(MegaMerger2$NTFrequency)
MegaMerger2 <- ddply(MegaMerger2,.(NTSkills),summarize,sum=sum(NTFrequency))
arrange(MegaMerger2,desc(MegaMerger2$sum),NTSkills)
## NTSkills sum
## 1 python 12
## 2 sas 11
## 3 simulation 11
## 4 tableau 11
## 5 microsoft 9
## 6 javascript 8
## 7 sql 7
## 8 technology 7
## 9 powerpoint 7
## 10 agile 6
## 11 regression 5
## 12 statistical 4
## 13 objectoriented 3
## 14 java 1
MegaMerger2$NTSkills <- factor(MegaMerger2$NTSkills,levels = MegaMerger2$NTSkills[order(MegaMerger2$sum)])
ggplot(MegaMerger2,aes(x = NTSkills , y = sum)) + theme_bw(base_size = 16) +
theme(axis.text.x=element_text(angle=90,hjust=1)) + geom_bar(stat="identity",fill = "brown")