For Project 3,I choose ‘Aviation’ industry to scrap data and make predictions.I scraped data with two airlines

1) American Airlines

2) British Airways

The two airlines had around 20-25 openings for ‘data analyst’ position around the states.The openings listed skills required for position and were used to make predictions for my task.

Loading Packages

library(rvest)
library(XML)
library(RCurl)
library(splitstackshape)
library(tm)
library(xml2)
library(stringr)
library(qdapRegex)
library(plyr)
library(ggplot2)

Loading and cleaning data

base_url1 <- "https://jobs.aa.com"
html_data <- readLines("https://jobs.aa.com/search/?q=analyst")
html_data <- grep (".jobTitle-link",html_data,value = TRUE)
html_data <- gsub('.*(href=")',"",html_data)
html_data <- gsub('\">*',"",html_data)
html_data <- gsub('*</a>',"",html_data)
html_data <- gsub('\\s*\\w*$',"",html_data)
html_data <- unlist(strsplit(html_data,split = '/',fixed = TRUE))
html_data1 <-paste(html_data[1],html_data[2],html_data[3],html_data[4],sep = '/')
html_data1 <- paste(base_url1,html_data1,'/',sep = '')
html_data2 <- paste(html_data[6],html_data[7],html_data[8],html_data[9],sep = '/')
html_data2 <- paste(base_url1,html_data2,'/',sep = '')
html_data3 <-paste(html_data[11],html_data[12],html_data[13],html_data[14],sep = '/')
html_data3 <- paste(base_url1,html_data3,'/',sep = '')
html_data4 <-paste(html_data[16],html_data[17],html_data[18],html_data[19],sep = '/')
html_data4<- paste(base_url1,html_data4,'/',sep = '')
html_data5 <-paste(html_data[21],html_data[22],html_data[23],html_data[24],sep = '/')
html_data5<- paste(base_url1,html_data5,'/',sep = '')
html_data6 <-paste(html_data[26],html_data[27],html_data[28],html_data[29],sep = '/')
html_data6<- paste(base_url1,html_data6,'/',sep = '')
html_data7 <-paste(html_data[31],html_data[32],html_data[33],html_data[34],sep = '/')
html_data7<- paste(base_url1,html_data7,'/',sep = '')
html_data8 <-paste(html_data[36],html_data[37],html_data[38],html_data[39],sep = '/')
html_data8<- paste(base_url1,html_data8,'/',sep = '')
html_data9 <-paste(html_data[41],html_data[42],html_data[43],html_data[44],sep = '/')
html_data9<- paste(base_url1,html_data9,'/',sep = '')
html_data10 <-paste(html_data[47],html_data[48],html_data[49],html_data[50],sep = '/')
html_data10<- paste(base_url1,html_data10,'/',sep = '')
html_data11 <-paste(html_data[52],html_data[53],html_data[54],html_data[55],sep = '/')
html_data11<- paste(base_url1,html_data11,'/',sep = '')
html_data12 <-paste(html_data[57],html_data[58],html_data[59],html_data[60],sep = '/')
html_data12<- paste(base_url1,html_data12,'/',sep = '')
html_data13 <-paste(html_data[62],html_data[63],html_data[64],html_data[65],sep = '/')
html_data13<- paste(base_url1,html_data13,'/',sep = '')
html_data14 <-paste(html_data[67],html_data[68],html_data[69],html_data[70],sep = '/')
html_data14<- paste(base_url1,html_data14,'/',sep = '')
html_data15<-paste(html_data[72],html_data[73],html_data[74],html_data[75],sep = '/')
html_data15<- paste(base_url1,html_data15,'/',sep = '')
html_data16 <-paste(html_data[77],html_data[78],html_data[79],html_data[80],sep = '/')
html_data16<- paste(base_url1,html_data16,'/',sep = '')
html_data17 <-paste(html_data[83],html_data[84],html_data[85],html_data[86],sep = '/')
html_data17<- paste(base_url1,html_data17,'/',sep = '')
html_data18 <-paste(html_data[88],html_data[89],html_data[90],html_data[91],sep = '/')
html_data18<- paste(base_url1,html_data18,'/',sep = '')
html_data19 <-paste(html_data[94],html_data[95],html_data[96],html_data[97],sep = '/')
html_data19<- paste(base_url1,html_data19,'/',sep = '')
html_data20 <-paste(html_data[99],html_data[100],html_data[101],html_data[102],sep = '/')
html_data20<- paste(base_url1,html_data20,'/',sep = '')
html_data21 <-paste(html_data[104],html_data[105],html_data[106],html_data[107],sep = '/')
html_data21<- paste(base_url1,html_data21,'/',sep = '')
html_data22 <-paste(html_data[109],html_data[110],html_data[111],html_data[112],sep = '/')
html_data22<- paste(base_url1,html_data22,'/',sep = '')
html_data23 <-paste(html_data[114],html_data[115],html_data[116],html_data[117],sep = '/')
html_data23<- paste(base_url1,html_data23,'/',sep = '')
html_data24<-paste(html_data[119],html_data[120],html_data[121],html_data[122],sep = '/')
html_data24<- paste(base_url1,html_data24,'/',sep = '')
html_data25<-paste(html_data[124],html_data[125],html_data[126],html_data[127],sep = '/')
html_data25<- paste(base_url1,html_data25,'/',sep = '')

html_megaset <- rbind(html_data1,html_data2,html_data3,html_data4,html_data5,html_data6,html_data7,html_data8,html_data9,html_data10,html_data11,html_data12,html_data13,html_data14,html_data15,html_data16,html_data17,html_data18,html_data19,html_data20,html_data21,html_data22,html_data23,html_data24,html_data25)
html_megaset
##             [,1]                                                                                                                  
## html_data1  "https://jobs.aa.com/job/Ft-Worth-Cargo-Revenue-Management-Analyst-TX-76101/426136700/"                               
## html_data2  "https://jobs.aa.com/job/Ft-Worth-Revenue-Management-Analyst-TX-76101/425350600/"                                     
## html_data3  "https://jobs.aa.com/job/Ft-Worth-Revenue-Management-Analyst%2C-Pricing-&amp;-Yield-Management-TX-76101/422059700/"   
## html_data4  "https://jobs.aa.com/job/Ft-Worth-Financial-Analyst-Intern-TX-76101/426144100/"                                       
## html_data5  "https://jobs.aa.com/job/Ft-Worth-Sales-Analyst-TX-76101/426085100/"                                                  
## html_data6  "https://jobs.aa.com/job/Dallas-Quality-Assurance-Analyst-%28IT%29-TX-75201/431773600/"                               
## html_data7  "https://jobs.aa.com/job/New-York-City-Analyst%2C-Sales-Planning-NY-10001/437678600/"                                 
## html_data8  "https://jobs.aa.com/job/Ft-Worth-MBA-Financial-Strategy-Analyst-TX-76101/430028500/"                                 
## html_data9  "https://jobs.aa.com/job/Ft-Worth-AnalystSenior-Analyst%2C-Division-Finance-TX-76101/431554300/"                      
## html_data10 "https://jobs.aa.com/job/Dallas-Operations-Research-Analyst-TX-75201/431533800/"                                      
## html_data11 "https://jobs.aa.com/job/Dallas-Object-Orient-Programmer-Analyst-TX-75201/431550700/"                                 
## html_data12 "https://jobs.aa.com/job/Dallas-Senior-Object-Oriented-Programmer-Analyst-TX-75201/431819800/"                        
## html_data13 "https://jobs.aa.com/job/Dallas-Senior-Object-Oriented-Programmer-Analyst-TX-75201/431814400/"                        
## html_data14 "https://jobs.aa.com/job/Phoenix-Senior-Object-Oriented-Programmer-Analyst-AZ-85001/431821500/"                       
## html_data15 "https://jobs.aa.com/job/Dallas-Senior-Object-Oriented-Programmer-Analyst-TX-75201/431764700/"                        
## html_data16 "https://jobs.aa.com/job/Phoenix-AnalystSenior-Analyst%2C-IT-Security-Intelligence-&amp;-Response-AZ-85001/432476100/"
## html_data17 "https://jobs.aa.com/job/Ft-Worth-Manager%2C-Labor-Analysis-TX-76101/437370800/"                                      
## html_data18 "https://jobs.aa.com/job/Ft-Worth-AnalystSenior-Analyst%2C-Supply-Chain-Performance-TX-76101/434891700/"              
## html_data19 "https://jobs.aa.com/job/Ft-Worth-MBA-Commercial-Strategy-Analyst-Intern-TX-76101/430027400/"                         
## html_data20 "https://jobs.aa.com/job/Ft-Worth-MBA-Commercial-Strategy-Analyst-TX-76101/430028300/"                                
## html_data21 "https://jobs.aa.com/job/Dallas-Sr_-Object-Oriented-Programmer-Analyst-TX-75201/424366800/"                           
## html_data22 "https://jobs.aa.com/job/Phoenix-Tester%2C-Revenue-Accounting-Automation-AZ-85001/436535700/"                         
## html_data23 "https://jobs.aa.com/job/Phoenix-Senior-Business-Analyst-Payroll-Systems-AZ-85001/431712200/"                         
## html_data24 "https://jobs.aa.com/job/Ft-Worth-Analyst%2C-Cargo-Revenue-Management-TX-76101/432914000/"                            
## html_data25 "https://jobs.aa.com/job/London-Finance-and-Business-Planning-AnalystSr_-Analyst-HNS/436742200/"

‘file1’ had 7 columns with raw data, which is required to transform into single data frame.

“Corpus” comes under package ‘tm’ for reading and collecting text document.

file1$V1<-paste(file1$V1,file1$V2,file1$V3,file1$V4,file1$V4,file1$V5,file1$V6,file1$V7,sep = "")
review_text1 <- paste(file1$V1, collapse=" ")
review_source1 <- VectorSource(review_text1)
c1 <- Corpus(review_source1)
c1 <- tm_map(c1, content_transformer(tolower))
c1 <- tm_map(c1, removePunctuation)
c1 <- tm_map(c1, stripWhitespace)
c1 <- tm_map(c1, removeWords, stopwords("english"))
dtm1 <- DocumentTermMatrix(c1)
dtm21 <- as.matrix(dtm1)
frequency1 <- colSums(dtm21)
frequency1 <- sort(frequency1, decreasing=TRUE)
head(frequency1,200)
##                                      true 
##                                       124 
##                          calledsfshorturl 
##                                        75 
##                                experience 
##                                        63 
##                                     false 
##                                        50 
##                                  function 
##                                        50 
##                                       var 
##                                        49 
##                                    skills 
##                                        31 
##                                       job 
##                                        28 
##                               information 
##                                        27 
##                                     track 
##                                        27 
##                       addthiscallbackdata 
##                                        25 
##          addthistoolboxmouseoverfunctione 
##                                        25 
##                        addthisupdateshare 
##                                        25 
##                   callbackaddthiscallback 
##                                        25 
## dataeventtargetattraddthisurldatashorturl 
##                                        25 
##                            datagaproperty 
##                                        25 
##                              datagasocial 
##                                        25 
##                              datashorturl 
##                                        25 
##                        datatrackclickback 
##                                        25 
##                                    evente 
##                                        25 
##                            thissfshorturl 
##                                        25 
##                                 trackvars 
##                                        25 
##                              ua1267643119 
##                                        25 
##                            ui508compliant 
##                                        25 
##                                   uiclick 
##                                        25 
##                                uilanguage 
##                                        25 
##                                       url 
##                                        25 
##                             addthisconfig 
##                                        24 
##                             uiopenwindows 
##                                        24 
##                        usernameaddthisj2w 
##                                        24 
##                                     years 
##                                        20 
##                                    degree 
##                                        18 
##                               performance 
##                                        18 
##                                      data 
##                                        17 
##                                      plus 
##                                        16 
##                                       web 
##                                        16 
##                                   testing 
##                                        15 
##                                  business 
##                                        14 
##                                    career 
##                                        14 
##                               development 
##                                        14 
##                                 including 
##                                        14 
##                             levelposition 
##                                        14 
##                                  computer 
##                                        13 
##                                   details 
##                                        13 
##                               engineering 
##                                        13 
##                                  provided 
##                                        13 
##                                 referrals 
##                                        13 
##                                analytical 
##                                        12 
##                                      will 
##                                        12 
##                                      java 
##                                        11 
##                                management 
##                                        11 
##                                     tools 
##                                        11 
##                              demonstrated 
##                                        10 
##                                   related 
##                                        10 
##                              applications 
##                                         9 
##                                  customer 
##                                         9 
##                                      soap 
##                                         9 
##                                    strong 
##                                         9 
##                                     using 
##                                         9 
##                                   ability 
##                                         8 
##                                  analysis 
##                                         8 
##                                 analytics 
##                                         8 
##                                     field 
##                                         8 
##                                javascript 
##                                         8 
##                                   masters 
##                                         8 
##                                   science 
##                                         8 
##                                    spring 
##                                         8 
##                                  accepted 
##                                         7 
##                                  analysts 
##                                         7 
##                                 bachelors 
##                                         7 
##                                 customers 
##                                         7 
##                                   develop 
##                                         7 
##                                       net 
##                                         7 
##                                  services 
##                                         7 
##                                       sql 
##                                         7 
##                                   systems 
##                                         7 
##                                technology 
##                                         7 
##                                     agile 
##                                         6 
##                                  airlines 
##                                         6 
##                               alternative 
##                                         6 
##                                  american 
##                                         6 
##                                     chain 
##                                         6 
##                             communication 
##                                         6 
##                                   digital 
##                                         6 
##                                 framework 
##                                         6 
##                                 marketing 
##                                         6 
##                                    server 
##                                         6 
##                                    supply 
##                                         6 
##                                      team 
##                                         6 
##                                       xml 
##                                         6 
##                                   airline 
##                                         5 
##                               application 
##                                         5 
##                                     based 
##                                         5 
##                                   complex 
##                                         5 
##                                delivering 
##                                         5 
##                                developing 
##                                         5 
##                                    driven 
##                                         5 
##                                    either 
##                                         5 
##                                       etc 
##                                         5 
##                                 financial 
##                                         5 
##                                       ibm 
##                                         5 
##                                 implement 
##                                         5 
##                            implementation 
##                                         5 
##                                  industry 
##                                         5 
##                               mathematics 
##                                         5 
##                                    people 
##                                         5 
##                                  planning 
##                                         5 
##                               proficiency 
##                                         5 
##                               programming 
##                                         5 
##                                regression 
##                                         5 
##                              requirements 
##                                         5 
##                                  software 
##                                         5 
##                                   support 
##                                         5 
##                                      tags 
##                                         5 
##                                   various 
##                                         5 
##                                    across 
##                                         4 
##                                     adobe 
##                                         4 
##                                     andor 
##                                         4 
##                                 automated 
##                                         4 
##                                     cadre 
##                                         4 
##                                       can 
##                                         4 
##                                       css 
##                                         4 
##                                  database 
##                                         4 
##                                    design 
##                                         4 
##                                equivalent 
##                                         4 
##                               exceptional 
##                                         4 
##                                    flight 
##                                         4 
##                                frameworks 
##                                         4 
##                                innovative 
##                                         4 
##                                     media 
##                                         4 
##                                    mobile 
##                                         4 
##                                operations 
##                                         4 
##                                    oracle 
##                                         4 
##                                  oriented 
##                                         4 
##                                      part 
##                                         4 
##                                   project 
##                                         4 
##                                  projects 
##                                         4 
##                                   quality 
##                                         4 
##                                  research 
##                                         4 
##                               statistical 
##                                         4 
##                                 supported 
##                                         4 
##                                       tag 
##                                         4 
##                                  talented 
##                                         4 
##                                 technical 
##                                         4 
##                                      test 
##                                         4 
##                                       tfs 
##                                         4 
##                                      tool 
##                                         4 
##                                    visual 
##                                         4 
##                                    within 
##                                         4 
##                                  acquired 
##                                         3 
##                                   analyst 
##                                         3 
##                                   analyze 
##                                         3 
##                                   angular 
##                                         3 
##                                       api 
##                                         3 
##                                     build 
##                                         3 
##                            classification 
##                                         3 
##                                      code 
##                                         3 
##                                commitment 
##                                         3 
##                                   company 
##                                         3 
##                                  concepts 
##                                         3 
##                                   control 
##                                         3 
##                                      core 
##                                         3 
##                                curriculum 
##                                         3 
##                           customercentric 
##                                         3 
##                                 dedicated 
##                                         3 
##                               educational 
##                                         3 
##                                       ejb 
##                                         3 
##                                  employee 
##                                         3 
##                                    ensure 
##                                         3 
##                                  external 
##                                         3 
##                                   finance 
##                                         3 
##                                      five 
##                                         3 
##                               forecasting 
##                                         3 
##                                foundation 
##                                         3 
##                                functional 
##                                         3 
##                                       git 
##                                         3 
##                                    global 
##                                         3 
##                                      high 
##                                         3 
##                                      html 
##                                         3 
##                                industrial 
##                                         3 
##                                  internal 
##                                         3 
##                                       jsp 
##                                         3 
##                                 knowledge 
##                                         3 
##                                leadership 
##                                         3 
##                                  maintain 
##                                         3 
##                                   methods 
##                                         3 
##                                    models 
##                                         3 
##                                  multiple 
##                                         3 
##                                 necessary 
##                                         3 
##                                     needs 
##                                         3 
##                                       new 
##                                         3 
##                            objectoriented 
##                                         3 
##                              organization 
##                                         3 
##                                       pay 
##                                         3 
##                                     plans 
##                                         3 
##                                   prepare 
##                                         3 
##                                   present 
##                                         3 
##                             presentations 
##                                         3 
##                                    python 
##                                         3 
##                              quantitative 
##                                         3

Forming skill sets based on ‘technical’ and ‘non techical’ category.

skill_data1<-as.data.frame(frequency1)
skill_data1$skill1<-rownames(skill_data1)

non_tech_skill1<-skill_data1[skill_data1$skill1 %in% c("Self","interpersonal","innovation","creative","curiosity","leadership","team","management","strategy","communication","planning","driven","willing","commitment","attention","proficiency","knowledge","teamwork","focus","selfmotivated"),]


tech_skills1 <- skill_data1[skill_data1$skill1 %in% c("technology","python","microsoft","powerpoint","tableau","simulation","javascrpts","sql","agile","objectoriented","sas","java","javascript","statistical","regression","Rstudio"),]

SoftSkill <- non_tech_skill1$skill1
SFrequency <- non_tech_skill1$frequency1
NTSkills<- tech_skills1$skill1
NTFrequency <- tech_skills1$frequency1
merger1 <- as.data.frame(cbind(SoftSkill,SFrequency))
merger1
##        SoftSkill SFrequency
## 1     management         11
## 2  communication          6
## 3           team          6
## 4         driven          5
## 5       planning          5
## 6    proficiency          5
## 7     commitment          3
## 8      knowledge          3
## 9     leadership          3
## 10      strategy          3
## 11       willing          3
## 12 interpersonal          2
## 13      creative          1
## 14      teamwork          1
merger2 <- as.data.frame(cbind(NTSkills,NTFrequency))
merger2
##          NTSkills NTFrequency
## 1            java          11
## 2      javascript           8
## 3             sql           7
## 4      technology           7
## 5           agile           6
## 6      regression           5
## 7     statistical           4
## 8  objectoriented           3
## 9          python           3
## 10      microsoft           2
## 11            sas           2
## 12     simulation           2
## 13        tableau           2

British Airways

(Similar analysis was performed with “British Airways”)

base_url <- "http://careers.united.com/"
dataunited <- readLines("http://careers.united.com/ListJobs/ByKeyword/analyst/")
## Warning in readLines("http://careers.united.com/ListJobs/ByKeyword/
## analyst/"): incomplete final line found on 'http://careers.united.com/
## ListJobs/ByKeyword/analyst/'
dataunited <- grep ('<a href=',dataunited,value = TRUE)
dataunited <- grep ('<a href=\"/ShowJob/Id',dataunited,value = TRUE)
dataunited <- gsub(" <a href=",'',dataunited)
dataunited <- gsub('*</a>','',dataunited)
dataunited <- str_trim(dataunited,"left")
dataunited <-  unlist(strsplit(dataunited,split = '/',fixed = TRUE))
dataunited1 <- paste(dataunited[2],dataunited[3],dataunited[4],dataunited[5],sep = '/')
dataunited1 <- paste(base_url,dataunited1,'/',sep = '')
dataunited1
## [1] "http://careers.united.com/ShowJob/Id/1375308/Senior-Manager-Claims-Analytics,-Data-Security-and-Financial-Controls/"
dataunited2 <- paste(dataunited[8],dataunited[9],dataunited[10],dataunited[11],sep = '/')
dataunited2 <- paste(base_url,dataunited2,'/',sep = '')
dataunited2
## [1] "http://careers.united.com/ShowJob/Id/1284888/Senior-Analyst-–-Information-Technology/"
dataunited3 <- paste(dataunited[14],dataunited[15],dataunited[16],dataunited[17],sep = '/')
dataunited3 <- paste(base_url,dataunited3,'/',sep = '')
dataunited3
## [1] "http://careers.united.com/ShowJob/Id/1384304/Coordinator-Cargo-Claims/"
dataunited4 <- paste(dataunited[20],dataunited[21],dataunited[22],dataunited[23],sep = '/')
dataunited4 <- paste(base_url,dataunited4,'/',sep = '')
dataunited4
## [1] "http://careers.united.com/ShowJob/Id/1384303/Senior-Analyst-Technical-Operations-Finance/"
dataunited5 <- paste(dataunited[26],dataunited[27],dataunited[28],dataunited[29],sep = '/')
dataunited5 <- paste(base_url,dataunited5,'/',sep = '')
dataunited5
## [1] "http://careers.united.com/ShowJob/Id/1384145/Analyst-Merchandising/"
dataunited6 <- paste(dataunited[32],dataunited[33],dataunited[34],dataunited[35],sep = '/')
dataunited6 <- paste(base_url,dataunited6,'/',sep = '')
dataunited6
## [1] "http://careers.united.com/ShowJob/Id/1382849/Analyst-Service-Improvement-and-Solutions/"
dataunited7 <- paste(dataunited[38],dataunited[39],dataunited[40],dataunited[41],sep = '/')
dataunited7 <- paste(base_url,dataunited7,'/',sep = '')
dataunited7
## [1] "http://careers.united.com/ShowJob/Id/1382759/Senior-Analyst-Network-Operations-Analysis/"
dataunited8 <- paste(dataunited[44],dataunited[45],dataunited[46],dataunited[47],sep = '/')
dataunited8 <- paste(base_url,dataunited8,'/',sep = '')
dataunited8
## [1] "http://careers.united.com/ShowJob/Id/1382568/Program-Manager-Merchandising/"
dataunited9 <- paste(dataunited[50],dataunited[51],dataunited[52],dataunited[53],sep = '/')
dataunited9 <- paste(base_url,dataunited9,'/',sep = '')
dataunited9
## [1] "http://careers.united.com/ShowJob/Id/1381896/Senior-Analyst-Cargo-Sales-Strategy/"
dataunited10 <- paste(dataunited[56],dataunited[57],dataunited[58],dataunited[59],sep = '/')
dataunited10<- paste(base_url,dataunited10,'/',sep = '')
dataunited10
## [1] "http://careers.united.com/ShowJob/Id/1328782/Senior-Analyst-Finance-and-Accounts-Payable-Systems-Administration/"
dataunited11 <- paste(dataunited[62],dataunited[63],dataunited[64],dataunited[65],sep = '/')
dataunited11 <- paste(base_url,dataunited11,'/',sep = '')
dataunited11
## [1] "http://careers.united.com/ShowJob/Id/1339623/Senior-Analyst-Statistics-and-Operations-Research/"
dataunited12 <- paste(dataunited[68],dataunited[69],dataunited[70],dataunited[71],sep = '/')
dataunited12<- paste(base_url,dataunited12,'/',sep = '')
dataunited12
## [1] "http://careers.united.com/ShowJob/Id/1338423/Senior-Analyst-Capital-Planning-Analysis/"
dataunited13<- paste(dataunited[74],dataunited[75],dataunited[76],dataunited[77],sep = '/')
dataunited13 <- paste(base_url,dataunited13,'/',sep = '')
dataunited13
## [1] "http://careers.united.com/ShowJob/Id/1275170/Senior-Analyst-Information-Technology/"
dataunited14 <- paste(dataunited[80],dataunited[81],dataunited[82],dataunited[83],sep = '/')
dataunited14<- paste(base_url,dataunited14,'/',sep = '')
dataunited14
## [1] "http://careers.united.com/ShowJob/Id/1375303/Intern-Financial-Planning-Analysis-(MBA-Summer-2018)/"
dataunited15<- paste(dataunited[86],dataunited[87],dataunited[88],dataunited[89],sep = '/')
dataunited15 <- paste(base_url,dataunited15,'/',sep = '')
dataunited15
## [1] "http://careers.united.com/ShowJob/Id/1373003/Sr.-Analyst-Marketing/"
dataunited16<- paste(dataunited[92],dataunited[93],dataunited[94],dataunited[95],sep = '/')
dataunited16<- paste(base_url,dataunited16,'/',sep = '')
dataunited16
## [1] "http://careers.united.com/ShowJob/Id/1358013/Analyst-Loyalty/"
dataunited17<- paste(dataunited[98],dataunited[99],dataunited[100],dataunited[101],sep = '/')
dataunited17<- paste(base_url,dataunited17,'/',sep = '')
dataunited17
## [1] "http://careers.united.com/ShowJob/Id/1290320/Analyst-Statistics-and-Operations-Research/"
dataunited18<- paste(dataunited[104],dataunited[105],dataunited[106],dataunited[107],sep = '/')
dataunited18<- paste(base_url,dataunited18,'/',sep = '')
dataunited18
## [1] "http://careers.united.com/ShowJob/Id/1354184/Senior-Analyst-Financial-Planning-Analysis-(MBA)/"
dataunited19<- paste(dataunited[110],dataunited[111],dataunited[112],dataunited[113],sep = '/')
dataunited19<- paste(base_url,dataunited19,'/',sep = '')
dataunited19
## [1] "http://careers.united.com/ShowJob/Id/1354183/Sr.-Analyst-Network-Planning-and-Strategy-(MBA)/"
dataunited20<- paste(dataunited[116],dataunited[117],dataunited[118],dataunited[119],sep = '/')
dataunited20<- paste(base_url,dataunited20,'/',sep = '')
dataunited20
## [1] "http://careers.united.com/ShowJob/Id/1328972/Associate-Analyst-Pricing-Revenue-Management/"
dataunited21 <- paste(dataunited[123],dataunited[124],dataunited[125],dataunited[126],sep = '/')
dataunited21 <- paste(base_url,dataunited21,'/',sep = '')
dataunited21
## [1] "http://careers.united.com/ShowJob/Id/1341983/Associate-Analyst-Financial-Planning-Analysis/"
dataunited22<- paste(dataunited[129],dataunited[130],dataunited[131],dataunited[132],sep = '/')
dataunited22 <- paste(base_url,dataunited22,'/',sep = '')
dataunited22
## [1] "http://careers.united.com/ShowJob/Id/1341980/Associate-Analyst-Network-Planning-Scheduling/"
dataunited23<- paste(dataunited[135],dataunited[136],dataunited[137],dataunited[138],sep = '/')
dataunited23<- paste(base_url,dataunited23,'/',sep = '')
dataunited23
## [1] "http://careers.united.com/ShowJob/Id/1328973/Intern-Financial-Planning-Analysis-(Summer-2018)/"
megasetunited <- rbind(dataunited1,dataunited2,dataunited3,dataunited4,dataunited5,dataunited6,dataunited7,dataunited8,dataunited9,dataunited10,dataunited11,dataunited12,dataunited13,dataunited14,dataunited15,dataunited16,dataunited17,dataunited18,dataunited19,dataunited20,dataunited21,dataunited22,dataunited23)
x1<-as.character()

for(i in 1:length(megasetunited))
{
  webpage<-getURL(megasetunited[i])
  webpage <- readLines(tc <- textConnection(webpage)); close(tc)
  pagetree <- htmlTreeParse(webpage, error=function(...){}, useInternalNodes = TRUE)
 t1<-unlist(xpathSApply(pagetree,"/html/body/div/descendant::*[contains (text(),'skills')]",xmlValue))
 t2<-unlist(xpathSApply(pagetree,"/html/body/div/descendant::*[contains (text(),'Preferred')]",xmlValue))
 t3<-unlist(xpathSApply(pagetree,"/html/body/div/descendant::*[contains (text(),'Required')]",xmlValue))
 t4 <- unlist(xpathSApply(pagetree,"/html/body/div/descendant::*[contains (text(),'Job overview')]",xmlValue))
 t5 <- unlist(xpathSApply(pagetree,"/html/body/div/descendant::*[contains (text(),'proficiency')]",xmlValue))
 if (length(paste(t1,t2,t3,t4,t5))>0 )
   { x1[i]<-paste(t1,t2,t3,t4,t5)  }
 else 
 x1[i]<-"0"
 }
write.csv(x1,file = "united.csv")
file2<-read.csv("united.csv",header = FALSE,stringsAsFactors = FALSE)
file2$V1<-paste(file2$V1,file2$V2,sep = "")
review_text2 <- paste(file2$V1, collapse=" ")
review_source2 <- VectorSource(review_text2)
c2 <- Corpus(review_source2)
c2 <- tm_map(c2, content_transformer(tolower))
c2 <- tm_map(c2, removePunctuation)
c2 <- tm_map(c2, stripWhitespace)
c2 <- tm_map(c2, removeWords, stopwords("english"))
dtm2 <- DocumentTermMatrix(c2)
dtm22 <- as.matrix(dtm2)
frequency2 <- colSums(dtm22)
frequency2 <- sort(frequency2, decreasing=TRUE)
head(frequency2,200)
##                skills              required                   job 
##                    24                    22                    14 
##              overview      responsibilities             preferred 
##                    14                    14                    13 
##            experience             microsoft            powerpoint 
##                     7                     7                     7 
##           proficiency                 excel                office 
##                     7                     6                     6 
##                proven         communication            analytical 
##                     6                     5                     4 
##              business                   new               problem 
##                     4                     4                     4 
##               solving             technical                  will 
##                     4                     4                     4 
##                  data                enable         interpersonal 
##                     3                     3                     3 
##             knowledge                 roles              training 
##                     3                     3                     3 
##                  word                access              analysts 
##                     3                     2                     2 
##           appropriate             attention               balance 
##                     2                     2                     2 
##                claims            department                detail 
##                     2                     2                     2 
##                 focus               however              industry 
##                     2                     2                     2 
##           information               loyalty            management 
##                     2                     2                     2 
##                 needs               pricing              progress 
##                     2                     2                     2 
##               revenue                 sheet           significant 
##                     2                     2                     2 
##          specifically                strong              teamwork 
##                     2                     2                     2 
##               uniteds                verbal               written 
##                     2                     2                     2 
##           10excellent                116000                11good 
##                     1                     1                     1 
##           12technical           13technical               14solid 
##                     1                     1                     1 
##              15highly             16loyalty        17demonstrates 
##                     1                     1                     1 
##                   180         19exceptional                  1job 
##                     1                     1                     1 
##           20associate               21track         22exceptional 
##                     1                     1                     1 
##               23solid           2analytical            3excellent 
##                     1                     1                     1 
##            5excellent            6excellent               7strong 
##                     1                     1                     1 
##         9presentation               account             achieving 
##                     1                     1                     1 
##                active          adaptability               advance 
##                     1                     1                     1 
##               airline                  also              analysis 
##                     1                     1                     1 
##                 andor                 apply                  area 
##                     1                     1                     1 
##              assigned            assistance                better 
##                     1                     1                     1 
##                  bill                 build              changing 
##                     1                     1                     1 
##         collaborating           comfortable            commitment 
##                     1                     1                     1 
##          compensation   compensationworkers               complex 
##                     1                     1                     1 
##              continue             continued                course 
##                     1                     1                     1 
##                create              creation              critical 
##                     1                     1                     1 
##               current              database                  date 
##                     1                     1                     1 
##          demonstrates         demonstrating             depending 
##                     1                     1                     1 
##           development          developments              dynamics 
##                     1                     1                     1 
##             economics                either           environment 
##                     1                     1                     1 
##             excellent           expectation                expert 
##                     1                     1                     1 
##             expertise               focused                formal 
##                     1                     1                     1 
##                  four          fundamentals                 goals 
##                     1                     1                     1 
##                  high                 hires              improved 
##                     1                     1                     1 
##           improvement          improvements             including 
##                     1                     1                     1 
##            increasing            individual              informed 
##                     1                     1                     1 
##          intermediate              involves               keeping 
##                     1                     1                     1 
##                   key               knowing       knowledgeskills 
##                     1                     1                     1 
##             languages                  lead            leadership 
##                     1                     1                     1 
##             liability             listening                  made 
##                     1                     1                     1 
##                  make               manager                matter 
##                     1                     1                     1 
##            meaningful               medical                mentor 
##                     1                     1                     1 
##              modeling                   nax                needed 
##                     1                     1                     1 
##                   net                  open           opportunity 
##                     1                     1                     1 
##         organizations             organized           outstanding 
##                     1                     1                     1 
##                  part                  past          personalized 
##                     1                     1                     1 
##                placed              position             practices 
##                     1                     1                     1 
##          presentation             processes              products 
##                     1                     1                     1 
## proficiencyexperience               program           programming 
##                     1                     1                     1 
##              provided                python                record 
##                     1                     1                     1 
##               records              relevant                review 
##                     1                     1                     1 
##              rotation                 sales                   sas 
##                     1                     1                     1 
##         selfmotivated                senior                 simio 
##                     1                     1                     1 
##            simulation               sources              spotfire 
##                     1                     1                     1 
##           spreadsheet          stakeholders               staying 
##                     1                     1                     1 
##                street            structured               subject 
##                     1                     1                     1 
##               tableau                  team   technicalfunctional 
##                     1                     1                     1 
##                  time                 tools 
##                     1                     1
skill_data2<-as.data.frame(frequency2)
skill_data2$skill2<-rownames(skill_data2)

non_tech_skill2<-skill_data2[skill_data2$skill2 %in% c("Self","interpersonal","innovation","creative","curiosity","leadership","team","management","strategy","communication","planning","driven","willing","commitment","attention","proficiency","knowledge","teamwork","focus","selfmotivated"),]

tech_skills2 <- skill_data2[skill_data2$skill2 %in% c("technology","python","microsoft","powerpoint","tableau","simulation","javascrpts","sas","simulation","javascrpts","sql","agile","objectoriented"),]

Meging and ploting data from both airlines together.

SoftSkill <- non_tech_skill2$skill2
SFrequency <- non_tech_skill2$frequency2
NTSkills<- tech_skills2$skill2
NTFrequency <- tech_skills2$frequency2
merger3 <- as.data.frame(cbind(SoftSkill,SFrequency))
merger3
##        SoftSkill SFrequency
## 1    proficiency          7
## 2  communication          5
## 3  interpersonal          3
## 4      knowledge          3
## 5      attention          2
## 6          focus          2
## 7     management          2
## 8       teamwork          2
## 9     commitment          1
## 10    leadership          1
## 11 selfmotivated          1
## 12          team          1
merger4 <- as.data.frame(cbind(NTSkills,NTFrequency))
merger4
##     NTSkills NTFrequency
## 1  microsoft           7
## 2 powerpoint           7
## 3     python           1
## 4        sas           1
## 5 simulation           1
## 6    tableau           1
MegaMerger1 <- rbind(merger1,merger3)
MegaMerger1$SFrequency <- as.integer(MegaMerger1$SFrequency)
MegaMerger1 <- ddply(MegaMerger1,.(SoftSkill),summarize,sum=sum(SFrequency))
MegaMerger1 <-arrange(MegaMerger1,desc(MegaMerger1$sum),SoftSkill)
MegaMerger1$SoftSkill <- as.character(MegaMerger1$SoftSkill)
MegaMerger1
##        SoftSkill sum
## 1    proficiency  12
## 2  communication  11
## 3      knowledge   8
## 4  interpersonal   7
## 5           team   7
## 6     commitment   5
## 7         driven   5
## 8     leadership   5
## 9     management   5
## 10      planning   5
## 11      strategy   4
## 12      teamwork   4
## 13       willing   4
## 14     attention   3
## 15         focus   3
## 16      creative   1
## 17 selfmotivated   1
MegaMerger1$SoftSkill <- factor(MegaMerger1$SoftSkill,levels = MegaMerger1$SoftSkill[order(MegaMerger1$sum)])
ggplot(MegaMerger1,aes(x = SoftSkill , y = sum)) + theme_bw(base_size = 16) +theme(axis.text.x=element_text(angle=90,hjust=1)) + geom_bar(stat="identity",fill = "brown")

MegaMerger2 <- rbind(merger2,merger4)
MegaMerger2$NTFrequency <- as.integer(MegaMerger2$NTFrequency)
MegaMerger2 <- ddply(MegaMerger2,.(NTSkills),summarize,sum=sum(NTFrequency))
arrange(MegaMerger2,desc(MegaMerger2$sum),NTSkills)
##          NTSkills sum
## 1          python  12
## 2             sas  11
## 3      simulation  11
## 4         tableau  11
## 5       microsoft   9
## 6      javascript   8
## 7             sql   7
## 8      technology   7
## 9      powerpoint   7
## 10          agile   6
## 11     regression   5
## 12    statistical   4
## 13 objectoriented   3
## 14           java   1
MegaMerger2$NTSkills <- factor(MegaMerger2$NTSkills,levels = MegaMerger2$NTSkills[order(MegaMerger2$sum)])
ggplot(MegaMerger2,aes(x = NTSkills , y = sum)) + theme_bw(base_size = 16) +
  theme(axis.text.x=element_text(angle=90,hjust=1)) + geom_bar(stat="identity",fill = "brown")