library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.2.3
library(zoo)
## Warning: package 'zoo' was built under R version 4.2.3
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(tm)
## Warning: package 'tm' was built under R version 4.2.3
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(SnowballC)
## Warning: package 'SnowballC' was built under R version 4.2.3
library(textcat)
## Warning: package 'textcat' was built under R version 4.2.3
library(caTools)
## Warning: package 'caTools' was built under R version 4.2.3
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.2.3
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.2.3
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
library(caret)
## Warning: package 'caret' was built under R version 4.2.3
## Loading required package: lattice
library(e1071)
## Warning: package 'e1071' was built under R version 4.2.3
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 4.2.3
## Loading required package: RColorBrewer
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'purrr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'lubridate' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ NLP::annotate() masks ggplot2::annotate()
## ✖ dplyr::combine() masks randomForest::combine()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ✖ randomForest::margin() masks ggplot2::margin()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(mice)
## Warning: package 'mice' was built under R version 4.2.3
##
## Attaching package: 'mice'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## cbind, rbind
job <- read.csv("pak_jobs.csv")
dim(job)
## [1] 6680 10
colnames(job)
## [1] "X_id" "Job.Name" "label"
## [4] "Company.Name" "Job.Type" "Experience.Required"
## [7] "Department" "JD" "City"
## [10] "Date.Posted"
job <- clean_names(job)
md.pattern(job, rotate.names = T)
## /\ /\
## { `---' }
## { O O }
## ==> V <== No need for mice. This data set is completely observed.
## \ \|/ /
## `-----'
## x_id job_name label company_name job_type experience_required department
## 6680 1 1 1 1 1 1 1
## 0 0 0 0 0 0 0
## jd city date_posted
## 6680 1 1 1 0
## 0 0 0 0
str(job)
## 'data.frame': 6680 obs. of 10 variables:
## $ x_id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ job_name : chr "Full Time New Job Positions .Net, .Netcore, Flutter, Tea Boy Jobs in Pakistan" "Full Time Senior Web Developer Jobs in Pakistan" "Full Time Russian Speakers Jobs in Pakistan" "Full Time Customer Support Specialist - International Jobs in Pakistan" ...
## $ label : chr "Premium Job" "Premium Job" "Premium Job" "Premium Job" ...
## $ company_name : chr "Nayel Solutions, Pakistan" "Eurosoft Tech Private Limited, Pakistan" "ICM JAPAN, Pakistan" "ibex, Pakistan" ...
## $ job_type : chr "Full Time Jobs" "Full Time Jobs" "Full Time Jobs" "Full Time Jobs" ...
## $ experience_required: chr "2 Years Job Exp." "2 Years Job Exp." "< 1 Year" "Job for Fresh Graduates" ...
## $ department : chr "IT Jobs" "IT Jobs" "Customer Service Jobs" "Customer Service Jobs" ...
## $ jd : chr "New Job Positions .net, .netcore, flutter, Tea boy in Nayel Solutions3 Positions for .net \\ .netcore Developer"| __truncated__ "We are looking for an experienced Web Developer.Responsible for creating the design and layout of a website or "| __truncated__ "International clients dealing exposure (B2B).Search new customers and bring business for the company.To promote"| __truncated__ "Responsible for acting as a liaison between our customer and the respective client. Provides assistance to the "| __truncated__ ...
## $ city : chr "Islamabad" "Karachi" "Karachi" "Islamabad" ...
## $ date_posted : chr "2021-03-12T00:00:00" "2021-03-12T00:00:00" "2021-03-12T00:00:00" "2021-03-09T00:00:00" ...
job$date_posted <- as.Date.character(job$date_posted, tryFormats = c("%Y-%m-%d"))
head(job)
## x_id
## 1 1
## 2 2
## 3 3
## 4 4
## 5 5
## 6 6
## job_name
## 1 Full Time New Job Positions .Net, .Netcore, Flutter, Tea Boy Jobs in Pakistan
## 2 Full Time Senior Web Developer Jobs in Pakistan
## 3 Full Time Russian Speakers Jobs in Pakistan
## 4 Full Time Customer Support Specialist - International Jobs in Pakistan
## 5 Full Time English Speaker - International Business Development Executive - BDE Job in Pakistan
## 6 Full Time Socks Operation Manager And Socks Inspector - Textiles Job in Pakistan
## label company_name job_type
## 1 Premium Job Nayel Solutions, Pakistan Full Time Jobs
## 2 Premium Job Eurosoft Tech Private Limited, Pakistan Full Time Jobs
## 3 Premium Job ICM JAPAN, Pakistan Full Time Jobs
## 4 Premium Job ibex, Pakistan Full Time Jobs
## 5 Premium Job ICM JAPAN, Pakistan Full Time Job
## 6 Premium Job Uni Hosiery Co. Inc., Pakistan Full Time Job
## experience_required department
## 1 2 Years Job Exp. IT Jobs
## 2 2 Years Job Exp. IT Jobs
## 3 < 1 Year Customer Service Jobs
## 4 Job for Fresh Graduates Customer Service Jobs
## 5 < 1 Year Customer Service Job
## 6 5 Years Job Exp. Production Job
## jd
## 1 New Job Positions .net, .netcore, flutter, Tea boy in Nayel Solutions3 Positions for .net \\ .netcore Developer programming language c#2 Positions for Flutter dart Developer1 Position for Tea boy
## 2 We are looking for an experienced Web Developer.Responsible for creating the design and layout of a website or web pagesKnowledge Of Front End Technologies Including CSS3, JavaScript, HTML5, And jQuery.Familiarity With Word Press Development and E-Co
## 3 International clients dealing exposure (B2B).Search new customers and bring business for the company.To promote business in the given region or country and Manage Business Accounts of customers.To maintain quality services with existing customers.Sal
## 4 Responsible for acting as a liaison between our customer and the respective client. Provides assistance to the customers with their questions, issues, new orders, service delivery requirements, billing, and any other query.Location: Karachi, Lahore &
## 5 International clients dealing exposure (B2B) and (B2C).Search new customers and bring business for the company.To promote business in the given region or country and Manage Business Accounts of customers.To maintain quality services with existing cus
## 6 NOTE: Apply By Sending Your CV/Resume to: hrdpk @ unihosiery.comCandidates with less than 5 years experience will not be considered for this position.\nA US based national-leading hosiery importer is seeking for candidates who can join us as ' Soc
## city date_posted
## 1 Islamabad 2021-03-12
## 2 Karachi 2021-03-12
## 3 Karachi 2021-03-12
## 4 Islamabad 2021-03-09
## 5 Karachi 2021-03-05
## 6 Faisalabad 2021-03-05
colnames(job)
## [1] "x_id" "job_name" "label"
## [4] "company_name" "job_type" "experience_required"
## [7] "department" "jd" "city"
## [10] "date_posted"
table(job$experience_required)
##
## < 1 Year 1 Year Job Exp. 10 Years Job Exp.
## 1591 1418 36
## 12 Years Job Exp. 14 Years Job Exp. 15 Years Job Exp.
## 2 1 6
## 2 Years Job Exp. 25 Years Job Exp. 3 Years Job Exp.
## 1635 1 789
## 4 Years Job Exp. 5 Years Job Exp. 6 Years Job Exp.
## 210 323 28
## 7 Years Job Exp. 8 Years Job Exp. 9 Years Job Exp.
## 18 29 2
## Job for Fresh Graduates Job for Students
## 449 142
#make experience a numeric
job$experience_required[job$experience_required == "< 1 Year"] <- 0.8
job$experience_required[job$experience_required == "1 Year Job Exp."] <- 1
job$experience_required[job$experience_required == "10 Years Job Exp."] <- 10
job$experience_required[job$experience_required == "12 Years Job Exp."] <- 12
job$experience_required[job$experience_required == "14 Years Job Exp."] <- 14
job$experience_required[job$experience_required == "15 Years Job Exp."] <- 15
job$experience_required[job$experience_required == "2 Years Job Exp."] <- 2
job$experience_required[job$experience_required == "3 Years Job Exp."] <- 3
job$experience_required[job$experience_required == "4 Years Job Exp."] <- 4
job$experience_required[job$experience_required == "5 Years Job Exp."] <- 5
job$experience_required[job$experience_required == "6 Years Job Exp."] <- 6
job$experience_required[job$experience_required == "7 Years Job Exp."] <- 7
job$experience_required[job$experience_required == "8 Years Job Exp."] <- 8
job$experience_required[job$experience_required == "9 Years Job Exp."] <- 9
job$experience_required[job$experience_required == "25 Years Job Exp."] <- 25
job$experience_required[job$experience_required == "Job for Fresh Graduates"] <- 0.5
job$experience_required[job$experience_required == "Job for Students"] <- 0
job1 <- job
#change experience class
job$experience_required <- as.numeric(as.character(job$experience_required))
table(job$experience_required)
##
## 0 0.5 0.8 1 2 3 4 5 6 7 8 9 10 12 14 15
## 142 449 1591 1418 1635 789 210 323 28 18 29 2 36 2 1 6
## 25
## 1
#Text Preprocessing
ggplot(job, aes(x = job$job_type)) +
geom_bar(stat = "count", fill = "red")+ coord_flip()
## Warning: Use of `job$job_type` is discouraged.
## ℹ Use `job_type` instead.
ggplot(job, aes(x = job$label)) +
geom_bar(stat = "count", fill = "red")+ coord_flip()
## Warning: Use of `job$label` is discouraged.
## ℹ Use `label` instead.
ggplot(job, aes(x = job$experience_required)) +
geom_bar(stat = "count", fill = "red")+ coord_flip()
## Warning: Use of `job$experience_required` is discouraged.
## ℹ Use `experience_required` instead.
ggplot(job, aes(x = job$city)) +
geom_bar(stat = "count", fill = "red")+ coord_flip()
## Warning: Use of `job$city` is discouraged.
## ℹ Use `city` instead.
ggplot(job, aes(x = job$experience_required)) +
geom_bar(stat = "bin", fill = "red")+
geom_text(stat = "count", aes(label = ..count..), vjust = 1.6, color = "white")
## Warning: Use of `job$experience_required` is discouraged.
## ℹ Use `experience_required` instead.
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Use of `job$experience_required` is discouraged.
## ℹ Use `experience_required` instead.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#create a binary vairable of experience req
job$unexpert <- as.factor(job$experience_required < 2)
table(job$unexpert)
##
## FALSE TRUE
## 3080 3600
corpus = VCorpus(VectorSource(job$jd))
#corpus = tm_map(Corpus, content_transformer(tolower))
corpus = tm_map(corpus, removePunctuation)
corpus = tm_map(corpus, removeWords, stopwords("english"))
corpus = tm_map(corpus, stemDocument)
corpus[[1]]$content
## [1] "New Job Posit net netcor flutter Tea boy Nayel Solutions3 Posit net netcor Develop program languag c2 Posit Flutter dart Developer1 Posit Tea boy"
I will now use a tecnique called bag of words which rearranges the data. Let’s now build our Document Term Matrix. This is a processing step which involves creating a data frame where each term(i.e. each word from our reviews) is a column, and each review is a row, with corresponding values for the number of times each term appears in each review. We then remove infrequent terms and focus on the ones that appear in multiple reviews.
frequencies = DocumentTermMatrix(corpus)
sparse = removeSparseTerms(frequencies, 0.99)
jobSparse = as.data.frame(as.matrix(sparse))
colnames(jobSparse) = make.names(colnames(jobSparse))
head(jobSparse)
## abil abl accord account achiev across activ administr agent also amp analysi
## 1 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 1 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 1 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0 0 0
## analyt analyz android answer app appli applic applicat architectur are area
## 1 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 1 0 0 0 0 0
## arrang assist assistant backend base basi basic best bonus brand build bull
## 1 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 0 0
## 4 0 1 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 1 0 0 0 0 0 0 0
## busi call campaign can candid career center channel client close code command
## 1 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0
## 3 3 0 0 0 0 0 0 0 1 0 0 0
## 4 0 0 0 0 0 0 0 0 1 0 0 0
## 5 3 0 0 0 0 0 0 0 1 0 0 0
## 6 0 0 0 1 1 0 0 0 0 0 0 0
## commiss commun communic compani complet comput concept confid consult contact
## 1 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0
## content coordin core corpor countri creat creativ css current custom daili
## 1 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 1 0 0 0 0 0
## 3 0 0 0 0 1 0 0 0 0 1 0
## 4 0 0 0 0 0 0 0 0 0 2 0
## 5 0 0 0 0 1 0 0 0 0 1 0
## 6 0 0 0 0 0 0 0 0 0 0 0
## data databas date day deal dedic degre deliv deliveri depart descript design
## 1 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 1
## 3 0 0 0 0 1 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 1 0 0 0
## 5 0 0 0 0 1 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0 0 0
## detail develop differ digit direct document drive duti dynam ecommerc educ
## 1 0 1 0 0 0 0 0 0 0 0 0
## 2 0 1 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0 0
## effect effici email employ employe encourag end energet engin engineer
## 1 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 1 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0
## english ensur enthusiast entri environ establish etc excel execut exist
## 1 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 1
## 4 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 1
## 6 0 0 0 0 0 0 0 0 0 0
## experi experienc expert fast femal field financi firm focus follow framework
## 1 0 0 0 0 0 0 0 0 0 0 0
## 2 0 1 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 0
## 6 1 0 0 0 0 0 0 0 0 0 0
## fresh front frontend full fulltim function. game generat get give given goal
## 1 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 1 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 1 0
## 4 0 0 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 1 0
## 6 0 0 0 0 0 0 0 0 0 0 0 0
## good grade graduat graphic great group grow growth hand handl head help high
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0 0 0 0
## hire home hour hous html idea ideal identifi implement improv includ individu
## 1 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0 0 0
## industri inform innov interact interest intern internat internship ios
## 1 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 1 0 0
## 4 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 1 0 0
## 6 0 0 0 0 0 0 0 0 0
## islamabad issu javascript job join karachi keep key know knowledg lahor
## 1 0 0 0 1 0 0 0 0 0 0 0
## 2 0 0 1 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 0
## 4 0 1 0 0 0 1 0 0 0 0 1
## 5 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 1 0 0 0 0 0 0
## languag laravel latest lead learn least level life like limit live locat look
## 1 1 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0 1
## 3 0 0 0 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0 0 0 0
## ltd main maintain mainten make male manag market may media meet minimum mobil
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 1 0 0 0 1 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0 0 0 0
## 5 0 0 1 0 0 0 1 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0 0 0 0
## model monitor month motiv multipl must mvc nation nativ need net network new
## 1 0 0 0 0 0 0 0 0 0 0 2 0 1
## 2 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 0 0 1
## 4 0 0 0 0 0 0 0 0 0 0 0 0 1
## 5 0 0 0 0 0 0 0 0 0 0 0 0 1
## 6 0 0 0 0 0 0 0 0 0 0 0 0 0
## night now offer offic office officer one onlin open oper operat opportun
## 1 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0 0 0
## optim order organ our outbound packag pakistan part passion peopl per perform
## 1 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 0 0
## 4 0 1 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0 0 0
## person phone photoshop php plan platform play pleas plus posit post potenti
## 1 0 0 0 0 0 0 0 0 0 4 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 1 0 0
## prefer prepar primari process product profession profici program project
## 1 0 0 0 0 0 0 0 1 0
## 2 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0
## promot prospect proven provid pvt qualif qualifi qualiti queri question react
## 1 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0
## 3 1 0 0 0 0 0 0 1 0 0 0
## 4 0 0 0 1 0 0 0 0 0 1 0
## 5 1 0 0 0 0 0 0 1 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0 0
## real record recruit region relat relationship relev report repres requir
## 1 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 1 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 1
## 5 0 0 0 1 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0
## research resourc respons result resum role salari sale search secur seek sell
## 1 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 0 0
## 4 0 0 1 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0 1 0
## send senior seo server servic set share shift should skill social soft
## 1 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 1 0 0 0 0 0 0 0
## 4 0 0 0 0 1 0 0 0 0 0 0 0
## 5 0 0 0 0 1 0 0 0 0 0 0 0
## 6 1 0 0 0 0 0 0 0 0 0 0 0
## softwar solut someon special specialist stack staff standard start startup
## 1 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0
## store strategi strong success support system take talent target task team
## 1 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0 0
## technic technolog test the theme this time tool trade train type understand
## 1 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 1 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0 0 0
## uniqu uniti updat urgent usa use user various video visit want web websit
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 2 1
## 3 0 0 0 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0 0 0 0
## welcom well what will within wordpress work world write writer year you your
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 1 0 0 0 0 0 0 1 0 1
Now add the dependant variabel experience to data
jobSparse$unexpert= job$unexpert
#Sentiment analysis first split the data
set.seed(124)
split = sample.split(jobSparse$unexpert, SplitRatio = 0.7)
jobSparse$split = split
train = subset(jobSparse, split == TRUE)
test = subset(jobSparse, split == FALSE)
nrow(train)
## [1] 4676
nrow(test)
## [1] 2004
#find the baseline accuracy that the model will have to surpass
table(train$unexpert)
##
## FALSE TRUE
## 2156 2520
1011 / nrow(train)
## [1] 0.2162104
head(train)
## abil abl accord account achiev across activ administr agent also amp analysi
## 1 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 1 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 1 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0 0 0
## analyt analyz android answer app appli applic applicat architectur are area
## 1 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 1 0 0 0 0 0
## arrang assist assistant backend base basi basic best bonus brand build bull
## 1 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 0 0
## 4 0 1 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 1 0 0 0 0 0 0 0
## busi call campaign can candid career center channel client close code command
## 1 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0
## 3 3 0 0 0 0 0 0 0 1 0 0 0
## 4 0 0 0 0 0 0 0 0 1 0 0 0
## 5 3 0 0 0 0 0 0 0 1 0 0 0
## 6 0 0 0 1 1 0 0 0 0 0 0 0
## commiss commun communic compani complet comput concept confid consult contact
## 1 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0
## content coordin core corpor countri creat creativ css current custom daili
## 1 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 1 0 0 0 0 0
## 3 0 0 0 0 1 0 0 0 0 1 0
## 4 0 0 0 0 0 0 0 0 0 2 0
## 5 0 0 0 0 1 0 0 0 0 1 0
## 6 0 0 0 0 0 0 0 0 0 0 0
## data databas date day deal dedic degre deliv deliveri depart descript design
## 1 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 1
## 3 0 0 0 0 1 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 1 0 0 0
## 5 0 0 0 0 1 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0 0 0
## detail develop differ digit direct document drive duti dynam ecommerc educ
## 1 0 1 0 0 0 0 0 0 0 0 0
## 2 0 1 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0 0
## effect effici email employ employe encourag end energet engin engineer
## 1 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 1 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0
## english ensur enthusiast entri environ establish etc excel execut exist
## 1 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 1
## 4 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 1
## 6 0 0 0 0 0 0 0 0 0 0
## experi experienc expert fast femal field financi firm focus follow framework
## 1 0 0 0 0 0 0 0 0 0 0 0
## 2 0 1 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 0
## 6 1 0 0 0 0 0 0 0 0 0 0
## fresh front frontend full fulltim function. game generat get give given goal
## 1 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 1 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 1 0
## 4 0 0 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 1 0
## 6 0 0 0 0 0 0 0 0 0 0 0 0
## good grade graduat graphic great group grow growth hand handl head help high
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0 0 0 0
## hire home hour hous html idea ideal identifi implement improv includ individu
## 1 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0 0 0
## industri inform innov interact interest intern internat internship ios
## 1 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 1 0 0
## 4 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 1 0 0
## 6 0 0 0 0 0 0 0 0 0
## islamabad issu javascript job join karachi keep key know knowledg lahor
## 1 0 0 0 1 0 0 0 0 0 0 0
## 2 0 0 1 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 0
## 4 0 1 0 0 0 1 0 0 0 0 1
## 5 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 1 0 0 0 0 0 0
## languag laravel latest lead learn least level life like limit live locat look
## 1 1 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0 1
## 3 0 0 0 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0 0 0 0
## ltd main maintain mainten make male manag market may media meet minimum mobil
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 1 0 0 0 1 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0 0 0 0
## 5 0 0 1 0 0 0 1 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0 0 0 0
## model monitor month motiv multipl must mvc nation nativ need net network new
## 1 0 0 0 0 0 0 0 0 0 0 2 0 1
## 2 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 0 0 1
## 4 0 0 0 0 0 0 0 0 0 0 0 0 1
## 5 0 0 0 0 0 0 0 0 0 0 0 0 1
## 6 0 0 0 0 0 0 0 0 0 0 0 0 0
## night now offer offic office officer one onlin open oper operat opportun
## 1 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0 0 0
## optim order organ our outbound packag pakistan part passion peopl per perform
## 1 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 0 0
## 4 0 1 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0 0 0
## person phone photoshop php plan platform play pleas plus posit post potenti
## 1 0 0 0 0 0 0 0 0 0 4 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 1 0 0
## prefer prepar primari process product profession profici program project
## 1 0 0 0 0 0 0 0 1 0
## 2 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0
## promot prospect proven provid pvt qualif qualifi qualiti queri question react
## 1 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0
## 3 1 0 0 0 0 0 0 1 0 0 0
## 4 0 0 0 1 0 0 0 0 0 1 0
## 5 1 0 0 0 0 0 0 1 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0 0
## real record recruit region relat relationship relev report repres requir
## 1 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 1 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 1
## 5 0 0 0 1 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0
## research resourc respons result resum role salari sale search secur seek sell
## 1 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 0 0
## 4 0 0 1 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0 1 0
## send senior seo server servic set share shift should skill social soft
## 1 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 1 0 0 0 0 0 0 0
## 4 0 0 0 0 1 0 0 0 0 0 0 0
## 5 0 0 0 0 1 0 0 0 0 0 0 0
## 6 1 0 0 0 0 0 0 0 0 0 0 0
## softwar solut someon special specialist stack staff standard start startup
## 1 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0
## store strategi strong success support system take talent target task team
## 1 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0 0
## technic technolog test the theme this time tool trade train type understand
## 1 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 1 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0 0 0
## uniqu uniti updat urgent usa use user various video visit want web websit
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 2 1
## 3 0 0 0 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0 0 0 0
## welcom well what will within wordpress work world write writer year you your
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 1 0 0 0 0 0 0 1 0 1
## unexpert split
## 1 FALSE TRUE
## 2 FALSE TRUE
## 3 TRUE TRUE
## 4 TRUE TRUE
## 5 TRUE TRUE
## 6 FALSE TRUE
table(train$experienced)
## < table of extent 0 >
the baseline accuracy is 21.6% means that almost 1/5 of all experiences are experiences. so the dataset is biased towards unexperienced.As a result machine learning algorithm will also be more likely to predict unexperienced jobs. There are different ways to handle this. by splitting that data into train and test preserve the ratio of expe and unexpe. Remove some of the unexper dataset to bring balance. I will here rather keep this as it is.
#Classification Tree¶ I will now build a CART model (this stands for Classification and Regression Trees, in this case it will be classification). I prefer to always try this type of model before something more complex like a random forest, because it’s much more interpretable and can be visualized. In this way we will be able to have a look at which words were treated as predictors.
cartModel <- rpart(unexpert ~., data = train, method = "class")
prp(cartModel)
Let’s evaluate the performance of CART.
predictCART <- predict(cartModel, newdata = test, type = "class")
table(test$unexpert, predictCART)
## predictCART
## FALSE TRUE
## FALSE 513 411
## TRUE 317 763
(491 + 1072) / nrow(test)
## [1] 0.7799401
The cart model has a 56% (from 21 to 77)improvement over the baseline model which is a positive sign.
#Model Enhancement - Cross Validation¶ I will now try to improve the performance by altering the default number of splits used to generate the tree. Cross validation is provided in R to help pick the optimal number of splits. There is a balance between too many splits which risks overfitting the model, and too few which may not yield a good enough accuracy.
numFOlds <- trainControl(method = "cv", number = 10)
cpGrid = expand.grid(.cp = seq(0.001, 0.01, 0.001))
train(unexpert ~., data = train, method = "rpart", trControl = numFOlds, tuneGrid = cpGrid)
## CART
##
## 4676 samples
## 368 predictor
## 2 classes: 'FALSE', 'TRUE'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 4208, 4209, 4208, 4209, 4209, 4208, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.001 0.6670304 0.3302348
## 0.002 0.6565590 0.3062429
## 0.003 0.6490703 0.2924859
## 0.004 0.6482105 0.2898908
## 0.005 0.6458592 0.2853721
## 0.006 0.6432919 0.2792755
## 0.007 0.6420066 0.2767358
## 0.008 0.6304435 0.2502913
## 0.009 0.6291582 0.2467738
## 0.010 0.6297992 0.2469164
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.001.
The cross validation gave me the optimal parameter cp = 0.001 so I will now re-build the tree with this parameter.
cartModelimpr <- rpart(unexpert ~., data = train, method = "class", cp = 0.001)
prp(cartModelimpr)
In this case the optimal tree has a lot of splits, which makes it more difficult to interpret, but gives us a good overview of which words are used to make the split decisions, and hence which words contribute most to the positive/negative sentiment.
Let’s now obtain predictions using the new tree.
predictCART_impr <- predict(cartModelimpr, newdata = test, type = "class")
table(test$unexpert, predictCART_impr)
## predictCART_impr
## FALSE TRUE
## FALSE 585 339
## TRUE 334 746
(626 + 939) / nrow(test)
## [1] 0.7809381
The improved model has 78% test set accuracy. This is 57% improvement from baseline. and also for an NLP model this is considered very high accuracy. The main reason I am able to obtain such high accuracy from text data is because all of the text is sentiment-related.. This is specific to reviews data, if I was looking at something else, like sentiment from Twitter, a lot of the text would be factual, and would not express opinions. Therefore, reviews data is ideal for sentiment analysis.
Secondly, as I am using rating as a predictor, this is very acurate. Some other approach like using a sentiment lexicon suffer from the intricacies of human language (e.g. humor and sarcasm can lead to positive words being used as negative and viceversa).