Detecting fraudulent job posts

Data source: Fake job posting prediction

Research question: What are the important features for detecting fraudulent job posts?

Load packages

library(Hmisc)
library(dplyr)
library(stringr)
library(tidyverse)
library(skimr)
library(corrplot)
library(pscl)
library(tree)
library(rpart)
library(rpart.plot)
library(rattle)
library(viridis)
library(randomForest)
library(caret)
library(pROC)
library(jtools)
library(xgboost)
library(sjPlot)
library(RColorBrewer)

Import data

df=read.csv("fake_job_postings.csv",header=TRUE)

Summary

df = df %>% mutate_at (vars(telecommuting,has_company_logo,has_questions,fraudulent), list(factor))
skim(df)

Data summary
Name	df
Number of rows	17880
Number of columns	18
_______________________
Column type frequency:
character	13
factor	4
numeric	1
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
title	0	1	3	142	0	11231	0
location	0	1	0	161	346	3106	0
department	0	1	0	255	11547	1338	6
salary_range	0	1	0	20	15180	859	0
company_profile	0	1	0	6178	3308	1710	0
description	0	1	3	14907	0	14802	0
requirements	0	1	0	10864	2694	11970	0
benefits	2	1	0	4429	7206	6207	0
employment_type	0	1	0	9	3471	6	0
required_experience	0	1	0	16	7050	8	0
required_education	0	1	0	33	8105	14	0
industry	0	1	0	36	4903	132	0
function.	0	1	0	22	6455	38	0

Variable type: factor

skim_variable	complete_rate	ordered	n_unique	top_counts
telecommuting	1	FALSE	2	0: 17113, 1: 767
has_company_logo	1	FALSE	2	1: 14220, 0: 3660
has_questions	1	FALSE	2	0: 9088, 1: 8792
fraudulent	1	FALSE	2	0: 17014, 1: 866

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
job_id	0	1	8940.5	5161.66	1	4470.75	8940.5	13410.25	17880	▇▇▇▇▇

Target variable

Hmisc:: describe(df$fraudulent) #imbalanced dataset

## df$fraudulent 
##        n  missing distinct 
##    17880        0        2 
##                       
## Value          0     1
## Frequency  17014   866
## Proportion 0.952 0.048

ggplot(data=df,aes(x=fraudulent, fill= fraudulent)) + geom_bar(width = 0.7) + geom_text(stat='count', aes(label=..count..),vjust=-0.3) + theme_light() +
    theme(
    panel.grid.major.x = element_blank(),
    panel.border = element_blank(),
    axis.ticks.x = element_blank(),
    axis.ticks.y = element_blank(),
    panel.grid.major.y = element_blank(),
    panel.grid.minor.x = element_blank(),
    panel.grid.minor.y = element_blank()
  ) + scale_fill_manual(values = c("#457b9d", "#606c38")) + ylim(c(0,18000))

Create new variables

#binary 
df2=df
df2$company_profile_specified = ifelse(df2$company_profile=="","0","1")
df2$description_specified=ifelse(df2$description=="","0","1")
df2$requirements_specified=ifelse(df2$requirements=="","0","1")
df2$benefits_specified=ifelse(df2$benefits=="","0","1")
df2$salary_specified=ifelse(df2$salary=="","0","1")
df2$location_specified=ifelse(df2$location=="","0","1")
df2$function._specified=ifelse(df2$function.=="","0","1")
df2$department_specified=ifelse(df2$department=="","0","1")
df2$industry_specified=ifelse(df2$industry=="","0","1")
df2$emptype_specified=ifelse(df2$employment_type=="","0","1")
df2$reqexp_specified=ifelse(df2$required_experience=="","0","1")
df2$reqedu_specified=ifelse(df2$required_education=="","0","1")

#character count
df2$company_profile_char=nchar(df2$company_profile)
df2$description_char=nchar(df2$description)
df2$requirements_char=nchar(df2$requirements)
df2$benefits_char=nchar(df2$benefits)
#word count
df2$company_profile_word=lengths(strsplit(df2$company_profile, '\\S+'))
df2$description_word=lengths(strsplit(df2$description, '\\S+'))
df2$requirements_word=lengths(strsplit(df2$requirements, '\\S+'))
df2$benefits_word=lengths(strsplit(df2$benefits, '\\S+'))
#avg word length
df2$company_profile_awl= df2$company_profile_char/df2$company_profile_word
df2$description_awl= df2$description_char/df2$description_word
df2$requirements_awl= df2$requirements_char/df2$requirements_word
df2$benefits_awl = df2$benefits_char/df2$benefits_word
#concatenate four text variables 
df2$text1= paste(df2$company_profile, df2$description, df2$requirements, df2$benefits)
df2$text1_char= nchar(df2$text1)
df2$text1_word= lengths(strsplit(df2$text1, '\\S+'))
df2$text1_avl= df2$text1_char/df2$text1_word
colnames(df2)

##  [1] "job_id"                    "title"                    
##  [3] "location"                  "department"               
##  [5] "salary_range"              "company_profile"          
##  [7] "description"               "requirements"             
##  [9] "benefits"                  "telecommuting"            
## [11] "has_company_logo"          "has_questions"            
## [13] "employment_type"           "required_experience"      
## [15] "required_education"        "industry"                 
## [17] "function."                 "fraudulent"               
## [19] "company_profile_specified" "description_specified"    
## [21] "requirements_specified"    "benefits_specified"       
## [23] "salary_specified"          "location_specified"       
## [25] "function._specified"       "department_specified"     
## [27] "industry_specified"        "emptype_specified"        
## [29] "reqexp_specified"          "reqedu_specified"         
## [31] "company_profile_char"      "description_char"         
## [33] "requirements_char"         "benefits_char"            
## [35] "company_profile_word"      "description_word"         
## [37] "requirements_word"         "benefits_word"            
## [39] "company_profile_awl"       "description_awl"          
## [41] "requirements_awl"          "benefits_awl"             
## [43] "text1"                     "text1_char"               
## [45] "text1_word"                "text1_avl"

df2 = df2 %>% mutate_at (vars(company_profile_specified,description_specified, requirements_specified,benefits_specified,salary_specified, location_specified, function._specified,department_specified, industry_specified, emptype_specified, reqexp_specified, reqedu_specified), list(factor))
df2[is.na(df2)] = 0

Check correlation

dfnum = select_if(df2[,-1],is.numeric)
dfnum = data.frame(lapply(dfnum, function(x) as.numeric(as.character(x))))
res=cor(dfnum)
corrplot(res, method="color", type="upper", tl.col="#636363" )

Select variables for testing

d1= df2 %>% select (fraudulent, emptype_specified, has_company_logo, has_questions, required_experience, company_profile_specified, requirements_specified, benefits_specified, salary_specified, company_profile_char, description_char, requirements_char, company_profile_word, description_word, requirements_word, benefits_word, reqedu_specified, location_specified, department_specified, function._specified, reqexp_specified)
dim(d1)

## [1] 17880    21

Test and train set

set.seed(1234)
y1= sample(1:17880,14304)
xtrain=d1[y1,]
xtest=d1[-y1,]
Hmisc:: describe(xtrain$fraudulent)

## xtrain$fraudulent 
##        n  missing distinct 
##    14304        0        2 
##                       
## Value          0     1
## Frequency  13618   686
## Proportion 0.952 0.048

Hmisc:: describe(xtest$fraudulent)

## xtest$fraudulent 
##        n  missing distinct 
##     3576        0        2 
##                     
## Value         0    1
## Frequency  3396  180
## Proportion 0.95 0.05

Decision Tree 1

mt = rpart(fraudulent ~., data = xtrain, method = "class", control = rpart.control(minsplit = 1, minbucket = 1, cp = 0.0082))
fancyRpartPlot(mt)

printcp(mt)

## 
## Classification tree:
## rpart(formula = fraudulent ~ ., data = xtrain, method = "class", 
##     control = rpart.control(minsplit = 1, minbucket = 1, cp = 0.0082))
## 
## Variables actually used in tree construction:
##  [1] benefits_word             company_profile_specified
##  [3] department_specified      description_char         
##  [5] description_word          has_company_logo         
##  [7] has_questions             location_specified       
##  [9] required_experience       requirements_char        
## [11] requirements_specified    requirements_word        
## [13] salary_specified         
## 
## Root node error: 686/14304 = 0.047959
## 
## n= 14304 
## 
##          CP nsplit rel error  xerror     xstd
## 1 0.0082604      0   1.00000 1.00000 0.037253
## 2 0.0082000     24   0.73032 0.89213 0.035282

mt$variable.importance

##      company_profile_char company_profile_specified      company_profile_word 
##                111.112819                111.112819                111.112819 
##          has_company_logo       required_experience         requirements_word 
##                 97.029716                 67.188815                 64.935684 
##         requirements_char          description_char          description_word 
##                 58.498860                 51.324283                 48.584153 
##             benefits_word             has_questions          reqexp_specified 
##                 40.489576                 29.186729                 17.687038 
##          salary_specified    requirements_specified      department_specified 
##                 16.529911                 13.212371                 12.154478 
##        location_specified        benefits_specified       function._specified 
##                  9.803922                  9.572962                  6.505612 
##          reqedu_specified         emptype_specified 
##                  4.277893                  3.634955

tree.p = predict(mt, xtest, type = "class")
cmt = confusionMatrix(tree.p, xtest$fraudulent)
cmt

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 3378  136
##          1   18   44
##                                           
##                Accuracy : 0.9569          
##                  95% CI : (0.9498, 0.9634)
##     No Information Rate : 0.9497          
##     P-Value [Acc > NIR] : 0.02357         
##                                           
##                   Kappa : 0.3468          
##                                           
##  Mcnemar's Test P-Value : < 2e-16         
##                                           
##             Sensitivity : 0.9947          
##             Specificity : 0.2444          
##          Pos Pred Value : 0.9613          
##          Neg Pred Value : 0.7097          
##              Prevalence : 0.9497          
##          Detection Rate : 0.9446          
##    Detection Prevalence : 0.9827          
##       Balanced Accuracy : 0.6196          
##                                           
##        'Positive' Class : 0               
##

round(cmt$byClass["F1"], 4)

##     F1 
## 0.9777

xtest$tp1= tree.p
roc_t1= roc(response= xtest$fraudulent, predictor = factor(xtest$tp1, ordered=TRUE), plot=TRUE, print.auc=TRUE)

#visualize variable importance
v1 = data.frame(imp = mt$variable.importance)
v2 <- v1 %>% 
  tibble::rownames_to_column() %>% 
  dplyr::rename("variable" = rowname) %>% 
  dplyr::arrange(imp) %>%
  dplyr::mutate(variable = forcats::fct_inorder(variable))
ggplot2::ggplot(v2) +
  geom_col(aes(x = variable, y = imp, fill= imp),
           col = "white", show.legend = F) +
  coord_flip() +
  scale_fill_viridis() +
  theme_minimal() + labs(x="Variable", y="Importance")

Random Forest 1

set.seed(4543)
rf <- randomForest(fraudulent ~ ., data=xtrain)
importance(rf)

##                           MeanDecreaseGini
## emptype_specified                19.761290
## has_company_logo                 46.250640
## has_questions                    29.868083
## required_experience              52.037617
## company_profile_specified        32.258745
## requirements_specified            6.631810
## benefits_specified               14.678876
## salary_specified                 24.283376
## company_profile_char            111.420000
## description_char                160.702803
## requirements_char               118.070348
## company_profile_word             94.759215
## description_word                145.402700
## requirements_word               109.332330
## benefits_word                   102.541178
## reqedu_specified                 17.820544
## location_specified                7.079038
## department_specified             22.108278
## function._specified              16.357035
## reqexp_specified                 14.818044

varUsed(rf, by.tree=FALSE, count =TRUE)

##  [1]  4617  3841  6913 13365   602  2758  5821  6468 13521 31893 26723 12758
## [13] 30405 25412 21942  6158  2145  8143  6013  3583

varImpPlot(rf)

rfp = predict(rf, xtest)
cmrf = confusionMatrix(rfp, xtest$fraudulent)
cmrf

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 3392   96
##          1    4   84
##                                           
##                Accuracy : 0.972           
##                  95% CI : (0.9661, 0.9772)
##     No Information Rate : 0.9497          
##     P-Value [Acc > NIR] : 2.08e-11        
##                                           
##                   Kappa : 0.6141          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9988          
##             Specificity : 0.4667          
##          Pos Pred Value : 0.9725          
##          Neg Pred Value : 0.9545          
##              Prevalence : 0.9497          
##          Detection Rate : 0.9485          
##    Detection Prevalence : 0.9754          
##       Balanced Accuracy : 0.7327          
##                                           
##        'Positive' Class : 0               
##

round(cmrf$byClass["F1"], 4)

##     F1 
## 0.9855

xtest$rfp= rfp
roc_rf= roc(response= xtest$fraudulent, predictor = factor(xtest$rfp, ordered=TRUE), plot=TRUE, print.auc=TRUE)

Logistic regression 1

model1= glm(fraudulent ~., data=xtrain, family = "binomial")
summary(model1)

## 
## Call:
## glm(formula = fraudulent ~ ., family = "binomial", data = xtrain)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.3801  -0.2634  -0.1657  -0.1127   3.6107  
## 
## Coefficients: (1 not defined because of singularities)
##                                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                         -1.018e+00  2.872e-01  -3.544 0.000393 ***
## emptype_specified1                  -2.339e-01  1.328e-01  -1.761 0.078291 .  
## has_company_logo1                   -1.198e+00  1.359e-01  -8.811  < 2e-16 ***
## has_questions1                      -5.351e-01  9.786e-02  -5.467 4.56e-08 ***
## required_experienceAssociate        -1.282e+00  2.133e-01  -6.008 1.88e-09 ***
## required_experienceDirector         -5.966e-01  3.488e-01  -1.711 0.087173 .  
## required_experienceEntry level      -8.491e-02  1.511e-01  -0.562 0.574257    
## required_experienceExecutive        -4.519e-01  4.406e-01  -1.026 0.305118    
## required_experienceInternship       -9.207e-01  3.926e-01  -2.345 0.019011 *  
## required_experienceMid-Senior level -9.934e-01  1.599e-01  -6.213 5.20e-10 ***
## required_experienceNot Applicable   -5.246e-01  1.989e-01  -2.637 0.008355 ** 
## company_profile_specified1          -1.455e+00  1.572e-01  -9.256  < 2e-16 ***
## requirements_specified1              5.283e-01  1.426e-01   3.705 0.000211 ***
## benefits_specified1                  1.542e-01  1.106e-01   1.395 0.163079    
## salary_specified1                    4.279e-01  1.126e-01   3.799 0.000145 ***
## company_profile_char                 6.218e-03  6.266e-04   9.923  < 2e-16 ***
## description_char                    -6.679e-05  1.273e-04  -0.525 0.599788    
## requirements_char                    3.496e-04  5.295e-04   0.660 0.509077    
## company_profile_word                -4.408e-02  4.601e-03  -9.580  < 2e-16 ***
## description_word                     2.994e-04  9.590e-04   0.312 0.754890    
## requirements_word                   -6.793e-03  4.041e-03  -1.681 0.092769 .  
## benefits_word                        3.140e-03  9.663e-04   3.250 0.001154 ** 
## reqedu_specified1                   -1.708e-02  1.269e-01  -0.135 0.892907    
## location_specified1                 -5.268e-01  2.743e-01  -1.920 0.054814 .  
## department_specified1                4.570e-01  9.416e-02   4.853 1.22e-06 ***
## function._specified1                 5.720e-01  1.338e-01   4.276 1.90e-05 ***
## reqexp_specified1                           NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5505.9  on 14303  degrees of freedom
## Residual deviance: 4186.0  on 14278  degrees of freedom
## AIC: 4238
## 
## Number of Fisher Scoring iterations: 7

pR2(model1)

## fitting null model for pseudo-r2

##           llh       llhNull            G2      McFadden          r2ML 
## -2.093001e+03 -2.752949e+03  1.319896e+03  2.397240e-01  8.814526e-02 
##          r2CU 
##  2.758893e-01

anova(model1, test= "Chisq")

#visualize coefficients 
theme_set(theme_light())
plot_model(model1, color="system")

#probablity 0.5
prob=predict(model1,xtest,type="response")
prob1=rep(0,3576)
prob1[prob>0.5]=1
cmlr = confusionMatrix(as.factor(prob1), xtest$fraudulent)
cmlr

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 3395  177
##          1    1    3
##                                           
##                Accuracy : 0.9502          
##                  95% CI : (0.9426, 0.9571)
##     No Information Rate : 0.9497          
##     P-Value [Acc > NIR] : 0.4588          
##                                           
##                   Kappa : 0.0305          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.99971         
##             Specificity : 0.01667         
##          Pos Pred Value : 0.95045         
##          Neg Pred Value : 0.75000         
##              Prevalence : 0.94966         
##          Detection Rate : 0.94938         
##    Detection Prevalence : 0.99888         
##       Balanced Accuracy : 0.50819         
##                                           
##        'Positive' Class : 0               
##

round(cmlr$byClass["F1"], 4)

##     F1 
## 0.9745

roc_lr2 = roc(xtest$fraudulent, prob1, plot=TRUE, print.auc=TRUE)

#probablity 0.1
prob2=rep(0,3576)
prob2[prob>0.1]=1
cmlr2 = confusionMatrix(as.factor(prob2), xtest$fraudulent)
cmlr2

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2999   67
##          1  397  113
##                                           
##                Accuracy : 0.8702          
##                  95% CI : (0.8588, 0.8811)
##     No Information Rate : 0.9497          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2735          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.8831          
##             Specificity : 0.6278          
##          Pos Pred Value : 0.9781          
##          Neg Pred Value : 0.2216          
##              Prevalence : 0.9497          
##          Detection Rate : 0.8386          
##    Detection Prevalence : 0.8574          
##       Balanced Accuracy : 0.7554          
##                                           
##        'Positive' Class : 0               
##

round(cmlr2$byClass["F1"], 4)

##     F1 
## 0.9282

roc_lr2 = roc(xtest$fraudulent, prob2, plot=TRUE, print.auc=TRUE)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

Select variables for testing part 2

d2= df2 %>% select (fraudulent, emptype_specified, has_company_logo, has_questions, required_experience, company_profile_specified, requirements_specified, benefits_specified,salary_specified, company_profile_char, description_char, requirements_char, company_profile_word, description_word, requirements_word, benefits_word, reqedu_specified)
dim(d2)

## [1] 17880    17

xtrain2=d2[y1,]
xtest2=d2[-y1,]

Decision Tree 2

mt2 = rpart(fraudulent ~., data = xtrain2, method = "class", control = rpart.control(minsplit = 1, minbucket = 1, cp = 0.0082))
fancyRpartPlot(mt2)

printcp(mt2)

## 
## Classification tree:
## rpart(formula = fraudulent ~ ., data = xtrain2, method = "class", 
##     control = rpart.control(minsplit = 1, minbucket = 1, cp = 0.0082))
## 
## Variables actually used in tree construction:
##  [1] benefits_word             company_profile_specified
##  [3] description_char          description_word         
##  [5] has_company_logo          has_questions            
##  [7] required_experience       requirements_char        
##  [9] requirements_word         salary_specified         
## 
## Root node error: 686/14304 = 0.047959
## 
## n= 14304 
## 
##          CP nsplit rel error  xerror     xstd
## 1 0.0082604      0   1.00000 1.00000 0.037253
## 2 0.0082000     21   0.76531 0.87609 0.034978

mt2$variable.importance

##      company_profile_char company_profile_specified      company_profile_word 
##                111.112819                111.112819                111.112819 
##          has_company_logo          description_char          description_word 
##                 97.029716                 76.949822                 62.044814 
##       required_experience         requirements_word         requirements_char 
##                 54.450410                 52.511917                 44.140326 
##             benefits_word             has_questions          salary_specified 
##                 35.996403                 29.186729                 10.312530 
##        benefits_specified    requirements_specified          reqedu_specified 
##                  5.096677                  3.020612                  2.733902 
##         emptype_specified 
##                  1.010292

#prediction
tree.p2 = predict(mt2, xtest2, type = "class")
cmt2 = confusionMatrix(tree.p2, xtest2$fraudulent)
cmt2

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 3379  136
##          1   17   44
##                                           
##                Accuracy : 0.9572          
##                  95% CI : (0.9501, 0.9636)
##     No Information Rate : 0.9497          
##     P-Value [Acc > NIR] : 0.01945         
##                                           
##                   Kappa : 0.3485          
##                                           
##  Mcnemar's Test P-Value : < 2e-16         
##                                           
##             Sensitivity : 0.9950          
##             Specificity : 0.2444          
##          Pos Pred Value : 0.9613          
##          Neg Pred Value : 0.7213          
##              Prevalence : 0.9497          
##          Detection Rate : 0.9449          
##    Detection Prevalence : 0.9829          
##       Balanced Accuracy : 0.6197          
##                                           
##        'Positive' Class : 0               
##

round(cmt2$byClass["F1"], 4)

##     F1 
## 0.9779

xtest2$tp2= tree.p2
roc_t2= roc(response= xtest2$fraudulent, predictor = factor(xtest2$tp2, ordered=TRUE), plot=TRUE, print.auc=TRUE)

#visualize variable importance
v3 = data.frame(imp = mt2$variable.importance)
v4 <- v3 %>% 
  tibble::rownames_to_column() %>% 
  dplyr::rename("variable" = rowname) %>% 
  dplyr::arrange(imp) %>%
  dplyr::mutate(variable = forcats::fct_inorder(variable))
ggplot2::ggplot(v4) +
  geom_col(aes(x = variable, y = imp, fill= imp),
           col = "white", show.legend = F) +
  coord_flip() +
  scale_fill_viridis() +
  theme_minimal() + labs(x="Variable", y="Importance")

Random Forest 2

set.seed(4543)
rf2 <- randomForest(fraudulent ~ ., data=xtrain2)
importance(rf2)

##                           MeanDecreaseGini
## emptype_specified                21.070030
## has_company_logo                 43.892486
## has_questions                    31.880177
## required_experience              65.084655
## company_profile_specified        33.457367
## requirements_specified            6.029228
## benefits_specified               13.886519
## salary_specified                 25.886002
## company_profile_char            118.570356
## description_char                174.675219
## requirements_char               126.886685
## company_profile_word            104.304001
## description_word                161.028799
## requirements_word               116.330389
## benefits_word                   110.087861
## reqedu_specified                 19.959067

varUsed(rf2, by.tree=FALSE, count =TRUE)

##  [1]  5281  3590  6436 15000   386  2416  5383  6196 14653 34959 28549 13342
## [13] 33330 27246 23213  6065

varImpPlot(rf2)

#prediction
rfp2 = predict(rf2, xtest2)
cmrf2 = confusionMatrix(rfp2, xtest2$fraudulent)
cmrf2

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 3392   92
##          1    4   88
##                                           
##                Accuracy : 0.9732          
##                  95% CI : (0.9673, 0.9782)
##     No Information Rate : 0.9497          
##     P-Value [Acc > NIR] : 1.618e-12       
##                                           
##                   Kappa : 0.6346          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9988          
##             Specificity : 0.4889          
##          Pos Pred Value : 0.9736          
##          Neg Pred Value : 0.9565          
##              Prevalence : 0.9497          
##          Detection Rate : 0.9485          
##    Detection Prevalence : 0.9743          
##       Balanced Accuracy : 0.7439          
##                                           
##        'Positive' Class : 0               
##

round(cmrf2$byClass["F1"], 4)

##    F1 
## 0.986

xtest2$rfp2= rfp2
roc_rf= roc(response= xtest2$fraudulent, predictor = factor(xtest2$rfp2, ordered=TRUE), plot=TRUE, print.auc=TRUE)

Logistic regression 2

model2= glm(fraudulent ~., data=xtrain2, family = "binomial")
summary(model2)

## 
## Call:
## glm(formula = fraudulent ~ ., family = "binomial", data = xtrain2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2525  -0.2723  -0.1703  -0.1151   3.4482  
## 
## Coefficients:
##                                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                         -1.415e+00  1.374e-01 -10.296  < 2e-16 ***
## emptype_specified1                  -1.593e-01  1.228e-01  -1.297   0.1945    
## has_company_logo1                   -1.113e+00  1.334e-01  -8.348  < 2e-16 ***
## has_questions1                      -5.173e-01  9.703e-02  -5.331 9.75e-08 ***
## required_experienceAssociate        -1.111e+00  2.101e-01  -5.287 1.24e-07 ***
## required_experienceDirector         -3.895e-01  3.458e-01  -1.126   0.2600    
## required_experienceEntry level       9.819e-02  1.450e-01   0.677   0.4984    
## required_experienceExecutive        -2.654e-01  4.371e-01  -0.607   0.5438    
## required_experienceInternship       -6.846e-01  3.887e-01  -1.761   0.0782 .  
## required_experienceMid-Senior level -8.121e-01  1.551e-01  -5.235 1.65e-07 ***
## required_experienceNot Applicable   -3.626e-01  1.950e-01  -1.859   0.0630 .  
## company_profile_specified1          -1.543e+00  1.547e-01  -9.978  < 2e-16 ***
## requirements_specified1              5.853e-01  1.416e-01   4.135 3.55e-05 ***
## benefits_specified1                  1.962e-01  1.094e-01   1.793   0.0730 .  
## salary_specified1                    5.081e-01  1.120e-01   4.537 5.72e-06 ***
## company_profile_char                 6.223e-03  6.299e-04   9.880  < 2e-16 ***
## description_char                    -9.761e-05  1.309e-04  -0.746   0.4559    
## requirements_char                    4.961e-04  5.185e-04   0.957   0.3387    
## company_profile_word                -4.362e-02  4.632e-03  -9.417  < 2e-16 ***
## description_word                     5.772e-04  9.778e-04   0.590   0.5550    
## requirements_word                   -7.453e-03  3.964e-03  -1.880   0.0601 .  
## benefits_word                        3.695e-03  9.413e-04   3.925 8.66e-05 ***
## reqedu_specified1                    1.147e-01  1.224e-01   0.937   0.3489    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5505.9  on 14303  degrees of freedom
## Residual deviance: 4235.2  on 14281  degrees of freedom
## AIC: 4281.2
## 
## Number of Fisher Scoring iterations: 7

pR2(model2)

## fitting null model for pseudo-r2

##           llh       llhNull            G2      McFadden          r2ML 
## -2.117584e+03 -2.752949e+03  1.270730e+03  2.307944e-01  8.500565e-02 
##          r2CU 
##  2.660625e-01

anova(model2, test= "Chisq")

probb=predict(model2,xtest2,type="response")
probb1=rep(0,3576)
probb1[prob>0.1]=1
cflr3 = confusionMatrix(as.factor(probb1), xtest2$fraudulent)
cflr3

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2999   67
##          1  397  113
##                                           
##                Accuracy : 0.8702          
##                  95% CI : (0.8588, 0.8811)
##     No Information Rate : 0.9497          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2735          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.8831          
##             Specificity : 0.6278          
##          Pos Pred Value : 0.9781          
##          Neg Pred Value : 0.2216          
##              Prevalence : 0.9497          
##          Detection Rate : 0.8386          
##    Detection Prevalence : 0.8574          
##       Balanced Accuracy : 0.7554          
##                                           
##        'Positive' Class : 0               
##

round(cflr3$byClass["F1"], 4)

##     F1 
## 0.9282

roc_lr3 = roc(xtest2$fraudulent, probb1, plot=TRUE, print.auc=TRUE)

#visualize coefficients 
plot_model(model2,colors="system")