Data source: Fake job posting prediction
Research question: What are the important features for detecting fraudulent job posts?
Load packages
library(Hmisc)
library(dplyr)
library(stringr)
library(tidyverse)
library(skimr)
library(corrplot)
library(pscl)
library(tree)
library(rpart)
library(rpart.plot)
library(rattle)
library(viridis)
library(randomForest)
library(caret)
library(pROC)
library(jtools)
library(xgboost)
library(sjPlot)
library(RColorBrewer)
Import data
df=read.csv("fake_job_postings.csv",header=TRUE)
Summary
df = df %>% mutate_at (vars(telecommuting,has_company_logo,has_questions,fraudulent), list(factor))
skim(df)
| Name | df |
| Number of rows | 17880 |
| Number of columns | 18 |
| _______________________ | |
| Column type frequency: | |
| character | 13 |
| factor | 4 |
| numeric | 1 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| title | 0 | 1 | 3 | 142 | 0 | 11231 | 0 |
| location | 0 | 1 | 0 | 161 | 346 | 3106 | 0 |
| department | 0 | 1 | 0 | 255 | 11547 | 1338 | 6 |
| salary_range | 0 | 1 | 0 | 20 | 15180 | 859 | 0 |
| company_profile | 0 | 1 | 0 | 6178 | 3308 | 1710 | 0 |
| description | 0 | 1 | 3 | 14907 | 0 | 14802 | 0 |
| requirements | 0 | 1 | 0 | 10864 | 2694 | 11970 | 0 |
| benefits | 2 | 1 | 0 | 4429 | 7206 | 6207 | 0 |
| employment_type | 0 | 1 | 0 | 9 | 3471 | 6 | 0 |
| required_experience | 0 | 1 | 0 | 16 | 7050 | 8 | 0 |
| required_education | 0 | 1 | 0 | 33 | 8105 | 14 | 0 |
| industry | 0 | 1 | 0 | 36 | 4903 | 132 | 0 |
| function. | 0 | 1 | 0 | 22 | 6455 | 38 | 0 |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| telecommuting | 0 | 1 | FALSE | 2 | 0: 17113, 1: 767 |
| has_company_logo | 0 | 1 | FALSE | 2 | 1: 14220, 0: 3660 |
| has_questions | 0 | 1 | FALSE | 2 | 0: 9088, 1: 8792 |
| fraudulent | 0 | 1 | FALSE | 2 | 0: 17014, 1: 866 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| job_id | 0 | 1 | 8940.5 | 5161.66 | 1 | 4470.75 | 8940.5 | 13410.25 | 17880 | ▇▇▇▇▇ |
Target variable
Hmisc:: describe(df$fraudulent) #imbalanced dataset
## df$fraudulent
## n missing distinct
## 17880 0 2
##
## Value 0 1
## Frequency 17014 866
## Proportion 0.952 0.048
ggplot(data=df,aes(x=fraudulent, fill= fraudulent)) + geom_bar(width = 0.7) + geom_text(stat='count', aes(label=..count..),vjust=-0.3) + theme_light() +
theme(
panel.grid.major.x = element_blank(),
panel.border = element_blank(),
axis.ticks.x = element_blank(),
axis.ticks.y = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank()
) + scale_fill_manual(values = c("#457b9d", "#606c38")) + ylim(c(0,18000))
Create new variables
#binary
df2=df
df2$company_profile_specified = ifelse(df2$company_profile=="","0","1")
df2$description_specified=ifelse(df2$description=="","0","1")
df2$requirements_specified=ifelse(df2$requirements=="","0","1")
df2$benefits_specified=ifelse(df2$benefits=="","0","1")
df2$salary_specified=ifelse(df2$salary=="","0","1")
df2$location_specified=ifelse(df2$location=="","0","1")
df2$function._specified=ifelse(df2$function.=="","0","1")
df2$department_specified=ifelse(df2$department=="","0","1")
df2$industry_specified=ifelse(df2$industry=="","0","1")
df2$emptype_specified=ifelse(df2$employment_type=="","0","1")
df2$reqexp_specified=ifelse(df2$required_experience=="","0","1")
df2$reqedu_specified=ifelse(df2$required_education=="","0","1")
#character count
df2$company_profile_char=nchar(df2$company_profile)
df2$description_char=nchar(df2$description)
df2$requirements_char=nchar(df2$requirements)
df2$benefits_char=nchar(df2$benefits)
#word count
df2$company_profile_word=lengths(strsplit(df2$company_profile, '\\S+'))
df2$description_word=lengths(strsplit(df2$description, '\\S+'))
df2$requirements_word=lengths(strsplit(df2$requirements, '\\S+'))
df2$benefits_word=lengths(strsplit(df2$benefits, '\\S+'))
#avg word length
df2$company_profile_awl= df2$company_profile_char/df2$company_profile_word
df2$description_awl= df2$description_char/df2$description_word
df2$requirements_awl= df2$requirements_char/df2$requirements_word
df2$benefits_awl = df2$benefits_char/df2$benefits_word
#concatenate four text variables
df2$text1= paste(df2$company_profile, df2$description, df2$requirements, df2$benefits)
df2$text1_char= nchar(df2$text1)
df2$text1_word= lengths(strsplit(df2$text1, '\\S+'))
df2$text1_avl= df2$text1_char/df2$text1_word
colnames(df2)
## [1] "job_id" "title"
## [3] "location" "department"
## [5] "salary_range" "company_profile"
## [7] "description" "requirements"
## [9] "benefits" "telecommuting"
## [11] "has_company_logo" "has_questions"
## [13] "employment_type" "required_experience"
## [15] "required_education" "industry"
## [17] "function." "fraudulent"
## [19] "company_profile_specified" "description_specified"
## [21] "requirements_specified" "benefits_specified"
## [23] "salary_specified" "location_specified"
## [25] "function._specified" "department_specified"
## [27] "industry_specified" "emptype_specified"
## [29] "reqexp_specified" "reqedu_specified"
## [31] "company_profile_char" "description_char"
## [33] "requirements_char" "benefits_char"
## [35] "company_profile_word" "description_word"
## [37] "requirements_word" "benefits_word"
## [39] "company_profile_awl" "description_awl"
## [41] "requirements_awl" "benefits_awl"
## [43] "text1" "text1_char"
## [45] "text1_word" "text1_avl"
df2 = df2 %>% mutate_at (vars(company_profile_specified,description_specified, requirements_specified,benefits_specified,salary_specified, location_specified, function._specified,department_specified, industry_specified, emptype_specified, reqexp_specified, reqedu_specified), list(factor))
df2[is.na(df2)] = 0
Check correlation
dfnum = select_if(df2[,-1],is.numeric)
dfnum = data.frame(lapply(dfnum, function(x) as.numeric(as.character(x))))
res=cor(dfnum)
corrplot(res, method="color", type="upper", tl.col="#636363" )
Select variables for testing
d1= df2 %>% select (fraudulent, emptype_specified, has_company_logo, has_questions, required_experience, company_profile_specified, requirements_specified, benefits_specified, salary_specified, company_profile_char, description_char, requirements_char, company_profile_word, description_word, requirements_word, benefits_word, reqedu_specified, location_specified, department_specified, function._specified, reqexp_specified)
dim(d1)
## [1] 17880 21
Test and train set
set.seed(1234)
y1= sample(1:17880,14304)
xtrain=d1[y1,]
xtest=d1[-y1,]
Hmisc:: describe(xtrain$fraudulent)
## xtrain$fraudulent
## n missing distinct
## 14304 0 2
##
## Value 0 1
## Frequency 13618 686
## Proportion 0.952 0.048
Hmisc:: describe(xtest$fraudulent)
## xtest$fraudulent
## n missing distinct
## 3576 0 2
##
## Value 0 1
## Frequency 3396 180
## Proportion 0.95 0.05
mt = rpart(fraudulent ~., data = xtrain, method = "class", control = rpart.control(minsplit = 1, minbucket = 1, cp = 0.0082))
fancyRpartPlot(mt)
printcp(mt)
##
## Classification tree:
## rpart(formula = fraudulent ~ ., data = xtrain, method = "class",
## control = rpart.control(minsplit = 1, minbucket = 1, cp = 0.0082))
##
## Variables actually used in tree construction:
## [1] benefits_word company_profile_specified
## [3] department_specified description_char
## [5] description_word has_company_logo
## [7] has_questions location_specified
## [9] required_experience requirements_char
## [11] requirements_specified requirements_word
## [13] salary_specified
##
## Root node error: 686/14304 = 0.047959
##
## n= 14304
##
## CP nsplit rel error xerror xstd
## 1 0.0082604 0 1.00000 1.00000 0.037253
## 2 0.0082000 24 0.73032 0.89213 0.035282
mt$variable.importance
## company_profile_char company_profile_specified company_profile_word
## 111.112819 111.112819 111.112819
## has_company_logo required_experience requirements_word
## 97.029716 67.188815 64.935684
## requirements_char description_char description_word
## 58.498860 51.324283 48.584153
## benefits_word has_questions reqexp_specified
## 40.489576 29.186729 17.687038
## salary_specified requirements_specified department_specified
## 16.529911 13.212371 12.154478
## location_specified benefits_specified function._specified
## 9.803922 9.572962 6.505612
## reqedu_specified emptype_specified
## 4.277893 3.634955
tree.p = predict(mt, xtest, type = "class")
cmt = confusionMatrix(tree.p, xtest$fraudulent)
cmt
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 3378 136
## 1 18 44
##
## Accuracy : 0.9569
## 95% CI : (0.9498, 0.9634)
## No Information Rate : 0.9497
## P-Value [Acc > NIR] : 0.02357
##
## Kappa : 0.3468
##
## Mcnemar's Test P-Value : < 2e-16
##
## Sensitivity : 0.9947
## Specificity : 0.2444
## Pos Pred Value : 0.9613
## Neg Pred Value : 0.7097
## Prevalence : 0.9497
## Detection Rate : 0.9446
## Detection Prevalence : 0.9827
## Balanced Accuracy : 0.6196
##
## 'Positive' Class : 0
##
round(cmt$byClass["F1"], 4)
## F1
## 0.9777
xtest$tp1= tree.p
roc_t1= roc(response= xtest$fraudulent, predictor = factor(xtest$tp1, ordered=TRUE), plot=TRUE, print.auc=TRUE)
#visualize variable importance
v1 = data.frame(imp = mt$variable.importance)
v2 <- v1 %>%
tibble::rownames_to_column() %>%
dplyr::rename("variable" = rowname) %>%
dplyr::arrange(imp) %>%
dplyr::mutate(variable = forcats::fct_inorder(variable))
ggplot2::ggplot(v2) +
geom_col(aes(x = variable, y = imp, fill= imp),
col = "white", show.legend = F) +
coord_flip() +
scale_fill_viridis() +
theme_minimal() + labs(x="Variable", y="Importance")
set.seed(4543)
rf <- randomForest(fraudulent ~ ., data=xtrain)
importance(rf)
## MeanDecreaseGini
## emptype_specified 19.761290
## has_company_logo 46.250640
## has_questions 29.868083
## required_experience 52.037617
## company_profile_specified 32.258745
## requirements_specified 6.631810
## benefits_specified 14.678876
## salary_specified 24.283376
## company_profile_char 111.420000
## description_char 160.702803
## requirements_char 118.070348
## company_profile_word 94.759215
## description_word 145.402700
## requirements_word 109.332330
## benefits_word 102.541178
## reqedu_specified 17.820544
## location_specified 7.079038
## department_specified 22.108278
## function._specified 16.357035
## reqexp_specified 14.818044
varUsed(rf, by.tree=FALSE, count =TRUE)
## [1] 4617 3841 6913 13365 602 2758 5821 6468 13521 31893 26723 12758
## [13] 30405 25412 21942 6158 2145 8143 6013 3583
varImpPlot(rf)
rfp = predict(rf, xtest)
cmrf = confusionMatrix(rfp, xtest$fraudulent)
cmrf
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 3392 96
## 1 4 84
##
## Accuracy : 0.972
## 95% CI : (0.9661, 0.9772)
## No Information Rate : 0.9497
## P-Value [Acc > NIR] : 2.08e-11
##
## Kappa : 0.6141
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9988
## Specificity : 0.4667
## Pos Pred Value : 0.9725
## Neg Pred Value : 0.9545
## Prevalence : 0.9497
## Detection Rate : 0.9485
## Detection Prevalence : 0.9754
## Balanced Accuracy : 0.7327
##
## 'Positive' Class : 0
##
round(cmrf$byClass["F1"], 4)
## F1
## 0.9855
xtest$rfp= rfp
roc_rf= roc(response= xtest$fraudulent, predictor = factor(xtest$rfp, ordered=TRUE), plot=TRUE, print.auc=TRUE)
model1= glm(fraudulent ~., data=xtrain, family = "binomial")
summary(model1)
##
## Call:
## glm(formula = fraudulent ~ ., family = "binomial", data = xtrain)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.3801 -0.2634 -0.1657 -0.1127 3.6107
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.018e+00 2.872e-01 -3.544 0.000393 ***
## emptype_specified1 -2.339e-01 1.328e-01 -1.761 0.078291 .
## has_company_logo1 -1.198e+00 1.359e-01 -8.811 < 2e-16 ***
## has_questions1 -5.351e-01 9.786e-02 -5.467 4.56e-08 ***
## required_experienceAssociate -1.282e+00 2.133e-01 -6.008 1.88e-09 ***
## required_experienceDirector -5.966e-01 3.488e-01 -1.711 0.087173 .
## required_experienceEntry level -8.491e-02 1.511e-01 -0.562 0.574257
## required_experienceExecutive -4.519e-01 4.406e-01 -1.026 0.305118
## required_experienceInternship -9.207e-01 3.926e-01 -2.345 0.019011 *
## required_experienceMid-Senior level -9.934e-01 1.599e-01 -6.213 5.20e-10 ***
## required_experienceNot Applicable -5.246e-01 1.989e-01 -2.637 0.008355 **
## company_profile_specified1 -1.455e+00 1.572e-01 -9.256 < 2e-16 ***
## requirements_specified1 5.283e-01 1.426e-01 3.705 0.000211 ***
## benefits_specified1 1.542e-01 1.106e-01 1.395 0.163079
## salary_specified1 4.279e-01 1.126e-01 3.799 0.000145 ***
## company_profile_char 6.218e-03 6.266e-04 9.923 < 2e-16 ***
## description_char -6.679e-05 1.273e-04 -0.525 0.599788
## requirements_char 3.496e-04 5.295e-04 0.660 0.509077
## company_profile_word -4.408e-02 4.601e-03 -9.580 < 2e-16 ***
## description_word 2.994e-04 9.590e-04 0.312 0.754890
## requirements_word -6.793e-03 4.041e-03 -1.681 0.092769 .
## benefits_word 3.140e-03 9.663e-04 3.250 0.001154 **
## reqedu_specified1 -1.708e-02 1.269e-01 -0.135 0.892907
## location_specified1 -5.268e-01 2.743e-01 -1.920 0.054814 .
## department_specified1 4.570e-01 9.416e-02 4.853 1.22e-06 ***
## function._specified1 5.720e-01 1.338e-01 4.276 1.90e-05 ***
## reqexp_specified1 NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5505.9 on 14303 degrees of freedom
## Residual deviance: 4186.0 on 14278 degrees of freedom
## AIC: 4238
##
## Number of Fisher Scoring iterations: 7
pR2(model1)
## fitting null model for pseudo-r2
## llh llhNull G2 McFadden r2ML
## -2.093001e+03 -2.752949e+03 1.319896e+03 2.397240e-01 8.814526e-02
## r2CU
## 2.758893e-01
anova(model1, test= "Chisq")
#visualize coefficients
theme_set(theme_light())
plot_model(model1, color="system")
#probablity 0.5
prob=predict(model1,xtest,type="response")
prob1=rep(0,3576)
prob1[prob>0.5]=1
cmlr = confusionMatrix(as.factor(prob1), xtest$fraudulent)
cmlr
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 3395 177
## 1 1 3
##
## Accuracy : 0.9502
## 95% CI : (0.9426, 0.9571)
## No Information Rate : 0.9497
## P-Value [Acc > NIR] : 0.4588
##
## Kappa : 0.0305
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.99971
## Specificity : 0.01667
## Pos Pred Value : 0.95045
## Neg Pred Value : 0.75000
## Prevalence : 0.94966
## Detection Rate : 0.94938
## Detection Prevalence : 0.99888
## Balanced Accuracy : 0.50819
##
## 'Positive' Class : 0
##
round(cmlr$byClass["F1"], 4)
## F1
## 0.9745
roc_lr2 = roc(xtest$fraudulent, prob1, plot=TRUE, print.auc=TRUE)
#probablity 0.1
prob2=rep(0,3576)
prob2[prob>0.1]=1
cmlr2 = confusionMatrix(as.factor(prob2), xtest$fraudulent)
cmlr2
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2999 67
## 1 397 113
##
## Accuracy : 0.8702
## 95% CI : (0.8588, 0.8811)
## No Information Rate : 0.9497
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2735
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.8831
## Specificity : 0.6278
## Pos Pred Value : 0.9781
## Neg Pred Value : 0.2216
## Prevalence : 0.9497
## Detection Rate : 0.8386
## Detection Prevalence : 0.8574
## Balanced Accuracy : 0.7554
##
## 'Positive' Class : 0
##
round(cmlr2$byClass["F1"], 4)
## F1
## 0.9282
roc_lr2 = roc(xtest$fraudulent, prob2, plot=TRUE, print.auc=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
Select variables for testing part 2
d2= df2 %>% select (fraudulent, emptype_specified, has_company_logo, has_questions, required_experience, company_profile_specified, requirements_specified, benefits_specified,salary_specified, company_profile_char, description_char, requirements_char, company_profile_word, description_word, requirements_word, benefits_word, reqedu_specified)
dim(d2)
## [1] 17880 17
xtrain2=d2[y1,]
xtest2=d2[-y1,]
mt2 = rpart(fraudulent ~., data = xtrain2, method = "class", control = rpart.control(minsplit = 1, minbucket = 1, cp = 0.0082))
fancyRpartPlot(mt2)
printcp(mt2)
##
## Classification tree:
## rpart(formula = fraudulent ~ ., data = xtrain2, method = "class",
## control = rpart.control(minsplit = 1, minbucket = 1, cp = 0.0082))
##
## Variables actually used in tree construction:
## [1] benefits_word company_profile_specified
## [3] description_char description_word
## [5] has_company_logo has_questions
## [7] required_experience requirements_char
## [9] requirements_word salary_specified
##
## Root node error: 686/14304 = 0.047959
##
## n= 14304
##
## CP nsplit rel error xerror xstd
## 1 0.0082604 0 1.00000 1.00000 0.037253
## 2 0.0082000 21 0.76531 0.87609 0.034978
mt2$variable.importance
## company_profile_char company_profile_specified company_profile_word
## 111.112819 111.112819 111.112819
## has_company_logo description_char description_word
## 97.029716 76.949822 62.044814
## required_experience requirements_word requirements_char
## 54.450410 52.511917 44.140326
## benefits_word has_questions salary_specified
## 35.996403 29.186729 10.312530
## benefits_specified requirements_specified reqedu_specified
## 5.096677 3.020612 2.733902
## emptype_specified
## 1.010292
#prediction
tree.p2 = predict(mt2, xtest2, type = "class")
cmt2 = confusionMatrix(tree.p2, xtest2$fraudulent)
cmt2
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 3379 136
## 1 17 44
##
## Accuracy : 0.9572
## 95% CI : (0.9501, 0.9636)
## No Information Rate : 0.9497
## P-Value [Acc > NIR] : 0.01945
##
## Kappa : 0.3485
##
## Mcnemar's Test P-Value : < 2e-16
##
## Sensitivity : 0.9950
## Specificity : 0.2444
## Pos Pred Value : 0.9613
## Neg Pred Value : 0.7213
## Prevalence : 0.9497
## Detection Rate : 0.9449
## Detection Prevalence : 0.9829
## Balanced Accuracy : 0.6197
##
## 'Positive' Class : 0
##
round(cmt2$byClass["F1"], 4)
## F1
## 0.9779
xtest2$tp2= tree.p2
roc_t2= roc(response= xtest2$fraudulent, predictor = factor(xtest2$tp2, ordered=TRUE), plot=TRUE, print.auc=TRUE)
#visualize variable importance
v3 = data.frame(imp = mt2$variable.importance)
v4 <- v3 %>%
tibble::rownames_to_column() %>%
dplyr::rename("variable" = rowname) %>%
dplyr::arrange(imp) %>%
dplyr::mutate(variable = forcats::fct_inorder(variable))
ggplot2::ggplot(v4) +
geom_col(aes(x = variable, y = imp, fill= imp),
col = "white", show.legend = F) +
coord_flip() +
scale_fill_viridis() +
theme_minimal() + labs(x="Variable", y="Importance")
set.seed(4543)
rf2 <- randomForest(fraudulent ~ ., data=xtrain2)
importance(rf2)
## MeanDecreaseGini
## emptype_specified 21.070030
## has_company_logo 43.892486
## has_questions 31.880177
## required_experience 65.084655
## company_profile_specified 33.457367
## requirements_specified 6.029228
## benefits_specified 13.886519
## salary_specified 25.886002
## company_profile_char 118.570356
## description_char 174.675219
## requirements_char 126.886685
## company_profile_word 104.304001
## description_word 161.028799
## requirements_word 116.330389
## benefits_word 110.087861
## reqedu_specified 19.959067
varUsed(rf2, by.tree=FALSE, count =TRUE)
## [1] 5281 3590 6436 15000 386 2416 5383 6196 14653 34959 28549 13342
## [13] 33330 27246 23213 6065
varImpPlot(rf2)
#prediction
rfp2 = predict(rf2, xtest2)
cmrf2 = confusionMatrix(rfp2, xtest2$fraudulent)
cmrf2
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 3392 92
## 1 4 88
##
## Accuracy : 0.9732
## 95% CI : (0.9673, 0.9782)
## No Information Rate : 0.9497
## P-Value [Acc > NIR] : 1.618e-12
##
## Kappa : 0.6346
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9988
## Specificity : 0.4889
## Pos Pred Value : 0.9736
## Neg Pred Value : 0.9565
## Prevalence : 0.9497
## Detection Rate : 0.9485
## Detection Prevalence : 0.9743
## Balanced Accuracy : 0.7439
##
## 'Positive' Class : 0
##
round(cmrf2$byClass["F1"], 4)
## F1
## 0.986
xtest2$rfp2= rfp2
roc_rf= roc(response= xtest2$fraudulent, predictor = factor(xtest2$rfp2, ordered=TRUE), plot=TRUE, print.auc=TRUE)
model2= glm(fraudulent ~., data=xtrain2, family = "binomial")
summary(model2)
##
## Call:
## glm(formula = fraudulent ~ ., family = "binomial", data = xtrain2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2525 -0.2723 -0.1703 -0.1151 3.4482
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.415e+00 1.374e-01 -10.296 < 2e-16 ***
## emptype_specified1 -1.593e-01 1.228e-01 -1.297 0.1945
## has_company_logo1 -1.113e+00 1.334e-01 -8.348 < 2e-16 ***
## has_questions1 -5.173e-01 9.703e-02 -5.331 9.75e-08 ***
## required_experienceAssociate -1.111e+00 2.101e-01 -5.287 1.24e-07 ***
## required_experienceDirector -3.895e-01 3.458e-01 -1.126 0.2600
## required_experienceEntry level 9.819e-02 1.450e-01 0.677 0.4984
## required_experienceExecutive -2.654e-01 4.371e-01 -0.607 0.5438
## required_experienceInternship -6.846e-01 3.887e-01 -1.761 0.0782 .
## required_experienceMid-Senior level -8.121e-01 1.551e-01 -5.235 1.65e-07 ***
## required_experienceNot Applicable -3.626e-01 1.950e-01 -1.859 0.0630 .
## company_profile_specified1 -1.543e+00 1.547e-01 -9.978 < 2e-16 ***
## requirements_specified1 5.853e-01 1.416e-01 4.135 3.55e-05 ***
## benefits_specified1 1.962e-01 1.094e-01 1.793 0.0730 .
## salary_specified1 5.081e-01 1.120e-01 4.537 5.72e-06 ***
## company_profile_char 6.223e-03 6.299e-04 9.880 < 2e-16 ***
## description_char -9.761e-05 1.309e-04 -0.746 0.4559
## requirements_char 4.961e-04 5.185e-04 0.957 0.3387
## company_profile_word -4.362e-02 4.632e-03 -9.417 < 2e-16 ***
## description_word 5.772e-04 9.778e-04 0.590 0.5550
## requirements_word -7.453e-03 3.964e-03 -1.880 0.0601 .
## benefits_word 3.695e-03 9.413e-04 3.925 8.66e-05 ***
## reqedu_specified1 1.147e-01 1.224e-01 0.937 0.3489
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5505.9 on 14303 degrees of freedom
## Residual deviance: 4235.2 on 14281 degrees of freedom
## AIC: 4281.2
##
## Number of Fisher Scoring iterations: 7
pR2(model2)
## fitting null model for pseudo-r2
## llh llhNull G2 McFadden r2ML
## -2.117584e+03 -2.752949e+03 1.270730e+03 2.307944e-01 8.500565e-02
## r2CU
## 2.660625e-01
anova(model2, test= "Chisq")
probb=predict(model2,xtest2,type="response")
probb1=rep(0,3576)
probb1[prob>0.1]=1
cflr3 = confusionMatrix(as.factor(probb1), xtest2$fraudulent)
cflr3
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2999 67
## 1 397 113
##
## Accuracy : 0.8702
## 95% CI : (0.8588, 0.8811)
## No Information Rate : 0.9497
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2735
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.8831
## Specificity : 0.6278
## Pos Pred Value : 0.9781
## Neg Pred Value : 0.2216
## Prevalence : 0.9497
## Detection Rate : 0.8386
## Detection Prevalence : 0.8574
## Balanced Accuracy : 0.7554
##
## 'Positive' Class : 0
##
round(cflr3$byClass["F1"], 4)
## F1
## 0.9282
roc_lr3 = roc(xtest2$fraudulent, probb1, plot=TRUE, print.auc=TRUE)
#visualize coefficients
plot_model(model2,colors="system")