H-1B Visas request dataset.
I acquired a sample of the dataset (10%) To increase accuracy of the model, I removed some of the variables.
colnames(Data)
## [1] "X" "CASE_STATUS" "EMPLOYER_NAME"
## [4] "SOC_NAME" "JOB_TITLE" "FULL_TIME_POSITION"
## [7] "PREVAILING_WAGE" "YEAR" "WORKSITE"
## [10] "lon" "lat"
Data <- sample_n(Data1, 300000)
Data <- Data[,-c(1,3,4,5,10:11)]
head(Data, 40)
## CASE_STATUS FULL_TIME_POSITION PREVAILING_WAGE YEAR
## 121099 CERTIFIED N 68411.0 2016
## 1589687 CERTIFIED Y 53763.0 2014
## 1570722 CERTIFIED Y 52790.0 2014
## 2843284 CERTIFIED N 45489.6 2011
## 8710 CERTIFIED Y 96450.0 2016
## 1128579 CERTIFIED Y 123479.0 2015
## 2490871 CERTIFIED Y 55794.0 2012
## 2911095 CERTIFIED Y 65707.0 2011
## 2434415 DENIED Y 47944.0 2012
## 1393218 CERTIFIED Y 48048.0 2014
## 417350 CERTIFIED Y 120390.0 2016
## 2539649 CERTIFIED-WITHDRAWN Y 50960.0 2012
## 328411 CERTIFIED N 65042.0 2016
## 1613060 CERTIFIED Y 60611.0 2014
## 1011132 DENIED Y 139714.0 2015
## 1170012 CERTIFIED-WITHDRAWN Y 118082.0 2015
## 1396464 CERTIFIED Y 45323.0 2014
## 2541459 WITHDRAWN Y 45094.4 2012
## 112347 CERTIFIED N 62234.0 2016
## 2706317 CERTIFIED-WITHDRAWN Y 43472.0 2011
## 1316415 CERTIFIED Y 94182.0 2014
## 310619 CERTIFIED Y 73590.0 2016
## 854045 CERTIFIED Y 104437.0 2015
## 2719451 CERTIFIED Y 61464.0 2011
## 2174285 CERTIFIED Y 95659.0 2013
## 156707 CERTIFIED N 56514.0 2016
## 2906848 CERTIFIED Y 66622.0 2011
## 2184841 CERTIFIED Y 65458.0 2013
## 2029925 CERTIFIED Y 99944.0 2013
## 2402467 CERTIFIED Y 64084.8 2012
## 328202 CERTIFIED Y 70491.2 2016
## 1528086 CERTIFIED Y 73154.0 2014
## 123552 CERTIFIED Y 81266.0 2016
## 256853 CERTIFIED N 67954.0 2016
## 2924818 CERTIFIED Y 81162.0 2011
## 2611877 CERTIFIED Y 112694.0 2012
## 305105 CERTIFIED Y 105602.0 2016
## 1194068 CERTIFIED Y 63190.0 2015
## 881907 CERTIFIED Y 71989.0 2015
## 2925577 CERTIFIED Y 65354.0 2011
## WORKSITE
## 121099 REDMOND, WASHINGTON
## 1589687 PEORIA, ILLINOIS
## 1570722 GREAT NECK, NEW JERSEY
## 2843284 NEW YORK, NEW YORK
## 8710 SHERMAN, TEXAS
## 1128579 MENLO PARK, CALIFORNIA
## 2490871 CHICAGO, ILLINOIS
## 2911095 STAFFORD, TEXAS
## 2434415 SHERBORN, MASSACHUSETTS
## 1393218 CLEVELAND, OHIO
## 417350 SAN FRANCISCO, CALIFORNIA
## 2539649 ATLANTA, GEORGIA
## 328411 COPPELL, TEXAS
## 1613060 WILMINGTON, DELAWARE
## 1011132 ROCKVILLE, MARYLAND
## 1170012 REDWOOD CITY, CALIFORNIA
## 1396464 MAHWAH, NEW JERSEY
## 2541459 GRAPEVINE, TEXAS
## 112347 MINNEAPOLIS, MINNESOTA
## 2706317 ST. CLOUD, MINNESOTA
## 1316415 BOSTON, MASSACHUSETTS
## 310619 BATON ROUGE, LOUISIANA
## 854045 ATLANTA, GEORGIA
## 2719451 ARLINGTON HEIGHTS, ILLINOIS
## 2174285 MARLBOROUGH, MASSACHUSETTS
## 156707 AURORA, ILLINOIS
## 2906848 HERNDON, VIRGINIA
## 2184841 PLEASANTON, CALIFORNIA
## 2029925 WOBURN, MASSACHUSETTS
## 2402467 BOISE, IDAHO
## 328202 ENGLEWOOD, COLORADO
## 1528086 MILPITAS, CALIFORNIA
## 123552 RYE, NEW YORK
## 256853 SAN DIEGO, CALIFORNIA
## 2924818 PHOENIX, ARIZONA
## 2611877 EAST HANOVER, NEW JERSEY
## 305105 NORWALK, CONNECTICUT
## 1194068 REDMOND, WASHINGTON
## 881907 SOMERVILLE, MASSACHUSETTS
## 2925577 BOCA RATON, FLORIDA
I start to clean up the sample of the dataset
Data <- na.exclude(Data) #Excludes empty spaces on the dataset
Data$WORKSITE <- gsub(".*,", "", Data$WORKSITE) #Remove cities, leave states for levels
#Remove dirt
Data <- Data[!Data$CASE_STATUS == "PENDING QUALITY AND COMPLIANCE REVIEW - UNASSIGNED",]
Data <- Data[!Data$WORKSITE == "NA", ]
#ORGANIZING DATA
Data[Data$CASE_STATUS == "INVALIDATED", "CASE_STATUS"] <- "REJECTED"
Data[Data$CASE_STATUS == "CERTIFIED-WITHDRAWN", "CASE_STATUS"] <- "WITHDRAWN"
Data$CASE_STATUS <- factor(Data$CASE_STATUS)
Data$YEAR <- as.factor(Data$YEAR)
Data$WORKSITE <- as.factor(Data$WORKSITE)
I Set up to perform 3-fold cross-validation.
trainOptions <- trainControl()
trainOptions$method="cv"
trainOptions$number=3
I continue to create the following partitions for Training, only using 30% of the data for Training, to avoid Overfitting (Characteristic of rf).
set.seed(4644)
inTraining <- createDataPartition(y=Data$CASE_STATUS, p=0.3, list=FALSE)
## Warning in createDataPartition(y = Data$CASE_STATUS, p = 0.3, list =
## FALSE): Some classes have a single record ( REJECTED ) and these will be
## selected for the sample
trainingData <- Data[inTraining,]
validationData <- Data[-inTraining,]
I used the Caret Library to train the Data set with the Random Forest Algorithm.
cl <- makeCluster(4)
registerDoParallel(cl)
rfModel <- train(CASE_STATUS~ ., data = Data, method="rf", trControl=trainOptions)
stopCluster(cl)
print(varImp(rfModel))
## rf variable importance
##
## only 20 most important variables shown (out of 59)
##
## Overall
## PREVAILING_WAGE 100.000
## WORKSITE TEXAS 3.283
## WORKSITE ILLINOIS 2.691
## WORKSITE NEW JERSEY 2.349
## WORKSITE FLORIDA 2.347
## WORKSITE MASSACHUSETTS 2.134
## WORKSITE NEW YORK 2.087
## WORKSITE VIRGINIA 1.975
## WORKSITE MICHIGAN 1.820
## WORKSITE PENNSYLVANIA 1.809
## WORKSITE WASHINGTON 1.805
## WORKSITE CALIFORNIA 1.764
## WORKSITE NORTH CAROLINA 1.758
## WORKSITE GEORGIA 1.741
## WORKSITE OHIO 1.571
## WORKSITE MARYLAND 1.461
## WORKSITE ARIZONA 1.427
## FULL_TIME_POSITIONY 1.386
## WORKSITE CONNECTICUT 1.361
## WORKSITE INDIANA 1.298
I redo a Partition of the Training set this time containing 60% of the data.
inTraining <- createDataPartition(y=validationData$CASE_STATUS, p=0.6, list=FALSE)
trainingAgain <- validationData[inTraining,]
testing <- validationData[-inTraining,]
Furthermore, I continue to analize the accuracy of the Model.
rfModel
## Random Forest
##
## 299990 samples
## 4 predictor
## 4 classes: 'CERTIFIED', 'DENIED', 'REJECTED', 'WITHDRAWN'
##
## No pre-processing
## Resampling: Cross-Validated (3 fold)
## Summary of sample sizes: 199994, 199993, 199993
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.8719855 0.0000000
## 30 0.8724455 0.0290493
## 59 0.8518598 0.1450824
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 30.
trainPredict <- predict(rfModel,trainingData)
confusionMatrix(trainPredict,trainingData$CASE_STATUS)
## Confusion Matrix and Statistics
##
## Reference
## Prediction CERTIFIED DENIED REJECTED WITHDRAWN
## CERTIFIED 78254 2684 1 8601
## DENIED 50 121 0 12
## REJECTED 0 0 0 0
## WITHDRAWN 104 8 0 163
##
## Overall Statistics
##
## Accuracy : 0.8727
## 95% CI : (0.8705, 0.8748)
## No Information Rate : 0.8712
## P-Value [Acc > NIR] : 0.0986
##
## Kappa : 0.0415
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: CERTIFIED Class: DENIED Class: REJECTED
## Sensitivity 0.99804 0.043015 0.000e+00
## Specificity 0.02623 0.999289 1.000e+00
## Pos Pred Value 0.87396 0.661202 NaN
## Neg Pred Value 0.66376 0.970027 1.000e+00
## Prevalence 0.87122 0.031256 1.111e-05
## Detection Rate 0.86951 0.001344 0.000e+00
## Detection Prevalence 0.99491 0.002033 0.000e+00
## Balanced Accuracy 0.51213 0.521152 5.000e-01
## Class: WITHDRAWN
## Sensitivity 0.018573
## Specificity 0.998621
## Pos Pred Value 0.592727
## Neg Pred Value 0.904005
## Prevalence 0.097513
## Detection Rate 0.001811
## Detection Prevalence 0.003056
## Balanced Accuracy 0.508597
testPredict <- predict(rfModel,Data)
confusionMatrix(testPredict,Data$CASE_STATUS)
## Confusion Matrix and Statistics
##
## Reference
## Prediction CERTIFIED DENIED REJECTED WITHDRAWN
## CERTIFIED 260842 9002 1 28676
## DENIED 169 343 0 30
## REJECTED 0 0 0 0
## WITHDRAWN 349 30 0 546
##
## Overall Statistics
##
## Accuracy : 0.8725
## 95% CI : (0.8713, 0.8737)
## No Information Rate : 0.8712
## P-Value [Acc > NIR] : 0.0216
##
## Kappa : 0.0387
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: CERTIFIED Class: DENIED Class: REJECTED
## Sensitivity 0.99802 0.036587 0.000e+00
## Specificity 0.02457 0.999315 1.000e+00
## Pos Pred Value 0.87378 0.632841 NaN
## Neg Pred Value 0.64690 0.969838 1.000e+00
## Prevalence 0.87123 0.031251 3.333e-06
## Detection Rate 0.86951 0.001143 0.000e+00
## Detection Prevalence 0.99511 0.001807 0.000e+00
## Balanced Accuracy 0.51129 0.517951 5.000e-01
## Class: WITHDRAWN
## Sensitivity 0.018665
## Specificity 0.998600
## Pos Pred Value 0.590270
## Neg Pred Value 0.904014
## Prevalence 0.097511
## Detection Rate 0.001820
## Detection Prevalence 0.003083
## Balanced Accuracy 0.508633
#First Test
testing[23423,]
## CASE_STATUS FULL_TIME_POSITION PREVAILING_WAGE YEAR WORKSITE
## 245195 CERTIFIED N 60549 2016 VIRGINIA
predict(rfModel,testing[23423,])
## [1] CERTIFIED
## Levels: CERTIFIED DENIED REJECTED WITHDRAWN
#Second Test
testing[668,]
## CASE_STATUS FULL_TIME_POSITION PREVAILING_WAGE YEAR WORKSITE
## 2688690 CERTIFIED Y 70699 2011 NEW JERSEY
predict(rfModel, testing[668,])
## [1] CERTIFIED
## Levels: CERTIFIED DENIED REJECTED WITHDRAWN
#Third Test
testing[31524,]
## CASE_STATUS FULL_TIME_POSITION PREVAILING_WAGE YEAR WORKSITE
## 2899357 CERTIFIED Y 75462 2011 MASSACHUSETTS
predict(rfModel, testing[65265,])
## [1] CERTIFIED
## Levels: CERTIFIED DENIED REJECTED WITHDRAWN