This is a binary classification problem in the present work.
# Required libraries
if(!require(dplyr))install.packages('dplyr'); require(dplyr)
## Warning: package 'dplyr' was built under R version 3.2.2
if(!require(caret))install.packages('caret'); require(caret)
## Warning: package 'caret' was built under R version 3.2.3
## Warning: package 'lattice' was built under R version 3.2.2
## Warning: package 'ggplot2' was built under R version 3.2.3
if(!require(pROC))install.packages('pROC'); require(pROC)
## Warning: package 'pROC' was built under R version 3.2.4
if(!require(ggplot2))install.packages('ggplot2'); require(ggplot2)
if(!require(outliers))install.packages('outliers'); require(outliers)
## Warning: package 'outliers' was built under R version 3.2.3
if(!require(parallel))install.packages('parallel'); require(parallel)
if(!require(doParallel))install.packages('doParallel'); require(doParallel)
## Warning: package 'doParallel' was built under R version 3.2.2
## Warning: package 'foreach' was built under R version 3.2.2
## Warning: package 'iterators' was built under R version 3.2.2
cluster <- makeCluster(detectCores()-1) # convention to leave 1 core for OS
registerDoParallel(cluster)
# Reproducible work
set.seed(1234)
# Load Data
train <- read.table('./Data/Training.csv',header=T,sep=';',na.strings=c('NA',''))
test<- read.table('./Data/Validation.csv',header=T,sep=';',na.strings=c('NA',''))
# Summary and structure
head(train); head(test)
## v9 v17 v29 v20 v41 v31 v36 v19 v2 v37 v12 v7 v27 v21 v39 v34
## 1 a 17,92 5,4e-05 u g c v 1,75 f t 1 f t g 80 5
## 2 b 16,92 3,35e-05 y p k v 0,29 f f 0 f f s 200 0
## 3 b 31,25 0,0001125 u g ff ff 0 f t 1 f f g 96 19
## 4 a 48,17 0,0001335 u g i o 0,335 f f 0 f f g 0 120
## 5 b 32,33 0,00035 u g k v 0,5 f f 0 f t g 232 0
## 6 a 34,83 0,000125 y p i h 0,5 f f 0 f t g 160 0
## v18 v35 classLabel
## 1 800000 t no.
## 2 2000000 <NA> no.
## 3 960000 t no.
## 4 0 <NA> no.
## 5 2320000 f no.
## 6 1600000 f no.
## v9 v17 v29 v20 v41 v31 v36 v19 v2 v37 v12 v7 v27 v21 v39 v34
## 1 b 32,33 0,00075 u g e bb 1,585 t f 0 t t s 420 0
## 2 b 23,58 0,000179 u g c v 0,54 f f 0 f t g 136 1
## 3 b 36,42 7,5e-05 y p d v 0,585 f f 0 t f g 240 3
## 4 b 18,42 0,0010415 y p aa v 0,125 t f 0 f f g 120 375
## 5 b 24,5 0,0013335 y p aa v 0,04 f f 0 t t g 120 475
## 6 a 39,08 4e-04 u g c v 3 f f 0 f f g 480 0
## v18 v35 classLabel
## 1 4200000 <NA> no.
## 2 1360000 <NA> no.
## 3 2400000 <NA> no.
## 4 1200000 <NA> no.
## 5 1200000 f no.
## 6 4800000 f no.
summary(train)
## v9 v17 v29 v20 v41
## a :1279 23,25 : 64 0 : 134 l : 32 g :3055
## b :2382 22,67 : 52 3e-04 : 121 u :3055 gg : 32
## NA's: 39 19,17 : 49 0,00065: 98 y : 549 p : 549
## 20,42 : 49 0,00015: 90 NA's: 64 NA's: 64
## 25,17 : 49 5e-04 : 87
## (Other):3398 5e-05 : 86
## NA's : 39 (Other):3084
## v31 v36 v19 v2 v37
## c : 787 v :2015 0 : 268 f: 529 f:1461
## q : 612 h : 970 1,5 : 169 t:3171 t:2239
## W : 363 bb : 339 0,04 : 127
## cc : 343 ff : 154 2,5 : 115
## x : 340 z : 49 1 : 111
## (Other):1189 (Other): 107 0,25 : 97
## NA's : 66 NA's : 66 (Other):2813
## v12 v7 v27 v21 v39
## Min. : 0.00 f: 276 f:1924 g:3430 Min. : 0.0
## 1st Qu.: 0.00 t:3424 t:1776 p: 81 1st Qu.: 0.0
## Median : 2.00 s: 189 Median : 120.0
## Mean : 4.16 Mean : 162.7
## 3rd Qu.: 6.00 3rd Qu.: 280.0
## Max. :67.00 Max. :1160.0
## NA's :100
## v34 v18 v35 classLabel
## Min. : 0 Min. : 0 f : 532 no. : 276
## 1st Qu.: 0 1st Qu.: 0 t :1023 yes.:3424
## Median : 113 Median : 1200000 NA's:2145
## Mean : 2247 Mean : 1626950
## 3rd Qu.: 1060 3rd Qu.: 2800000
## Max. :100000 Max. :11600000
## NA's :100
str(train)
## 'data.frame': 3700 obs. of 19 variables:
## $ v9 : Factor w/ 2 levels "a","b": 1 2 2 1 2 1 1 2 2 2 ...
## $ v17 : Factor w/ 288 levels "13,75","15,17",..: 17 10 142 240 153 171 98 48 123 20 ...
## $ v29 : Factor w/ 179 levels "0","0,000104",..: 160 153 5 9 40 8 152 176 154 95 ...
## $ v20 : Factor w/ 3 levels "l","u","y": 2 3 2 2 2 3 2 3 2 2 ...
## $ v41 : Factor w/ 3 levels "g","gg","p": 1 3 1 1 1 3 1 3 1 1 ...
## $ v31 : Factor w/ 14 levels "aa","c","cc",..: 2 9 6 7 9 7 8 2 2 2 ...
## $ v36 : Factor w/ 9 levels "bb","dd","ff",..: 8 8 3 7 8 4 5 4 8 4 ...
## $ v19 : Factor w/ 118 levels "0","0,04","0,085",..: 39 8 1 9 13 13 1 7 8 25 ...
## $ v2 : Factor w/ 2 levels "f","t": 1 1 1 1 1 1 1 1 1 1 ...
## $ v37 : Factor w/ 2 levels "f","t": 2 1 2 1 1 1 1 1 1 1 ...
## $ v12 : int 1 0 1 0 0 0 0 0 0 0 ...
## $ v7 : Factor w/ 2 levels "f","t": 1 1 1 1 1 1 1 1 1 1 ...
## $ v27 : Factor w/ 2 levels "f","t": 2 1 1 1 2 2 2 1 1 1 ...
## $ v21 : Factor w/ 3 levels "g","p","s": 1 3 1 1 1 1 1 1 1 1 ...
## $ v39 : int 80 200 96 0 232 160 276 280 220 320 ...
## $ v34 : int 5 0 19 120 0 0 1 204 140 13 ...
## $ v18 : num 800000 2000000 960000 0 2320000 1600000 2760000 2800000 2200000 3200000 ...
## $ v35 : Factor w/ 2 levels "f","t": 2 NA 2 NA 1 1 NA NA NA NA ...
## $ classLabel: Factor w/ 2 levels "no.","yes.": 1 1 1 1 1 1 1 1 1 1 ...
# We change some variables from factor to numeric classes
train$v17<- as.numeric(train$v17); train$v29<- as.numeric(train$v29); train$v19<- as.numeric(train$v19)
test$v17<- as.numeric(test$v17); test$v29<- as.numeric(test$v29); test$v19<- as.numeric(test$v19)
# Make positive response as the reference
train <- train %>% mutate(classLabel= relevel(classLabel,ref = 'yes.'))
test <- test %>% mutate(classLabel= relevel(classLabel,ref = 'yes.'))
Vars<- names(train %>% select(-classLabel))
Ind_num<- sapply(train[,Vars], class) %in% c('numeric','integer')
# Numerical variables
Vars_num<- Vars[Ind_num]
# Categorical variables
Vars_cat<-setdiff(Vars,Vars_num)
train<- train %>% mutate(v18 = sqrt(v18), v34= sqrt(v34))
test<- test %>% mutate(v18 = sqrt(v18), v34= sqrt(v34))
#for(v in Vars_num) {
# print(v)
# print(outlier(train[,v]))
#}
# Supposed Outliers
train <- train %>% filter(v17 < 240 & v29 > 4 & v19 < 130
& v12 < 20 & v39 < 760 & v34 < 120 & v18 < 2700)
test <- test %>% filter(v17 < 240 & v29 > 4 & v19 < 130
& v12 < 20 & v39 < 760 & v34 < 120 & v18 < 2700)
# Impute missing numerical values
preProcValues<- preProcess(train[Vars_num], method = 'knnImpute')
train[Vars_num]<- predict(preProcValues, train[Vars_num])
# Categorical model
Pred_cat <- function(Outcome,VarTrain,VarPred) {
# Mean probabilty according to overall 'yes' answer
tab<- table(Outcome)
p_yes<- as.numeric((tab/sum(tab))['yes.'])
# Mean probabilty according to Na levels
tab <- table(Outcome[is.na(VarTrain)])
p_na<- as.numeric((tab/sum(tab))['yes.'])
# Mean probabilty according to other levels
tab <- table(Outcome,VarTrain)
p <- (tab['yes.',]+p_yes/1000)/(colSums(tab)+p_yes/1000)
# Mapping according to levels
pred <- p[VarPred]
# Assign values to Na levels
pred[is.na(VarPred)] <- p_na
# Assign values to last levels
pred[is.na(pred)] <- p_yes
# Obtained prediction
pred
}
# Numerical model
Pred_num <- function(Outcome,VarTrain,VarPred) {
# Cut range
cuts <- unique(quantile(VarTrain, probs=seq(0, 1, 1.0e-4), na.rm=T)) #0.000001
# If/not single cut value
ifelse(length(cuts)==1,cuts <- c(cuts-1,cuts),cuts)
# Cut numerical levels to categorical levels
VarTrain <- cut(VarTrain,cuts)
VarPred <- cut(VarPred,cuts)
Pred_cat(Outcome,VarTrain,VarPred)
}
# Predicting
for(v in Vars_cat) {
pi <- paste('pred',v,sep='')
train[,pi]<- Pred_cat(train[,'classLabel'],train[,v],train[,v])
test[,pi]<- Pred_cat(test[,'classLabel'],test[,v],test[,v])
}
for(v in Vars_num) {
pi <- paste('pred',v,sep='')
train[,pi]<- Pred_num(train[,'classLabel'],train[,v],train[,v])
test[,pi]<- Pred_num(test[,'classLabel'],test[,v],test[,v])
}
# New data sets
train<- train %>% select(starts_with("pred"),classLabel)
test<- test %>% select(starts_with("pred"),classLabel)
# Check presence of Nas
sum(is.na(train)); sum(is.na(test))
## [1] 0
## [1] 0
# Check dimensions
dim(train); dim(test)
## [1] 2806 19
## [1] 183 19
# PreProcessing data set (centering, scaling and avoid skewness)
xTrans <- preProcess(train %>% select(-classLabel), method= c('center','scale','BoxCox'))
train_T <- predict(xTrans, train %>% select(-classLabel))
test_T <- predict(xTrans, test %>% select(-classLabel))
## predv17
## predv29
# Add class column
train_T$classLabel<- train$classLabel ; test_T$classLabel<- test$classLabel
# Check zero variance variables
Ind_zv<- nearZeroVar(train_T %>% select(-classLabel))
if(length(Ind_zv) > 0){
train_T <- train_T %>% select(-Ind_zv)
test_T <- test_T %>% select(-Ind_zv)
}
# Check dimensions
dim(train_T); dim(test_T)
## [1] 2806 19
## [1] 183 19
# Check correlation between variables
descrCorr <- cor(train_T %>% select(-classLabel))
highCorr <- findCorrelation(descrCorr, cutoff = 0.9)
if(length(highCorr) > 0){
train_T <- train_T %>% select(-highCorr)
test_T <- test_T %>% select(-highCorr)
}
# Check dimensions
dim(train_T); dim(test_T)
## [1] 2806 16
## [1] 183 16
# Control Cross-Validation & ROC
Control<- trainControl(method= "repeatedcv", summaryFunction= twoClassSummary, classProbs= T,
number= 10,
repeats= 5,
allowParallel= T)
# K nearly expected
#(10/(sum(train_T['classLabel'] == 'yes.')/length(train_T$classLabel)))
# kNN Model
#knn_fit<- train(classLabel~ ., data= train_T, method= 'knn', metric='ROC', trControl= Control,
# tuneGrid = expand.grid(.k = c(10,20)))
#plot(knn_fit)
knn_fit<- train(classLabel~ ., data= train_T, method= 'knn', metric='ROC', trControl= Control)
pred<- predict(knn_fit,test_T[setdiff(names(test_T),'classLabel')], type= 'prob')[,'yes.']
rocCurve<- roc(response= test_T$classLabel, predictor= pred)
# Area
auc(rocCurve)
## Area under the curve: 0.9926
# ROC plot (Knn)
pred<- predict(knn_fit,test_T[setdiff(names(test_T),'classLabel')], type= 'prob')[,'yes.']
rocCurve<- roc(response= test_T$classLabel, predictor= pred)
a<- data.frame(y= rocCurve$sensitivities, x= rocCurve$specificities, model= 'KNN')
ggplot(data=a,aes(x= 1-x,y=y, color=model))+
geom_line(lwd=1)+
guides(fill=F)+theme_bw()+
xlab('1-Specificity')+
ylab('Sensitivity')+ggtitle('ROC Curve')+
geom_abline (intercept = 0, slope = 1, lwd=1,lty=2) +
theme(axis.text= element_text(size=13,face='bold'),
axis.title= element_text(size=15,vjust = 0.5,face='bold'),
title=element_text(size=15,vjust = 2))+
scale_x_continuous(limits=c(0,1),breaks=seq(0,1,0.2))+
scale_y_continuous(limits=c(0,1),breaks=seq(0,1,0.2))
# Control Cross-Validation
Control<- trainControl(method= "repeatedcv",
number= 10,
repeats= 5,
allowParallel= T)
# Knn model
knn_fit<- train(classLabel~ ., data= train_T, method= 'knn', trControl= Control)
pred<- predict(knn_fit,test_T[setdiff(names(test_T),'classLabel')])
# Confusion matrix
confusionMatrix(pred,test_T$classLabel)
## Confusion Matrix and Statistics
##
## Reference
## Prediction yes. no.
## yes. 77 0
## no. 14 92
##
## Accuracy : 0.9235
## 95% CI : (0.875, 0.9575)
## No Information Rate : 0.5027
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8469
## Mcnemar's Test P-Value : 0.000512
##
## Sensitivity : 0.8462
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.8679
## Prevalence : 0.4973
## Detection Rate : 0.4208
## Detection Prevalence : 0.4208
## Balanced Accuracy : 0.9231
##
## 'Positive' Class : yes.
##
We did a binary classification study via a the Knn predictive model to predict the validation data set from the training data set. We have got an accuracy of ~ 92% with a ROC area of ~ 0.99. Maybe, we can still investigated the present work to get a better accurary.
# Stop parallel process
stopCluster(cluster)
# Software environment
sessionInfo()
## R version 3.2.0 (2015-04-16)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 8 x64 (build 9200)
##
## locale:
## [1] LC_COLLATE=English_United States.1252
## [2] LC_CTYPE=English_United States.1252
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.1252
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] doParallel_1.0.10 iterators_1.0.8 foreach_1.4.3 outliers_0.14
## [5] pROC_1.8 caret_6.0-64 ggplot2_2.0.0 lattice_0.20-33
## [9] dplyr_0.4.3
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.12.3 compiler_3.2.0 formatR_1.2.1
## [4] nloptr_1.0.4 plyr_1.8.3 class_7.3-14
## [7] tools_3.2.0 digest_0.6.9 lme4_1.1-11
## [10] evaluate_0.8 gtable_0.2.0 nlme_3.1-124
## [13] mgcv_1.8-11 Matrix_1.2-0 DBI_0.3.1
## [16] yaml_2.1.13 SparseM_1.7 e1071_1.6-7
## [19] stringr_1.0.0 knitr_1.12.3 MatrixModels_0.4-1
## [22] stats4_3.2.0 grid_3.2.0 nnet_7.3-12
## [25] R6_2.1.2 rmarkdown_0.9.5 minqa_1.2.4
## [28] reshape2_1.4.1 car_2.1-1 magrittr_1.5
## [31] scales_0.3.0 codetools_0.2-14 htmltools_0.3
## [34] MASS_7.3-45 splines_3.2.0 assertthat_0.1
## [37] pbkrtest_0.4-4 colorspace_1.2-6 quantreg_5.21
## [40] stringi_1.0-1 lazyeval_0.1.10 munsell_0.4.3