SVM Model

SVM can be used for classification (distinguishing between several groups or classes) and regression (obtaining a mathematical model to predict something). They can be applied to both linear and non linear problems. Train data imported from Kaggle.

Data Preparation

Removing ID as it plays no role in the machine learning process

library(readr)
train <- read_csv("d:/data/train.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double()
## )
## i Use `spec()` for the full column specifications.
train$ID = NULL

dim(train)
## [1] 76020   370

Removing error values

Delete error files, check empty file and delete empty file use na.omit

## [1] 1310
## [1] 0

Clean the dataset

Removing predictors having constant 0 value across all observations

for (f in setdiff(names(train),c('TARGET'))) {
  if (mean(train[[f]])== sum(train[[f]])) {
     #cat(f, "is constant in train.\n")
    train[[f]] = NULL
      }
}

Removing duplicated predictors

Getting the unique variable, should remove the duplicate variable(predictor variable) Use library(digest) faster result.

Count per row

Making a new feature, thats the count of the number of zeros in the row

Removing features being a specific linear function of other features

Use library(caret), removing the odd linier function

## Loading required package: lattice
## Loading required package: ggplot2

Removing the correlated features

cor.features = findCorrelation(cor(train), cutoff = .95, verbose = FALSE)
train = train[, -cor.features]

Removing duplicate rows removing

After removing the duplicate rows , dup=!duplicated(train), save the cleaning data in file train-c.csv

dup=!duplicated(train)
train.y = train.y[dup]
train= train[dup,] #; rm(dup)
dim(train)
## [1] 70571   170
table(train.y)
## train.y
##     0     1 
## 67768  2803
write.csv(train,"train-c.csv")

Read clean data

One way to evaluate the performance of a model is to train it on a number of different smaller datasets and evaluate them over the other smaller testing set. Here for saving memory setting dataset into 1000 observation, create split data for model preaparation, 70 % for train_data adn 30 % for test_data

dr1 <- read.csv("train-c.csv")
dr1 <- dr1[1:3000,]
dr1 <- dr1[,-1]
dr1$TARGET <- as.factor(dr1$TARGET)
set.seed(123)
spl <- sample(nrow(dr1),nrow(dr1)*0.7)
train_data <- dr1[spl,]
test_data <- dr1[-spl,]

Create SVM Model

SVM model, use variable target as "TARGET", data use train_data use library(e1071)

library(e1071)
classifier1 = svm(formula = TARGET ~ .,
                 data = train_data,
                 type = 'C-classification',
                 kernel = 'linear')
## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'ind_var6_0' and 'ind_var6' and 'ind_var13_medio_0' and
## 'ind_var33' and 'ind_var44' and 'num_var33_0' and 'saldo_var13_medio' and
## 'saldo_var33' and 'delta_imp_aport_var17_1y3' and 'delta_imp_aport_var33_1y3'
## and 'delta_imp_reemb_var17_1y3' and 'delta_imp_trasp_var17_in_1y3'
## and 'delta_imp_trasp_var33_in_1y3' and 'delta_num_aport_var17_1y3'
## and 'delta_num_aport_var33_1y3' and 'delta_num_compra_var44_1y3'
## and 'delta_num_venta_var44_1y3' and 'imp_aport_var13_ult1'
## and 'imp_aport_var17_ult1' and 'imp_aport_var33_hace3' and
## 'imp_aport_var33_ult1' and 'imp_var7_emit_ult1' and 'imp_compra_var44_hace3'
## and 'imp_compra_var44_ult1' and 'imp_trasp_var17_in_hace3' and
## 'ind_var7_emit_ult1' and 'num_aport_var13_ult1' and 'num_aport_var17_hace3'
## and 'num_aport_var17_ult1' and 'num_aport_var33_hace3' and
## 'num_aport_var33_ult1' and 'num_compra_var44_hace3' and 'num_compra_var44_ult1'
## and 'num_meses_var29_ult3' and 'saldo_medio_var17_hace3' and
## 'saldo_medio_var29_hace2' and 'saldo_medio_var33_hace2' and
## 'saldo_medio_var33_hace3' and 'saldo_medio_var44_hace3' and
## 'saldo_medio_var44_ult1' constant. Cannot scale data.

Predicting Test set

Prediction model use test data

y_pred1= predict(classifier1, newdata = test_data)
#y_pred1

Check Accuracy Model

Checking the model accuracy Create table, check manual accuracy

##    y_pred1
##       0   1
##   0 793  62
##   1  45   0
## [1] 0.8811111

Confusion matrix

Use library(caret) Just type confusionMatrix, will get the result accuracy

library(caret)
confusionMatrix(svm_tbl)
## Confusion Matrix and Statistics
## 
##    y_pred1
##       0   1
##   0 793  62
##   1  45   0
##                                           
##                Accuracy : 0.8811          
##                  95% CI : (0.8582, 0.9015)
##     No Information Rate : 0.9311          
##     P-Value [Acc > NIR] : 1.0000          
##                                           
##                   Kappa : -0.0615         
##                                           
##  Mcnemar's Test P-Value : 0.1219          
##                                           
##             Sensitivity : 0.9463          
##             Specificity : 0.0000          
##          Pos Pred Value : 0.9275          
##          Neg Pred Value : 0.0000          
##              Prevalence : 0.9311          
##          Detection Rate : 0.8811          
##    Detection Prevalence : 0.9500          
##       Balanced Accuracy : 0.4732          
##                                           
##        'Positive' Class : 0               
## 

Logistic Model

Logistic regression is used to predict a class, i.e., a probability. Logistic regression can predict a binary outcome accurately.

Importing the dataset

Import clean dataset

library(readr)
dataset <- read_csv("d:/Y/train-c.csv")
## Warning: Missing column names filled in: 'X1' [1]
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double()
## )
## i Use `spec()` for the full column specifications.
dataset <- dataset[,-1]

Splitting the dataset into the Training set and Test set

Splitting use random sampling set.seed(123), into train data 70% and test data 30%

set.seed(123)
spl <- sample(nrow(dataset),nrow(dataset)*0.7)
train<- dataset[spl,]
test <- dataset[-spl,]

Model logistic regression to the training set

GlM(Generalized Logistic Model), or well known as logistic regression model. In this case as variable target "TARGET", is "1" and "0", so the family is binomial.

log_model <-  glm(formula = TARGET ~ .,
                 family = binomial, 
                 data = train)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

Predicting the test set results

Use type = response for logistic reg,that will give the prob listed in the single vector The prediction in new data use test data. prob_pred > 0.5, means the result "TARGET" is "1", other "0"

prob_pred = predict(log_model, type = 'response',newdata = test)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
#prob_pred                                                          
y_pred = ifelse(prob_pred > 0.5, 1, 0)
#y_pred

Check accuracy

Create table Create manual accuracy

log_tbl <- table(test$TARGET, y_pred)
log_tbl
##    y_pred
##         0     1
##   0 18730  1624
##   1   434   384
log_accuracy <- sum(diag(log_tbl))/sum(log_tbl)
log_accuracy
## [1] 0.9027961

Making confussion matrix

Use library(caret), just type confusionMatrik(cm),we get the accuracy of the model

library(caret)
confusionMatrix(log_tbl)
## Confusion Matrix and Statistics
## 
##    y_pred
##         0     1
##   0 18730  1624
##   1   434   384
##                                           
##                Accuracy : 0.9028          
##                  95% CI : (0.8987, 0.9068)
##     No Information Rate : 0.9052          
##     P-Value [Acc > NIR] : 0.8817          
##                                           
##                   Kappa : 0.2295          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.9774          
##             Specificity : 0.1912          
##          Pos Pred Value : 0.9202          
##          Neg Pred Value : 0.4694          
##              Prevalence : 0.9052          
##          Detection Rate : 0.8847          
##    Detection Prevalence : 0.9614          
##       Balanced Accuracy : 0.5843          
##                                           
##        'Positive' Class : 0               
## 

Performance Model Accuracy

svm_tbl <- table(test_data$TARGET, y_pred1)
svm_accuracy <- sum(diag(svm_tbl))/sum(svm_tbl)
log_tbl < table(test$TARGET, y_pred)
##    y_pred
##         0     1
##   0 FALSE FALSE
##   1 FALSE FALSE
log_accuracy <- sum(diag(log_tbl))/sum(log_tbl)
Accuracy_model <- paste("SVM Model Accuracy :",svm_accuracy, "Logistic model Accuracy :",log_accuracy)
Accuracy_model
## [1] "SVM Model Accuracy : 0.881111111111111 Logistic model Accuracy : 0.902796145853013"