SVM can be used for classification (distinguishing between several groups or classes) and regression (obtaining a mathematical model to predict something). They can be applied to both linear and non linear problems. Train data imported from Kaggle.
Removing ID as it plays no role in the machine learning process
library(readr)
train <- read_csv("d:/data/train.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double()
## )
## i Use `spec()` for the full column specifications.
train$ID = NULL
dim(train)
## [1] 76020 370
Delete error files, check empty file and delete empty file use na.omit
## [1] 1310
## [1] 0
Removing predictors having constant 0 value across all observations
for (f in setdiff(names(train),c('TARGET'))) {
if (mean(train[[f]])== sum(train[[f]])) {
#cat(f, "is constant in train.\n")
train[[f]] = NULL
}
}
Getting the unique variable, should remove the duplicate variable(predictor variable) Use library(digest) faster result.
Making a new feature, thats the count of the number of zeros in the row
Use library(caret), removing the odd linier function
## Loading required package: lattice
## Loading required package: ggplot2
After removing the duplicate rows , dup=!duplicated(train), save the cleaning data in file train-c.csv
dup=!duplicated(train)
train.y = train.y[dup]
train= train[dup,] #; rm(dup)
dim(train)
## [1] 70571 170
table(train.y)
## train.y
## 0 1
## 67768 2803
write.csv(train,"train-c.csv")
One way to evaluate the performance of a model is to train it on a number of different smaller datasets and evaluate them over the other smaller testing set. Here for saving memory setting dataset into 1000 observation, create split data for model preaparation, 70 % for train_data adn 30 % for test_data
dr1 <- read.csv("train-c.csv")
dr1 <- dr1[1:3000,]
dr1 <- dr1[,-1]
dr1$TARGET <- as.factor(dr1$TARGET)
set.seed(123)
spl <- sample(nrow(dr1),nrow(dr1)*0.7)
train_data <- dr1[spl,]
test_data <- dr1[-spl,]
SVM model, use variable target as "TARGET", data use train_data use library(e1071)
library(e1071)
classifier1 = svm(formula = TARGET ~ .,
data = train_data,
type = 'C-classification',
kernel = 'linear')
## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'ind_var6_0' and 'ind_var6' and 'ind_var13_medio_0' and
## 'ind_var33' and 'ind_var44' and 'num_var33_0' and 'saldo_var13_medio' and
## 'saldo_var33' and 'delta_imp_aport_var17_1y3' and 'delta_imp_aport_var33_1y3'
## and 'delta_imp_reemb_var17_1y3' and 'delta_imp_trasp_var17_in_1y3'
## and 'delta_imp_trasp_var33_in_1y3' and 'delta_num_aport_var17_1y3'
## and 'delta_num_aport_var33_1y3' and 'delta_num_compra_var44_1y3'
## and 'delta_num_venta_var44_1y3' and 'imp_aport_var13_ult1'
## and 'imp_aport_var17_ult1' and 'imp_aport_var33_hace3' and
## 'imp_aport_var33_ult1' and 'imp_var7_emit_ult1' and 'imp_compra_var44_hace3'
## and 'imp_compra_var44_ult1' and 'imp_trasp_var17_in_hace3' and
## 'ind_var7_emit_ult1' and 'num_aport_var13_ult1' and 'num_aport_var17_hace3'
## and 'num_aport_var17_ult1' and 'num_aport_var33_hace3' and
## 'num_aport_var33_ult1' and 'num_compra_var44_hace3' and 'num_compra_var44_ult1'
## and 'num_meses_var29_ult3' and 'saldo_medio_var17_hace3' and
## 'saldo_medio_var29_hace2' and 'saldo_medio_var33_hace2' and
## 'saldo_medio_var33_hace3' and 'saldo_medio_var44_hace3' and
## 'saldo_medio_var44_ult1' constant. Cannot scale data.
Prediction model use test data
y_pred1= predict(classifier1, newdata = test_data)
#y_pred1
Checking the model accuracy Create table, check manual accuracy
## y_pred1
## 0 1
## 0 793 62
## 1 45 0
## [1] 0.8811111
Use library(caret) Just type confusionMatrix, will get the result accuracy
library(caret)
confusionMatrix(svm_tbl)
## Confusion Matrix and Statistics
##
## y_pred1
## 0 1
## 0 793 62
## 1 45 0
##
## Accuracy : 0.8811
## 95% CI : (0.8582, 0.9015)
## No Information Rate : 0.9311
## P-Value [Acc > NIR] : 1.0000
##
## Kappa : -0.0615
##
## Mcnemar's Test P-Value : 0.1219
##
## Sensitivity : 0.9463
## Specificity : 0.0000
## Pos Pred Value : 0.9275
## Neg Pred Value : 0.0000
## Prevalence : 0.9311
## Detection Rate : 0.8811
## Detection Prevalence : 0.9500
## Balanced Accuracy : 0.4732
##
## 'Positive' Class : 0
##
Logistic regression is used to predict a class, i.e., a probability. Logistic regression can predict a binary outcome accurately.
Import clean dataset
library(readr)
dataset <- read_csv("d:/Y/train-c.csv")
## Warning: Missing column names filled in: 'X1' [1]
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double()
## )
## i Use `spec()` for the full column specifications.
dataset <- dataset[,-1]
Splitting use random sampling set.seed(123), into train data 70% and test data 30%
set.seed(123)
spl <- sample(nrow(dataset),nrow(dataset)*0.7)
train<- dataset[spl,]
test <- dataset[-spl,]
GlM(Generalized Logistic Model), or well known as logistic regression model. In this case as variable target "TARGET", is "1" and "0", so the family is binomial.
log_model <- glm(formula = TARGET ~ .,
family = binomial,
data = train)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Use type = response for logistic reg,that will give the prob listed in the single vector The prediction in new data use test data. prob_pred > 0.5, means the result "TARGET" is "1", other "0"
prob_pred = predict(log_model, type = 'response',newdata = test)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
#prob_pred
y_pred = ifelse(prob_pred > 0.5, 1, 0)
#y_pred
Create table Create manual accuracy
log_tbl <- table(test$TARGET, y_pred)
log_tbl
## y_pred
## 0 1
## 0 18730 1624
## 1 434 384
log_accuracy <- sum(diag(log_tbl))/sum(log_tbl)
log_accuracy
## [1] 0.9027961
Use library(caret), just type confusionMatrik(cm),we get the accuracy of the model
library(caret)
confusionMatrix(log_tbl)
## Confusion Matrix and Statistics
##
## y_pred
## 0 1
## 0 18730 1624
## 1 434 384
##
## Accuracy : 0.9028
## 95% CI : (0.8987, 0.9068)
## No Information Rate : 0.9052
## P-Value [Acc > NIR] : 0.8817
##
## Kappa : 0.2295
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.9774
## Specificity : 0.1912
## Pos Pred Value : 0.9202
## Neg Pred Value : 0.4694
## Prevalence : 0.9052
## Detection Rate : 0.8847
## Detection Prevalence : 0.9614
## Balanced Accuracy : 0.5843
##
## 'Positive' Class : 0
##
svm_tbl <- table(test_data$TARGET, y_pred1)
svm_accuracy <- sum(diag(svm_tbl))/sum(svm_tbl)
log_tbl < table(test$TARGET, y_pred)
## y_pred
## 0 1
## 0 FALSE FALSE
## 1 FALSE FALSE
log_accuracy <- sum(diag(log_tbl))/sum(log_tbl)
Accuracy_model <- paste("SVM Model Accuracy :",svm_accuracy, "Logistic model Accuracy :",log_accuracy)
Accuracy_model
## [1] "SVM Model Accuracy : 0.881111111111111 Logistic model Accuracy : 0.902796145853013"