Support Vector Machine
##Link: https://www.geeksforgeeks.org/r-language/classifying-data-using-support-vector-machinessvms-in-r/
rm(list=ls())
# install.packages('e1071')
# install.packages('caTools')
# install.packages('ggplot2')
# install.packages('caret')
#install.packages("caTools")
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(e1071)
##
## Attaching package: 'e1071'
## The following object is masked from 'package:ggplot2':
##
## element
library(caTools)
library(ggplot2)
#data = read.csv("~/Desktop/CAU/AI4OPT/Data Engineering and Mining II/social.csv")
data=read.csv("/Users/maxineharlemon/AIOpt/social.csv")
head(data)
summary(data)
## User.ID Gender Age EstimatedSalary
## Min. :15566689 Length:400 Min. :18.00 Min. : 15000
## 1st Qu.:15626764 Class :character 1st Qu.:29.75 1st Qu.: 43000
## Median :15694342 Mode :character Median :37.00 Median : 70000
## Mean :15691540 Mean :37.66 Mean : 69742
## 3rd Qu.:15750363 3rd Qu.:46.00 3rd Qu.: 88000
## Max. :15815236 Max. :60.00 Max. :150000
## Purchased
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.3575
## 3rd Qu.:1.0000
## Max. :1.0000
set.seed(123)
data$Gender=factor(data$Gender, levels=c("Male","Female"), labels=c(0,1))
data$Gender=as.numeric(data$Gender)
data[, c("Age", "EstimatedSalary")] <- scale(data[, c("Age", "EstimatedSalary")])
split <- sample.split(data$Purchased, SplitRatio = 0.75)
training_set <- subset(data, split == TRUE)
test_set <- subset(data, split == FALSE)
sum(is.na(training_set))
## [1] 0
training_set_cleaned <- na.omit(training_set)
classifier<- svm(Purchased ~ Age + EstimatedSalary + Gender, data = training_set, type='C-classification',kernal='radial',gamma=0.1)
y_pred <- predict(classifier, newdata = test_set)
table(test_set$Purchased, y_pred)
## y_pred
## 0 1
## 0 59 5
## 1 4 32
accuracy <- sum(diag(table(test_set$Purchased, y_pred))) / sum(table(test_set$Purchased, y_pred))
cat("Accuracy: ", accuracy)
## Accuracy: 0.91
confusionMatrix(table(test_set$Purchased, y_pred))
## Confusion Matrix and Statistics
##
## y_pred
## 0 1
## 0 59 5
## 1 4 32
##
## Accuracy : 0.91
## 95% CI : (0.836, 0.958)
## No Information Rate : 0.63
## P-Value [Acc > NIR] : 1.623e-10
##
## Kappa : 0.8059
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9365
## Specificity : 0.8649
## Pos Pred Value : 0.9219
## Neg Pred Value : 0.8889
## Prevalence : 0.6300
## Detection Rate : 0.5900
## Detection Prevalence : 0.6400
## Balanced Accuracy : 0.9007
##
## 'Positive' Class : 0
##
X1 = seq(min(training_set$Age) - 1, max(training_set$Age) + 1, by = 0.5)
X2 = seq(min(training_set$EstimatedSalary) - 1, max(training_set$EstimatedSalary) + 1, by = 0.5)
grid_set = expand.grid(X1, X2)
grid_set$Gender = 1.5 #median(training_set$Gender) # Default Gender value for grid
names(grid_set) <- c("Age", "EstimatedSalary", "Gender")
y_grid = predict(classifier, newdata = grid_set)
ggplot() +
geom_text(data = grid_set, aes(x = Age, y = EstimatedSalary, label = as.factor(y_grid))) +
geom_point(data = training_set, aes(x = Age, y = EstimatedSalary, color = as.factor(Purchased)), size = 3, shape = 21) +
scale_fill_manual(values = c('coral1', 'aquamarine')) +
scale_color_manual(values = c('green4', 'red3')) +
labs(title = 'SVM Decision Boundary (Training set)', x = 'Age', y = 'Estimated Salary') +
theme_minimal() +
theme(legend.position = "none")
