The Salary data set is being used for Support vector machine modelling by using the factors like age, occupation, workclas, marital status etc and predict the salary classification.
library(kernlab)
## Warning: package 'kernlab' was built under R version 3.5.1
library(caret)
## Warning: package 'caret' was built under R version 3.5.1
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.1
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:kernlab':
##
## alpha
library(plyr)
## Warning: package 'plyr' was built under R version 3.5.1
library(ggplot2)
library(psych)
## Warning: package 'psych' was built under R version 3.5.1
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
## The following object is masked from 'package:kernlab':
##
## alpha
library(e1071)
## Warning: package 'e1071' was built under R version 3.5.1
# Data(Train)
train_sal <- read.csv(file.choose())
str(train_sal)
## 'data.frame': 30161 obs. of 14 variables:
## $ age : int 39 50 38 53 28 37 49 52 31 42 ...
## $ workclass : Factor w/ 7 levels " Federal-gov",..: 6 5 3 3 3 3 3 5 3 3 ...
## $ education : Factor w/ 16 levels " 10th"," 11th",..: 10 10 12 2 10 13 7 12 13 10 ...
## $ educationno : int 13 13 9 7 13 14 5 9 14 13 ...
## $ maritalstatus: Factor w/ 7 levels " Divorced"," Married-AF-spouse",..: 5 3 1 3 3 3 4 3 5 3 ...
## $ occupation : Factor w/ 14 levels " Adm-clerical",..: 1 4 6 6 10 4 8 4 10 4 ...
## $ relationship : Factor w/ 6 levels " Husband"," Not-in-family",..: 2 1 2 1 6 6 2 1 2 1 ...
## $ race : Factor w/ 5 levels " Amer-Indian-Eskimo",..: 5 5 5 3 3 5 3 5 5 5 ...
## $ sex : Factor w/ 2 levels " Female"," Male": 2 2 2 2 1 1 1 2 1 2 ...
## $ capitalgain : int 2174 0 0 0 0 0 0 0 14084 5178 ...
## $ capitalloss : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hoursperweek : int 40 13 40 40 40 40 16 45 50 40 ...
## $ native : Factor w/ 40 levels " Cambodia"," Canada",..: 38 38 38 38 5 38 22 38 38 38 ...
## $ Salary : Factor w/ 2 levels " <=50K"," >50K": 1 1 1 1 1 1 1 2 2 2 ...
View(train_sal)
train_sal$educationno <- as.factor(train_sal$educationno)
class(train_sal)
## [1] "data.frame"
# Data(Test)
test_sal <- read.csv(file.choose())
str(test_sal)
## 'data.frame': 15060 obs. of 14 variables:
## $ age : int 25 38 28 44 34 63 24 55 65 36 ...
## $ workclass : Factor w/ 7 levels " Federal-gov",..: 3 3 2 3 3 5 3 3 3 1 ...
## $ education : Factor w/ 16 levels " 10th"," 11th",..: 2 12 8 16 1 15 16 6 12 10 ...
## $ educationno : int 7 9 12 10 6 15 10 4 9 13 ...
## $ maritalstatus: Factor w/ 7 levels " Divorced"," Married-AF-spouse",..: 5 3 3 3 5 3 5 3 3 3 ...
## $ occupation : Factor w/ 14 levels " Adm-clerical",..: 7 5 11 7 8 10 8 3 7 1 ...
## $ relationship : Factor w/ 6 levels " Husband"," Not-in-family",..: 4 1 1 1 2 1 5 1 1 1 ...
## $ race : Factor w/ 5 levels " Amer-Indian-Eskimo",..: 3 5 5 3 5 5 5 5 5 5 ...
## $ sex : Factor w/ 2 levels " Female"," Male": 2 2 2 2 2 2 1 2 2 2 ...
## $ capitalgain : int 0 0 0 7688 0 3103 0 0 6418 0 ...
## $ capitalloss : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hoursperweek : int 40 50 40 40 30 32 40 10 40 40 ...
## $ native : Factor w/ 40 levels " Cambodia"," Canada",..: 38 38 38 38 38 38 38 38 38 38 ...
## $ Salary : Factor w/ 2 levels " <=50K"," >50K": 1 1 2 2 1 2 1 1 2 1 ...
View(test_sal)
test_sal$educationno <- as.factor(test_sal$educationno)
class(test_sal)
## [1] "data.frame"
#Visualization
# Plot and ggplot
ggplot(data=train_sal,aes(x=train_sal$Salary, y = train_sal$age, fill = train_sal$Salary)) +
geom_boxplot() +
ggtitle("Box Plot")
plot(train_sal$workclass,train_sal$Salary)
plot(train_sal$education,train_sal$Salary)
plot(train_sal$educationno,train_sal$Salary)
plot(train_sal$maritalstatus,train_sal$Salary)
plot(train_sal$occupation,train_sal$Salary)
plot(train_sal$relationship,train_sal$Salary)
plot(train_sal$race,train_sal$Salary)
plot(train_sal$sex,train_sal$Salary)
ggplot(data=train_sal,aes(x=train_sal$Salary, y = train_sal$capitalgain, fill = train_sal$Salary)) +
geom_boxplot() +
ggtitle("Box Plot")
ggplot(data=train_sal,aes(x=train_sal$Salary, y = train_sal$capitalloss, fill = train_sal$Salary)) +
geom_boxplot() +
ggtitle("Box Plot")
ggplot(data=train_sal,aes(x=train_sal$Salary, y = train_sal$hoursperweek, fill = train_sal$Salary)) +
geom_boxplot() +
ggtitle("Box Plot")
plot(train_sal$native,train_sal$Salary)
#Density Plot
ggplot(data=train_sal,aes(x = train_sal$age, fill = train_sal$Salary)) +
geom_density(alpha = 0.9, color = 'Violet')
ggtitle("Age - Density Plot")
## $title
## [1] "Age - Density Plot"
##
## attr(,"class")
## [1] "labels"
ggplot(data=train_sal,aes(x = train_sal$workclass, fill = train_sal$Salary)) +
geom_density(alpha = 0.9, color = 'Violet')
ggtitle("Workclass Density Plot")
## $title
## [1] "Workclass Density Plot"
##
## attr(,"class")
## [1] "labels"
ggplot(data=train_sal,aes(x = train_sal$education, fill = train_sal$Salary)) +
geom_density(alpha = 0.9, color = 'Violet')
ggtitle("education Density Plot")
## $title
## [1] "education Density Plot"
##
## attr(,"class")
## [1] "labels"
ggplot(data=train_sal,aes(x = train_sal$educationno, fill = train_sal$Salary)) +
geom_density(alpha = 0.9, color = 'Violet')
ggtitle("educationno Density Plot")
## $title
## [1] "educationno Density Plot"
##
## attr(,"class")
## [1] "labels"
ggplot(data=train_sal,aes(x = train_sal$maritalstatus, fill = train_sal$Salary)) +
geom_density(alpha = 0.9, color = 'Violet')
ggtitle("maritalstatus Density Plot")
## $title
## [1] "maritalstatus Density Plot"
##
## attr(,"class")
## [1] "labels"
ggplot(data=train_sal,aes(x = train_sal$occupation, fill = train_sal$Salary)) +
geom_density(alpha = 0.9, color = 'Violet')
## Warning: Groups with fewer than two data points have been dropped.
## Warning: Groups with fewer than two data points have been dropped.
ggtitle("occupation Density Plot")
## $title
## [1] "occupation Density Plot"
##
## attr(,"class")
## [1] "labels"
ggplot(data=train_sal,aes(x = train_sal$sex, fill = train_sal$Salary)) +
geom_density(alpha = 0.9, color = 'Violet')
ggtitle("sex Density Plot")
## $title
## [1] "sex Density Plot"
##
## attr(,"class")
## [1] "labels"
ggplot(data=train_sal,aes(x = train_sal$relationship, fill = train_sal$Salary)) +
geom_density(alpha = 0.9, color = 'Violet')
ggtitle("Relationship Density Plot")
## $title
## [1] "Relationship Density Plot"
##
## attr(,"class")
## [1] "labels"
ggplot(data=train_sal,aes(x = train_sal$race, fill = train_sal$Salary)) +
geom_density(alpha = 0.9, color = 'Violet')
ggtitle("Race Density Plot")
## $title
## [1] "Race Density Plot"
##
## attr(,"class")
## [1] "labels"
ggplot(data=train_sal,aes(x = train_sal$capitalgain, fill = train_sal$Salary)) +
geom_density(alpha = 0.9, color = 'Violet')
ggtitle("Capitalgain Density Plot")
## $title
## [1] "Capitalgain Density Plot"
##
## attr(,"class")
## [1] "labels"
ggplot(data=train_sal,aes(x = train_sal$capitalloss, fill = train_sal$Salary)) +
geom_density(alpha = 0.9, color = 'Violet')
ggtitle("Capitalloss Density Plot")
## $title
## [1] "Capitalloss Density Plot"
##
## attr(,"class")
## [1] "labels"
ggplot(data=train_sal,aes(x = train_sal$hoursperweek, fill = train_sal$Salary)) +
geom_density(alpha = 0.9, color = 'Violet')
ggtitle("Hoursperweek Density Plot")
## $title
## [1] "Hoursperweek Density Plot"
##
## attr(,"class")
## [1] "labels"
ggplot(data=train_sal,aes(x = train_sal$native, fill = train_sal$Salary)) +
geom_density(alpha = 0.9, color = 'Violet')
## Warning: Groups with fewer than two data points have been dropped.
ggtitle("native Density Plot")
## $title
## [1] "native Density Plot"
##
## attr(,"class")
## [1] "labels"
# Building model
model1<-ksvm(train_sal$Salary~.,
data= train_sal, kernel = "vanilladot")
## Setting default kernel parameters
model1
## Support Vector Machine object of class "ksvm"
##
## SV type: C-svc (classification)
## parameter : cost C = 1
##
## Linear (vanilla) kernel function.
##
## Number of Support Vectors : 10599
##
## Objective Function Value : -10535.76
## Training error : 0.15162
Salary_prediction <- predict(model1, test_sal)
table(Salary_prediction,test_sal$Salary)
##
## Salary_prediction <=50K >50K
## <=50K 10601 1554
## >50K 759 2146
agreement <- Salary_prediction == test_sal$Salary
table(agreement)
## agreement
## FALSE TRUE
## 2313 12747
prop.table(table(agreement))
## agreement
## FALSE TRUE
## 0.1535857 0.8464143
# Different types of kernels
# "rbfdot", "polydot", "tanhdot", "vanilladot", "laplacedot",
# "besseldot", "anovadot", "splinedot", "matrix"
# kernel = rfdot
model_rfdot<-ksvm(train_sal$Salary~.,
data= train_sal,kernel = "rbfdot")
pred_rfdot<-predict(model_rfdot,newdata=test_sal)
mean(pred_rfdot==test_sal$Salary) # 85.19
## [1] 0.851992
# kernel = vanilladot
model_vanilla<-ksvm(train_sal$Salary~.,
data= train_sal,kernel = "vanilladot")
## Setting default kernel parameters
pred_vanilla<-predict(model_vanilla,newdata=test_sal)
mean(pred_vanilla==test_sal$Salary) # 84.64
## [1] 0.8464143