set.seed(1)
x1 = runif(500) - 0.5
x2 = runif(500) - 0.5
y=1*(x1^2-x2^2>0)STA-6543 Assignment #8
Problem 5
We have seen that we can fit an SVM with a non-linear kernel in order to perform classification using a non-linear decision boundary. We will now see that we can also obtain a non-linear decision boundary by performing logistic regression using non-linear transformations of the features. (a) Generate a data set with n = 500 and p = 2, such that the observations belong to two classes with a quadratic decision boundary between them.
plot(x1[y==0],x2[y==0],col="red",xlab="X1",ylab="X2")
points(x1[y==1],x2[y==1],col="blue")df=data.frame(x1 = x1, x2 = x2, y = as.factor(y))
glm_fit=glm(y~.,data=df, family='binomial')library(ggplot2)
glm_prob=predict(glm_fit,newdata=df,type='response')
glm_pred=ifelse(glm_prob>0.5,1,0)
ggplot(data = df, mapping = aes(x1, x2)) +
geom_point(data = df, mapping = aes(colour = glm_pred))glm_fit_2=glm(y~poly(x1,2)+poly(x2,2),data=df,family='binomial')Warning: glm.fit: algorithm did not converge
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
glm_prob_2=predict(glm_fit_2,newdata=df,type='response')
glm_pred_2=ifelse(glm_prob_2>0.5,1,0)
ggplot(data = df, mapping = aes(x1, x2)) +
geom_point(data = df, mapping = aes(colour = glm_pred_2))library(e1071)Warning: package 'e1071' was built under R version 4.4.3
svm_lin=svm(y~.,data=df,kernel='linear',cost=0.01)
plot(svm_lin,df)svm_lin_2=svm(y~.,data=df,kernel='radial',gamma=1)
plot(svm_lin_2,data=df)- Support vector machines with radial kernels show the non-linear boundary, while the others didn’t.
Problem 7
In this problem, you will use support vector approaches in order to predict whether a given car gets high or low gas mileage based on the Auto data set. (a) Create a binary variable that takes on a 1 for cars with gas mileage above the median, and a 0 for cars with gas mileage below the median.
library(ISLR2)Warning: package 'ISLR2' was built under R version 4.4.3
attach(Auto)The following object is masked from package:ggplot2:
mpg
mpg_med = median(Auto$mpg)
bin.var = ifelse(Auto$mpg > mpg_med, 1, 0)
Auto$mpglevel = as.factor(bin.var)library(e1071)
set.seed(1)
tune_out = tune(svm, mpg~., data = Auto, kernel = "linear", ranges = list(cost = c(0.01,
0.1, 1, 5, 10, 100)))
summary(tune_out)
Parameter tuning of 'svm':
- sampling method: 10-fold cross validation
- best parameters:
cost
0.1
- best performance: 8.981009
- Detailed performance results:
cost error dispersion
1 1e-02 10.305990 5.295587
2 1e-01 8.981009 4.750742
3 1e+00 9.647184 4.313908
4 5e+00 10.149220 4.755080
5 1e+01 10.306219 4.953047
6 1e+02 10.684083 5.080506
set.seed(2)
tune_out = tune(svm, mpg ~ ., data = Auto, kernel = "polynomial", ranges = list(cost = c(0.1,
1, 5, 10), degree = c(2, 3, 4)))
summary(tune_out)
Parameter tuning of 'svm':
- sampling method: 10-fold cross validation
- best parameters:
cost degree
10 2
- best performance: 50.95606
- Detailed performance results:
cost degree error dispersion
1 0.1 2 61.59446 13.60292
2 1.0 2 60.15304 13.79293
3 5.0 2 55.06386 15.19391
4 10.0 2 50.95606 15.72388
5 0.1 3 61.71831 13.56940
6 1.0 3 61.39833 13.54758
7 5.0 3 59.99304 13.43208
8 10.0 3 58.28857 13.27760
9 0.1 4 61.75343 13.57197
10 1.0 4 61.74822 13.57317
11 5.0 4 61.72510 13.57851
12 10.0 4 61.69626 13.58520
set.seed(33)
tune_out = tune(svm, mpg ~ ., data = Auto, kernel = "radial", ranges = list(cost = c(0.1,
1, 5, 10), degree = c(2, 3, 4)))
summary(tune_out)
Parameter tuning of 'svm':
- sampling method: 10-fold cross validation
- best parameters:
cost degree
10 2
- best performance: 7.294024
- Detailed performance results:
cost degree error dispersion
1 0.1 2 22.211472 8.440215
2 1.0 2 10.118616 4.286046
3 5.0 2 7.649506 2.993534
4 10.0 2 7.294024 2.818545
5 0.1 3 22.211472 8.440215
6 1.0 3 10.118616 4.286046
7 5.0 3 7.649506 2.993534
8 10.0 3 7.294024 2.818545
9 0.1 4 22.211472 8.440215
10 1.0 4 10.118616 4.286046
11 5.0 4 7.649506 2.993534
12 10.0 4 7.294024 2.818545
svm_lin = svm(mpglevel ~ ., data = Auto, kernel = "linear", cost = 1)
svm_poly = svm(mpglevel ~ ., data = Auto, kernel = "polynomial", cost = 10,
degree = 2)
svm_radial = svm(mpglevel ~ ., data = Auto, kernel = "radial", cost = 10, gamma = 0.01)
plotpairs = function(fit) {
for (name in names(Auto)[!(names(Auto) %in% c("mpg", "mpglevel", "name"))]) {
plot(fit, Auto, as.formula(paste("mpg~", name, sep = "")))
}
}
plotpairs(svm_lin)plotpairs(svm_radial)Problem 5
This problem involves the OJ data set which is part of the ISLR2 package.
attach(OJ)
set.seed(1)
data_Train = sample(nrow(OJ), 800)
oj_train = OJ[data_Train,]
oj_test = OJ[-data_Train,]svc=svm(Purchase~.,data=oj_train,kernel='linear',cost=0.01)
summary(svc)
Call:
svm(formula = Purchase ~ ., data = oj_train, kernel = "linear", cost = 0.01)
Parameters:
SVM-Type: C-classification
SVM-Kernel: linear
cost: 0.01
Number of Support Vectors: 435
( 219 216 )
Number of Classes: 2
Levels:
CH MM
pred_train = predict(svc, oj_train)
(t<-table(oj_train$Purchase, pred_train)) pred_train
CH MM
CH 420 65
MM 75 240
pred_test = predict(svc, oj_test)
table(oj_test$Purchase, pred_test) pred_test
CH MM
CH 153 15
MM 33 69
set.seed(1)
tune_svc = tune(svm, Purchase ~ ., data = oj_train, kernel = "linear", ranges = list(cost = c(0.01,0.1,1,10)))
summary(tune_svc)
Parameter tuning of 'svm':
- sampling method: 10-fold cross validation
- best parameters:
cost
0.1
- best performance: 0.1725
- Detailed performance results:
cost error dispersion
1 0.01 0.17625 0.02853482
2 0.10 0.17250 0.03162278
3 1.00 0.17500 0.02946278
4 10.00 0.17375 0.03197764
svm_lin_1 = svm(Purchase ~ ., kernel = "linear", data = oj_train, cost = tune_out$best.parameters$cost)
pred_train_1 = predict(svm_lin_1, oj_train)
table(oj_train$Purchase, pred_train_1) pred_train_1
CH MM
CH 423 62
MM 69 246
test_pred_1 = predict(svm_lin_1, oj_test)
(t<-table(oj_test$Purchase, test_pred_1)) test_pred_1
CH MM
CH 156 12
MM 28 74
set.seed(1)
svm_rad_1 = svm(Purchase ~ ., data = oj_train, kernel = "radial")
summary(svm_rad_1)
Call:
svm(formula = Purchase ~ ., data = oj_train, kernel = "radial")
Parameters:
SVM-Type: C-classification
SVM-Kernel: radial
cost: 1
Number of Support Vectors: 373
( 188 185 )
Number of Classes: 2
Levels:
CH MM
pred_train_2 = predict(svm_rad_1, oj_train)
table(oj_train$Purchase, pred_train_2) pred_train_2
CH MM
CH 441 44
MM 77 238
test_pred_2 = predict(svm_rad_1, oj_test)
table(oj_test$Purchase, test_pred_2) test_pred_2
CH MM
CH 151 17
MM 33 69
svm_rad_1 = svm(Purchase ~ ., data = oj_train, kernel = "radial", cost = tune_svc$best.parameters$cost)
pred_train = predict(svm_rad_1, oj_train)
table(oj_train$Purchase, pred_train_1) pred_train_1
CH MM
CH 423 62
MM 69 246
test_pred_3 = predict(svm_rad_1, oj_test)
(t<-table(oj_test$Purchase, test_pred_3)) test_pred_3
CH MM
CH 150 18
MM 37 65
svm_pol_1 = svm(Purchase ~ ., kernel = "poly", data = oj_train, degree=2)
summary(svm_pol_1)
Call:
svm(formula = Purchase ~ ., data = oj_train, kernel = "poly", degree = 2)
Parameters:
SVM-Type: C-classification
SVM-Kernel: polynomial
cost: 1
degree: 2
coef.0: 0
Number of Support Vectors: 447
( 225 222 )
Number of Classes: 2
Levels:
CH MM
pred_train_2 = predict(svm_pol_1, oj_train)
(t<-table(oj_train$Purchase, pred_train_2)) pred_train_2
CH MM
CH 449 36
MM 110 205
test_pred_3 = predict(svm_pol_1, oj_test)
(t<-table(oj_test$Purchase, test_pred_3)) test_pred_3
CH MM
CH 153 15
MM 45 57
set.seed(1)
tune_svc = tune(svm, Purchase ~ ., data = oj_train, kernel = "poly", degree = 2, ranges = list(cost = 10^seq(-2, 1, by = 0.25)))
summary(tune_svc)
Parameter tuning of 'svm':
- sampling method: 10-fold cross validation
- best parameters:
cost
3.162278
- best performance: 0.1775
- Detailed performance results:
cost error dispersion
1 0.01000000 0.39125 0.04210189
2 0.01778279 0.37125 0.03537988
3 0.03162278 0.36500 0.03476109
4 0.05623413 0.33750 0.04714045
5 0.10000000 0.32125 0.05001736
6 0.17782794 0.24500 0.04758034
7 0.31622777 0.19875 0.03972562
8 0.56234133 0.20500 0.03961621
9 1.00000000 0.20250 0.04116363
10 1.77827941 0.18500 0.04199868
11 3.16227766 0.17750 0.03670453
12 5.62341325 0.18375 0.03064696
13 10.00000000 0.18125 0.02779513
- Linear basis approach has the best results.