1a)First classify the data treating the last column as an ordered factor (the wine tasters score). 1b)
Using the best version choose two attributes and a slice through the data to plot. Choose a different set of attributes and another set of slices to plot. 1c)
Compare and contrast the best version of the SVM with the ridge regression model.
library(randomForest)
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
library(e1071)
winequality <- read.csv("/home/archana/ML_works_ucsc/winequality-red.csv", header = TRUE,sep = ";")
model1 <- svm(factor(quality, ordered = TRUE) ~ ., data = winequality, gamma = 1,
cost = 4, cross = 10)
print(model1)
##
## Call:
## svm(formula = factor(quality, ordered = TRUE) ~ ., data = winequality,
## gamma = 1, cost = 4, cross = 10)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 4
## gamma: 1
##
## Number of Support Vectors: 1365
summary(model1)
##
## Call:
## svm(formula = factor(quality, ordered = TRUE) ~ ., data = winequality,
## gamma = 1, cost = 4, cross = 10)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 4
## gamma: 1
##
## Number of Support Vectors: 1365
##
## ( 570 543 172 53 17 10 )
##
##
## Number of Classes: 6
##
## Levels:
## 3 4 5 6 7 8
##
## 10-fold cross-validation on training data:
##
## Total Accuracy: 66.79
## Single Accuracies:
## 69.81 63.75 65.62 68.75 65.62 64.38 67.5 70 66.88 65.62
svm1Err = (100 - model1$tot.accuracy)/100
svm1Err
## [1] 0.3321
plot(model1, winequality, fixed.acidity ~ alcohol)
plot(model1, winequality, fixed.acidity ~ volatile.acidity)
plot(model1, winequality, free.sulfur.dioxide ~ total.sulfur.dioxide)
x1 <- subset(winequality, select = -quality)
y1 <- winequality$quality
# test with train data
pred1 <- predict(model1, x1)
# class(pred) Check accuracy:
table(pred1, y1)
## y1
## pred1 3 4 5 6 7 8
## 3 10 0 0 0 0 0
## 4 0 52 0 0 0 0
## 5 0 1 676 4 0 0
## 6 0 0 5 633 1 0
## 7 0 0 0 1 198 1
## 8 0 0 0 0 0 17
obj <- tune(svm, quality ~ ., data = winequality, ranges = list(gamma = 2^(-4:0),
cost = 2^(2:4)), tunecontrol = tune.control(sampling = "cross"))
summary(obj)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## gamma cost
## 0.0625 4
##
## - best performance: 0.3887
##
## - Detailed performance results:
## gamma cost error dispersion
## 1 0.0625 4 0.3887 0.04457
## 2 0.1250 4 0.3990 0.04939
## 3 0.2500 4 0.4118 0.05455
## 4 0.5000 4 0.4064 0.04287
## 5 1.0000 4 0.4035 0.04666
## 6 0.0625 8 0.3933 0.04820
## 7 0.1250 8 0.4188 0.05544
## 8 0.2500 8 0.4382 0.05451
## 9 0.5000 8 0.4220 0.04235
## 10 1.0000 8 0.4088 0.04742
## 11 0.0625 16 0.4063 0.05256
## 12 0.1250 16 0.4467 0.06442
## 13 0.2500 16 0.4804 0.05026
## 14 0.5000 16 0.4421 0.04493
## 15 1.0000 16 0.4099 0.04781
winequality <- read.csv("/home/archana/ML_works_ucsc/winequality-red.csv", header = TRUE,sep = ";")
str(winequality)
## 'data.frame': 1599 obs. of 12 variables:
## $ fixed.acidity : num 7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
## $ volatile.acidity : num 0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
## $ citric.acid : num 0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
## $ residual.sugar : num 1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
## $ chlorides : num 0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
## $ free.sulfur.dioxide : num 11 25 15 17 11 13 15 15 9 17 ...
## $ total.sulfur.dioxide: num 34 67 54 60 34 40 59 21 18 102 ...
## $ density : num 0.998 0.997 0.997 0.998 0.998 ...
## $ pH : num 3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
## $ sulphates : num 0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
## $ alcohol : num 9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
## $ quality : int 5 5 5 6 5 5 5 7 7 5 ...
names(winequality)
## [1] "fixed.acidity" "volatile.acidity" "citric.acid"
## [4] "residual.sugar" "chlorides" "free.sulfur.dioxide"
## [7] "total.sulfur.dioxide" "density" "pH"
## [10] "sulphates" "alcohol" "quality"
model2 <- svm(quality ~ ., data = winequality, gamma = 0.0909, cost = 1, cross = 10)
print(model2)
##
## Call:
## svm(formula = quality ~ ., data = winequality, gamma = 0.0909,
## cost = 1, cross = 10)
##
##
## Parameters:
## SVM-Type: eps-regression
## SVM-Kernel: radial
## cost: 1
## gamma: 0.0909
## epsilon: 0.1
##
##
## Number of Support Vectors: 1328
summary(model2)
##
## Call:
## svm(formula = quality ~ ., data = winequality, gamma = 0.0909,
## cost = 1, cross = 10)
##
##
## Parameters:
## SVM-Type: eps-regression
## SVM-Kernel: radial
## cost: 1
## gamma: 0.0909
## epsilon: 0.1
##
##
## Number of Support Vectors: 1328
##
##
##
## 10-fold cross-validation on training data:
##
## Total Mean Squared Error: 0.391
## Squared Correlation Coefficient: 0.402
## Mean Squared Errors:
## 0.396 0.3598 0.3788 0.5145 0.4527 0.369 0.4026 0.2584 0.3885 0.39
svm2Err = (100 - model2$tot.accuracy)/100
svm2Err
## numeric(0)
plot(model2, winequality, density ~ alcohol)
plot(model2, winequality, residual.sugar ~ pH)
x2 <- subset(winequality, select = -quality)
y2 <- winequality$quality
# test with train data
pred2 <- round(as.numeric(predict(model2, x2)), 0)
# class(pred) Check accuracy:
table(pred2, y2)
## y2
## pred2 3 4 5 6 7 8
## 4 1 0 0 0 0 0
## 5 9 44 550 165 6 0
## 6 0 9 129 453 109 8
## 7 0 0 2 20 84 10
s <- 1 + 9 + 44 + 9 + 129 + 2 + 165 + 20 + 6 + 109 + 8 + 10
s
## [1] 512
perSmallError2 <- round(s/nrow(winequality), 2)
perSmallError2
## [1] 0.32
l <- 9 + 9 + 2 + 20 + 6 + 8
l
## [1] 54
table(pred1, y1)
## y1
## pred1 3 4 5 6 7 8
## 3 10 0 0 0 0 0
## 4 0 52 0 0 0 0
## 5 0 1 676 4 0 0
## 6 0 0 5 633 1 0
## 7 0 0 0 1 198 1
## 8 0 0 0 0 0 17
table(pred2, y2)
## y2
## pred2 3 4 5 6 7 8
## 4 1 0 0 0 0 0
## 5 9 44 550 165 6 0
## 6 0 9 129 453 109 8
## 7 0 0 2 20 84 10
Treating Quality as factor or numeric (rounded) had little effect on the classifier. Resulting in 32% and 33% error respectively. I used a second evaluation technique that accepped one-level difference as correct classification. Use this approach, the error for both categorical and numeric quality is about the same 3%
Problem 2: Classify the sonar data set. 2a)
Use a support vector machine to classify the sonar data set. First tune an SVM employing radial basis function (default). Next tune an SVM employing a linear kernel. Compare the results. 2b) In past homework, trees were used to classify the sonar data. Compare the best result using trees with the best result using SVM.
rm(list = ls())
oldpar <- par(no.readonly = TRUE) # record current setting
par(mar = rep(1, 4)) # make the margins smaller for RStudio
library(e1071)
library(rpart)
library(MASS)
sonar <- read.csv(file = "/home/archana/ML_works_ucsc/sonar_train.csv", header = FALSE)
sonar$V61 <- as.factor(sonar$V61)
# head(sonar) summary(sonar)
model <- svm(V61 ~ ., data = sonar)
print(model)
##
## Call:
## svm(formula = V61 ~ ., data = sonar)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
## gamma: 0.01667
##
## Number of Support Vectors: 97
plot(model, sonar, V60 ~ V59)
plot(model, sonar, V2 ~ V1)
x <- subset(sonar, select = -V61)
y <- sonar$V61
# test with train data
pred <- predict(model, x)
class(pred)
## [1] "factor"
table(pred, y)
## y
## pred -1 1
## -1 64 0
## 1 2 64
obj <- tune(svm, V61 ~ ., data = sonar, ranges = list(gamma = 2^(-1:1), cost = 2^(2:4)),
tunecontrol = tune.control(sampling = "cross"))
summary(obj)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## gamma cost
## 0.5 4
##
## - best performance: 0.5538
##
## - Detailed performance results:
## gamma cost error dispersion
## 1 0.5 4 0.5538 0.13471
## 2 1.0 4 0.6385 0.08919
## 3 2.0 4 0.6538 0.07476
## 4 0.5 8 0.5538 0.13471
## 5 1.0 8 0.6385 0.08919
## 6 2.0 8 0.6538 0.07476
## 7 0.5 16 0.5538 0.13471
## 8 1.0 16 0.6385 0.08919
## 9 2.0 16 0.6538 0.07476
plot(obj)
obj$best.parameters
## gamma cost
## 1 0.5 4
model <- svm(V61 ~ ., data = sonar, gamma = 0.5, cost = 4)
print(model)
##
## Call:
## svm(formula = V61 ~ ., data = sonar, gamma = 0.5, cost = 4)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 4
## gamma: 0.5
##
## Number of Support Vectors: 130
summary(model)
##
## Call:
## svm(formula = V61 ~ ., data = sonar, gamma = 0.5, cost = 4)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 4
## gamma: 0.5
##
## Number of Support Vectors: 130
##
## ( 66 64 )
##
##
## Number of Classes: 2
##
## Levels:
## -1 1
plot(model, sonar, V60 ~ V59)
plot(model, sonar, V2 ~ V1)
x <- subset(sonar, select = -V61)
y <- sonar$V61
# test with train data
pred <- predict(model, x)
# Check accuracy:
table(pred, y)
## y
## pred -1 1
## -1 66 0
## 1 0 64
C <- 2^(-10)
C = 2 * C
C
## [1] 0.001953
model <- svm(V61 ~ ., data = sonar, kernel = "linear", cost = C)
print(model)
##
## Call:
## svm(formula = V61 ~ ., data = sonar, kernel = "linear", cost = C)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 0.001953
## gamma: 0.01667
##
## Number of Support Vectors: 113
summary(model)
##
## Call:
## svm(formula = V61 ~ ., data = sonar, kernel = "linear", cost = C)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 0.001953
## gamma: 0.01667
##
## Number of Support Vectors: 113
##
## ( 57 56 )
##
##
## Number of Classes: 2
##
## Levels:
## -1 1
plot(model, sonar, V60 ~ V59)
plot(model, sonar, V2 ~ V1)
x <- subset(sonar, select = -V61)
y <- sonar$V61
# test with train data
pred <- predict(model, x)
# Check accuracy:
table(pred, y)
## y
## pred -1 1
## -1 47 13
## 1 19 51
C <- 0.65
model <- svm(V61 ~ ., data = sonar, kernel = "linear", cost = C)
print(model)
##
## Call:
## svm(formula = V61 ~ ., data = sonar, kernel = "linear", cost = C)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 0.65
## gamma: 0.01667
##
## Number of Support Vectors: 51
summary(model)
##
## Call:
## svm(formula = V61 ~ ., data = sonar, kernel = "linear", cost = C)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 0.65
## gamma: 0.01667
##
## Number of Support Vectors: 51
##
## ( 24 27 )
##
##
## Number of Classes: 2
##
## Levels:
## -1 1
plot(model, sonar, V60 ~ V59)
plot(model, sonar, V2 ~ V1)
17 Errors, the classification improved, but it still had errors compare of sonar with tree depth
library(rpart)
train <- read.csv("/home/archana/ML_works_ucsc/sonar_train.csv", header = FALSE)
y <- as.factor(train[, 61])
x <- train[, 1:60]
test <- read.csv("/home/archana/ML_works_ucsc/sonar_test.csv", header = FALSE)
y_test <- as.factor(test[, 61])
x_test <- test[, 1:60]
train_error <- rep(0, 6)
test_error <- rep(0, 6)
for (dep in 1:6) {
fit <- rpart(y ~ ., x, control = rpart.control(minsplit = 0, minbucket = 0,
cp = -1, maxcompete = 0, maxsurrogate = 0, usesurrogate = 0, xval = 0,
maxdepth = dep))
train_error[dep] <- 1 - sum(y == predict(fit, x, type = "class"))/length(y)
test_error[dep] <- 1 - sum(y_test == predict(fit, x_test, type = "class"))/length(y_test)
}
plot(seq(1, 6), test_error, type = "o", pch = 19, ylim = c(0, 0.5), ylab = "Error Rate",
xlab = "Tree Depth", main = "Err Rate versus Tree Depth Plot")
points(train_error, type = "o", pch = 19, lwd = 4, col = "blue")
legend(4, 0.5, c("Test Error", "Training Error"), col = c("black", "blue"),
pch = 19, lwd = c(1, 4))
train_error
## [1] 0.22308 0.19231 0.10769 0.06154 0.01538 0.00000
test_error
## [1] 0.2821 0.2949 0.3333 0.2821 0.2564 0.2692
min(train_error)
## [1] 0
min(test_error)
## [1] 0.2564
Problem 3:
The in class example (svm1.r) used the glass data set. Use the Random Forest technique on the glass data. Compare the Random Forest results with the results obtained in class with SVM.