library(randomForest)
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
library(e1071)
setwd("C:/Users/Manjari/Desktop/Machine learning/Home Work Solutions")
winequality <- read.csv("winequality-red.csv", header = TRUE,sep = ";")
model1 <- svm(factor(quality, ordered = TRUE) ~ ., data = winequality, gamma = 1, cost = 4, cross = 10)
print(model1)
##
## Call:
## svm(formula = factor(quality, ordered = TRUE) ~ ., data = winequality,
## gamma = 1, cost = 4, cross = 10)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 4
## gamma: 1
##
## Number of Support Vectors: 1365
summary(model1)
##
## Call:
## svm(formula = factor(quality, ordered = TRUE) ~ ., data = winequality,
## gamma = 1, cost = 4, cross = 10)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 4
## gamma: 1
##
## Number of Support Vectors: 1365
##
## ( 570 543 172 53 17 10 )
##
##
## Number of Classes: 6
##
## Levels:
## 3 4 5 6 7 8
##
## 10-fold cross-validation on training data:
##
## Total Accuracy: 66.79174
## Single Accuracies:
## 62.89308 66.875 70 65.625 66.875 68.125 71.25 64.375 70 61.875
svm1Err = (100 - model1$tot.accuracy)/100
svm1Err
## [1] 0.3320826
plot(model1, winequality, fixed.acidity ~ alcohol)
plot(model1, winequality, fixed.acidity ~ volatile.acidity)
plot(model1, winequality, free.sulfur.dioxide ~ total.sulfur.dioxide)
x1 <- subset(winequality, select = -quality)
y1 <- winequality$quality
pred1 <- predict(model1, x1)
table(pred1, y1)
## y1
## pred1 3 4 5 6 7 8
## 3 10 0 0 0 0 0
## 4 0 52 0 0 0 0
## 5 0 1 676 4 0 0
## 6 0 0 5 633 1 0
## 7 0 0 0 1 198 1
## 8 0 0 0 0 0 17
obj <- tune(svm, quality ~ ., data = winequality, ranges = list(gamma = 2^(-4:0), cost = 2^(2:4)), tunecontrol = tune.control(sampling = "cross"))
summary(obj)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## gamma cost
## 0.0625 4
##
## - best performance: 0.3780183
##
## - Detailed performance results:
## gamma cost error dispersion
## 1 0.0625 4 0.3780183 0.05638519
## 2 0.1250 4 0.3847228 0.06737206
## 3 0.2500 4 0.3957970 0.07105080
## 4 0.5000 4 0.3943695 0.08031540
## 5 1.0000 4 0.3945748 0.06662639
## 6 0.0625 8 0.3802658 0.06855827
## 7 0.1250 8 0.4005225 0.07415958
## 8 0.2500 8 0.4210195 0.08246570
## 9 0.5000 8 0.4073099 0.07762405
## 10 1.0000 8 0.3993351 0.06689450
## 11 0.0625 16 0.3934438 0.08258105
## 12 0.1250 16 0.4286154 0.08013054
## 13 0.2500 16 0.4554170 0.08695392
## 14 0.5000 16 0.4254551 0.07582938
## 15 1.0000 16 0.4002628 0.06667417
str(winequality)
## 'data.frame': 1599 obs. of 12 variables:
## $ fixed.acidity : num 7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
## $ volatile.acidity : num 0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
## $ citric.acid : num 0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
## $ residual.sugar : num 1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
## $ chlorides : num 0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
## $ free.sulfur.dioxide : num 11 25 15 17 11 13 15 15 9 17 ...
## $ total.sulfur.dioxide: num 34 67 54 60 34 40 59 21 18 102 ...
## $ density : num 0.998 0.997 0.997 0.998 0.998 ...
## $ pH : num 3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
## $ sulphates : num 0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
## $ alcohol : num 9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
## $ quality : int 5 5 5 6 5 5 5 7 7 5 ...
names(winequality)
## [1] "fixed.acidity" "volatile.acidity" "citric.acid"
## [4] "residual.sugar" "chlorides" "free.sulfur.dioxide"
## [7] "total.sulfur.dioxide" "density" "pH"
## [10] "sulphates" "alcohol" "quality"
model2 <- svm(quality ~ ., data = winequality, gamma = 0.0909, cost = 1, cross = 10)
print(model2)
##
## Call:
## svm(formula = quality ~ ., data = winequality, gamma = 0.0909,
## cost = 1, cross = 10)
##
##
## Parameters:
## SVM-Type: eps-regression
## SVM-Kernel: radial
## cost: 1
## gamma: 0.0909
## epsilon: 0.1
##
##
## Number of Support Vectors: 1328
summary(model2)
##
## Call:
## svm(formula = quality ~ ., data = winequality, gamma = 0.0909,
## cost = 1, cross = 10)
##
##
## Parameters:
## SVM-Type: eps-regression
## SVM-Kernel: radial
## cost: 1
## gamma: 0.0909
## epsilon: 0.1
##
##
## Number of Support Vectors: 1328
##
##
##
## 10-fold cross-validation on training data:
##
## Total Mean Squared Error: 0.3923175
## Squared Correlation Coefficient: 0.4003188
## Mean Squared Errors:
## 0.4934818 0.3799656 0.3101665 0.4353687 0.2939571 0.345708 0.3990553 0.3718168 0.4352961 0.4589919
svm2Err = (100 - model2$tot.accuracy)/100
svm2Err
## numeric(0)
plot(model2, winequality, density ~ alcohol)
plot(model2, winequality, residual.sugar ~ pH)
x2 <- subset(winequality, select = -quality)
y2 <- winequality$quality
# test with train data
pred2 <- round(as.numeric(predict(model2, x2)), 0)
# class(pred) Check accuracy:
table(pred2, y2)
## y2
## pred2 3 4 5 6 7 8
## 4 1 0 0 0 0 0
## 5 9 44 550 165 6 0
## 6 0 9 129 453 109 8
## 7 0 0 2 20 84 10
s <- 1 + 9 + 44 + 9 + 129 + 2 + 165 + 20 + 6 + 109 + 8 + 10
s
## [1] 512
perSmallError2 <- round(s/nrow(winequality), 2)
perSmallError2
## [1] 0.32
l <- 9 + 9 + 2 + 20 + 6 + 8
l
## [1] 54
table(pred1, y1)
## y1
## pred1 3 4 5 6 7 8
## 3 10 0 0 0 0 0
## 4 0 52 0 0 0 0
## 5 0 1 676 4 0 0
## 6 0 0 5 633 1 0
## 7 0 0 0 1 198 1
## 8 0 0 0 0 0 17
Treating Quality as factor or numeric (rounded) had little effect on the classifier. Resulting in 32% and 33% error respectively. I used a second evaluation technique that accepped one-level difference as correct classification. Use this approach, the error for both categorical and numeric quality is about the same 3%. 2) Classify the sonar data set. 2a) Use a support vector machine to classify the sonar data set. First tune an SVM employing radial basis function (default). Next tune an SVM employing a linear kernel. Compare the results. 2b) In past homework, trees were used to classify the sonar data. Compare the best result using trees with the best result using SVM.
library(e1071)
library(rpart)
library(MASS)
setwd("C:/Users/Manjari/Desktop/Machine learning/Home Work Solutions")
oldpar <- par(no.readonly = TRUE) # record current setting
par(mar = rep(1, 4))
sonar <- read.csv("sonar_train.csv", header = FALSE)
sonar$V61 <- as.factor(sonar$V61)
model <- svm(V61 ~ ., data = sonar)
print(model)
##
## Call:
## svm(formula = V61 ~ ., data = sonar)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
## gamma: 0.01666667
##
## Number of Support Vectors: 97
plot(model, sonar, V60 ~ V59)
obj <- tune(svm, V61 ~ ., data = sonar, ranges = list(gamma = 2^(-1:1), cost = 2^(2:4)),tunecontrol = tune.control(sampling = "cross"))
summary(obj)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## gamma cost
## 0.5 4
##
## - best performance: 0.5615385
##
## - Detailed performance results:
## gamma cost error dispersion
## 1 0.5 4 0.5615385 0.13099528
## 2 1.0 4 0.6307692 0.07944581
## 3 2.0 4 0.6307692 0.07944581
## 4 0.5 8 0.5615385 0.13099528
## 5 1.0 8 0.6307692 0.07944581
## 6 2.0 8 0.6307692 0.07944581
## 7 0.5 16 0.5615385 0.13099528
## 8 1.0 16 0.6307692 0.07944581
## 9 2.0 16 0.6307692 0.07944581
plot(obj)
obj$best.parameters
## gamma cost
## 1 0.5 4
model <- svm(V61 ~ ., data = sonar, gamma = 0.5, cost = 4)
print(model)
##
## Call:
## svm(formula = V61 ~ ., data = sonar, gamma = 0.5, cost = 4)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 4
## gamma: 0.5
##
## Number of Support Vectors: 130
summary(model)
##
## Call:
## svm(formula = V61 ~ ., data = sonar, gamma = 0.5, cost = 4)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 4
## gamma: 0.5
##
## Number of Support Vectors: 130
##
## ( 66 64 )
##
##
## Number of Classes: 2
##
## Levels:
## -1 1
model <- svm(V61 ~ ., data = sonar, kernel = "linear")
print(model)
##
## Call:
## svm(formula = V61 ~ ., data = sonar, kernel = "linear")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 1
## gamma: 0.01666667
##
## Number of Support Vectors: 47
summary(model)
##
## Call:
## svm(formula = V61 ~ ., data = sonar, kernel = "linear")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 1
## gamma: 0.01666667
##
## Number of Support Vectors: 47
##
## ( 21 26 )
##
##
## Number of Classes: 2
##
## Levels:
## -1 1
plot(model, sonar, V60 ~ V59)
plot(model, sonar, V2 ~ V1)
x <- subset(sonar, select = -V61)
y <- sonar$V61
pred <- predict(model, x)
table(pred, y)
## y
## pred -1 1
## -1 66 2
## 1 0 62
C <- 0.65
model <- svm(V61 ~ ., data = sonar, kernel = "linear", cost = C)
print(model)
##
## Call:
## svm(formula = V61 ~ ., data = sonar, kernel = "linear", cost = C)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 0.65
## gamma: 0.01666667
##
## Number of Support Vectors: 51
summary(model)
##
## Call:
## svm(formula = V61 ~ ., data = sonar, kernel = "linear", cost = C)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 0.65
## gamma: 0.01666667
##
## Number of Support Vectors: 51
##
## ( 24 27 )
##
##
## Number of Classes: 2
##
## Levels:
## -1 1
plot(model, sonar, V60 ~ V59)
plot(model, sonar, V2 ~ V1)
17 Errors, the classification improved, but it still had errors compare of sonar with tree depth
library(rpart)
setwd("C:/Users/Manjari/Desktop/Machine learning/Home Work Solutions")
train <- read.csv("sonar_train.csv", header = FALSE)
y <- as.factor(train[, 61])
x <- train[, 1:60]
test <- read.csv("sonar_test.csv", header = FALSE)
y_test <- as.factor(test[, 61])
x_test <- test[, 1:60]
train_error <- rep(0, 6)
test_error <- rep(0, 6)
for (dep in 1:6)
{
fit <- rpart(y ~ ., x, control = rpart.control(minsplit = 0, minbucket = 0, cp = -1,maxcompete = 0, maxsurrogate = 0, usesurrogate = 0, xval = 0,maxdepth = dep))
train_error[dep] <- 1 - sum(y == predict(fit, x, type = "class"))/length(y)
test_error[dep] <- 1 - sum(y_test == predict(fit, x_test, type = "class"))/length(y_test)
}
plot(seq(1, 6), test_error, type = "o", pch = 19, ylim = c(0, 0.5), ylab = "Error Rate",xlab = "Tree Depth", main = "Err Rate versus Tree Depth Plot")
points(train_error, type = "o", pch = 19, lwd = 4, col = "blue")
legend(4, 0.5, c("Test Error", "Training Error"), col = c("black", "blue"), pch = 19, lwd = c(1, 4))
train_error
## [1] 0.22307692 0.19230769 0.10769231 0.06153846 0.01538462 0.00000000
test_error
## [1] 0.2820513 0.2948718 0.3333333 0.2820513 0.2564103 0.2692308
min(train_error)
## [1] 0
min(test_error)
## [1] 0.2564103
data(Glass, package = "mlbench")
setwd("C:/Users/Manjari/Desktop/Machine learning/Home Work Solutions")
str(Glass)
## 'data.frame': 214 obs. of 10 variables:
## $ RI : num 1.52 1.52 1.52 1.52 1.52 ...
## $ Na : num 13.6 13.9 13.5 13.2 13.3 ...
## $ Mg : num 4.49 3.6 3.55 3.69 3.62 3.61 3.6 3.61 3.58 3.6 ...
## $ Al : num 1.1 1.36 1.54 1.29 1.24 1.62 1.14 1.05 1.37 1.36 ...
## $ Si : num 71.8 72.7 73 72.6 73.1 ...
## $ K : num 0.06 0.48 0.39 0.57 0.55 0.64 0.58 0.57 0.56 0.57 ...
## $ Ca : num 8.75 7.83 7.78 8.22 8.07 8.07 8.17 8.24 8.3 8.4 ...
## $ Ba : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Fe : num 0 0 0 0 0 0.26 0 0 0 0.11 ...
## $ Type: Factor w/ 6 levels "1","2","3","5",..: 1 1 1 1 1 1 1 1 1 1 ...
Glass$Type
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [71] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [106] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [141] 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 5 5 5 5 5 5 5 5 5 5 5 5
## [176] 5 6 6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7
## [211] 7 7 7 7
## Levels: 1 2 3 5 6 7
index <- 1:nrow(Glass)
set.seed(pi)
testindex <- sample(index, trunc(length(index)/3))
testset <- Glass[testindex, ]
trainset <- Glass[-testindex, ]
svm.model <- svm(Type ~ ., data = trainset, cost = 100, gamma = 1)
svm.pred <- predict(svm.model, testset[, -10])
table(pred = svm.pred, true = testset[, 10])
## true
## pred 1 2 3 5 6 7
## 1 18 7 1 0 0 0
## 2 5 18 0 5 0 2
## 3 2 1 2 0 0 0
## 5 0 0 0 2 0 0
## 6 0 0 0 0 1 0
## 7 0 0 0 0 0 7
length(testset[, 10])
## [1] 71
svmErr <- 1 - sum(svm.pred == testset[, 10])/length(testset[, 10])
svmErr
## [1] 0.3239437
rpart.model <- rpart(Type ~ ., data = trainset)
rpart.pred <- predict(rpart.model, testset[, -10], type = "class")
table(pred = rpart.pred, true = testset[, 10])
## true
## pred 1 2 3 5 6 7
## 1 20 3 0 0 0 0
## 2 4 19 2 1 1 0
## 3 1 3 1 0 0 0
## 5 0 1 0 6 0 0
## 6 0 0 0 0 0 0
## 7 0 0 0 0 0 9
1 - sum(rpart.pred == testset[, 10])/length(testset[, 10])
## [1] 0.2253521
obj <- tune(svm, Type ~ ., data = trainset, ranges = list(gamma = 2^(-4:0), cost = 2^(2:4)), tunecontrol = tune.control(sampling = "cross"))
summary(obj)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## gamma cost
## 0.0625 16
##
## - best performance: 0.3142857
##
## - Detailed performance results:
## gamma cost error dispersion
## 1 0.0625 4 0.3333333 0.10830935
## 2 0.1250 4 0.3342857 0.07165045
## 3 0.2500 4 0.3490476 0.06044473
## 4 0.5000 4 0.3485714 0.10953071
## 5 1.0000 4 0.3557143 0.12433851
## 6 0.0625 8 0.3209524 0.07148851
## 7 0.1250 8 0.3633333 0.06834803
## 8 0.2500 8 0.3566667 0.09543498
## 9 0.5000 8 0.3628571 0.11715533
## 10 1.0000 8 0.3557143 0.12433851
## 11 0.0625 16 0.3142857 0.08571429
## 12 0.1250 16 0.3357143 0.09718253
## 13 0.2500 16 0.3566667 0.09091927
## 14 0.5000 16 0.3490476 0.11584148
## 15 1.0000 16 0.3633333 0.11097611
plot(obj)
obj$best.parameters
## gamma cost
## 11 0.0625 16
glass.cf <- randomForest(Type ~ ., data = Glass, control = randomForest_unbiased(mtry = 2))
table(Glass$Type, glass.pred <- predict(randomForest(Type ~ ., data = Glass, control = randomForest_unbiased(ntree = 60)),OOB = TRUE))
##
## 1 2 3 5 6 7
## 1 62 6 2 0 0 0
## 2 10 60 1 3 2 0
## 3 7 4 6 0 0 0
## 5 0 2 0 10 0 1
## 6 1 1 0 0 7 0
## 7 1 3 0 0 0 25
rfErr <- 1 - sum(Glass$Type == glass.pred)/length(glass.pred)
rfErr
## [1] 0.2056075
data(iris)
setwd("C:/Users/Manjari/Desktop/Machine learning/Home Work Solutions")
obj <- tune(svm, Species~., data = iris,ranges = list(gamma = 2^(-1:1), cost = 2^(2:4)),tunecontrol = tune.control(sampling = "fix"))
summary(obj)
##
## Parameter tuning of 'svm':
##
## - sampling method: fixed training/validation set
##
## - best parameters:
## gamma cost
## 0.5 4
##
## - best performance: 0.02
##
## - Detailed performance results:
## gamma cost error dispersion
## 1 0.5 4 0.02 NA
## 2 1.0 4 0.04 NA
## 3 2.0 4 0.04 NA
## 4 0.5 8 0.04 NA
## 5 1.0 8 0.04 NA
## 6 2.0 8 0.04 NA
## 7 0.5 16 0.04 NA
## 8 1.0 16 0.04 NA
## 9 2.0 16 0.04 NA
plot(obj)
x <- iris[,-5]
y <- iris[,5]
obj2 <- tune.knn(x, y, k = 1:5, tunecontrol = tune.control(sampling = "boot"))
summary(obj2)
##
## Parameter tuning of 'knn.wrapper':
##
## - sampling method: bootstrapping
##
## - best parameters:
## k
## 4
##
## - best performance: 0.04726544
##
## - Detailed performance results:
## k error dispersion
## 1 1 0.04732794 0.02165832
## 2 2 0.05213906 0.02490950
## 3 3 0.04729779 0.02432917
## 4 4 0.04726544 0.03014710
## 5 5 0.04898860 0.02657732
data(mtcars)
obj3 <- tune.rpart(mpg~., data = mtcars, minsplit = c(5,10,15))
summary(obj3)
##
## Parameter tuning of 'rpart.wrapper':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## minsplit
## 5
##
## - best performance: 9.088153
##
## - Detailed performance results:
## minsplit error dispersion
## 1 5 9.088153 7.463697
## 2 10 11.649977 10.104875
## 3 15 17.655931 10.144265
plot(obj3)
tune(lm, mpg~., data = mtcars)
##
## Error estimation of 'lm' using 10-fold cross validation: 13.11232
tune.control(random = FALSE, nrepeat = 1, repeat.aggregate = min,
sampling = c("cross", "fix", "bootstrap"), sampling.aggregate = mean,
sampling.dispersion = sd,cross = 10, fix = 2/3, nboot = 10, boot.size = 9/10, best.model = TRUE,performances = TRUE, error.fun = NULL)
## $random
## [1] FALSE
##
## $nrepeat
## [1] 1
##
## $repeat.aggregate
## function (..., na.rm = FALSE) .Primitive("min")
##
## $sampling
## [1] "cross"
##
## $sampling.aggregate
## function (x, ...)
## UseMethod("mean")
## <bytecode: 0x000000000769da20>
## <environment: namespace:base>
##
## $sampling.dispersion
## function (x, na.rm = FALSE)
## sqrt(var(if (is.vector(x)) x else as.double(x), na.rm = na.rm))
## <bytecode: 0x000000000769d188>
## <environment: namespace:stats>
##
## $cross
## [1] 10
##
## $fix
## [1] 0.6666667
##
## $nboot
## [1] 10
##
## $boot.size
## [1] 0.9
##
## $best.model
## [1] TRUE
##
## $performances
## [1] TRUE
##
## $error.fun
## NULL
##
## attr(,"class")
## [1] "tune.control"
data(iris)
attach(iris)
setwd("C:/Users/Manjari/Desktop/Machine learning/Home Work Solutions")
model <- svm(Species ~ ., data = iris)
x <- subset(iris, select = -Species)
y <- Species
model <- svm(x, y)
print(model)
##
## Call:
## svm.default(x = x, y = y)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
## gamma: 0.25
##
## Number of Support Vectors: 51
summary(model)
##
## Call:
## svm.default(x = x, y = y)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
## gamma: 0.25
##
## Number of Support Vectors: 51
##
## ( 8 22 21 )
##
##
## Number of Classes: 3
##
## Levels:
## setosa versicolor virginica
pred <- predict(model, x)
pred <- fitted(model)
table(pred, y)
## y
## pred setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 48 2
## virginica 0 2 48
pred <- predict(model, x, decision.values = TRUE)
attr(pred, "decision.values")[1:4,]
## setosa/versicolor setosa/virginica versicolor/virginica
## 1 1.196152 1.091757 0.6708810
## 2 1.064621 1.056185 0.8483518
## 3 1.180842 1.074542 0.6439798
## 4 1.110699 1.053012 0.6782041
plot(cmdscale(dist(iris[,-5])),col = as.integer(iris[,5]),pch = c("o","+")[1:150 %in% model$index + 1])
x <- seq(0.1, 5, by = 0.05)
y <- log(x) + rnorm(x, sd = 0.2)
m <- svm(x, y)
new <- predict(m, x)
plot(x, y)
points(x, log(x), col = 2)
points(x, new, col = 4)
X <- data.frame(a = rnorm(1000), b = rnorm(1000))
attach(X)
m <- svm(X, gamma = 0.1)
m <- svm(~ a + b, gamma = 0.1)
newdata <- data.frame(a = c(0, 4), b = c(0, 4))
predict (m, newdata)
## 1 2
## TRUE FALSE
plot(X, col = 1:1000 %in% m$index + 1, xlim = c(-5,5), ylim=c(-5,5))
points(newdata, pch = "+", col = 2, cex = 5)
i2 <- iris
levels(i2$Species)[3] <- "versicolor"
summary(i2$Species)
## setosa versicolor
## 50 100
wts <- 100 / table(i2$Species)
wts
##
## setosa versicolor
## 2 1
m <- svm(Species ~ ., data = i2, class.weights = wts)
model <- svm(Species ~ ., data = iris, gamma = 0.5, cost = 4)
print(model)
##
## Call:
## svm(formula = Species ~ ., data = iris, gamma = 0.5, cost = 4)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 4
## gamma: 0.5
##
## Number of Support Vectors: 49
summary(model)
##
## Call:
## svm(formula = Species ~ ., data = iris, gamma = 0.5, cost = 4)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 4
## gamma: 0.5
##
## Number of Support Vectors: 49
##
## ( 11 18 20 )
##
##
## Number of Classes: 3
##
## Levels:
## setosa versicolor virginica
x <- subset(iris, select = -Species)
y <- Species
pred <- predict(model, x)
table(pred, y)
## y
## pred setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 48 1
## virginica 0 2 49
obj <- tune(svm, Species~., data = iris,ranges = list(gamma = seq(.5,1.5,0.1), cost = seq(7,9,0.5)),tunecontrol = tune.control(sampling = "cross"))
summary(obj)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## gamma cost
## 0.5 7
##
## - best performance: 0.06
##
## - Detailed performance results:
## gamma cost error dispersion
## 1 0.5 7.0 0.06000000 0.07981460
## 2 0.6 7.0 0.06000000 0.07981460
## 3 0.7 7.0 0.06000000 0.07981460
## 4 0.8 7.0 0.06000000 0.07981460
## 5 0.9 7.0 0.06000000 0.07981460
## 6 1.0 7.0 0.06000000 0.07981460
## 7 1.1 7.0 0.06000000 0.07981460
## 8 1.2 7.0 0.06000000 0.07981460
## 9 1.3 7.0 0.06666667 0.07698004
## 10 1.4 7.0 0.06000000 0.07981460
## 11 1.5 7.0 0.06000000 0.07981460
## 12 0.5 7.5 0.06000000 0.07981460
## 13 0.6 7.5 0.06000000 0.07981460
## 14 0.7 7.5 0.06000000 0.07981460
## 15 0.8 7.5 0.06000000 0.07981460
## 16 0.9 7.5 0.06000000 0.07981460
## 17 1.0 7.5 0.06000000 0.07981460
## 18 1.1 7.5 0.06666667 0.07698004
## 19 1.2 7.5 0.06666667 0.07698004
## 20 1.3 7.5 0.06666667 0.07698004
## 21 1.4 7.5 0.06000000 0.07981460
## 22 1.5 7.5 0.06000000 0.07981460
## 23 0.5 8.0 0.06000000 0.07981460
## 24 0.6 8.0 0.06000000 0.07981460
## 25 0.7 8.0 0.06000000 0.07981460
## 26 0.8 8.0 0.06000000 0.07981460
## 27 0.9 8.0 0.06000000 0.07981460
## 28 1.0 8.0 0.06000000 0.07981460
## 29 1.1 8.0 0.06666667 0.07698004
## 30 1.2 8.0 0.06666667 0.07698004
## 31 1.3 8.0 0.06666667 0.07698004
## 32 1.4 8.0 0.06000000 0.07981460
## 33 1.5 8.0 0.06000000 0.07981460
## 34 0.5 8.5 0.06000000 0.07981460
## 35 0.6 8.5 0.06000000 0.07981460
## 36 0.7 8.5 0.06000000 0.07981460
## 37 0.8 8.5 0.06000000 0.07981460
## 38 0.9 8.5 0.06000000 0.07981460
## 39 1.0 8.5 0.06666667 0.07698004
## 40 1.1 8.5 0.06666667 0.07698004
## 41 1.2 8.5 0.06666667 0.07698004
## 42 1.3 8.5 0.06666667 0.07698004
## 43 1.4 8.5 0.06000000 0.07981460
## 44 1.5 8.5 0.06000000 0.07981460
## 45 0.5 9.0 0.06000000 0.07981460
## 46 0.6 9.0 0.06000000 0.07981460
## 47 0.7 9.0 0.06000000 0.07981460
## 48 0.8 9.0 0.06000000 0.07981460
## 49 0.9 9.0 0.06000000 0.07981460
## 50 1.0 9.0 0.06000000 0.07981460
## 51 1.1 9.0 0.06666667 0.07698004
## 52 1.2 9.0 0.06666667 0.07698004
## 53 1.3 9.0 0.06666667 0.07698004
## 54 1.4 9.0 0.06000000 0.07981460
## 55 1.5 9.0 0.06000000 0.07981460
plot(obj)
obj$best.parameters
## gamma cost
## 1 0.5 7
model <- svm(Species ~ ., data = iris, gamma = 1.0, cost = 8)
print(model)
##
## Call:
## svm(formula = Species ~ ., data = iris, gamma = 1, cost = 8)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 8
## gamma: 1
##
## Number of Support Vectors: 55
summary(model)
##
## Call:
## svm(formula = Species ~ ., data = iris, gamma = 1, cost = 8)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 8
## gamma: 1
##
## Number of Support Vectors: 55
##
## ( 13 20 22 )
##
##
## Number of Classes: 3
##
## Levels:
## setosa versicolor virginica
obj$best.parameters
## gamma cost
## 1 0.5 7
model$index
## [1] 14 15 16 19 21 23 26 32 33 37 42 44 45 51 54 58 60
## [18] 61 63 64 65 68 69 71 73 77 78 79 80 84 85 86 99 101
## [35] 107 109 110 111 115 118 119 120 124 126 128 130 132 134 135 136 139
## [52] 142 147 149 150
pred <- predict(model, x)
table(pred, y)
## y
## pred setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 49 0
## virginica 0 1 50
149/150 # SVM correctly classifies 99.3% of Iris Data
## [1] 0.9933333
which((pred == iris[,5]) == F) # 84
## [1] 84
model <- svm(Species ~ ., data = iris, gamma = 2.0, cost = 16)
pred <- predict(model, x)
which((pred == iris[,5]) == F)
## integer(0)
table(pred, y)
## y
## pred setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 50 0
## virginica 0 0 50
model <- svm(Species ~ ., data = iris, gamma = 3.0, cost = 4)
pred <- predict(model, x)
which((pred == iris[,5]) == F) # this also gave NO errors
## integer(0)
table(pred, y)
## y
## pred setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 50 0
## virginica 0 0 50
data(Glass, package = "mlbench")
str(Glass)
## 'data.frame': 214 obs. of 10 variables:
## $ RI : num 1.52 1.52 1.52 1.52 1.52 ...
## $ Na : num 13.6 13.9 13.5 13.2 13.3 ...
## $ Mg : num 4.49 3.6 3.55 3.69 3.62 3.61 3.6 3.61 3.58 3.6 ...
## $ Al : num 1.1 1.36 1.54 1.29 1.24 1.62 1.14 1.05 1.37 1.36 ...
## $ Si : num 71.8 72.7 73 72.6 73.1 ...
## $ K : num 0.06 0.48 0.39 0.57 0.55 0.64 0.58 0.57 0.56 0.57 ...
## $ Ca : num 8.75 7.83 7.78 8.22 8.07 8.07 8.17 8.24 8.3 8.4 ...
## $ Ba : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Fe : num 0 0 0 0 0 0.26 0 0 0 0.11 ...
## $ Type: Factor w/ 6 levels "1","2","3","5",..: 1 1 1 1 1 1 1 1 1 1 ...
Glass$Type
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [71] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [106] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [141] 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 5 5 5 5 5 5 5 5 5 5 5 5
## [176] 5 6 6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7
## [211] 7 7 7 7
## Levels: 1 2 3 5 6 7
index <- 1:nrow(Glass)
set.seed(pi)
testindex <- sample(index, trunc(length(index)/3))
testset <- Glass[testindex, ]
trainset <- Glass[-testindex, ]
svm.model <- svm(Type ~ ., data = trainset, cost = 100, gamma = 1)
svm.pred <- predict(svm.model, testset[, -10])
table(pred = svm.pred, true = testset[, 10])
## true
## pred 1 2 3 5 6 7
## 1 18 7 1 0 0 0
## 2 5 18 0 5 0 2
## 3 2 1 2 0 0 0
## 5 0 0 0 2 0 0
## 6 0 0 0 0 1 0
## 7 0 0 0 0 0 7
length(testset[,10])
## [1] 71
1-sum(svm.pred == testset[,10])/length(testset[,10])
## [1] 0.3239437
rpart.model <- rpart(Type ~ ., data = trainset)
rpart.pred <- predict(rpart.model, testset[, -10], type = "class")
table(pred = rpart.pred, true = testset[, 10])
## true
## pred 1 2 3 5 6 7
## 1 20 3 0 0 0 0
## 2 4 19 2 1 1 0
## 3 1 3 1 0 0 0
## 5 0 1 0 6 0 0
## 6 0 0 0 0 0 0
## 7 0 0 0 0 0 9
1-sum(rpart.pred == testset[,10])/length(testset[,10])
## [1] 0.2253521
obj <- tune(svm, Type~., data = trainset,ranges = list(gamma = 2^(-4:0), cost = 2^(2:4)),tunecontrol = tune.control(sampling = "cross"))
summary(obj)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## gamma cost
## 0.0625 16
##
## - best performance: 0.3142857
##
## - Detailed performance results:
## gamma cost error dispersion
## 1 0.0625 4 0.3333333 0.10830935
## 2 0.1250 4 0.3342857 0.07165045
## 3 0.2500 4 0.3490476 0.06044473
## 4 0.5000 4 0.3485714 0.10953071
## 5 1.0000 4 0.3557143 0.12433851
## 6 0.0625 8 0.3209524 0.07148851
## 7 0.1250 8 0.3633333 0.06834803
## 8 0.2500 8 0.3566667 0.09543498
## 9 0.5000 8 0.3628571 0.11715533
## 10 1.0000 8 0.3557143 0.12433851
## 11 0.0625 16 0.3142857 0.08571429
## 12 0.1250 16 0.3357143 0.09718253
## 13 0.2500 16 0.3566667 0.09091927
## 14 0.5000 16 0.3490476 0.11584148
## 15 1.0000 16 0.3633333 0.11097611
plot(obj)
obj$best.parameters
## gamma cost
## 11 0.0625 16
svm.model <- svm(Type ~ ., data = trainset, cost = 8, gamma = 0.25)
svm.pred <- predict(svm.model, testset[, -10])
table(pred = svm.pred, true = testset[, 10])
## true
## pred 1 2 3 5 6 7
## 1 22 9 1 0 0 0
## 2 2 15 2 3 0 1
## 3 1 1 0 0 0 0
## 5 0 0 0 4 0 0
## 6 0 1 0 0 1 0
## 7 0 0 0 0 0 8
length(testset[,10])
## [1] 71
1-sum(svm.pred == testset[,10])/length(testset[,10])
## [1] 0.2957746
(6+3+1+2+4+4+1)/71 # = 0.2957746
## [1] 0.2957746
obj <- tune(svm, Type~., data = trainset,ranges = list(gamma = seq(0.1,0.15,0.01), cost = seq(15,17,1)),tunecontrol = tune.control(sampling = "cross"))
summary(obj)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## gamma cost
## 0.13 16
##
## - best performance: 0.2728571
##
## - Detailed performance results:
## gamma cost error dispersion
## 1 0.10 15 0.2800000 0.1174775
## 2 0.11 15 0.2938095 0.1240342
## 3 0.12 15 0.2866667 0.1265827
## 4 0.13 15 0.2733333 0.1189566
## 5 0.14 15 0.2800000 0.1125485
## 6 0.15 15 0.3014286 0.1280521
## 7 0.10 16 0.2800000 0.1174775
## 8 0.11 16 0.2938095 0.1240342
## 9 0.12 16 0.2866667 0.1265827
## 10 0.13 16 0.2728571 0.1143970
## 11 0.14 16 0.2800000 0.1125485
## 12 0.15 16 0.3014286 0.1280521
## 13 0.10 17 0.2938095 0.1240342
## 14 0.11 17 0.2938095 0.1240342
## 15 0.12 17 0.2800000 0.1256036
## 16 0.13 17 0.2728571 0.1143970
## 17 0.14 17 0.2800000 0.1125485
## 18 0.15 17 0.3014286 0.1280521
plot(obj)
obj$best.parameters
## gamma cost
## 10 0.13 16
svm.model <- svm(Type ~ ., data = trainset, cost = 16, gamma = 0.13)
svm.pred <- predict(svm.model, testset[, -10])
table(pred = svm.pred, true = testset[, 10])
## true
## pred 1 2 3 5 6 7
## 1 22 9 1 0 0 0
## 2 2 15 1 3 0 0
## 3 1 1 1 0 0 0
## 5 0 0 0 4 0 0
## 6 0 1 0 0 1 1
## 7 0 0 0 0 0 8
length(testset[,10])
## [1] 71
1-sum(svm.pred == testset[,10])/length(testset[,10])
## [1] 0.2816901
(9+1+2+1+3+1+1+1+1)/length(testset[,10]) #[1] 0.2816901
## [1] 0.2816901
obj <- tune(svm, Type~., data = trainset,ranges = list(gamma = seq(0.1,0.15,0.01), cost = seq(15,17,1)),tunecontrol = tune.control(sampling = "cross"))
summary(obj)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## gamma cost
## 0.13 17
##
## - best performance: 0.2657143
##
## - Detailed performance results:
## gamma cost error dispersion
## 1 0.10 15 0.2938095 0.1466009
## 2 0.11 15 0.3076190 0.1325411
## 3 0.12 15 0.3076190 0.1325411
## 4 0.13 15 0.2866667 0.1331972
## 5 0.14 15 0.2800000 0.1435398
## 6 0.15 15 0.2942857 0.1265827
## 7 0.10 16 0.2866667 0.1409359
## 8 0.11 16 0.3076190 0.1325411
## 9 0.12 16 0.3076190 0.1325411
## 10 0.13 16 0.2800000 0.1395346
## 11 0.14 16 0.2800000 0.1435398
## 12 0.15 16 0.2942857 0.1265827
## 13 0.10 17 0.2938095 0.1302178
## 14 0.11 17 0.3076190 0.1325411
## 15 0.12 17 0.3076190 0.1325411
## 16 0.13 17 0.2657143 0.1608243
## 17 0.14 17 0.2800000 0.1435398
## 18 0.15 17 0.3009524 0.1186724
plot(obj)
obj$best.parameters
## gamma cost
## 16 0.13 17
svm.model <- svm(Type ~ ., data = trainset, cost = 16, gamma = 0.13)
svm.pred <- predict(svm.model, testset[, -10])
table(pred = svm.pred, true = testset[, 10])
## true
## pred 1 2 3 5 6 7
## 1 22 9 1 0 0 0
## 2 2 15 1 3 0 0
## 3 1 1 1 0 0 0
## 5 0 0 0 4 0 0
## 6 0 1 0 0 1 1
## 7 0 0 0 0 0 8
length(testset[,10])
## [1] 71
1-sum(svm.pred == testset[,10])/length(testset[,10])
## [1] 0.2816901
(9+1+2+1+3+1+1+1+1)/length(testset[,10]) #[1] 0.2816901
## [1] 0.2816901
x <- seq(0.1, 5, by = 0.05)
y <- log(x) + rnorm(x, sd = 0.2)
m <- svm(x, y)
new <- predict(m, x)
plot(x, y, col = 1)
points(x, log(x), col = 2)
points(x, new, col = 4)
legend(3, -1, c("actual y", "log(x)", "predicted"), col = c(1,2,4), pch=1)