使用「iris」資料庫,比較SVM、Logistics Regression、Tree的正確度
1. SVM
library(e1071)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# 只取iris中virginica和versicolor兩個品種進行分析
iris.small <- filter(iris, Species %in% c("virginica", "versicolor"))
# 分成訓練及測試資料,10% test,90% train
n <- 0.1 * nrow(iris.small)
index <- sample(1:nrow(iris.small), n)
train <- iris.small[-index,]
test <- iris.small[index,]
# 訓練SVM的分類模型
model <- svm(Species ~., data = train)
# 預測
test_results <- predict(model, test)
# 測試資料的混淆矩陣
true_value <- test$Species
table(true_value, test_results)
## test_results
## true_value setosa versicolor virginica
## setosa 0 0 0
## versicolor 0 7 1
## virginica 0 0 2
# 測試資料的分類準確率
confus.matrix <- table(true_value, test_results)
accuracy.svm.iris <- sum(diag(confus.matrix)) / sum(confus.matrix)
accuracy.svm.iris
## [1] 0.9
2. Logistics Regression
library(dplyr)
# 只取iris中virginica和versicolor兩個品種進行分析
iris.small <- filter(iris, Species %in% c("virginica", "versicolor"))
# 將versicolor假定為1,virginica假定為0
iris.small$Species <- ifelse(iris.small$Species=='versicolor', 1,0)
head(iris.small)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 7.0 3.2 4.7 1.4 1
## 2 6.4 3.2 4.5 1.5 1
## 3 6.9 3.1 4.9 1.5 1
## 4 5.5 2.3 4.0 1.3 1
## 5 6.5 2.8 4.6 1.5 1
## 6 5.7 2.8 4.5 1.3 1
#分成訓練及測試資料,10% test,90% train
n <- 0.1 * nrow(iris.small)
index <- sample(1:nrow(iris.small), n)
train <- iris.small[-index,]
test <- iris.small[index,]
# 訓練Logistics Regression的分類模型
model.glm <- glm(Species ~ ., family = binomial(link = 'logit'), data = train)
model.glm
##
## Call: glm(formula = Species ~ ., family = binomial(link = "logit"),
## data = train)
##
## Coefficients:
## (Intercept) Sepal.Length Sepal.Width Petal.Length Petal.Width
## 39.160 2.156 7.004 -8.651 -17.941
##
## Degrees of Freedom: 89 Total (i.e. Null); 85 Residual
## Null Deviance: 124.8
## Residual Deviance: 11.73 AIC: 21.73
# 預測
test.pred <- predict(model.glm, test, type = "response")
pred.round <- round(test.pred, 2)
# 測試資料的分類準確率
pred.results <- ifelse(pred.round > 0.5, 1,0)
pred.results
## 3 58 32 60 18 52 83 82 25 19
## 1 0 1 0 1 0 0 0 1 1
# 1: versicolor 0: virginica
confus.matrix <- table(predict = pred.results, real = test$Species)
confus.matrix
## real
## predict 0 1
## 0 5 0
## 1 0 5
accuracy.glm.iris <- sum(diag(confus.matrix)) / sum(confus.matrix)
accuracy.glm.iris
## [1] 1
3. Tree
library(tree)
## Warning: package 'tree' was built under R version 3.4.4
library(dplyr)
# 只取iris中virginica和versicolor兩個品種進行分析
iris.small <- filter(iris, Species %in% c("virginica", "versicolor"))
#分成訓練及測試資料,10% test,90% train
n <- 0.1 * nrow(iris.small)
index <- sample(1:nrow(iris.small), n)
train <- iris.small[-index,]
test <- iris.small[index,]
# 訓練tree的分類模型
iris.tree <- tree(Species ~., data = train)
plot(iris.tree)
text(iris.tree)

# 預測
model.tree <- predict(iris.tree, test, type = "class")
# 測試資料的分類準確率
pred.tree <- ifelse(model.tree == test$Species, 1,0)
accuracy.tree.iris <- sum(pred.tree) / length(pred.tree)
accuracy.tree.iris
## [1] 0.8
使用「Titanic」資料庫,比較SVM、Logistics Regression、Tree的正確度
1. SVM
library(e1071)
titanic <- read.csv('titanic.csv', header=T, na.string=c(""))
# 看看每個欄位資料遺失的情形
num_na <- function(x) {
sum(is.na(x))
}
sapply(titanic, num_na)
## PassengerId Survived Pclass Name Sex Age
## 0 0 0 0 0 177
## SibSp Parch Ticket Fare Cabin Embarked
## 0 0 0 0 687 2
# 產生一個新資料庫,只包含需要的欄位
names(titanic)
## [1] "PassengerId" "Survived" "Pclass" "Name" "Sex"
## [6] "Age" "SibSp" "Parch" "Ticket" "Fare"
## [11] "Cabin" "Embarked"
data <- titanic[,c(2,3,5,6,7,8,10,12)]
names(data)
## [1] "Survived" "Pclass" "Sex" "Age" "SibSp" "Parch"
## [7] "Fare" "Embarked"
# 處理遺失的資料與資料切割
mean_age <- mean(data$Age, na.rm = T)
data$Age[is.na(data$Age)] <- mean_age
data <- data[!is.na(data$Embarked),]
#分成訓練及測試資料,10% test,90% train
n <- 0.1 * nrow(data)
index <- sample(1:nrow(data), n)
train <- data[-index,]
test <- data[index,]
# 訓練SVM的分類模型
model <- svm(Survived ~., data = train)
# 預測
test_results <- predict(model, test)
# 測試資料的分類準確率
test_rt_f <- ifelse(test_results > 0.5, 1,0)
true_value <- test$Survived
table(true_value, test_rt_f)
## test_rt_f
## true_value 0 1
## 0 46 6
## 1 12 24
accuracy.svm.titanic <- mean(test_rt_f == true_value)
accuracy.svm.titanic
## [1] 0.7954545
2. Logistics Regression
titanic <- read.csv('titanic.csv', header=T, na.string=c(""))
# 看看每個欄位資料遺失的情形
num_na <- function(x) {
sum(is.na(x))
}
sapply(titanic, num_na)
## PassengerId Survived Pclass Name Sex Age
## 0 0 0 0 0 177
## SibSp Parch Ticket Fare Cabin Embarked
## 0 0 0 0 687 2
# 產生一個新資料庫,只包含需要的欄位
names(titanic)
## [1] "PassengerId" "Survived" "Pclass" "Name" "Sex"
## [6] "Age" "SibSp" "Parch" "Ticket" "Fare"
## [11] "Cabin" "Embarked"
data <- titanic[,c(2,3,5,6,7,8,10,12)]
names(data)
## [1] "Survived" "Pclass" "Sex" "Age" "SibSp" "Parch"
## [7] "Fare" "Embarked"
# 處理遺失的資料與資料切割
mean_age <- mean(data$Age, na.rm = T)
data$Age[is.na(data$Age)] <- mean_age
data <- data[!is.na(data$Embarked),]
#分成訓練及測試資料,10% test,90% train
n <- 0.1 * nrow(data)
index <- sample(1:nrow(data), n)
train <- data[-index,]
test <- data[index,]
# 訓練Logistics Regression的分類模型
model.glm <- glm(Survived ~., family = binomial(link = "logit"), data = train)
# 預測
test.pred <- predict(model.glm, test, type = "response")
# 測試資料的分類準確率
pred.results <- ifelse(test.pred > 0.5, 1,0)
table(predict = pred.results, real = test$Survived)
## real
## predict 0 1
## 0 47 6
## 1 10 25
accuracy.glm.titanic <- mean(pred.results == test$Survived)
accuracy.glm.titanic
## [1] 0.8181818
3. Tree
library(tree)
titanic <- read.csv('titanic.csv', header=T, na.string=c(""))
# 看看每個欄位資料遺失的情形
num_na <- function(x) {
sum(is.na(x))
}
sapply(titanic, num_na)
## PassengerId Survived Pclass Name Sex Age
## 0 0 0 0 0 177
## SibSp Parch Ticket Fare Cabin Embarked
## 0 0 0 0 687 2
# 產生一個新資料庫,只包含需要的欄位
names(titanic)
## [1] "PassengerId" "Survived" "Pclass" "Name" "Sex"
## [6] "Age" "SibSp" "Parch" "Ticket" "Fare"
## [11] "Cabin" "Embarked"
data <- titanic[,c(2,3,5,6,7,8,10,12)]
names(data)
## [1] "Survived" "Pclass" "Sex" "Age" "SibSp" "Parch"
## [7] "Fare" "Embarked"
# 處理遺失的資料與資料切割
mean_age <- mean(data$Age, na.rm = T)
data$Age[is.na(data$Age)] <- mean_age
data <- data[!is.na(data$Embarked),]
#分成訓練及測試資料,10% test,90% train
n <- 0.1 * nrow(data)
index <- sample(1:nrow(data), n)
train <- data[-index,]
test <- data[index,]
# 訓練tree的分類模型
titanic.tree <- tree(Survived ~ ., data = train)
plot(titanic.tree)
text(titanic.tree)

# 預測
model.tree <- predict(titanic.tree, test, type = "vector")
# 測試資料的分類準確率
pred.tree <- ifelse(model.tree > 0.5, 1,0)
confus.matrix <- table(test$Survived, pred.tree)
accuracy.tree.titanic <- sum(diag(confus.matrix)) / sum(confus.matrix)
accuracy.tree.titanic
## [1] 0.6931818