Decision Tree
data(iris)
#View(iris)
library(rpart)
?rpart
## starting httpd help server ... done
# y = ax + b
# Species = W1 * Sepal.Lengh +W2 * Sepal.Width + W3 * Petal.Length + W4 * Petal.Width
fit <- rpart(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data = iris)
fit
## n= 150
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 150 100 setosa (0.33333333 0.33333333 0.33333333)
## 2) Petal.Length< 2.45 50 0 setosa (1.00000000 0.00000000 0.00000000) *
## 3) Petal.Length>=2.45 100 50 versicolor (0.00000000 0.50000000 0.50000000)
## 6) Petal.Width< 1.75 54 5 versicolor (0.00000000 0.90740741 0.09259259) *
## 7) Petal.Width>=1.75 46 1 virginica (0.00000000 0.02173913 0.97826087) *
plot(fit, margin = 0.1)
text(fit)

predict(fit, data.frame(Sepal.Width = 100, Sepal.Length = 100, Petal.Length = 2, Petal.Width = 100))
## setosa versicolor virginica
## 1 1 0 0
predicted_prob_ <- predict(fit, iris)
predicted <- predict(fit, iris, type= 'class')
sum(predicted == iris$Species) / length(iris$Species)
## [1] 0.96
save(x = fit, file = 'tree.RData')
data(iris)
load('tree.RData')
#predict(fit, iris)
plot(Petal.Width ~ Petal.Length, data = iris, col = Species)
abline(v = 2.45, col='orange')
abline(h = 1.75, col='blue')

Evaluate Tree Model
predicted <- predict(fit, iris, type= 'class')
sum(predicted == iris$Species) / length(iris$Species)
## [1] 0.96
table(iris$Species, predicted)
## predicted
## setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 49 1
## virginica 0 5 45
# install.packages('caret')
# install.packages('e1071')
library(caret)
## Warning: package 'caret' was built under R version 3.4.4
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.4.4
tb <- table(iris$Species, predicted)
confusionMatrix(tb)
## Confusion Matrix and Statistics
##
## predicted
## setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 49 1
## virginica 0 5 45
##
## Overall Statistics
##
## Accuracy : 0.96
## 95% CI : (0.915, 0.9852)
## No Information Rate : 0.36
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.94
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9074 0.9783
## Specificity 1.0000 0.9896 0.9519
## Pos Pred Value 1.0000 0.9800 0.9000
## Neg Pred Value 1.0000 0.9500 0.9900
## Prevalence 0.3333 0.3600 0.3067
## Detection Rate 0.3333 0.3267 0.3000
## Detection Prevalence 0.3333 0.3333 0.3333
## Balanced Accuracy 1.0000 0.9485 0.9651
students <- c(80,50,90,60,88,47)
groups <- c(1 ,2 ,1 ,2 ,1 , 2)
students[groups == 1]
## [1] 80 90 88
students[groups == 2]
## [1] 50 60 47
set.seed(567)
idx <- sample.int(2, nrow(iris), replace=TRUE, prob = c(0.7,0.3))
table(idx)
## idx
## 1 2
## 107 43
trainset <- iris[idx == 1,]
testset <- iris[idx == 2,]
dim(trainset)
## [1] 107 5
dim(testset)
## [1] 43 5
fit <- rpart(Species ~ ., data = trainset)
predicted <- predict(fit, testset, type = 'class')
sum((predicted == testset$Species)) / length(testset$Species)
## [1] 0.9534884
table(testset$Species, predicted)
## predicted
## setosa versicolor virginica
## setosa 15 0 0
## versicolor 0 13 0
## virginica 0 2 13
train_predicted <- predict(fit, trainset, type = 'class')
test_predicted <- predict(fit, testset, type = 'class')
sum((train_predicted == trainset$Species)) / length(trainset$Species)
## [1] 0.9626168
sum((test_predicted == testset$Species)) / length(testset$Species)
## [1] 0.9534884
plot(fit, margin = 0.1)
text(fit)

dataset <- iris[iris$Species != 'setosa', ]
plot(Petal.Width ~ Petal.Length, data = dataset, col = Species)
abline(h = 1.75, col='orange')

Logistic Regression
dataset <- iris[iris$Species != 'setosa',]
fit <- glm(Species ~ .,data=dataset, family=binomial )
predicted <- predict(fit, data= dataset)
predicted
## 51 52 53 54 55 56
## -11.3544818 -9.9326130 -6.7253803 -10.0730364 -6.5638417 -9.1918314
## 57 58 59 60 61 62
## -6.6396889 -21.3484037 -11.1356798 -11.1201500 -17.0366939 -10.1926410
## 63 64 65 66 67 68
## -16.1233989 -7.1315175 -18.0998447 -12.7756425 -6.6242594 -18.0278599
## 69 70 71 72 73 74
## -2.7586819 -16.2559018 -0.3853463 -14.8926121 -1.2377160 -10.1206562
## 75 76 77 78 79 80
## -13.4714513 -11.8610318 -7.2461444 -0.9640817 -6.9422588 -22.7708802
## 81 82 83 84 85 86
## -16.2842296 -19.0557818 -16.2565095 1.8801634 -6.1312154 -8.4540886
## 87 88 89 90 91 92
## -8.1182133 -8.2734585 -14.0532409 -11.4092139 -10.1341622 -8.7425447
## 93 94 95 96 97 98
## -14.6454823 -20.9268370 -11.1060362 -15.1854381 -12.6887357 -12.9784072
## 99 100 101 102 103 104
## -23.5097383 -12.9635855 22.0760350 7.8590485 13.8507316 8.1763399
## 105 106 107 108 109 110
## 16.2155389 19.2186911 2.0990656 12.3116893 11.7484836 18.7960092
## 111 112 113 114 115 116
## 4.6215645 8.2657934 10.8185436 10.3274231 16.3340282 12.2398047
## 117 118 119 120 121 122
## 6.0722686 16.3990117 28.1305669 2.4490547 14.7789486 7.6267405
## 123 124 125 126 127 128
## 19.4226714 2.9119477 10.9466766 7.7251297 1.5474425 1.4007256
## 129 130 131 132 133 134
## 14.0837476 3.5182027 12.6759926 9.4199248 15.9123613 -1.3561051
## 135 136 137 138 139 140
## 3.3591953 17.9147039 15.8075785 5.6507019 0.7043091 8.9609944
## 141 142 143 144 145 146
## 16.8257565 9.7894062 7.8590485 16.9113477 18.2611313 11.8934775
## 147 148 149 150
## 7.0196773 6.9006805 12.3396098 3.7796467
res <- ifelse(predicted > 0, 'virginica', 'versicolor' )
sum(dataset$Species == res) / length(dataset$Species)
## [1] 0.98
table(dataset$Species, res)
## res
## versicolor virginica
## setosa 0 0
## versicolor 49 1
## virginica 1 49
fit
##
## Call: glm(formula = Species ~ ., family = binomial, data = dataset)
##
## Coefficients:
## (Intercept) Sepal.Length Sepal.Width Petal.Length Petal.Width
## -42.638 -2.465 -6.681 9.429 18.286
##
## Degrees of Freedom: 99 Total (i.e. Null); 95 Residual
## Null Deviance: 138.6
## Residual Deviance: 11.9 AIC: 21.9
SVM
library(e1071)
## Warning: package 'e1071' was built under R version 3.4.4
?svm
fit_linear <- svm(Species ~ ., data = iris, kernel='linear')
fit_rbf <- svm(Species ~ ., data = iris, kernel='radial')
fit_poly <- svm(Species ~ ., data = iris, kernel='polynomial')
predicted <- predict(fit_linear, data = iris)
sum(iris$Species == predicted) / length(iris$Species)
## [1] 0.9666667
table(iris$Species, predicted)
## predicted
## setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 46 4
## virginica 0 1 49
predicted <- predict(fit_rbf, data = iris)
sum(iris$Species == predicted) / length(iris$Species)
## [1] 0.9733333
table(iris$Species, predicted)
## predicted
## setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 48 2
## virginica 0 2 48
predicted <- predict(fit_poly, data = iris)
sum(iris$Species == predicted) / length(iris$Species)
## [1] 0.9533333
table(iris$Species, predicted)
## predicted
## setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 50 0
## virginica 0 7 43
fit_tree <- rpart(Species ~ ., data = iris)
predicted <- predict(fit_tree, data = iris, type='class')
sum(iris$Species == predicted) / length(iris$Species)
## [1] 0.96
iris.subset = subset(iris, select=c("Sepal.Length", "Sepal.Width", "Species"), Species %in% c("setosa","virginica"))
svm.model = svm(Species ~ ., data=iris.subset, kernel='linear', cost=1, scale=FALSE)
plot(x=iris.subset$Sepal.Length,y=iris.subset$Sepal.Width, col=iris.subset$Species, pch=19)
points(iris.subset[svm.model$index,c(1,2)],col="blue",cex=2)
w = t(svm.model$coefs) %*% svm.model$SV
b = -svm.model$rho
abline(a=-b/w[1,2], b=-w[1,1]/w[1,2], col="red", lty=5)

svm.model <- svm(Species ~ ., data=iris.subset, type='C-classification', kernel='linear', cost=10000, scale=FALSE)
plot(x=iris.subset$Sepal.Length,y=iris.subset$Sepal.Width, col=iris.subset$Species, pch=19)
points(iris.subset[svm.model$index,c(1,2)],col="blue",cex=2)
w <- t(svm.model$coefs) %*% svm.model$SV
b <- -svm.model$rho
abline(a=-b/w[1,2], b=-w[1,1]/w[1,2], col="red", lty=5)
