Introduction to Machine Learning Chapter 3

Better Note : https://rpubs.com/potentialwjy/MachineLearning3

Decision tree

set.seed(1)
library(rpart)

## Warning: package 'rpart' was built under R version 3.3.3

library(rattle)

## Warning: package 'rattle' was built under R version 3.3.3

## Rattle: A free graphical interface for data mining with R.
## Version 4.1.0 Copyright (c) 2006-2015 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.

library(rpart.plot)

## Warning: package 'rpart.plot' was built under R version 3.3.3

library(RColorBrewer)

train <- read.csv("train.csv", header = TRUE)
test <- read.csv("test.csv", header = TRUE)
str(train)

## 'data.frame':    891 obs. of  12 variables:
##  $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : int  0 1 1 1 0 0 0 0 1 1 ...
##  $ Pclass     : int  3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : Factor w/ 891 levels "Abbing, Mr. Anthony",..: 109 191 358 277 16 559 520 629 417 581 ...
##  $ Sex        : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
##  $ Age        : num  22 38 26 35 35 NA 54 2 27 14 ...
##  $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket     : Factor w/ 681 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : Factor w/ 148 levels "","A10","A14",..: 1 83 1 57 1 1 131 1 1 1 ...
##  $ Embarked   : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...

tree <- rpart(Survived ~ Pclass+Sex+Age , data = train, method = "class")

# Draw the decision tree
fancyRpartPlot(tree)

download data : train.csv

Classify with the decision tree

tree <- rpart(Survived ~ ., train, method = "class")
pred <- predict(tree, test, type = "class")
conf <- table(test$Survived, pred)
sum(diag(conf))/sum(conf) # Print out the accuracy

Pruning the tree

set.seed(1)
tree <- rpart(Survived ~ ., train, method = "class", control = rpart.control(cp=0.00001))

# Draw the complex tree
fancyRpartPlot(tree)

# Prune the tree: pruned
pruned <- prune(tree, cp=0.01)

# Draw pruned
fancyRpartPlot(pruned)

Splitting criterion

set.seed(1)

# Train and test tree with gini criterion
tree_g <- rpart(spam ~ ., train, method = "class")
pred_g <- predict(tree_g, test, type = "class")
conf_g <- table(test$spam, pred_g)
acc_g <- sum(diag(conf_g)) / sum(conf_g)

# Train and test tree with information gain as splitting criterion
tree_i <- rpart(spam ~ ., train, method = "class", parms = list(split = "information"))
pred_i <- predict(tree_i, test, type = "class")
conf_i <- table(test$spam, pred_i)
acc_i <- sum(diag(conf_i)) / sum(conf_i)

# Draw a fancy plot of both tree_g and tree_i
fancyRpartPlot(tree_g)
fancyRpartPlot(tree_i)

Preprocess the data

train_labels <- train$Survived
test_labels <- test$Survived

# Copy train and test to knn_train and knn_test
knn_train <- train
knn_test <- test

# Drop Survived column for knn_train and knn_test
knn_train$Survived <- NULL
knn_test$Survived <- NULL

# Normalize Pclass
min_class <- min(knn_train$Pclass)
max_class <- max(knn_train$Pclass)
knn_train$Pclass <- (knn_train$Pclass - min_class) / (max_class - min_class)
knn_test$Pclass <- (knn_test$Pclass - min_class) / (max_class - min_class)

# Normalize Age
min_age <- min(knn_train$Age)
max_age <- max(knn_train$Age)

The knn() function

library(class)

pred <- knn(train = knn_train, test = knn_test, cl = train_labels, k = 5)
conf <- table(test_labels, pred)
conf

K’s choice

range <- 1:round(0.2 * nrow(knn_train))
accs <- rep(0, length(range))

for (k in range) {

  pred <- knn(train = knn_train, test = knn_test, cl = train_labels, k = k)
  conf <- table(test_labels, pred)
  accs[k] <- sum(diag(conf)) / sum(conf)
}

plot(range, accs, xlab = "k")

# Calculate the best k
which.max(accs)

Creating the ROC curve (1)

train <- read.table("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", sep=",", fill = TRUE)
colnames(train) <- c("age", "workclass", "fnlwgt", "education", "marital-status", "occupation", "moving", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income")
test <- read.table("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test", sep=",", fill = TRUE)
colnames(test) <- c("age", "workclass", "fnlwgt", "education", "marital-status", "occupation", "moving", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income")

tree <- rpart(income ~ ., train, method = "class")
all_probs <- predict(tree, test, type = "prob")
head(all_probs)
probs <- all_probs[, 2]

Test Data from UCI : Census Income Data Set

Creating the ROC curve (2)

library(ROCR)

tree <- rpart(income ~ ., train, method = "class")
probs <- predict(tree, test, type = "prob")[,2]
pred <- prediction(probs, labels = test$income)
perf <- performance(pred, "tpr", "fpr")
plot(perf)

The area under the curve

library(ROCR)

tree <- rpart(income ~ ., train, method = "class")
probs <- predict(tree, test, type = "prob")[,2]
pred <- prediction(probs, labels = test$income)
perf <- performance(pred, "auc")

# Print out the AUC
perf@y.values[[1]]

Comparing the methods

Compare a decision tree model and a k-Nearest Neighbor model

library(ROCR)

# Make the prediction objects for both models: pred_t, pred_k
pred_t <- prediction(probs_t, labels = test$spam)
pred_k <- prediction(probs_k, labels = test$spam)

# Make the performance objects for both models: perf_t, perf_k
perf_t <- performance(pred_t, "tpr", "fpr")
perf_k <- performance(pred_k, "tpr", "fpr")


# Draw the ROC lines using draw_roc_lines()
draw_roc_lines(perf_t, perf_k)

Introduction to Machine Learning Chapter 3

Rose Park

2017/6-22, 7-6, 7-7

Decision tree

Classify with the decision tree

Pruning the tree

Splitting criterion

Preprocess the data

The knn() function

K’s choice

Creating the ROC curve (1)

Creating the ROC curve (2)

The area under the curve

Comparing the methods