Better Note : https://rpubs.com/potentialwjy/MachineLearning3
set.seed(1)
library(rpart)
## Warning: package 'rpart' was built under R version 3.3.3
library(rattle)
## Warning: package 'rattle' was built under R version 3.3.3
## Rattle: A free graphical interface for data mining with R.
## Version 4.1.0 Copyright (c) 2006-2015 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.3.3
library(RColorBrewer)
train <- read.csv("train.csv", header = TRUE)
test <- read.csv("test.csv", header = TRUE)
str(train)
## 'data.frame': 891 obs. of 12 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : Factor w/ 891 levels "Abbing, Mr. Anthony",..: 109 191 358 277 16 559 520 629 417 581 ...
## $ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : Factor w/ 681 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : Factor w/ 148 levels "","A10","A14",..: 1 83 1 57 1 1 131 1 1 1 ...
## $ Embarked : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...
tree <- rpart(Survived ~ Pclass+Sex+Age , data = train, method = "class")
# Draw the decision tree
fancyRpartPlot(tree)
download data : train.csv
tree <- rpart(Survived ~ ., train, method = "class")
pred <- predict(tree, test, type = "class")
conf <- table(test$Survived, pred)
sum(diag(conf))/sum(conf) # Print out the accuracy
set.seed(1)
tree <- rpart(Survived ~ ., train, method = "class", control = rpart.control(cp=0.00001))
# Draw the complex tree
fancyRpartPlot(tree)
# Prune the tree: pruned
pruned <- prune(tree, cp=0.01)
# Draw pruned
fancyRpartPlot(pruned)
set.seed(1)
# Train and test tree with gini criterion
tree_g <- rpart(spam ~ ., train, method = "class")
pred_g <- predict(tree_g, test, type = "class")
conf_g <- table(test$spam, pred_g)
acc_g <- sum(diag(conf_g)) / sum(conf_g)
# Train and test tree with information gain as splitting criterion
tree_i <- rpart(spam ~ ., train, method = "class", parms = list(split = "information"))
pred_i <- predict(tree_i, test, type = "class")
conf_i <- table(test$spam, pred_i)
acc_i <- sum(diag(conf_i)) / sum(conf_i)
# Draw a fancy plot of both tree_g and tree_i
fancyRpartPlot(tree_g)
fancyRpartPlot(tree_i)
train_labels <- train$Survived
test_labels <- test$Survived
# Copy train and test to knn_train and knn_test
knn_train <- train
knn_test <- test
# Drop Survived column for knn_train and knn_test
knn_train$Survived <- NULL
knn_test$Survived <- NULL
# Normalize Pclass
min_class <- min(knn_train$Pclass)
max_class <- max(knn_train$Pclass)
knn_train$Pclass <- (knn_train$Pclass - min_class) / (max_class - min_class)
knn_test$Pclass <- (knn_test$Pclass - min_class) / (max_class - min_class)
# Normalize Age
min_age <- min(knn_train$Age)
max_age <- max(knn_train$Age)
library(class)
pred <- knn(train = knn_train, test = knn_test, cl = train_labels, k = 5)
conf <- table(test_labels, pred)
conf
range <- 1:round(0.2 * nrow(knn_train))
accs <- rep(0, length(range))
for (k in range) {
pred <- knn(train = knn_train, test = knn_test, cl = train_labels, k = k)
conf <- table(test_labels, pred)
accs[k] <- sum(diag(conf)) / sum(conf)
}
plot(range, accs, xlab = "k")
# Calculate the best k
which.max(accs)
train <- read.table("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", sep=",", fill = TRUE)
colnames(train) <- c("age", "workclass", "fnlwgt", "education", "marital-status", "occupation", "moving", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income")
test <- read.table("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test", sep=",", fill = TRUE)
colnames(test) <- c("age", "workclass", "fnlwgt", "education", "marital-status", "occupation", "moving", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income")
tree <- rpart(income ~ ., train, method = "class")
all_probs <- predict(tree, test, type = "prob")
head(all_probs)
probs <- all_probs[, 2]
Test Data from UCI : Census Income Data Set
library(ROCR)
tree <- rpart(income ~ ., train, method = "class")
probs <- predict(tree, test, type = "prob")[,2]
pred <- prediction(probs, labels = test$income)
perf <- performance(pred, "tpr", "fpr")
plot(perf)
library(ROCR)
tree <- rpart(income ~ ., train, method = "class")
probs <- predict(tree, test, type = "prob")[,2]
pred <- prediction(probs, labels = test$income)
perf <- performance(pred, "auc")
# Print out the AUC
perf@y.values[[1]]
Compare a decision tree model and a k-Nearest Neighbor model
library(ROCR)
# Make the prediction objects for both models: pred_t, pred_k
pred_t <- prediction(probs_t, labels = test$spam)
pred_k <- prediction(probs_k, labels = test$spam)
# Make the performance objects for both models: perf_t, perf_k
perf_t <- performance(pred_t, "tpr", "fpr")
perf_k <- performance(pred_k, "tpr", "fpr")
# Draw the ROC lines using draw_roc_lines()
draw_roc_lines(perf_t, perf_k)