train <- read.csv("train.csv")
# Set random seed. Don't remove this line
set.seed(1)
# Load the rpart, rattle, rpart.plot and RColorBrewer package
library(rpart)
## Warning: package 'rpart' was built under R version 3.2.5
library(rattle)
## Warning: package 'rattle' was built under R version 3.2.5
## Rattle: A free graphical interface for data mining with R.
## XXXX 4.1.0 Copyright (c) 2006-2015 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.2.5
library(RColorBrewer)
## Warning: package 'RColorBrewer' was built under R version 3.2.3
# Build a tree model: tree
tree <- rpart(Survived ~., data=train, method="class")
# Draw the decision tree
fancyRpartPlot(tree)
test <- read.csv("test.csv")
# Code from previous exercise
set.seed(1)
library(rpart)
## Use tree to predict the labels of the test set with the predict() function; store the resulting prediction in pred.
tree <- rpart(Survived ~ ., train, method = "class")
# Predict the values of the test set: pred
pred <- predict(tree, test, type="class")
## Create a confusion matrix, conf, of your predictions on the test set. The true values, test$Survived, should be on the rows.
# Construct the confusion matrix: conf
conf <- table(test$Survived, pred)
## Use the confusion matrix to print out the accuracy. This is the ratio of all correctly classified instances divided by the total number of classified instances.
# Print out the accuracy
sum(diag(conf)) / sum(conf)
## [1] 0.7990654
# Calculation of a complex tree
set.seed(1)
tree <- rpart(Survived ~ ., train, method = "class", control = rpart.control(cp=0.00001))
# Draw the complex tree
fancyRpartPlot(tree)
# Prune the tree: pruned
## Use the prune() method to shrink tree to a more compact tree, pruned. Also specify the cp argument to be 0.01. This is a complexity parameter. It basically tells the algorithm to remove node splits that do not sufficiently decrease the impurity.
pruned <- prune(tree, cp=0.01)
# Draw pruned
fancyRpartPlot(pruned)
- Another way to check if you overfit your model is by comparing the accuracy on the training set with the accuracy on the test set. You’d see that the difference between those two is smaller for the simpler tree. You can also set the cp argument while learning the tree with rpart() using rpart.control.
# Set random seed. Don't remove this line.
set.seed(1)
# Train and test tree with gini criterion (train data is not available)
##tree_g <- rpart(spam ~ ., train, method = "class")
##pred_g <- predict(tree_g, test, type = "class")
##conf_g <- table(test$spam, pred_g)
##acc_g <- sum(diag(conf_g)) / sum(conf_g)
# Change the first line of code to use information gain as splitting criterion
##tree_i <- rpart(spam ~ ., train, method = "class", parms = list(split = "information"))
##pred_i <- predict(tree_i, test, type = "class")
##conf_i <- table(test$spam, pred_i)
##acc_i <- sum(diag(conf_i)) / sum(conf_i)
# Draw a fancy plot of both tree_g and tree_i
##fancyRpartPlot(tree_g)
##fancyRpartPlot(tree_i)
# Print out acc_g and acc_i
##acc_g
##acc_i
#(x???min(x))/(max(x)???min(x))
train <- read.csv("train3.csv")
test <- read.csv("test3.csv")
# Store the Survived column of train and test in train_labels and test_labels
train_labels <- train$Survived
test_labels <- test$Survived
# Copy train and test to knn_train and knn_test
knn_train <- train
knn_test <- test
# Drop Survived column for knn_train and knn_test
knn_train$Survived <- NULL
knn_test$Survived <- NULL
# normalize Pclass
##Pclass is an ordinal value between 1 and 3. Have a look at the code that normalizes this variable in both the training and the test set. To define the minimum and maximum, only the training set is used; we can't use information on the test set (like the minimums or maximums) to normalize the data.
min_class <- min(knn_train$Pclass)
max_class <- max(knn_train$Pclass)
knn_train$Pclass <- (knn_train$Pclass - min_class) / (max_class - min_class)
knn_test$Pclass <- (knn_test$Pclass - min_class) / (max_class - min_class)
# normalize Age
##In a similar fashion, normalize the Age column of knn_train as well as knn_test. Again, you should only use features from the train set to decide on the normalization! Use the intermediate variables min_age and max_age.
min_age <- min(knn_train$Age)
max_age <- max(knn_train$Age)
knn_train$Age <- (knn_train$Age - min_age) / (max_age - min_age)
knn_test$Age <- (knn_test$Age - min_age) / (max_age - min_age)
head(knn_train$Age)
## [1] 0.2926614 0.1916130 0.4442339 0.4821271 0.3684476 0.3684476
head(knn_test$Age)
## [1] 0.520020210 0.002147278 0.254768220 0.374763168 0.355816597 0.418971833
# Set random seed. Don't remove this line.
set.seed(1)
# Load the class package
library(class)
## Warning: package 'class' was built under R version 3.2.5
# Make predictions using knn: pred
pred <- knn(knn_train, knn_test, train_labels, k=5)
# Construct the confusion matrix: conf
conf <- table(test_labels, pred)
# Print out the confusion matrix
conf
## pred
## test_labels 0 1
## 0 113 16
## 1 26 59
# Set random seed. Don't remove this line.
set.seed(1)
# Load the class package, define range and accs
library(class)
range <- 1:round(0.2 * nrow(knn_train))
accs <- rep(0, length(range))
for (k in range)
# Make predictions using knn: pred
{
pred <- knn(knn_train, knn_test, train_labels, k=k)
# Construct the confusion matrix: conf
conf <- table(test_labels, pred)
# Calculate the accuracy and store it in accs[k]
accs[k] <- sum(diag(conf)) / sum(conf)
}
# Plot the accuracies. Title of x-axis is "k".
plot(range, accs, xlab="k")
# Calculate the best k
which.max(accs)
## [1] 1
train4 <- read.csv("train4.csv")
test4 <- read.csv("test4.csv")
# Set random seed. Don't remove this line
set.seed(1)
# Build a tree on the training set: tree
tree <- rpart(income ~ ., train4, method = "class")
# Predict probability values using the model: all_probs
all_probs <- predict(tree, test4, type="prob")
# Print out all_probs
head(all_probs)
## <=50K >50K
## 1 0.6989796 0.30102041
## 2 0.9500130 0.04998705
## 3 0.9500130 0.04998705
## 4 0.9500130 0.04998705
## 5 0.2836489 0.71635112
## 6 0.6989796 0.30102041
# Select second column of all_probs: probs
probs <- all_probs[,2]
head(probs)
## 1 2 3 4 5 6
## 0.30102041 0.04998705 0.04998705 0.04998705 0.71635112 0.30102041
# Code of previous exercise
set.seed(1)
tree <- rpart(income ~ ., train4, method = "class")
probs <- predict(tree, test4, type = "prob")[,2]
# Load the ROCR library
library(ROCR)
## Warning: package 'ROCR' was built under R version 3.2.5
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.2.5
##
## Attaching package: 'gplots'
##
## The following object is masked from 'package:stats':
##
## lowess
# Make a prediction object: pred
pred <- prediction(probs, test4$income)
# Make a performance object: perf
perf <- performance(pred, "tpr" , "fpr")
# Plot this curve
plot(perf)
# Build tree and predict probability values for the test set
set.seed(1)
tree <- rpart(income ~ ., train4, method = "class")
probs <- predict(tree, test4, type = "prob")[,2]
# Load the ROCR library
library(ROCR)
# Make a prediction object: pred
pred <- prediction(probs, test4$income)
# Make a performance object: perf
perf <- performance(pred, "auc")
# Print out the AUC
perf@y.values[[1]]
## [1] 0.8481775
test5 <- read.csv("test5.csv")
# Make the prediction objects for both models: pred_t, pred_k
##pred_t <- prediction(probs_t, test5$spam)
##pred_k <- prediction(probs_k, test5$spam)
# Make the performance objects for both models: perf_t, perf_k
##perf_t <- performance(pred_t, "tpr","fpr")
##perf_k <- performance(pred_k, "tpr", "fpr")
# Draw the ROC lines using draw_roc_lines()
##draw_roc_lines(perf_t, perf_k)