# Set random seed. Don't remove this line
set.seed(1)
# Have a look at the structure of titanic
titanic = read.csv("titanic_train.csv")
str(titanic)
## 'data.frame': 891 obs. of 12 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : Factor w/ 891 levels "Abbing, Mr. Anthony",..: 109 191 358 277 16 559 520 629 417 581 ...
## $ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : Factor w/ 681 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : Factor w/ 148 levels "","A10","A14",..: 1 83 1 57 1 1 131 1 1 1 ...
## $ Embarked : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...
library(rpart)
## Warning: package 'rpart' was built under R version 3.2.2
tree <- rpart(Survived ~ ., data = titanic, method = "class")
pred <- predict(tree, titanic, type="class")
table(titanic$Survived, pred)
## pred
## 0 1
## 0 549 0
## 1 0 342
# P N
# p TP FN
# n FP TN
##Accuracy
# TP+TN/TP+FN+FP+TN
##Precision
# TP/TP+FP
##Recall
# TP/FP+FN
PredictionPositive <- c(212, 53)
PredictionNegative <- c(78,371)
conf <- data.frame(PredictionPositive, PredictionNegative)
conf
## PredictionPositive PredictionNegative
## 1 212 78
## 2 53 371
# Assign TP, FN, FP and TN using conf
TP=212
FN=78
FP=53
TN=371
# Calculate and print the accuracy: acc
acc=(TP+TN)/(TP+FN+FP+TN)
acc
## [1] 0.8165266
# Calculate and print out the precision: prec
prec=TP/(TP+FP)
prec
## [1] 0.8
# Calculate and print out the recall: rec
rec=TP/(TP+FN)
rec
## [1] 0.7310345
air <- read.csv("airfoil_self_noise.csv")
str(air)
## 'data.frame': 1503 obs. of 6 variables:
## $ freq : int 800 1000 1250 1600 2000 2500 3150 4000 5000 6300 ...
## $ angle : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ch_length: num 0.305 0.305 0.305 0.305 0.305 ...
## $ velocity : num 71.3 71.3 71.3 71.3 71.3 71.3 71.3 71.3 71.3 71.3 ...
## $ thickness: num 0.00266 0.00266 0.00266 0.00266 0.00266 ...
## $ dec : num 126 125 126 128 127 ...
fit <- lm(dec ~ freq + angle + ch_length, data = air)
pred <- predict(fit)
rmse <- sqrt(sum((air$dec-pred)^2)/nrow(air))
rmse
## [1] 5.215778
fit2 <- lm(dec ~ freq + angle + ch_length + velocity + thickness, data = air)
pred2 <- predict(fit2)
rmse2 <- sqrt(sum((air$dec - pred2) ^ 2) / nrow(air))
rmse2
## [1] 4.799244
seeds <- read.csv("seeds_dataset.csv")
str(seeds)
## 'data.frame': 210 obs. of 8 variables:
## $ area : num 15.3 14.9 14.3 13.8 16.1 ...
## $ perimeter : num 14.8 14.6 14.1 13.9 15 ...
## $ compactness : num 0.871 0.881 0.905 0.895 0.903 ...
## $ length : num 5.76 5.55 5.29 5.32 5.66 ...
## $ width : num 3.31 3.33 3.34 3.38 3.56 ...
## $ asymmetry : num 2.22 1.02 2.7 2.26 1.35 ...
## $ groove_length: num 5.22 4.96 4.83 4.8 5.17 ...
## $ X : int 1 1 1 1 1 1 1 1 1 1 ...
set.seed(1)
km_seeds <- kmeans(seeds, 3)
km_seeds
## K-means clustering with 3 clusters of sizes 75, 61, 74
##
## Cluster means:
## area perimeter compactness length width asymmetry groove_length
## 1 11.90907 13.25027 0.8515493 5.222333 2.865093 4.722187 5.093040
## 2 18.72180 16.29738 0.8850869 6.208934 3.722672 3.603590 6.066098
## 3 14.63203 14.45324 0.8790973 5.561784 3.274892 2.744043 5.184932
## X
## 1 2.866667
## 2 1.983607
## 3 1.135135
##
## Clustering vector:
## [1] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [36] 3 3 2 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 3 3 3 3 3 3 3
## [71] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2
## [106] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 2 2 2 2 2 2 2 3 3 3 3 2 3 3 3
## [141] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [176] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##
## Within cluster sum of squares by cluster:
## [1] 217.4687 185.0922 223.1591
## (between_SS / total_SS = 78.1 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
plot(length ~ compactness, data = seeds, col=km_seeds$cluster)
# Print out the ratio of the WSS to the BSS
km_seeds$tot.withinss/km_seeds$betweenss
## [1] 0.2800729
# Defining performance metrics for unsupervised learning (cf. clustering) is easy and straightforward.
# You can use the RMSE to determine if a classification was good.
# If you want to build a system that can automatically categorize mails as spam or not, a confusion matrix can help you assess its quality. <----RIGHT!
# The classification model you learned on a dataset shows an error rate of 0.1. You're sure this model will be useful.
# Set random seed. Don't remove this line.
set.seed(1)
# Shuffle the dataset, call the result shuffled
n <- nrow(titanic)
shuffled <- titanic[sample(n), ]
train_indices <- 1:round(0.7 * n)
# Split the data in train and test
train <- shuffled[train_indices, ]
test_indices <- (round(0.7 * n) + 1):n
test <- shuffled[test_indices, ]
# Print the structure of train and test
str(train)
## 'data.frame': 624 obs. of 12 variables:
## $ PassengerId: int 237 332 510 807 179 796 837 585 556 55 ...
## $ Survived : int 0 0 1 0 0 0 0 0 0 0 ...
## $ Pclass : int 2 1 3 1 2 2 3 3 1 1 ...
## $ Name : Factor w/ 891 levels "Abbing, Mr. Anthony",..: 375 642 460 35 325 626 643 645 881 625 ...
## $ Sex : Factor w/ 2 levels "female","male": 2 2 2 2 2 2 2 2 2 2 ...
## $ Age : num 44 45.5 26 39 30 39 21 NA 62 65 ...
## $ SibSp : int 1 0 0 0 0 0 0 0 0 0 ...
## $ Parch : int 0 0 0 0 0 0 0 0 0 1 ...
## $ Ticket : Factor w/ 681 levels "110152","110413",..: 199 21 81 13 170 224 269 292 53 30 ...
## $ Fare : num 26 28.5 56.5 0 13 ...
## $ Cabin : Factor w/ 148 levels "","A10","A14",..: 1 58 1 13 1 1 1 1 1 25 ...
## $ Embarked : Factor w/ 4 levels "","C","Q","S": 4 4 4 4 4 4 4 2 4 2 ...
str(test)
## 'data.frame': 267 obs. of 12 variables:
## $ PassengerId: int 561 316 468 786 306 677 559 790 82 789 ...
## $ Survived : int 0 1 0 0 1 0 1 0 1 1 ...
## $ Pclass : int 3 3 1 3 1 3 1 1 3 3 ...
## $ Name : Factor w/ 891 levels "Abbing, Mr. Anthony",..: 563 595 771 335 17 735 805 315 742 212 ...
## $ Sex : Factor w/ 2 levels "female","male": 2 1 2 2 2 2 1 2 2 2 ...
## $ Age : num NA 26 56 25 0.92 24.5 39 46 29 1 ...
## $ SibSp : int 0 0 0 0 1 0 1 0 0 1 ...
## $ Parch : int 0 0 0 0 2 0 1 0 0 2 ...
## $ Ticket : Factor w/ 681 levels "110152","110413",..: 472 344 45 475 38 295 2 593 312 549 ...
## $ Fare : num 7.75 7.85 26.55 7.25 151.55 ...
## $ Cabin : Factor w/ 148 levels "","A10","A14",..: 1 1 1 1 64 1 136 46 1 1 ...
## $ Embarked : Factor w/ 4 levels "","C","Q","S": 3 4 4 4 4 4 4 2 4 4 ...
tree <- rpart(Survived ~ ., train, method = "class")
# Predict the outcome on the test set with tree: pred
pred <- predict(tree, test, type="class")
conf <- table(test$Survived, pred)
conf
## pred
## 0 1
## 0 137 27
## 1 36 67
str(shuffled)
## 'data.frame': 891 obs. of 12 variables:
## $ PassengerId: int 237 332 510 807 179 796 837 585 556 55 ...
## $ Survived : int 0 0 1 0 0 0 0 0 0 0 ...
## $ Pclass : int 2 1 3 1 2 2 3 3 1 1 ...
## $ Name : Factor w/ 891 levels "Abbing, Mr. Anthony",..: 375 642 460 35 325 626 643 645 881 625 ...
## $ Sex : Factor w/ 2 levels "female","male": 2 2 2 2 2 2 2 2 2 2 ...
## $ Age : num 44 45.5 26 39 30 39 21 NA 62 65 ...
## $ SibSp : int 1 0 0 0 0 0 0 0 0 0 ...
## $ Parch : int 0 0 0 0 0 0 0 0 0 1 ...
## $ Ticket : Factor w/ 681 levels "110152","110413",..: 199 21 81 13 170 224 269 292 53 30 ...
## $ Fare : num 26 28.5 56.5 0 13 ...
## $ Cabin : Factor w/ 148 levels "","A10","A14",..: 1 58 1 13 1 1 1 1 1 25 ...
## $ Embarked : Factor w/ 4 levels "","C","Q","S": 4 4 4 4 4 4 4 2 4 2 ...
# Set random seed. Don't remove this line.
set.seed(1)
# Initialize the accs vector
accs <- rep(0,6)
for (i in 1:6) {
# These indices indicate the interval of the test set
indices <- (((i-1) * round((1/6)*nrow(shuffled))) + 1):((i*round((1/6) * nrow(shuffled))))
# Exclude them from the train set
train <- shuffled[-indices,]
# Include them in the test set
test <- shuffled[indices,]
# A model is learned using each training set
tree <- rpart(Survived ~ ., train, method = "class")
# Make a prediction on the test set using tree
pred <- predict(tree, test, type="class")
# Assign the confusion matrix to conf
conf <- table(test$Survived, pred)
# Assign the accuracy of this model to the ith index in accs
accs[i] <-sum(diag(conf))/sum(conf)
}
# Print out the mean of accs
accs
## [1] 0.8040541 0.7770270 0.7972973 0.7702703 0.7500000 0.7905405
mean(accs)
## [1] 0.7815315
# 18
emails_full <- read.csv("emails_full.csv")
# The spam filter that has been 'learned' for you
spam_classifier <- function(x){
prediction <- rep(NA,length(x))
prediction[x > 4] <- 1
prediction[x >= 3 & x <= 4] <- 0
prediction[x >= 2.2 & x < 3] <- 1
prediction[x >= 1.4 & x < 2.2] <- 0
prediction[x > 1.25 & x < 1.4] <- 1
prediction[x <= 1.25] <- 0
return(factor(prediction, levels=c("1","0")))
}
# Apply spam_classifier to emails_full: pred_full
pred_full <- spam_classifier(emails_full$avg_capital_seq)
# Build confusion matrix for classifier on emails_full: conf_full
conf_full <- table(emails_full$spam, pred_full)
# Calculate the accuracy with conf_full: acc_full
acc_full <- sum(diag(conf_full)) / sum(conf_full)
# Print out acc_full
acc_full
## [1] 0.3438383
##HINT
# The confusion matrix is given by
## P N
## p TP FN
## n FP TN
# The accuracy is given by
## Accuracy=(TP+TN)/(TP+FN+FP+TN)
emails_small <- read.csv("emails_small.csv")
# You should change the code of the classifier, simplifying it
spam_classifier <- function(x){
prediction <- rep(NA,length(x))
prediction[x > 4] <- 1
prediction[x <= 4] <- 0
return(factor(prediction, levels=c("1","0")))
}
# conf_small and acc_small have been calculated for you
conf_small <- table(emails_small$spam, spam_classifier(emails_small$avg_capital_seq))
acc_small <- sum(diag(conf_small)) / sum(conf_small)
acc_small
## [1] 0.2307692
# Apply spam_classifier to emails_full and calculate the confusion matrix: conf_full
pred_full <- spam_classifier(emails_full$avg_capital_seq)
conf_full <- table(emails_full$spam, pred_full)
# Calculate acc_full
acc_full <- sum(diag(conf_full)) / sum(conf_full)
# Print acc_full
acc_full
## [1] 0.2740709
the official answer is [1] 0.7259291!!
Great! Your model no longer fits the small dataset perfectly but it fits the big dataset better. You increased the bias on the model and caused it to generalize better over the complete dataset. While the first classifier overfits the data, an accuracy of 73% is far from satisfying for a spam filter