Homework on Trees 1) This question uses the following ages for a set of trees: 19,23,30,30,45,25,24,20. Store them in R using the syntax ages<-c(19,23,30,30,45,25,24,20).
a) Compute the standard deviation in R using the sd() function. Also compute the mean and median.
setwd("C:/Users/Manjari/Desktop/Machine learning/Home Work Solutions")
ages<-c(19,23,30,30,45,25,24,20)
ages_sd=sd(ages)
ages_mean=mean(ages)
ages_median=median(ages)
cat("Standard deviation:",ages_sd,"|Mean",ages_mean,"|Median",ages_median)
## Standard deviation: 8.315218 |Mean 27 |Median 24.5
b) Compute the same value in R without the sd function.
my_sd=function(x){
my_mean=sum(x)/length(x)
my_sd=sqrt(sum((x-my_mean)*(x-my_mean))/length(x))
return(my_sd)
}
cat("Standard deviation:",my_sd(ages))
## Standard deviation: 7.778175
sd=sd(ages+10)
cat("Old age",ages_sd,"|new age",sd)
## Old age 8.315218 |new age 8.315218
multiple_sd=sd(ages*100)
cat("old age",ages_sd,"|Multiple age",multiple_sd)
## old age 8.315218 |Multiple age 831.5218
new_length<-length(ages)+1
ages[new_length]<-70
Nmedian=median(ages)
Nmean=mean(ages)
difference_mean=Nmean-ages_mean
difference_median=Nmedian-ages_median
cat("old mean",ages_mean,"|New Mean",Nmean,"|difference",difference_mean,"/n","old median",ages_median,"|New Median",Nmedian,"|difference",difference_median)
## old mean 27 |New Mean 31.77778 |difference 4.777778 /n old median 24.5 |New Median 25 |difference 0.5
The following tree was created using rpart for the data table given above
require("rpart")
## Loading required package: rpart
setwd("C:/Users/Manjari/Desktop/Machine learning/Home Work Solutions")
data<-read.table("table4_8pg199.txt",header=TRUE,sep=",")
print(data)
## Instance a1 a2 a3 Target
## 1 1 TRUE TRUE 1 1
## 2 2 TRUE TRUE 6 1
## 3 3 TRUE FALSE 5 0
## 4 4 FALSE FALSE 4 1
## 5 5 FALSE TRUE 7 0
## 6 6 FALSE TRUE 3 0
## 7 7 FALSE FALSE 8 0
## 8 8 TRUE FALSE 7 1
## 9 9 FALSE TRUE 5 0
The following tree was created using rpart for the data table given above.
y=as.factor(data$Target)
x=data[,2:4]
fit<- rpart(y~.,x,control=rpart.control(minsplit = 0, minbucket = 0, maxdepth = 5))
error_training = 1-sum(y==predict(fit,x, type="class"))/length(y)
cat("Training Error:",error_training)
## Training Error: 0
print(fit)
## n= 9
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 9 4 0 (0.5555556 0.4444444)
## 2) a1< 0.5 5 1 0 (0.8000000 0.2000000)
## 4) a2>=0.5 3 0 0 (1.0000000 0.0000000) *
## 5) a2< 0.5 2 1 0 (0.5000000 0.5000000)
## 10) a3>=6 1 0 0 (1.0000000 0.0000000) *
## 11) a3< 6 1 0 1 (0.0000000 1.0000000) *
## 3) a1>=0.5 4 1 1 (0.2500000 0.7500000)
## 6) a2< 0.5 2 1 0 (0.5000000 0.5000000)
## 12) a3< 6 1 0 0 (1.0000000 0.0000000) *
## 13) a3>=6 1 0 1 (0.0000000 1.0000000) *
## 7) a2>=0.5 2 0 1 (0.0000000 1.0000000) *
plot(fit)
text(fit)
we found the decision tree.
test<- data.frame(Observation = 1:4, a1 = c(TRUE,TRUE,FALSE,FALSE), a2 = c(TRUE,FALSE,TRUE,FALSE), a3 = c(2.5,5.5,2.5,8.5))
print(test)
## Observation a1 a2 a3
## 1 1 TRUE TRUE 2.5
## 2 2 TRUE FALSE 5.5
## 3 3 FALSE TRUE 2.5
## 4 4 FALSE FALSE 8.5
result = predict(fit,test[,2:4], type="class")
print(result)
## 1 2 3 4
## 1 0 0 0
## Levels: 0 1
The decsion tree used to predict result of unseen data.
library(rpart)
data_3 <- data.frame(Observation = 1:10, A = c(TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, TRUE, TRUE),B = c(FALSE, TRUE, TRUE, FALSE, TRUE, FALSE, FALSE, FALSE, TRUE, FALSE), "Class Label" = as.factor(c(1, 1, 1, 0, 1,0, 0, 0, 0, 0)))
y = data_3$Class.Label
x = data_3[,2:3]
fit<-rpart(y~., x, control=rpart.control(minsplit=0,minbucket=0,maxdepth=5))
error_training = 1-sum(y==predict(fit,x,type="class"))/length(y)
cat("Training error:", error_training)
## Training error: 0.2
print(fit)
## n= 10
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 10 4 0 (0.6000000 0.4000000)
## 2) B< 0.5 6 1 0 (0.8333333 0.1666667) *
## 3) B>=0.5 4 1 1 (0.2500000 0.7500000) *
plot(fit)
text(fit)
4) The UC Irvine web site has many interesting data sets. The Sonar Data is described at the web site: http://archive.ics.uci.edu/ml/machine-learningdatabases/undocumented/connectionist-bench/sonar/sonar.names . The sonar data has been divided into a training set (sonar_train.csv) and a test set (sonar_test.csv). The file sonar_test.csv should be used as the hold out set, while the file sonar_train.csv should be used to build the tree. Use R to compute the classification error on the test set when training on the training set for a tree of depth 5 using control=rpart.control(maxdepth=5). Remember that the 61st column is the response and the other 60 columns are the predictors. What is the error on the training set? What is the error on the test set? What is the differences in these errors? Documentation for the rpart package can be found at http://cran.r-project.org/web/packages/rpart/rpart.pdf
sonar.train <- read.csv("sonar_train.csv", header = FALSE)
sonar.test <- read.csv("sonar_test.csv", header = FALSE)
y <- as.factor(sonar.train[,61])
x <- sonar.train[, 1:60]
fit <- rpart(y ~ ., x, control = rpart.control(minsplit = 0, minbucket = 0,maxdepth = 3))
plot(fit)
text(fit)
print(fit)
## n= 130
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 130 64 -1 (0.50769231 0.49230769)
## 2) V11>=0.17095 79 21 -1 (0.73417722 0.26582278)
## 4) V27>=0.8191 37 2 -1 (0.94594595 0.05405405)
## 8) V9>=0.0889 34 0 -1 (1.00000000 0.00000000) *
## 9) V9< 0.0889 3 1 1 (0.33333333 0.66666667) *
## 5) V27< 0.8191 42 19 -1 (0.54761905 0.45238095)
## 10) V54>=0.02075 12 0 -1 (1.00000000 0.00000000) *
## 11) V54< 0.02075 30 11 1 (0.36666667 0.63333333) *
## 3) V11< 0.17095 51 8 1 (0.15686275 0.84313725)
## 6) V52>=0.0209 6 1 -1 (0.83333333 0.16666667)
## 12) V1>=0.02225 5 0 -1 (1.00000000 0.00000000) *
## 13) V1< 0.02225 1 0 1 (0.00000000 1.00000000) *
## 7) V52< 0.0209 45 3 1 (0.06666667 0.93333333)
## 14) V19>=0.8351 5 2 -1 (0.60000000 0.40000000) *
## 15) V19< 0.8351 40 0 1 (0.00000000 1.00000000) *
plotcp(fit)
predictions <- as.numeric(predict(fit, sonar.test, type = "class"))
predictions <- replace(predictions, predictions == 1, -1)
predictions <- replace(predictions, predictions == 2, 1)
predictions <- as.numeric(predictions)
predictions
## [1] 1 -1 -1 1 -1 -1 1 1 1 -1 -1 1 1 1 -1 -1 -1 1 1 1 -1 1 1
## [24] 1 1 1 -1 -1 1 -1 -1 1 1 1 1 1 1 1 1 -1 -1 -1 1 -1 1 -1
## [47] 1 1 1 1 -1 1 1 1 -1 1 1 -1 1 1 1 -1 -1 1 -1 1 1 -1 1
## [70] -1 1 1 1 -1 1 -1 1 1
actuals <- sonar.test[, 61]
compare <- predictions == actuals
length(compare[compare == TRUE])/length(compare)
## [1] 0.6666667
data_5 <- read.table("winequality-red.txt", header=TRUE, sep=";")
num_sample = nrow(data_5)
data_5_mixed = data_5[sample(num_sample, num_sample, replace=FALSE),]
k=3
wine_train=matrix(0, (k-1)*num_sample/k, ncol(data_5))
wine_train = data_5_mixed[1:(num_sample*(2/3)),]
max_depth = 10
error_training_total = rep(0, times = max_depth)
error_cross_total = rep(0, times = max_depth)
for(idepth in 1:max_depth){
pick = k
for(j in 1:k){
i_tmp = 1
for(i in 1:k){
if(i == pick){
wine_cross = data_5_mixed[((i-1)*num_sample/k+1):(num_sample*(i/k)),]
} else {
wine_train[((i_tmp-1)*num_sample/k+1):(num_sample*(i_tmp/k)), ] = data_5_mixed[((i-1)*num_sample/k+1):(num_sample*(i/k)),]
i_tmp = i_tmp + 1
}
}
pick = pick - 1
y_wine_train = wine_train[,12]
x_wine_train = wine_train[,1:11]
fit<-rpart(y_wine_train~., x_wine_train, control=rpart.control(maxdepth=idepth))
error_training = sqrt(sum((y_wine_train-predict(fit,x_wine_train))^2)/length(y_wine_train))
error_training_total[idepth] = error_training_total[idepth] + error_training
y_wine_cross = wine_cross[,12]
x_wine_cross = wine_cross[,1:11]
error_cross = sqrt(sum((y_wine_cross-predict(fit,x_wine_cross))^2)/length(y_wine_cross))
error_cross_total[idepth] = error_cross_total[idepth] + error_cross
}
error_training_total[idepth] = error_training_total[idepth]/k
error_cross_total[idepth] = error_cross_total[idepth]/k
if(idepth > 1 && error_cross_total[idepth] < error_cross_total[idepth-1]){
fit_save = fit
min_depth = idepth
}
if(idepth ==5){
opt_fit = fit
}
}
plot(1:max_depth, error_training_total)
points(1:max_depth, error_cross_total, col='red')