Homework on Trees 1) This question uses the following ages for a set of trees: 19,23,30,30,45,25,24,20. Store them in R using the syntax ages<-c(19,23,30,30,45,25,24,20).

a) Compute the standard deviation in R using the sd() function. Also compute the mean and median.

setwd("C:/Users/Manjari/Desktop/Machine learning/Home Work Solutions")
ages<-c(19,23,30,30,45,25,24,20)
ages_sd=sd(ages)
ages_mean=mean(ages)
ages_median=median(ages)
cat("Standard deviation:",ages_sd,"|Mean",ages_mean,"|Median",ages_median)
## Standard deviation: 8.315218 |Mean 27 |Median 24.5

b) Compute the same value in R without the sd function.

my_sd=function(x){
  my_mean=sum(x)/length(x)
  my_sd=sqrt(sum((x-my_mean)*(x-my_mean))/length(x))
return(my_sd)
  }
cat("Standard deviation:",my_sd(ages))
## Standard deviation: 7.778175
  1. Using R, how does the standard deviation from part a) change if you add 10 to all the values?
sd=sd(ages+10)
cat("Old age",ages_sd,"|new age",sd)
## Old age 8.315218 |new age 8.315218
  1. Using R, how does the standard deviation in part a) change if you multiply all the values by 100?
multiple_sd=sd(ages*100)
cat("old age",ages_sd,"|Multiple age",multiple_sd)
## old age 8.315218 |Multiple age 831.5218
  1. Next add another tree of age 70 to the sample. Compute the mean and median with this tree added to the sample. How have the mean and median changed?
new_length<-length(ages)+1
ages[new_length]<-70
Nmedian=median(ages)
Nmean=mean(ages)
difference_mean=Nmean-ages_mean
difference_median=Nmedian-ages_median
cat("old mean",ages_mean,"|New Mean",Nmean,"|difference",difference_mean,"/n","old median",ages_median,"|New Median",Nmedian,"|difference",difference_median)
## old mean 27 |New Mean 31.77778 |difference 4.777778 /n old median 24.5 |New Median 25 |difference 0.5
  1. Here is the data table for question 2.

The following tree was created using rpart for the data table given above

require("rpart")
## Loading required package: rpart
setwd("C:/Users/Manjari/Desktop/Machine learning/Home Work Solutions")
data<-read.table("table4_8pg199.txt",header=TRUE,sep=",")
print(data)
##   Instance    a1    a2 a3 Target
## 1        1  TRUE  TRUE  1      1
## 2        2  TRUE  TRUE  6      1
## 3        3  TRUE FALSE  5      0
## 4        4 FALSE FALSE  4      1
## 5        5 FALSE  TRUE  7      0
## 6        6 FALSE  TRUE  3      0
## 7        7 FALSE FALSE  8      0
## 8        8  TRUE FALSE  7      1
## 9        9 FALSE  TRUE  5      0

The following tree was created using rpart for the data table given above.

y=as.factor(data$Target)
x=data[,2:4]
fit<- rpart(y~.,x,control=rpart.control(minsplit = 0, minbucket = 0, maxdepth = 5))
error_training = 1-sum(y==predict(fit,x, type="class"))/length(y)
cat("Training Error:",error_training)
## Training Error: 0
print(fit)
## n= 9 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 9 4 0 (0.5555556 0.4444444)  
##    2) a1< 0.5 5 1 0 (0.8000000 0.2000000)  
##      4) a2>=0.5 3 0 0 (1.0000000 0.0000000) *
##      5) a2< 0.5 2 1 0 (0.5000000 0.5000000)  
##       10) a3>=6 1 0 0 (1.0000000 0.0000000) *
##       11) a3< 6 1 0 1 (0.0000000 1.0000000) *
##    3) a1>=0.5 4 1 1 (0.2500000 0.7500000)  
##      6) a2< 0.5 2 1 0 (0.5000000 0.5000000)  
##       12) a3< 6 1 0 0 (1.0000000 0.0000000) *
##       13) a3>=6 1 0 1 (0.0000000 1.0000000) *
##      7) a2>=0.5 2 0 1 (0.0000000 1.0000000) *
plot(fit)
text(fit)

we found the decision tree.

test<- data.frame(Observation = 1:4, a1 = c(TRUE,TRUE,FALSE,FALSE), a2 = c(TRUE,FALSE,TRUE,FALSE), a3 = c(2.5,5.5,2.5,8.5))
print(test)
##   Observation    a1    a2  a3
## 1           1  TRUE  TRUE 2.5
## 2           2  TRUE FALSE 5.5
## 3           3 FALSE  TRUE 2.5
## 4           4 FALSE FALSE 8.5
result = predict(fit,test[,2:4], type="class")
print(result)
## 1 2 3 4 
## 1 0 0 0 
## Levels: 0 1

The decsion tree used to predict result of unseen data.

  1. Consider the table given in the text on page 200 in the book exercise number five (copied below). It is a binary class problem. Would it be possible to create a model which would correctly classify this training data? If it is possible create a tree which gives the correct answer (either + or - ) for each training observation. Otherwise, give the reason that it is not possible to do so
library(rpart)
data_3 <- data.frame(Observation = 1:10, A = c(TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, TRUE, TRUE),B = c(FALSE, TRUE, TRUE, FALSE, TRUE, FALSE, FALSE, FALSE, TRUE, FALSE), "Class Label" = as.factor(c(1, 1, 1, 0, 1,0, 0, 0, 0, 0)))
y = data_3$Class.Label
x = data_3[,2:3]
fit<-rpart(y~., x, control=rpart.control(minsplit=0,minbucket=0,maxdepth=5))
error_training = 1-sum(y==predict(fit,x,type="class"))/length(y)
cat("Training error:", error_training)
## Training error: 0.2
print(fit)
## n= 10 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 10 4 0 (0.6000000 0.4000000)  
##   2) B< 0.5 6 1 0 (0.8333333 0.1666667) *
##   3) B>=0.5 4 1 1 (0.2500000 0.7500000) *
plot(fit)
text(fit)

4) The UC Irvine web site has many interesting data sets. The Sonar Data is described at the web site: http://archive.ics.uci.edu/ml/machine-learningdatabases/undocumented/connectionist-bench/sonar/sonar.names . The sonar data has been divided into a training set (sonar_train.csv) and a test set (sonar_test.csv). The file sonar_test.csv should be used as the hold out set, while the file sonar_train.csv should be used to build the tree. Use R to compute the classification error on the test set when training on the training set for a tree of depth 5 using control=rpart.control(maxdepth=5). Remember that the 61st column is the response and the other 60 columns are the predictors. What is the error on the training set? What is the error on the test set? What is the differences in these errors? Documentation for the rpart package can be found at http://cran.r-project.org/web/packages/rpart/rpart.pdf

sonar.train <- read.csv("sonar_train.csv", header = FALSE)
sonar.test <- read.csv("sonar_test.csv", header = FALSE)

y <- as.factor(sonar.train[,61]) 
x <- sonar.train[, 1:60]

fit <- rpart(y ~ ., x, control = rpart.control(minsplit = 0, minbucket = 0,maxdepth = 3))

plot(fit)
text(fit)

print(fit)
## n= 130 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 130 64 -1 (0.50769231 0.49230769)  
##    2) V11>=0.17095 79 21 -1 (0.73417722 0.26582278)  
##      4) V27>=0.8191 37  2 -1 (0.94594595 0.05405405)  
##        8) V9>=0.0889 34  0 -1 (1.00000000 0.00000000) *
##        9) V9< 0.0889 3  1 1 (0.33333333 0.66666667) *
##      5) V27< 0.8191 42 19 -1 (0.54761905 0.45238095)  
##       10) V54>=0.02075 12  0 -1 (1.00000000 0.00000000) *
##       11) V54< 0.02075 30 11 1 (0.36666667 0.63333333) *
##    3) V11< 0.17095 51  8 1 (0.15686275 0.84313725)  
##      6) V52>=0.0209 6  1 -1 (0.83333333 0.16666667)  
##       12) V1>=0.02225 5  0 -1 (1.00000000 0.00000000) *
##       13) V1< 0.02225 1  0 1 (0.00000000 1.00000000) *
##      7) V52< 0.0209 45  3 1 (0.06666667 0.93333333)  
##       14) V19>=0.8351 5  2 -1 (0.60000000 0.40000000) *
##       15) V19< 0.8351 40  0 1 (0.00000000 1.00000000) *
plotcp(fit)

predictions <- as.numeric(predict(fit, sonar.test, type = "class"))
predictions <- replace(predictions, predictions == 1, -1)
predictions <- replace(predictions, predictions == 2, 1)
predictions <- as.numeric(predictions)
predictions
##  [1]  1 -1 -1  1 -1 -1  1  1  1 -1 -1  1  1  1 -1 -1 -1  1  1  1 -1  1  1
## [24]  1  1  1 -1 -1  1 -1 -1  1  1  1  1  1  1  1  1 -1 -1 -1  1 -1  1 -1
## [47]  1  1  1  1 -1  1  1  1 -1  1  1 -1  1  1  1 -1 -1  1 -1  1  1 -1  1
## [70] -1  1  1  1 -1  1 -1  1  1
actuals <- sonar.test[, 61]
compare <- predictions == actuals
length(compare[compare == TRUE])/length(compare)
## [1] 0.6666667
  1. Check out the web page which describes a wine quality data set: http://archive.ics.uci.edu/ml/machine-learning-databases/winequality/winequality.names Use the Red Wine data set: winequality-red.csv This data set contains 1599 observations of 11 attributes. The median score of the wine tasters is given in the last column. Note also that the delimiter used in this file is a semi colon and not a comma. Use rpart on this data to create trees for a range of different tree depths. Use cross validation to generate training error and test error. Plot these errors as a function of tree depth. Which tree depth results in the best Test Error? What is that Test Error? Hint: look at the cross validation example given in the lecture. Are you considering the tasters score as a class, an ordered factor, or a numeric? How did you calculate the error? Which attribute is at the root node? Make a scatter plot of the wine quality score vs. this root node attribute. What is the correlation between each of the eleven attributes and the wine quality (hint: cor(wineData[, 1:11], wineData$quality). Which attribute has the highest (in absolution value) correlation with the wine quality? Make a scatter plot of the wine quality score vs. this attribute.
data_5 <- read.table("winequality-red.txt", header=TRUE, sep=";")
num_sample = nrow(data_5)
data_5_mixed = data_5[sample(num_sample, num_sample, replace=FALSE),]

k=3
wine_train=matrix(0, (k-1)*num_sample/k, ncol(data_5))
wine_train = data_5_mixed[1:(num_sample*(2/3)),]

max_depth = 10

error_training_total = rep(0, times = max_depth)
error_cross_total = rep(0, times = max_depth)

for(idepth in 1:max_depth){
  pick = k
  
  for(j in 1:k){
    
    i_tmp = 1
    for(i in 1:k){
      
      if(i == pick){
        wine_cross = data_5_mixed[((i-1)*num_sample/k+1):(num_sample*(i/k)),]
      } else {
        wine_train[((i_tmp-1)*num_sample/k+1):(num_sample*(i_tmp/k)), ] = data_5_mixed[((i-1)*num_sample/k+1):(num_sample*(i/k)),]
        i_tmp = i_tmp + 1
      }
    }
    pick = pick - 1
    
    y_wine_train = wine_train[,12]
    x_wine_train = wine_train[,1:11]
    
    fit<-rpart(y_wine_train~., x_wine_train, control=rpart.control(maxdepth=idepth))
    error_training = sqrt(sum((y_wine_train-predict(fit,x_wine_train))^2)/length(y_wine_train))
    error_training_total[idepth] = error_training_total[idepth] + error_training
    
    y_wine_cross = wine_cross[,12]
    x_wine_cross = wine_cross[,1:11]
    
    error_cross = sqrt(sum((y_wine_cross-predict(fit,x_wine_cross))^2)/length(y_wine_cross))
    error_cross_total[idepth] = error_cross_total[idepth] + error_cross
    
  }
  
  error_training_total[idepth] = error_training_total[idepth]/k
  error_cross_total[idepth] = error_cross_total[idepth]/k
  
  if(idepth > 1 && error_cross_total[idepth] < error_cross_total[idepth-1]){
    fit_save = fit
    min_depth = idepth
  }
  if(idepth ==5){
    opt_fit = fit
  }
  
}

plot(1:max_depth, error_training_total)
points(1:max_depth, error_cross_total, col='red')