1)This question uses the following ages for a set of trees: 19,23,30,30,45,25,24,20 a) Compute the standard deviation in R using the sd() function. Also compute the mean and median. b) Compute the same value in R without the sd function. c) Using R, how does the standard deviation from part a) change if you add 10 to all the values?
ages<-c(19,23,30,30,45,25,24,20)
sd(ages)
## [1] 8.315
mean(ages)
## [1] 27
median(ages)
## [1] 24.5
std_dev <- function(x) {
x <- as.numeric(x)
sumofsqmean <- 0
mean.x = mean(x)
for (i in x) {
sumofsqmean <- sumofsqmean + (i - mean.x)^2
}
sqrt(sumofsqmean/(length(x) - 1))
}
std_dev(ages)
## [1] 8.315
age_10<-ages+10
age_10
## [1] 29 33 40 40 55 35 34 30
sd(age_10)
## [1] 8.315
There is no change to the standard diviation as the offest of each observation is same.
age_100 <-ages*100
age_100
## [1] 1900 2300 3000 3000 4500 2500 2400 2000
sd(age_100)
## [1] 831.5
The standard deviation increases proportionally to the multiplication value.
ages70 <- c(ages, 70)
mean(ages)
## [1] 27
mean(ages70)
## [1] 31.78
median(ages)
## [1] 24.5
median(ages70)
## [1] 25
The mean and median increase,not always though. Median might not change of 24.5 was very frequent. Mean might not change if 70 happen to be the mean value of ages, before adding another 70.
library(rpart)
require(rpart)
train <- read.csv("/home/archana/ML works_ucsc/textdata.csv", header = TRUE)
test <- read.csv("/home/archana/ML works_ucsc/test.csv", header = TRUE)
y <- as.factor(train[, 5]) #class labels 0 or 1
x <- train[, 2:4]
fit <- rpart(y ~ ., x, control = rpart.control(minsplit = 0, minbucket = 0,
maxdepth = 5))
plot(fit)
text(fit)
predictions <- predict(fit, test, type = "class")
predictions
## 1 2 3 4
## 1 0 0 0
## Levels: 0 1
Consider the table given in the text on page 200 in the book exercise number five (copied below). It is a binary class problem. Would it be possible to create a model which would correctly classify this training data? If it is possible create a tree which gives the correct answer (either + or - ) for each training observation. Otherwise, give the reason that it is not possible to do so.
The model is an underfit as there no proper attributes. however it is not possible to have training error as 0
library(rpart)
data_3 <- data.frame(Observation = 1:10, A = c(TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, TRUE, TRUE),B = c(FALSE, TRUE, TRUE, FALSE, TRUE, FALSE, FALSE, FALSE, TRUE, FALSE), "Class Label" = as.factor(c(1, 1, 1, 0, 1,0, 0, 0, 0, 0)))
y = data_3$Class.Label
x = data_3[,2:3]
fit<-rpart(y~., x, control=rpart.control(minsplit=0,minbucket=0,maxdepth=5))
error_training = 1-sum(y==predict(fit,x,type="class"))/length(y)
cat("Training error:", error_training)
## Training error: 0.2
print(fit)
## n= 10
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 10 4 0 (0.6000 0.4000)
## 2) B< 0.5 6 1 0 (0.8333 0.1667) *
## 3) B>=0.5 4 1 1 (0.2500 0.7500) *
plot(fit)
text(fit)
4 - Train & Test with Sonar Data
library(rpart)
require(rpart)
sonar.train <- read.csv("/home/archana/ML works_ucsc/sonar_train.csv", header = FALSE)
sonar.test <- read.csv("/home/archana/ML works_ucsc/sonar_test.csv", header = FALSE)
y <- as.factor(sonar.train[, 61]) #0 or 1
x <- sonar.train[, 1:60]
fit <- rpart(y ~ ., x, control = rpart.control(minsplit = 0, minbucket = 0,
maxdepth = 3))
plot(fit)
text(fit)
print(fit)
## n= 130
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 130 64 -1 (0.50769 0.49231)
## 2) V11>=0.1709 79 21 -1 (0.73418 0.26582)
## 4) V27>=0.8191 37 2 -1 (0.94595 0.05405)
## 8) V9>=0.0889 34 0 -1 (1.00000 0.00000) *
## 9) V9< 0.0889 3 1 1 (0.33333 0.66667) *
## 5) V27< 0.8191 42 19 -1 (0.54762 0.45238)
## 10) V54>=0.02075 12 0 -1 (1.00000 0.00000) *
## 11) V54< 0.02075 30 11 1 (0.36667 0.63333) *
## 3) V11< 0.1709 51 8 1 (0.15686 0.84314)
## 6) V52>=0.0209 6 1 -1 (0.83333 0.16667)
## 12) V1>=0.02225 5 0 -1 (1.00000 0.00000) *
## 13) V1< 0.02225 1 0 1 (0.00000 1.00000) *
## 7) V52< 0.0209 45 3 1 (0.06667 0.93333)
## 14) V19>=0.8351 5 2 -1 (0.60000 0.40000) *
## 15) V19< 0.8351 40 0 1 (0.00000 1.00000) *
plotcp(fit)
predictions <- as.numeric(predict(fit, sonar.test, type = "class"))
predictions <- replace(predictions, predictions == 1, -1)
predictions <- replace(predictions, predictions == 2, 1)
predictions <- as.numeric(predictions)
predictions
## [1] 1 -1 -1 1 -1 -1 1 1 1 -1 -1 1 1 1 -1 -1 -1 1 1 1 -1 1 1
## [24] 1 1 1 -1 -1 1 -1 -1 1 1 1 1 1 1 1 1 -1 -1 -1 1 -1 1 -1
## [47] 1 1 1 1 -1 1 1 1 -1 1 1 -1 1 1 1 -1 -1 1 -1 1 1 -1 1
## [70] -1 1 1 1 -1 1 -1 1 1
actuals <- sonar.test[, 61]
compare <- predictions == actuals
length(compare[compare == TRUE])/length(compare)
## [1] 0.6667
The training error and test error are same. 5 - Red Wine data set
http://archive.ics.uci.edu/ml/machine-learning-databases/wine- quality/winequality.names Use the Red Wine data set: winequality-red.csv This data set contains 1599 observations of 11 attributes. The median score of the wine tasters is given in the last column. Note also that the delimiter used in this file is a semi colon and not a comma. Use rpart on this data to create trees for a range of different tree depths. Use cross validation to generate training error and test error. Plot these errors as a function of tree depth. Which tree depth results in the best Test Error?
data_5 <- read.table("/home/archana/ML works_ucsc/winequality-red.txt", header=TRUE, sep=";")
num_sample = nrow(data_5)
data_5_mixed = data_5[sample(num_sample, num_sample, replace=FALSE),]
k=3
wine_train=matrix(0, (k-1)*num_sample/k, ncol(data_5))
wine_train = data_5_mixed[1:(num_sample*(2/3)),]
max_depth = 10
error_training_total = rep(0, times = max_depth)
error_cross_total = rep(0, times = max_depth)
for(idepth in 1:max_depth){
pick = k
for(j in 1:k){
i_tmp = 1
for(i in 1:k){
if(i == pick){
wine_cross = data_5_mixed[((i-1)*num_sample/k+1):(num_sample*(i/k)),]
} else {
wine_train[((i_tmp-1)*num_sample/k+1):(num_sample*(i_tmp/k)), ] = data_5_mixed[((i-1)*num_sample/k+1):(num_sample*(i/k)),]
i_tmp = i_tmp + 1
}
}
pick = pick - 1
y_wine_train = wine_train[,12]
x_wine_train = wine_train[,1:11]
fit<-rpart(y_wine_train~., x_wine_train, control=rpart.control(maxdepth=idepth))
error_training = sqrt(sum((y_wine_train-predict(fit,x_wine_train))^2)/length(y_wine_train))
error_training_total[idepth] = error_training_total[idepth] + error_training
y_wine_cross = wine_cross[,12]
x_wine_cross = wine_cross[,1:11]
error_cross = sqrt(sum((y_wine_cross-predict(fit,x_wine_cross))^2)/length(y_wine_cross))
error_cross_total[idepth] = error_cross_total[idepth] + error_cross
}
error_training_total[idepth] = error_training_total[idepth]/k
error_cross_total[idepth] = error_cross_total[idepth]/k
if(idepth > 1 && error_cross_total[idepth] < error_cross_total[idepth-1]){
fit_save = fit
min_depth = idepth
}
if(idepth ==5){
opt_fit = fit
}
}
plot(1:max_depth, error_training_total)
points(1:max_depth, error_cross_total, col='red')
cat(error_cross_total, "\n", min_depth, "th cross validation error has the best value.")
## 0.743 0.7176 0.6874 0.6717 0.6704 0.6704 0.6704 0.6704 0.6704 0.6704
## 5 th cross validation error has the best value.
print(opt_fit)
## n= 1066
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 1066 686.10 5.616
## 2) alcohol< 10.53 660 283.50 5.347
## 4) volatile.acidity>=0.4175 509 193.30 5.250
## 8) volatile.acidity>=0.6525 151 47.58 5.053 *
## 9) volatile.acidity< 0.6525 358 137.40 5.332
## 18) alcohol< 9.95 259 78.63 5.236 *
## 19) alcohol>=9.95 99 50.02 5.586 *
## 5) volatile.acidity< 0.4175 151 69.10 5.675
## 10) sulphates< 0.665 78 18.68 5.397 *
## 11) sulphates>=0.665 73 37.95 5.973 *
## 3) alcohol>=10.53 406 276.80 6.054
## 6) volatile.acidity>=0.87 19 16.74 4.526 *
## 7) volatile.acidity< 0.87 387 213.50 6.129
## 14) sulphates< 0.615 136 66.24 5.794
## 28) volatile.acidity>=0.385 98 40.20 5.653
## 56) free.sulfur.dioxide< 8.5 27 14.67 5.222 *
## 57) free.sulfur.dioxide>=8.5 71 18.62 5.817 *
## 29) volatile.acidity< 0.385 38 19.05 6.158 *
## 15) sulphates>=0.615 251 123.80 6.311
## 30) alcohol< 11.55 148 59.03 6.081 *
## 31) alcohol>=11.55 103 45.71 6.641 *
plot(opt_fit)
text(opt_fit)
plot_frame=data.frame(data_5_mixed$alcohol,
data_5_mixed$quality)
plot(plot_frame)
cor_wine=abs(cor(data_5_mixed[, 1:11], data_5_mixed$quality))
cor_wine
## [,1]
## fixed.acidity 0.12405
## volatile.acidity 0.39056
## citric.acid 0.22637
## residual.sugar 0.01373
## chlorides 0.12891
## free.sulfur.dioxide 0.05066
## total.sulfur.dioxide 0.18510
## density 0.17492
## pH 0.05773
## sulphates 0.25140
## alcohol 0.47617
plot(cor_wine[,1],xaxt="n")
axis(1, at=1:11, labels=row.names(cor_wine))
plot_frame2=data.frame(data_5_mixed$volatile.acidity,
data_5_mixed$quality)
plot(plot_frame2)