Homework #5

In homework 3 problem 4 ridge regression was performed on the wine quality data set. Now use k-nearest neighbors to classify this data. Use cross validation to choose the best value for k. Round the results from the ridge regression to the nearest integer to form the classification with ridge regression. Using the best values for k (nearest neighbor) and lambda (ridge regression), compare and contrast the results of these two classification techniques.

par(mar=rep(2, 4)) # make the margins smaller for RStudio

##Modified Ridge regression of HW3 problem4
setwd("C:/Users/Manjari/Desktop/Machine learning/Home Work Solutions")
wine_data <- read.table("winequality-red.txt", header=TRUE, sep=";")

wine_train = wine_data[1:1400,]

wine_test = wine_data[1401:1599,]

wine_train_cv = wine_train

rmse=function(X,Y)
{
return( sqrt(sum((X-Y)^2)/length(Y)) )
}

library(MASS)

  xlambda=rep(0, times = 30)
for(i in seq(from = 0, to = 29))
{
  exp <- (+3 -4*(i/20))
   xlambda[i+1] <- 10^exp
}

cross_valid <- function(df_train_cv, k, args, method, err_type="rmse")
  {
  error_train <- 0
  error_cv <- 0
  num_sample <- nrow(df_train_cv)
  df_train = df_train_cv[1:(num_sample*((k-1)/k)),]
  df_cv = df_train_cv[1:(num_sample*(1/k)),]
pick <- k #pick kth set
  
  for(j in 1:k){
    i_tmp <- 1
    for(i in 1:k){
      if(i == pick){
df_cv <- df_train_cv[((i-1)*num_sample/k+1):(num_sample*(i/k)),]
} else {
 df_train[((i_tmp-1)*num_sample/k+1):(num_sample*(i_tmp/k)), ] <- df_train_cv[((i-1)*num_sample/k+1):(num_sample*(i/k)),]
i_tmp <- i_tmp + 1
}
}

pick <- pick - 1

y_df_train <- df_train[,ncol(df_train)]
x_df_train <- df_train[,-ncol(df_train)]
yx_df_train <- cbind(x_df_train, y_df_train)


y_df_cv <- df_cv[,ncol(df_cv)]
x_df_cv <- df_cv[,-ncol(df_cv)]
yx_df_cv <- cbind(x_df_cv, y_df_cv)

if(method=="linear ridge"){

fit <- lm.ridge(y_df_train~., yx_df_train, lambda=xlambda[args])
A <- as.array(fit$coef[1:(ncol(df_train)-1)]/fit$scales)
X <- as.matrix(x_df_train)
for( i in seq(from = 1, to = ncol(x_df_train))){
X[,i] <- X[,i] - fit$xm[i]
}
yh <- X%*%A + fit$ym
error_train <- error_train + rmse(round(yh), y_df_train)/k*0.1/0.1
X <- as.matrix(x_df_cv)
for( i in seq(from = 1, to = ncol(x_df_cv))){
X[,i] <- X[,i] - fit$xm[i]
}
yh <- X%*%A + fit$ym
error_cv <- error_cv + rmse(round(yh), y_df_cv)/k*0.1/0.1
}

if(method=="k nearest neighbors"){
y_df_train <- as.factor(y_df_train)

x_df_train <- scale(x_df_train)
KNN <- knn(x_df_train, x_df_train, y_df_train, k=xnn[args])

if(err_type=="rmse"){
error_train <- error_train + rmse(as.numeric(as.character(y_df_train)),as.numeric(as.character(KNN)))/k*0.1/0.1
}

if(err_type=="class"){
error_train <- error_train + (1-sum(abs(y_df_train == KNN))/length(KNN))/k*0.1/0.1
}

x_df_cv <- scale(x_df_cv)
KNN <- knn(x_df_train, x_df_cv, y_df_train, k=xnn[args])
if(err_type=="rmse"){
error_cv <- error_cv + rmse(y_df_cv, as.numeric(as.character(KNN)))/k*0.1/0.1
}

if(err_type=="class"){
error_cv <- error_cv + (1-sum(abs(y_df_cv == KNN))/length(KNN))/k*0.1/0.1
}
}
}
return(c(error_train,error_cv))
}

k <- 5
wine_train_err <- NULL
wine_cv_err <- NULL
for(ilambda in 1:length(xlambda)){
wine_err <- cross_valid(wine_train_cv, k, ilambda, method="linear ridge")
wine_train_err[ilambda] <- wine_err[1]
wine_cv_err[ilambda] <- wine_err[2]
}
wine_train_err_lr <- wine_train_err
wine_cv_err_lr <- wine_cv_err
min_lambda_id <- min(which(min(wine_cv_err_lr) == wine_cv_err_lr))
min_wine_lambda <- xlambda[min_lambda_id]
sprintf("%dth lambda %f is optimal.", min_lambda_id, min_wine_lambda)

## [1] "8th lambda 39.810717 is optimal."

## [1] "8th lambda 39.810717 is optimal."
plot(1:length(xlambda),wine_train_err_lr,ylim=c(min(wine_train_err_lr, wine_cv_err_lr),max(wine_train_err_lr, wine_cv_err_lr)))

points(1:length(xlambda),wine_cv_err_lr, col='red')
points(min_lambda_id, wine_cv_err_lr[min_lambda_id],pch = 19, col="red")

library(class)
require(class)
wine_train_err<-NULL
wine_cv_err<-NULL
xnn <- c(1:15)
for(inn in 1:length(xnn)){
  wine_err <- cross_valid(wine_train_cv, k, inn, method="k nearest neighbors")
  wine_train_err[inn] <- wine_err[1]
  wine_cv_err[inn] <- wine_err[2]
  }

wine_train_err_knn <- wine_train_err
wine_cv_err_knn <- wine_cv_err

margin = 0.04
min_k_id <- min(which(min(wine_cv_err_knn) + margin > wine_cv_err_knn))
min_k_id <- ifelse(xnn[min_k_id]%%2, min_k_id, min_k_id+1) # K must be odd
min_wine_k <- xnn[min_k_id]
sprintf("k = %d is optimal.", min_wine_k)

## [1] "k = 7 is optimal."

## [1] "k = 9 is optimal."
plot(1:length(xnn),wine_train_err_knn,ylim=c(min(wine_train_err_knn, wine_cv_err_knn),max(wine_train_err_knn, wine_cv_err_knn)))

points(1:length(xnn),wine_cv_err_knn, col='red')
points(min_k_id, wine_cv_err_knn[min_k_id],pch = 19, col="red")

#Use test set to compare two algorithms

y_wine_train_cv <- wine_train_cv[,ncol(wine_train_cv)]
x_wine_train_cv <- wine_train_cv[,-ncol(wine_train_cv)]
yx_wine_train_cv <- cbind(x_wine_train_cv, y_wine_train_cv)
y_wine_test <- wine_test [,ncol(wine_test )]
x_wine_test <- wine_test [,-ncol(wine_test )]
yx_wine_test <- cbind(x_wine_test, y_wine_test )
fit <- lm.ridge(y_wine_train_cv~., yx_wine_train_cv, lambda=min_wine_lambda)
A <- as.array(fit$coef[1:(ncol(wine_train_cv)-1)]/fit$scales)
X <- as.matrix(x_wine_train_cv)
for( i in seq(from = 1, to = ncol(x_wine_train_cv))){
X[,i] <- X[,i] - fit$xm[i]
}

yh <- X%*%A + fit$ym
error_wine_train_lr <- rmse(round(yh), y_wine_train_cv)*0.1/0.1
error_wine_train_lr #0.7010197

## [1] 0.7010197

X <- as.matrix(x_wine_test)
for( i in seq(from = 1, to = ncol(x_wine_test))){
X[,i] <- X[,i] - fit$xm[i]
}


yh <- X%*%A + fit$ym
error_wine_test_lr <- rmse(round(yh), y_wine_test)*0.1/0.1
error_wine_test_lr #0.7263871

## [1] 0.7263871

y_wine_train_cv <- as.factor(y_wine_train_cv)
x_wine_train_cv <- scale(x_wine_train_cv)
KNN <- knn(x_wine_train_cv, x_wine_train_cv, y_wine_train_cv, k=min_wine_k)
error_wine_train_knn <- rmse(as.numeric(as.character(y_wine_train_cv)), as.numeric(as.character(KNN)))*0.1/0.1

error_wine_train_knn #0.688684

## [1] 0.6829558

x_wine_test <- scale(x_wine_test)
KNN <- knn(x_wine_train_cv, x_wine_test, y_wine_train_cv, k=min_wine_k)
error_wine_test_knn <- rmse(y_wine_test, as.numeric(as.character(KNN)))*0.1/0.1
error_wine_test_knn #0.8447294

## [1] 0.8144425

tb_wine_lr_vs_knn <- data.frame("Train err"=c(error_wine_train_lr, error_wine_train_knn),"Test err"=c(error_wine_test_lr, error_wine_test_knn))

row.names(tb_wine_lr_vs_knn) <- c("Linear Ridge", "K-Nearest Neighbors")
print(tb_wine_lr_vs_knn)

##                     Train.err  Test.err
## Linear Ridge        0.7010197 0.7263871
## K-Nearest Neighbors 0.6829558 0.8144425

All error rates are calculated by root mean square method. Even though K-nearest neighbor’s classification predicts qualities as factors, it does not make sense to define error as 0 or 1 per each observations. Let’s suppose one of true values is “8”, the classifier finds “7”. It is obvious that the classifier has found a better value than “3~6”. That is why root mean square method is used to calculate error rate of wine data, no matter if classification or regression is used. However, defining error as 0 or 1 is used to calculate error rate of iris data, since iris data has characteristics of classification problems Test error rate is lower when the linear regression model is applied, because it is the regression problem rather than classification to fit wine quality data.

Use k-nearest neighbors to classify the Iris data set. Compare the k-nearest neighbor results with the results obtained in class using the Naive Bayes Classifier.

library(class)
library(e1071)

iris <- read.table("irisdata.csv",sep = ",",header = FALSE)
str(iris)

## 'data.frame':    150 obs. of  5 variables:
##  $ V1: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ V2: num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ V3: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ V4: num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ V5: Factor w/ 3 levels "Iris-setosa",..: 1 1 1 1 1 1 1 1 1 1 ...

train <- iris[,1:4]
labels <- iris[,5]
###Scale the data
train2 <- train
for(i in seq(from = 1, to = ncol(train))){
  v = var(train[,i])
  m = mean(train[,i])
  train2[,i] <- (train[,i]-m)/sqrt(v)
}
####Perform cross validation on the new data
out <- knn.cv(train2,labels,k=3)
1-sum(abs(labels == out))/length(out)

## [1] 0.05333333

Err <- rep(0,50)
for(kk in seq(from=1,to=50)){
  out <- knn.cv(train2,labels,k=kk)
  Error <- 1-sum(abs(labels == out))/length(out)
  Err[kk] <- Error   
}
Err

##  [1] 0.05333333 0.06000000 0.05333333 0.06666667 0.05333333 0.04666667
##  [7] 0.04000000 0.04666667 0.04666667 0.04666667 0.04666667 0.04000000
## [13] 0.03333333 0.04000000 0.03333333 0.04000000 0.03333333 0.04000000
## [19] 0.04666667 0.04000000 0.05333333 0.04666667 0.05333333 0.04666667
## [25] 0.04000000 0.04666667 0.05333333 0.06000000 0.05333333 0.06000000
## [31] 0.05333333 0.06000000 0.05333333 0.06000000 0.05333333 0.07333333
## [37] 0.08666667 0.09333333 0.10000000 0.10000000 0.11333333 0.10666667
## [43] 0.10666667 0.10666667 0.10666667 0.11333333 0.11333333 0.12666667
## [49] 0.12666667 0.13333333

plot(Err)

In conclusion K-Nearest neighbour based on k = 3, gives a better error 0.030 compared to 0.040 with Naive Bayes. K = 3 > NB for Classifying species in the Iris-dataset.

Classify the wine quality data using Naive Bayes. Compare the results with the two methods described in problem 1 of this homework set. Think about why one of the methods used works better than the others.

library(klaR)
setwd("C:/Users/Manjari/Desktop/Machine learning/Home Work Solutions")
wine_data <- read.table("winequality-red.txt", header=TRUE, sep=";")
str(wine_data)

## 'data.frame':    1599 obs. of  12 variables:
##  $ fixed.acidity       : num  7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
##  $ volatile.acidity    : num  0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
##  $ citric.acid         : num  0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
##  $ residual.sugar      : num  1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
##  $ chlorides           : num  0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
##  $ free.sulfur.dioxide : num  11 25 15 17 11 13 15 15 9 17 ...
##  $ total.sulfur.dioxide: num  34 67 54 60 34 40 59 21 18 102 ...
##  $ density             : num  0.998 0.997 0.997 0.998 0.998 ...
##  $ pH                  : num  3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
##  $ sulphates           : num  0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
##  $ alcohol             : num  9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
##  $ quality             : int  5 5 5 6 5 5 5 7 7 5 ...

wine_data$quality <- as.factor(wine_data$quality)
mod <- naiveBayes(quality~.,data = wine_data)
qualityHat <- predict(mod, wine_data[,1:11])

Err <- 1 - sum(qualityHat == wine_data$quality)/length(wine_data$quality)
Err

## [1] 0.4396498

pairs(wine_data[,1:11])

Question 4: Classify the sonar data using Naive Bayes. Compare the results with the methods used in class and with the last homework set. Give reasons for any discrepancies between the results for these methods. (Either in class or in homework, the following methods have been used on this data set: Trees, Linear Regression, Ridge Regression, an Ensemble Method, and now Naive Bayes.)

library(class)
library(e1071) 
library(klaR) 
library(MASS) 
library(rpart)
sonar.train <- read.csv("sonar_train.csv", header = FALSE)
sonar.train$V61 <- as.factor(sonar.train$V61)
method <- NaiveBayes(V61 ~ ., data = sonar.train)
out <- predict(method)
Err <- 1 - sum(out$class == sonar.train$V61)/length(sonar.train$V61)
Err

## [1] 0.2384615

train.labels <- sonar.train$V61
train <- sonar.train[, -61]

Err <- rep(0, 20)
for (kk in seq(from = 1, to = 20)) {
    out <- knn.cv(train, train.labels, k = kk)
    Error <- 1 - sum(abs(train.labels == out))/length(out)
    Err[kk] <- Error
}
Err

##  [1] 0.2000000 0.2615385 0.2615385 0.2769231 0.3000000 0.3153846 0.3000000
##  [8] 0.2923077 0.2923077 0.2846154 0.3000000 0.3076923 0.3153846 0.2846154
## [15] 0.3076923 0.3000000 0.3076923 0.3230769 0.3230769 0.3230769

plot(Err)

best = which.min(Err)
best

## [1] 1

K-nearest with k = 1, does the best with error of 0.2 verus NaiveBayes,which also perform well with an error of 0.2384. It depends on costs of error and ease of computation on ghd dataset that would make the determination for ‘best’ method to be used to classify the sonar data-set.

5.)Scaled Vs. Unscaled data, and Best k value for K-Nearest Neighbors Run the code in the file KfirstNearestNeighbor.R Does KNN create a better model if the data is first scaled and normalized? What should be chosen as the best value for k and why?

require(class)

Cross validation is used to find the best value of K. Originally “4” is chosen as the best value of K. However, if an even number is chosen as K, it causes problems of decisions. Let’s suppose there are the same amounts of records such as two “0”s and two “1”s. To avoid such problems, one is added to the best K, when K is a even number. Therefore, “5” is chosen as the best K.

Homework #5

Manjari

Wednesday, March 11, 2015