par(mar=rep(2, 4)) # make the margins smaller for RStudio
##Modified Ridge regression of HW3 problem4
setwd("C:/Users/Manjari/Desktop/Machine learning/Home Work Solutions")
wine_data <- read.table("winequality-red.txt", header=TRUE, sep=";")
wine_train = wine_data[1:1400,]
wine_test = wine_data[1401:1599,]
wine_train_cv = wine_train
rmse=function(X,Y)
{
return( sqrt(sum((X-Y)^2)/length(Y)) )
}
library(MASS)
xlambda=rep(0, times = 30)
for(i in seq(from = 0, to = 29))
{
exp <- (+3 -4*(i/20))
xlambda[i+1] <- 10^exp
}
cross_valid <- function(df_train_cv, k, args, method, err_type="rmse")
{
error_train <- 0
error_cv <- 0
num_sample <- nrow(df_train_cv)
df_train = df_train_cv[1:(num_sample*((k-1)/k)),]
df_cv = df_train_cv[1:(num_sample*(1/k)),]
pick <- k #pick kth set
for(j in 1:k){
i_tmp <- 1
for(i in 1:k){
if(i == pick){
df_cv <- df_train_cv[((i-1)*num_sample/k+1):(num_sample*(i/k)),]
} else {
df_train[((i_tmp-1)*num_sample/k+1):(num_sample*(i_tmp/k)), ] <- df_train_cv[((i-1)*num_sample/k+1):(num_sample*(i/k)),]
i_tmp <- i_tmp + 1
}
}
pick <- pick - 1
y_df_train <- df_train[,ncol(df_train)]
x_df_train <- df_train[,-ncol(df_train)]
yx_df_train <- cbind(x_df_train, y_df_train)
y_df_cv <- df_cv[,ncol(df_cv)]
x_df_cv <- df_cv[,-ncol(df_cv)]
yx_df_cv <- cbind(x_df_cv, y_df_cv)
if(method=="linear ridge"){
fit <- lm.ridge(y_df_train~., yx_df_train, lambda=xlambda[args])
A <- as.array(fit$coef[1:(ncol(df_train)-1)]/fit$scales)
X <- as.matrix(x_df_train)
for( i in seq(from = 1, to = ncol(x_df_train))){
X[,i] <- X[,i] - fit$xm[i]
}
yh <- X%*%A + fit$ym
error_train <- error_train + rmse(round(yh), y_df_train)/k*0.1/0.1
X <- as.matrix(x_df_cv)
for( i in seq(from = 1, to = ncol(x_df_cv))){
X[,i] <- X[,i] - fit$xm[i]
}
yh <- X%*%A + fit$ym
error_cv <- error_cv + rmse(round(yh), y_df_cv)/k*0.1/0.1
}
if(method=="k nearest neighbors"){
y_df_train <- as.factor(y_df_train)
x_df_train <- scale(x_df_train)
KNN <- knn(x_df_train, x_df_train, y_df_train, k=xnn[args])
if(err_type=="rmse"){
error_train <- error_train + rmse(as.numeric(as.character(y_df_train)),as.numeric(as.character(KNN)))/k*0.1/0.1
}
if(err_type=="class"){
error_train <- error_train + (1-sum(abs(y_df_train == KNN))/length(KNN))/k*0.1/0.1
}
x_df_cv <- scale(x_df_cv)
KNN <- knn(x_df_train, x_df_cv, y_df_train, k=xnn[args])
if(err_type=="rmse"){
error_cv <- error_cv + rmse(y_df_cv, as.numeric(as.character(KNN)))/k*0.1/0.1
}
if(err_type=="class"){
error_cv <- error_cv + (1-sum(abs(y_df_cv == KNN))/length(KNN))/k*0.1/0.1
}
}
}
return(c(error_train,error_cv))
}
k <- 5
wine_train_err <- NULL
wine_cv_err <- NULL
for(ilambda in 1:length(xlambda)){
wine_err <- cross_valid(wine_train_cv, k, ilambda, method="linear ridge")
wine_train_err[ilambda] <- wine_err[1]
wine_cv_err[ilambda] <- wine_err[2]
}
wine_train_err_lr <- wine_train_err
wine_cv_err_lr <- wine_cv_err
min_lambda_id <- min(which(min(wine_cv_err_lr) == wine_cv_err_lr))
min_wine_lambda <- xlambda[min_lambda_id]
sprintf("%dth lambda %f is optimal.", min_lambda_id, min_wine_lambda)
## [1] "8th lambda 39.810717 is optimal."
## [1] "8th lambda 39.810717 is optimal."
plot(1:length(xlambda),wine_train_err_lr,ylim=c(min(wine_train_err_lr, wine_cv_err_lr),max(wine_train_err_lr, wine_cv_err_lr)))
points(1:length(xlambda),wine_cv_err_lr, col='red')
points(min_lambda_id, wine_cv_err_lr[min_lambda_id],pch = 19, col="red")
library(class)
require(class)
wine_train_err<-NULL
wine_cv_err<-NULL
xnn <- c(1:15)
for(inn in 1:length(xnn)){
wine_err <- cross_valid(wine_train_cv, k, inn, method="k nearest neighbors")
wine_train_err[inn] <- wine_err[1]
wine_cv_err[inn] <- wine_err[2]
}
wine_train_err_knn <- wine_train_err
wine_cv_err_knn <- wine_cv_err
margin = 0.04
min_k_id <- min(which(min(wine_cv_err_knn) + margin > wine_cv_err_knn))
min_k_id <- ifelse(xnn[min_k_id]%%2, min_k_id, min_k_id+1) # K must be odd
min_wine_k <- xnn[min_k_id]
sprintf("k = %d is optimal.", min_wine_k)
## [1] "k = 7 is optimal."
## [1] "k = 9 is optimal."
plot(1:length(xnn),wine_train_err_knn,ylim=c(min(wine_train_err_knn, wine_cv_err_knn),max(wine_train_err_knn, wine_cv_err_knn)))
points(1:length(xnn),wine_cv_err_knn, col='red')
points(min_k_id, wine_cv_err_knn[min_k_id],pch = 19, col="red")
#Use test set to compare two algorithms
y_wine_train_cv <- wine_train_cv[,ncol(wine_train_cv)]
x_wine_train_cv <- wine_train_cv[,-ncol(wine_train_cv)]
yx_wine_train_cv <- cbind(x_wine_train_cv, y_wine_train_cv)
y_wine_test <- wine_test [,ncol(wine_test )]
x_wine_test <- wine_test [,-ncol(wine_test )]
yx_wine_test <- cbind(x_wine_test, y_wine_test )
fit <- lm.ridge(y_wine_train_cv~., yx_wine_train_cv, lambda=min_wine_lambda)
A <- as.array(fit$coef[1:(ncol(wine_train_cv)-1)]/fit$scales)
X <- as.matrix(x_wine_train_cv)
for( i in seq(from = 1, to = ncol(x_wine_train_cv))){
X[,i] <- X[,i] - fit$xm[i]
}
yh <- X%*%A + fit$ym
error_wine_train_lr <- rmse(round(yh), y_wine_train_cv)*0.1/0.1
error_wine_train_lr #0.7010197
## [1] 0.7010197
X <- as.matrix(x_wine_test)
for( i in seq(from = 1, to = ncol(x_wine_test))){
X[,i] <- X[,i] - fit$xm[i]
}
yh <- X%*%A + fit$ym
error_wine_test_lr <- rmse(round(yh), y_wine_test)*0.1/0.1
error_wine_test_lr #0.7263871
## [1] 0.7263871
y_wine_train_cv <- as.factor(y_wine_train_cv)
x_wine_train_cv <- scale(x_wine_train_cv)
KNN <- knn(x_wine_train_cv, x_wine_train_cv, y_wine_train_cv, k=min_wine_k)
error_wine_train_knn <- rmse(as.numeric(as.character(y_wine_train_cv)), as.numeric(as.character(KNN)))*0.1/0.1
error_wine_train_knn #0.688684
## [1] 0.6829558
x_wine_test <- scale(x_wine_test)
KNN <- knn(x_wine_train_cv, x_wine_test, y_wine_train_cv, k=min_wine_k)
error_wine_test_knn <- rmse(y_wine_test, as.numeric(as.character(KNN)))*0.1/0.1
error_wine_test_knn #0.8447294
## [1] 0.8144425
tb_wine_lr_vs_knn <- data.frame("Train err"=c(error_wine_train_lr, error_wine_train_knn),"Test err"=c(error_wine_test_lr, error_wine_test_knn))
row.names(tb_wine_lr_vs_knn) <- c("Linear Ridge", "K-Nearest Neighbors")
print(tb_wine_lr_vs_knn)
## Train.err Test.err
## Linear Ridge 0.7010197 0.7263871
## K-Nearest Neighbors 0.6829558 0.8144425
All error rates are calculated by root mean square method. Even though K-nearest neighbor’s classification predicts qualities as factors, it does not make sense to define error as 0 or 1 per each observations. Let’s suppose one of true values is “8”, the classifier finds “7”. It is obvious that the classifier has found a better value than “3~6”. That is why root mean square method is used to calculate error rate of wine data, no matter if classification or regression is used. However, defining error as 0 or 1 is used to calculate error rate of iris data, since iris data has characteristics of classification problems Test error rate is lower when the linear regression model is applied, because it is the regression problem rather than classification to fit wine quality data.
library(class)
library(e1071)
iris <- read.table("irisdata.csv",sep = ",",header = FALSE)
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ V1: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ V2: num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ V3: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ V4: num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ V5: Factor w/ 3 levels "Iris-setosa",..: 1 1 1 1 1 1 1 1 1 1 ...
train <- iris[,1:4]
labels <- iris[,5]
###Scale the data
train2 <- train
for(i in seq(from = 1, to = ncol(train))){
v = var(train[,i])
m = mean(train[,i])
train2[,i] <- (train[,i]-m)/sqrt(v)
}
####Perform cross validation on the new data
out <- knn.cv(train2,labels,k=3)
1-sum(abs(labels == out))/length(out)
## [1] 0.05333333
Err <- rep(0,50)
for(kk in seq(from=1,to=50)){
out <- knn.cv(train2,labels,k=kk)
Error <- 1-sum(abs(labels == out))/length(out)
Err[kk] <- Error
}
Err
## [1] 0.05333333 0.06000000 0.05333333 0.06666667 0.05333333 0.04666667
## [7] 0.04000000 0.04666667 0.04666667 0.04666667 0.04666667 0.04000000
## [13] 0.03333333 0.04000000 0.03333333 0.04000000 0.03333333 0.04000000
## [19] 0.04666667 0.04000000 0.05333333 0.04666667 0.05333333 0.04666667
## [25] 0.04000000 0.04666667 0.05333333 0.06000000 0.05333333 0.06000000
## [31] 0.05333333 0.06000000 0.05333333 0.06000000 0.05333333 0.07333333
## [37] 0.08666667 0.09333333 0.10000000 0.10000000 0.11333333 0.10666667
## [43] 0.10666667 0.10666667 0.10666667 0.11333333 0.11333333 0.12666667
## [49] 0.12666667 0.13333333
plot(Err)
In conclusion K-Nearest neighbour based on k = 3, gives a better error 0.030 compared to 0.040 with Naive Bayes. K = 3 > NB for Classifying species in the Iris-dataset.
library(klaR)
setwd("C:/Users/Manjari/Desktop/Machine learning/Home Work Solutions")
wine_data <- read.table("winequality-red.txt", header=TRUE, sep=";")
str(wine_data)
## 'data.frame': 1599 obs. of 12 variables:
## $ fixed.acidity : num 7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
## $ volatile.acidity : num 0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
## $ citric.acid : num 0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
## $ residual.sugar : num 1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
## $ chlorides : num 0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
## $ free.sulfur.dioxide : num 11 25 15 17 11 13 15 15 9 17 ...
## $ total.sulfur.dioxide: num 34 67 54 60 34 40 59 21 18 102 ...
## $ density : num 0.998 0.997 0.997 0.998 0.998 ...
## $ pH : num 3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
## $ sulphates : num 0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
## $ alcohol : num 9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
## $ quality : int 5 5 5 6 5 5 5 7 7 5 ...
wine_data$quality <- as.factor(wine_data$quality)
mod <- naiveBayes(quality~.,data = wine_data)
qualityHat <- predict(mod, wine_data[,1:11])
Err <- 1 - sum(qualityHat == wine_data$quality)/length(wine_data$quality)
Err
## [1] 0.4396498
pairs(wine_data[,1:11])
Question 4: Classify the sonar data using Naive Bayes. Compare the results with the methods used in class and with the last homework set. Give reasons for any discrepancies between the results for these methods. (Either in class or in homework, the following methods have been used on this data set: Trees, Linear Regression, Ridge Regression, an Ensemble Method, and now Naive Bayes.)
library(class)
library(e1071)
library(klaR)
library(MASS)
library(rpart)
sonar.train <- read.csv("sonar_train.csv", header = FALSE)
sonar.train$V61 <- as.factor(sonar.train$V61)
method <- NaiveBayes(V61 ~ ., data = sonar.train)
out <- predict(method)
Err <- 1 - sum(out$class == sonar.train$V61)/length(sonar.train$V61)
Err
## [1] 0.2384615
train.labels <- sonar.train$V61
train <- sonar.train[, -61]
Err <- rep(0, 20)
for (kk in seq(from = 1, to = 20)) {
out <- knn.cv(train, train.labels, k = kk)
Error <- 1 - sum(abs(train.labels == out))/length(out)
Err[kk] <- Error
}
Err
## [1] 0.2000000 0.2615385 0.2615385 0.2769231 0.3000000 0.3153846 0.3000000
## [8] 0.2923077 0.2923077 0.2846154 0.3000000 0.3076923 0.3153846 0.2846154
## [15] 0.3076923 0.3000000 0.3076923 0.3230769 0.3230769 0.3230769
plot(Err)
best = which.min(Err)
best
## [1] 1
K-nearest with k = 1, does the best with error of 0.2 verus NaiveBayes,which also perform well with an error of 0.2384. It depends on costs of error and ease of computation on ghd dataset that would make the determination for ‘best’ method to be used to classify the sonar data-set.
5.)Scaled Vs. Unscaled data, and Best k value for K-Nearest Neighbors Run the code in the file KfirstNearestNeighbor.R Does KNN create a better model if the data is first scaled and normalized? What should be chosen as the best value for k and why?
require(class)
Cross validation is used to find the best value of K. Originally “4” is chosen as the best value of K. However, if an even number is chosen as K, it causes problems of decisions. Let’s suppose there are the same amounts of records such as two “0”s and two “1”s. To avoid such problems, one is added to the best K, when K is a even number. Therefore, “5” is chosen as the best K.