1. Once again check out wine quality data set

This problem is to create an ordinary least squares linear model (use the lm function in R) for this data set using the first 1400 observations. Next check the model’s performance on the last 199 observations. How well did the model predict the results for the last 199 observations? What measure did you use to evaluate how well the model did this prediction? Next use the model to predict the results for the whole data set and measure how well your model worked.

#Q1a
wine_data <- read.table("/home/archana/ML_works_ucsc/winequality-red.txt", header=TRUE, sep=";")
wine_train = wine_data[1:1400,]
wine_test = wine_data[1401:1599,]
rmse=function(X,Y){
  return( sqrt(sum((X-Y)^2)/length(Y)) )
}
y_train = wine_train[,12]
lmodel = lm(quality~., data = wine_train)
wine_train_err = rmse(y_train, predict(lmodel, newdata = wine_train[,-12]))
y_test = wine_test[,12]
wine_test_err = rmse(y_test, predict(lmodel, newdata = wine_test[,-12]))
cat("training error: ", wine_train_err, ", test error: ", wine_test_err, "\n")
## training error:  0.6386 , test error:  0.6978
#Q1b
y_whole = y_train
y_whole = append(y_whole, y_test)
wine_whole_err = rmse(y_whole,
  predict(lmodel, newdata = rbind(wine_train[,-12], wine_test[,-12])))
cat("training error: ", wine_train_err, ", test error: ", wine_test_err,
    ", whole data error: ", wine_whole_err, "\n")
## training error:  0.6386 , test error:  0.6978 , whole data error:  0.6463

Model performed very similarly on the full-data to the test (199) set. This is a good indication of model performace. 2) Perform a ridge regression on the wine quality data set

Compare the results of applying the ridge regression model to the last 199 observations with the results of applying the ordinary least square model to these observations. Compare the coefficients resulting from the ridge regression with the coefficients that were obtained in problem 1. What conclusions can you make from this comparison?

## Q2a
library("ridge")
xlambda=rep(0, times = 30)
for(i in seq(from = 0, to = 29)){
  #
  exp <- (+3 -4*(i/20))
  xlambda[i+1] <- 10^exp
}
wine_train_err_lr = rep(0, times = length(xlambda))
wine_test_err_lr = rep(0, times = length(xlambda))
min_lambda = 10000
for(i in 1:length(xlambda)){
  lrmodel = linearRidge(quality~., data = wine_train, lambda = xlambda[i])
  wine_train_err_lr[i] = rmse(y_train, predict(lrmodel, newdata = wine_train[,-12]))
  wine_test_err_lr[i] = rmse(y_test, predict(lrmodel, newdata = wine_test[,-12]))
  
  # use 0.001 as threshold 
  if(i > 1 && wine_test_err_lr[i] < (wine_test_err_lr[i-1]-0.005)){
    min_lambda = xlambda[i]
    index_lambda=i
  }
}
plot(1:length(xlambda),wine_train_err_lr,
     ylim=c(min(wine_train_err_lr, wine_test_err_lr),
            max(wine_train_err_lr, wine_test_err_lr)))
points(1:length(xlambda),wine_test_err_lr, col='red')

plot of chunk unnamed-chunk-2

cat(index_lambda, "th ", "lambda is optimal: ", min_lambda, "\n")
## 18 th  lambda is optimal:  0.3981
cat("ridge training error: ", wine_train_err_lr[xlambda==min_lambda],
    ", test error: ", wine_test_err_lr[xlambda==min_lambda], "\n")
## ridge training error:  0.6482 , test error:  0.7093
## Q2b
lrmodel2 = linearRidge(quality~., data = wine_train, lambda=min_lambda)
wine_train_err_lr2 = rmse(y_train, predict(lrmodel2, newdata = wine_train[,-12]))
wine_test_err_lr2 = rmse(y_test, predict(lrmodel2, newdata = wine_test[,-12]))
cat("non-ridge training error: ", wine_train_err, ", test error: ", wine_test_err, "\n")
## non-ridge training error:  0.6386 , test error:  0.6978
cat("ridge training error: ", wine_train_err_lr2, ", test error: ", wine_test_err_lr2, "\n")
## ridge training error:  0.6482 , test error:  0.7093
lrmodel0 = linearRidge(quality~., data = wine_train, lambda=0)
#coeficients of the linear and ridge regression model
lrmodel0$coef
##                         [,1]
## fixed.acidity         1.5267
## volatile.acidity     -7.2101
## citric.acid          -1.0628
## residual.sugar        0.3145
## chlorides            -3.3324
## free.sulfur.dioxide   1.3093
## total.sulfur.dioxide -4.2522
## density              -1.0535
## pH                   -1.6970
## sulphates             5.7145
## alcohol              11.2484
lrmodel2$coef
##                         [,1]
## fixed.acidity         1.7009
## volatile.acidity     -5.3273
## citric.acid           1.2137
## residual.sugar        0.5700
## chlorides            -2.5102
## free.sulfur.dioxide   0.2721
## total.sulfur.dioxide -3.0674
## density              -2.3773
## pH                   -0.3891
## sulphates             4.1572
## alcohol               7.8186
plot(abs(lrmodel0$coef))
points(abs(lrmodel2$coef),col="red")

plot of chunk unnamed-chunk-2 Error of the ridge regression model is larger than linear the regression model(lambda = 0). However, the absolute value of coefficients of the ridge regession model is less than the liner model. We can guess the ridge model is more robust against overfitting problems

  1. Perform a ridge regression on the Iris Data Set

It only involves the Versicolor and Virginica species (rows 51 through 150). Use cross validated ridge regression to classify these two species. Create and plot a ROC curve for this classification method.

#Q3a
library(MASS)
iris_data = read.table("/home/archana/ML_works_ucsc/irisdata.csv", sep=",", header=FALSE)
# we only need Versicolor and Virginica data.
iris_data = iris_data[51:150,]
target = rep(0,100)
target[iris_data[,5]=="Iris-versicolor"] = 1
target[iris_data[,5]!="Iris-versicolor"] = -1
row.names(iris_data)<-NULL
iris_data = cbind(iris_data, target)
set_seed <- function(i) {
  set.seed(i)
  if (exists(".Random.seed"))  oldseed <- get(".Random.seed", .GlobalEnv)
  if (exists(".Random.seed"))  assign(".Random.seed", oldseed, .GlobalEnv)
}

k = 10
# 10-fold cross valiation is used.
num_sample = nrow(iris_data)

set_seed(123)
iris_data = iris_data[sample(num_sample, num_sample, replace=FALSE),]
iris_train = iris_data[1:(num_sample*((k-1)/k)),]
iris_cross = iris_data[1:(num_sample*(1/k)),]

error_train_total = matrix(0, nrow = length(xlambda), ncol = 1)
error_cross_total = matrix(0, nrow = length(xlambda), ncol = 1)
for(ilambda in 1:length(xlambda)){
  pick = k #pick kth set
  
  error_train = 0
  error_cross = 0
  
  for(j in 1:k){
    i_tmp = 1
    for(i in 1:k){
      
      #choose training set, and cross validation set 
      if(i == pick){
        iris_cross = iris_data[((i-1)*num_sample/k+1):(num_sample*(i/k)),]
      } else {
        iris_train[((i_tmp-1)*num_sample/k+1):(num_sample*(i_tmp/k)), 
                   ] = iris_data[((i-1)*num_sample/k+1):(num_sample*i/k),]
        i_tmp = i_tmp + 1
      }
    }
    pick = pick - 1
    y_iris_train = iris_train[,6]
    x_iris_train = iris_train[,1:4]
    yx_iris_train = cbind(x_iris_train, y_iris_train)
    
    y_iris_cross = iris_cross[,6]
    x_iris_cross = iris_cross[,1:4]
    
    iris_model = lm.ridge(y_iris_train~., yx_iris_train, lambda=xlambda[ilambda])
    A = as.array(iris_model$coef[1:4]/iris_model$scales)
    X_train = x_iris_train
    for( i in seq(from = 1, to = ncol(x_iris_train))){
      X_train[,i] = x_iris_train[,i] - iris_model$xm[i]
    }
    X_train=as.matrix(X_train)
    yh = X_train%*%A + iris_model$ym

    yhP = (yh >= 0.0)
    yp = (y_iris_train >= 0.0)
    error_train = error_train + sum(yhP != yp)/(length(y_iris_train)*k*0.00001/0.00001)
    X_cross = x_iris_cross
    for( i in seq(from = 1, to = ncol(x_iris_cross))){
      X_cross[,i] = x_iris_cross[,i] - iris_model$xm[i]
    }
    X_cross=as.matrix(X_cross)
    yh = X_cross%*%A + iris_model$ym
    
    
    yhP = (yh >= 0.0)
    yp = (y_iris_cross >= 0.0)
    
    error_cross = error_cross + sum(yhP != yp)/(length(y_iris_cross)*k*0.00001/0.00001)
    
  }
  
  error_train_total[ilambda,1] = error_train
  error_cross_total[ilambda,1] = error_cross
}

min_iris_lambda <- xlambda[min(which(min(error_cross_total) == error_cross_total))]
th_lambda = min(which(min(error_cross_total) == error_cross_total))
cat(th_lambda, "th lambda", min_iris_lambda, "is optimal.")
## 9 th lambda 25.12 is optimal.
plot(1:length(xlambda),error_train_total[,1],
     ylim=c(min(error_train_total, error_cross_total),
            max(error_train_total, error_cross_total)))
points(1:length(xlambda),error_cross_total[,1], col='red')

plot of chunk unnamed-chunk-3

##Q3b
th_list=seq(from=-0.9000, to=0.9000, by=0.1000)

tp = rep(0,times=length(th_list))
fn = rep(0,times=length(th_list))
fp = rep(0,times=length(th_list))
tn = rep(0,times=length(th_list))

y_iris_all = iris_data[,6]
x_iris_all = iris_data[,1:4]
yx_iris_all = cbind(x_iris_all, y_iris_all)

iris_th_model = lm.ridge(y_iris_all~., yx_iris_all, lambda=min_iris_lambda)
A = as.array(iris_th_model$coef[1:4]/iris_th_model$scales)
X_all = x_iris_all
for( i in seq(from = 1, to = ncol(x_iris_all))){
  X_all[,i] = x_iris_all[,i] - iris_th_model$xm[i]
}
X_all=as.matrix(X_all)
yh = X_all%*%A + iris_th_model$ym


ith = 1
for(th in th_list){
  
  tp[ith] = sum( (yh >= th) & (y_iris_all > 0) )
  fn[ith] = sum( (yh < th) & (y_iris_all > 0) )
  fp[ith] = sum( (yh >= th) & (y_iris_all < 0) )
  tn[ith] = sum( (yh < th) & (y_iris_all < 0) )
  
  ith = ith + 1
}

tpr = tp/(tp+fn)
fpr = fp/(fp+tn)

plot(fpr,tpr)
lines(spline(fpr,tpr,n=200), col="red")
points(fpr[10],tpr[10],pch = 19, col="blue")

plot of chunk unnamed-chunk-3

cat(th_list[10], " is optimal threshold.")
## 0  is optimal threshold.

0 is the most optimal treshold value to classify to two species of Iris(The blue point in the graph). The performance is like as follows: tpr: 096, fpr: 0.02.

  1. Ridge regression with second degree terms

Use multiclass to classify the data and then compare the results with the results obtained in class. It is fine to use brute force to add these attributes. For those who are adventurous, investigate the function mutate in the package plyr.

##Q4
require(plyr)
## Loading required package: plyr
addTermsx = x_iris_all
addTermsx = mutate(addTermsx, V1.1 = V1*V1, V2.2 = V2*V2, V3.3 = V3*V3, V4.4 = V4*V4,
                   V1.2 = V1*V2, V1.3 = V1*V3, V1.4 = V1*V4, V2.3 = V2*V3,
                   V2.4 = V2*V4, V3.4=V3*V4)

addTermsxy = cbind(addTermsx, y_iris_all)

error_train_total2 = matrix(0, nrow = length(xlambda), ncol = 1)
error_cross_total2 = matrix(0, nrow = length(xlambda), ncol = 1)

iris_train = addTermsxy[1:(num_sample*((k-1)/k)),]
iris_cross = addTermsxy[1:(num_sample*((1)/k)),]

for(ilambda in 1:length(xlambda)){
  pick = k #pick kth set
  
  error_train = 0
  error_cross = 0
  
  for(j in 1:k){
    i_tmp = 1
    for(i in 1:k){
      
      #choose training set, and cross validation set 
      if(i == pick){
        iris_cross = addTermsxy[((i-1)*num_sample/k+1):(num_sample*(i/k)),]
      } else {
        iris_train[((i_tmp-1)*num_sample/k+1):(num_sample*(i_tmp/k)),
                   ] = addTermsxy[((i-1)*num_sample/k+1):(num_sample*(i/k)),]
        i_tmp = i_tmp + 1
      }
    }
    
    
    pick = pick - 1
    
    y_iris_train = iris_train[,15]
    x_iris_train = iris_train[,-15]
    yx_iris_train = cbind(x_iris_train, y_iris_train)
    
    y_iris_cross = iris_cross[,15]
    x_iris_cross = iris_cross[,-15]
    
    
    iris_model = lm.ridge(y_iris_train~., yx_iris_train, lambda=xlambda[ilambda])
    
    A = as.array(iris_model$coef[1:14]/iris_model$scales)
    X_train = x_iris_train
    for( i in seq(from = 1, to = ncol(x_iris_train))){
      X_train[,i] = x_iris_train[,i] - iris_model$xm[i]
    }
    X_train=as.matrix(X_train)
    yh = X_train%*%A + iris_model$ym
 
    yhP = (yh >= 0.0)
    yp = (y_iris_train >= 0.0)
    error_train = error_train + sum(yhP != yp)/(length(y_iris_train)*k/0.00001*0.00001)
    
    
    X_cross = x_iris_cross
    for( i in seq(from = 1, to = ncol(x_iris_cross))){
      X_cross[,i] = x_iris_cross[,i] - iris_model$xm[i]
    }
    X_cross=as.matrix(X_cross)
    yh = X_cross%*%A + iris_model$ym
    
    
    
    yhP = (yh >= 0.0)
    yp = (y_iris_cross >= 0.0)
    
    error_cross = error_cross + sum(yhP != yp)/(length(y_iris_cross)*k/0.00001*0.00001)
    
  }
  
  error_train_total2[ilambda,1] = error_train
  error_cross_total2[ilambda,1] = error_cross
}

min_iris_lambda2 <- xlambda[min(which(min(error_cross_total2) == error_cross_total2))]
th_lambda = min(which(min(error_cross_total2) == error_cross_total2))
cat(th_lambda, "th lambda", min_iris_lambda2, "is optimal.")
## 10 th lambda 15.85 is optimal.
plot(1:length(xlambda),error_train_total2[,1],
     ylim=c(min(error_train_total2, error_cross_total2),
            max(error_train_total2, error_cross_total2)))
points(1:length(xlambda),error_cross_total2[,1], col='red')

plot of chunk unnamed-chunk-4

sprintf("minimum error of linear model with no additional term: %f", min(error_cross_total))
## [1] "minimum error of linear model with no additional term: 0.030000"
sprintf("minimum error of linear model with 2 dim additional term: %f", min(error_cross_total2))
## [1] "minimum error of linear model with 2 dim additional term: 0.020000"

The classification error decreases when we add second degree terms to the linear ridge model. This means Iris data set has not only linear characteristics, but also has nonlinear character- istics, such as second degree of its attributes.

  1. Ridge regression on multi-class problem of Glass Identification Data Set

This problem will only work with building and vehicle window glass (classes 1,2 and 3), so it only uses the first 163 rows of data. (Ignore rows 164 through 214) With this set up this is a three class problem. Use ridge regression to classify this data into the three classes: building windows float processed, building windows non float processed, and vehicle windows float processed.

  ## Q5
  k = 10
  glass_data = read.table("/home/archana/ML_works_ucsc/glass.txt", header=FALSE, sep=',')
  glass_data = glass_data[1:163,]
  
  row.names(glass_data)<-NULL
  
  num_sample = nrow(glass_data)
  glass_data = glass_data[sample(num_sample, num_sample, replace=FALSE),]
  
  class1 = rep(0,163)
  class2 = rep(0,163)
  class3 = rep(0,163)
  
  class1[glass_data[,11]==1] = 1
  class1[glass_data[,11]!=1] = -1
  
  class2[glass_data[,11]==2] = 1
  class2[glass_data[,11]!=2] = -1
  
  class3[glass_data[,11]==3] = 1
  class3[glass_data[,11]!=3] = -1
  
  
  glass_data = cbind(glass_data, class1, class2, class3)
  
  
  min_glass_lambda=rep(1000,3)
  glass_train = glass_data[1:round((num_sample*((k-1)/k))),]
  
  error_glass_train_total = matrix(0, nrow = length(xlambda), ncol = 3)
  error_glass_cross_total = matrix(0, nrow = length(xlambda), ncol = 3)
  
  th_lambda = c(0,0,0)
  min_error = c(0,0,0)
  
  for(iy in 1:3){
    
    for(ilambda in 1:length(xlambda)){
      pick = k #pick kth set
      
      error_train = 0
      error_cross = 0
      
      for(j in 1:k){
        i_tmp = 1
        for(i in 1:k){
          
          #choose training set, and cross validation set 
          if(i == pick){
            glass_cross = glass_data[(round((i-1)*num_sample/k)+1):(round(num_sample*(i/k))),]
          } else {
            glass_train[((i_tmp-1)*num_sample/k+1):(num_sample*(i_tmp/k)),
                        ] = glass_data[((i-1)*num_sample/k+1):(num_sample*(i/k)),]
            i_tmp = i_tmp + 1
          }
        }
        
        pick = pick - 1
        
        y_glass_train = glass_train[,11+iy]
        x_glass_train = glass_train[,1:10]
        yx_glass_train = cbind(x_glass_train, y_glass_train)
        
        y_glass_cross = glass_cross[,11+iy]
        x_glass_cross = glass_cross[,1:10]
        
        glass_model = lm.ridge(y_glass_train~., yx_glass_train, lambda=xlambda[ilambda])
        A = as.array(glass_model$coef[1:10]/glass_model$scales)      
        X_train = x_glass_train
        for( i in seq(from = 1, to = ncol(x_glass_train))){
          X_train[,i] = x_glass_train[,i] - glass_model$xm[i]
        }
        X_train=as.matrix(X_train)
        yh = X_train%*%A + glass_model$ym
        
        
        yhP = (yh >= 0.0)
        yp = (y_glass_train >= 0.0)
        error_train = error_train + sum(yhP != yp)/(length(y_glass_train)*k*0.00001/0.00001)
        
        
        X_cross = x_glass_cross
        for( i in seq(from = 1, to = ncol(x_glass_cross))){
          X_cross[,i] = x_glass_cross[,i] - glass_model$xm[i]
        }
        X_cross=as.matrix(X_cross)
        yh = X_cross%*%A + glass_model$ym
        
        
        yhP = (yh >= 0.0)
        yp = (y_glass_cross >= 0.0)
        
        error_cross = error_cross + sum(yhP != yp)/(length(y_glass_cross)*k*0.00001/0.00001)
        
      }
      
      error_glass_train_total[ilambda,iy] = error_train
      error_glass_cross_total[ilambda,iy] = error_cross
      
    }
    min_glass_lambda[iy] <- xlambda[min(which(min(error_glass_cross_total[,iy
                                        ]) == error_glass_cross_total[,iy]))]
    th_lambda[iy]=min(which(min(error_glass_cross_total[,iy]) == error_glass_cross_total[,iy]))
  }
  
  sprintf("minimum classification error are %f, %f, %f",
          error_glass_cross_total[5,1], 
          error_glass_cross_total[11,2],
          error_glass_cross_total[13,3])
## [1] "minimum classification error are 0.018382, 0.165441, 0.085662"
  plot(1:length(xlambda),error_glass_train_total[,1],
       ylim=c(min(error_glass_train_total, error_glass_cross_total),
              max(error_glass_train_total, error_glass_cross_total)))
  points(1:length(xlambda),error_glass_cross_total[,1], col='red')

plot of chunk unnamed-chunk-5

  #windows non float
  plot(1:length(xlambda),error_glass_train_total[,2],
       ylim=c(min(error_glass_train_total, error_glass_cross_total),
              max(error_glass_train_total, error_glass_cross_total)))
  points(1:length(xlambda),error_glass_cross_total[,2], col='red')

plot of chunk unnamed-chunk-5

  #vehicle_float
  plot(1:length(xlambda),error_glass_train_total[,3],
       ylim=c(min(error_glass_train_total, error_glass_cross_total),
              max(error_glass_train_total, error_glass_cross_total)))
  points(1:length(xlambda),error_glass_cross_total[,3], col='red')

plot of chunk unnamed-chunk-5 It shows that windows float glass is as difficult to be distigushed from a non-float glass