1(a) Once again check out wine quality data set described in the web page below: http://archive.ics.uci.edu/ml/machine-learning-databases/winequality/winequality.names Remember the Red Wine data set (winequality-red.csv) contains 1599 observations of 11 attributes. The median score of the wine tasters is given in the last column. Note also that the delimiter used in this file is a semi colon and not a comma. This problem is to create an ordinary least squares linear model (use the lm function in R) for this data set using the first 1400 observations. Don’t forget to scale each column before you create the model. Next check the model’s performance on the last 199 observations. How well did the model predict the results for the last 199 observations? What measure did you use to evaluate how well the model did this prediction? Next use the model to predict the results for the whole data set and measure how well your model worked. (hint: use the r function lm and the regression example from class)

#1(a)
setwd("C:/Users/Manjari/Desktop/Machine learning/Home Work Solutions")
wine_data <- read.table("winequality-red.txt", header=TRUE, sep=";")
wine_train = wine_data[1:1400,]
wine_test = wine_data[1401:1599,]
rmse=function(X,Y){
  return( sqrt(sum((X-Y)^2)/length(Y)) )
}
y_train = wine_train[,12]
lmodel = lm(quality~., data = wine_train)
wine_train_err = rmse(y_train, predict(lmodel, newdata = wine_train[,-12]))
y_test = wine_test[,12]
wine_test_err = rmse(y_test, predict(lmodel, newdata = wine_test[,-12]))
cat("training error: ", wine_train_err, ", test error: ", wine_test_err, "\n")
## training error:  0.638642 , test error:  0.6977601
#Q1b
y_whole = y_train
y_whole = append(y_whole, y_test)
wine_whole_err = rmse(y_whole,
  predict(lmodel, newdata = rbind(wine_train[,-12], wine_test[,-12])))
cat("training error: ", wine_train_err, ", test error: ", wine_test_err,
    ", whole data error: ", wine_whole_err, "\n")
## training error:  0.638642 , test error:  0.6977601 , whole data error:  0.6462941
  1. Perform a ridge regression on the wine quality data set from problem 1 using only the first 1400 observations. Compare the results of applying the ridge regression model to the last 199 observations with the results of applying the ordinary least square model to these observations. Compare the coefficients resulting from the ridge regression with the coefficients that were obtained in problem 1. What conclusions can you make from this comparison?
library("ridge")
xlambda=rep(0, times = 30)
for(i in seq(from = 0, to = 29)){
  
exp <- (+3 -4*(i/20))
  xlambda[i+1] <- 10^exp
}
wine_train_err_lr = rep(0, times = length(xlambda))
wine_test_err_lr = rep(0, times = length(xlambda))
min_lambda = 10000
for(i in 1:length(xlambda)){
  lrmodel = linearRidge(quality~., data = wine_train, lambda = xlambda[i])
  wine_train_err_lr[i] = rmse(y_train, predict(lrmodel, newdata = wine_train[,-12]))
  wine_test_err_lr[i] = rmse(y_test, predict(lrmodel, newdata = wine_test[,-12]))
  if(i > 1 && wine_test_err_lr[i] < (wine_test_err_lr[i-1]-0.005)){
    min_lambda = xlambda[i]
    index_lambda=i
  }
}
plot(1:length(xlambda),wine_train_err_lr,
     ylim=c(min(wine_train_err_lr, wine_test_err_lr),
            max(wine_train_err_lr, wine_test_err_lr)))
points(1:length(xlambda),wine_test_err_lr, col='red')

cat(index_lambda, "th ", "lambda is optimal: ", min_lambda, "\n")
## 18 th  lambda is optimal:  0.3981072
cat("ridge training error: ", wine_train_err_lr[xlambda==min_lambda],
    ", test error: ", wine_test_err_lr[xlambda==min_lambda], "\n")
## ridge training error:  0.6482447 , test error:  0.7092735
## Q2b
lrmodel2 = linearRidge(quality~., data = wine_train, lambda=min_lambda)
wine_train_err_lr2 = rmse(y_train, predict(lrmodel2, newdata = wine_train[,-12]))
wine_test_err_lr2 = rmse(y_test, predict(lrmodel2, newdata = wine_test[,-12]))
cat("non-ridge training error: ", wine_train_err, ", test error: ", wine_test_err, "\n")
## non-ridge training error:  0.638642 , test error:  0.6977601
cat("ridge training error: ", wine_train_err_lr2, ", test error: ", wine_test_err_lr2, "\n")
## ridge training error:  0.6482447 , test error:  0.7092735
lrmodel0 = linearRidge(quality~., data = wine_train, lambda=0)
#coeficients of the linear and ridge regression model
lrmodel0$coef
##                            [,1]
## fixed.acidity         1.5267106
## volatile.acidity     -7.2101097
## citric.acid          -1.0628381
## residual.sugar        0.3144794
## chlorides            -3.3323779
## free.sulfur.dioxide   1.3092827
## total.sulfur.dioxide -4.2521873
## density              -1.0535142
## pH                   -1.6969912
## sulphates             5.7144814
## alcohol              11.2483663
lrmodel2$coef
##                            [,1]
## fixed.acidity         1.7008522
## volatile.acidity     -5.3273062
## citric.acid           1.2136676
## residual.sugar        0.5699825
## chlorides            -2.5102442
## free.sulfur.dioxide   0.2720990
## total.sulfur.dioxide -3.0673511
## density              -2.3773154
## pH                   -0.3890982
## sulphates             4.1571833
## alcohol               7.8186197
plot(abs(lrmodel0$coef))
points(abs(lrmodel2$coef),col="red")

  1. This problem uses the Iris Data Set. It only involves the Versicolor and Virginica species (rows 51 through 150). Use cross validated ridge regression to classify these two species. Create and plot a ROC curve for this classification method.
#Q3a
library(MASS)
iris_data = read.table("irisdata.csv", sep=",", header=FALSE)
# we only need Versicolor and Virginica data.
iris_data = iris_data[51:150,]
target = rep(0,100)
target[iris_data[,5]=="Iris-versicolor"] = 1
target[iris_data[,5]!="Iris-versicolor"] = -1
row.names(iris_data)<-NULL
iris_data = cbind(iris_data, target)
set_seed <- function(i) {
  set.seed(i)
  if (exists(".Random.seed"))  oldseed <- get(".Random.seed", .GlobalEnv)
  if (exists(".Random.seed"))  assign(".Random.seed", oldseed, .GlobalEnv)
}

k = 10
# 10-fold cross valiation is used.
num_sample = nrow(iris_data)

set_seed(123)
iris_data = iris_data[sample(num_sample, num_sample, replace=FALSE),]
iris_train = iris_data[1:(num_sample*((k-1)/k)),]
iris_cross = iris_data[1:(num_sample*(1/k)),]

error_train_total = matrix(0, nrow = length(xlambda), ncol = 1)
error_cross_total = matrix(0, nrow = length(xlambda), ncol = 1)
for(ilambda in 1:length(xlambda)){
  pick = k #pick kth set
  
  error_train = 0
  error_cross = 0
  
  for(j in 1:k){
    i_tmp = 1
    for(i in 1:k){
      
      #choose training set, and cross validation set 
      if(i == pick){
        iris_cross = iris_data[((i-1)*num_sample/k+1):(num_sample*(i/k)),]
      } else {
        iris_train[((i_tmp-1)*num_sample/k+1):(num_sample*(i_tmp/k)), 
                   ] = iris_data[((i-1)*num_sample/k+1):(num_sample*i/k),]
        i_tmp = i_tmp + 1
      }
    }
    pick = pick - 1
    y_iris_train = iris_train[,6]
    x_iris_train = iris_train[,1:4]
    yx_iris_train = cbind(x_iris_train, y_iris_train)
    
    y_iris_cross = iris_cross[,6]
    x_iris_cross = iris_cross[,1:4]
    
    iris_model = lm.ridge(y_iris_train~., yx_iris_train, lambda=xlambda[ilambda])
    A = as.array(iris_model$coef[1:4]/iris_model$scales)
    X_train = x_iris_train
    for( i in seq(from = 1, to = ncol(x_iris_train))){
      X_train[,i] = x_iris_train[,i] - iris_model$xm[i]
    }
    X_train=as.matrix(X_train)
    yh = X_train%*%A + iris_model$ym

    yhP = (yh >= 0.0)
    yp = (y_iris_train >= 0.0)
    error_train = error_train + sum(yhP != yp)/(length(y_iris_train)*k*0.00001/0.00001)
    X_cross = x_iris_cross
    for( i in seq(from = 1, to = ncol(x_iris_cross))){
      X_cross[,i] = x_iris_cross[,i] - iris_model$xm[i]
    }
    X_cross=as.matrix(X_cross)
    yh = X_cross%*%A + iris_model$ym
    
    
    yhP = (yh >= 0.0)
    yp = (y_iris_cross >= 0.0)
    
    error_cross = error_cross + sum(yhP != yp)/(length(y_iris_cross)*k*0.00001/0.00001)
    
  }
  
  error_train_total[ilambda,1] = error_train
  error_cross_total[ilambda,1] = error_cross
}

min_iris_lambda <- xlambda[min(which(min(error_cross_total) == error_cross_total))]
th_lambda = min(which(min(error_cross_total) == error_cross_total))
cat(th_lambda, "th lambda", min_iris_lambda, "is optimal.")
## 9 th lambda 25.11886 is optimal.
plot(1:length(xlambda),error_train_total[,1],
     ylim=c(min(error_train_total, error_cross_total),
            max(error_train_total, error_cross_total)))
points(1:length(xlambda),error_cross_total[,1], col='red')

  1. See if you can improve on regression-based classification of the iris data that we did in class. Classify the iris data set with second degree terms added using a ridge regression. (ie supplement the original 4 attributes x1, x2, x3, and x4 by including the 10 second degree terms ( x1x1, x1x2, x1*x3, . ) for a total of 14 attributes.) Use multiclass to classify the data and then compare the results with the results obtained in class. It is fine to use brute force to add these attributes. For those who are adventurous, investigate the function mutate in the package plyr.
attach(iris)
library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-2
library(plyr)

oldpar <- par(no.readonly=TRUE)
names(iris)
## [1] "Sepal.Length" "Sepal.Width"  "Petal.Length" "Petal.Width" 
## [5] "Species"
dim(iris)
## [1] 150   5
str(iris)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
##data manipulation using mutate
iris_test<-mutate(iris, X5 =Sepal.Length^2, X6= Sepal.Length* Sepal.Width,
                   X7=Sepal.Length*Petal.Length, X8=Sepal.Length*Petal.Width,
                   X9 =Sepal.Width^2, X10=Sepal.Width*Petal.Length, 
                   X11=Sepal.Width*Petal.Width, X12= Petal.Length^2, 
                   X13=Petal.Length*Petal.Width, X14=Petal.Width^2)
                   
iris_data<-iris[51:150,]                  
iris_trans1<-iris_test[, -5]   
iris_trans2<-iris_test[, 5]
iris_trans<-cbind(iris_trans1, iris_trans2)


##iris_data<-iris[51:150,]
iris_data<-iris_trans[51:150,]   


Y1 <- c(rep(-1,100))
Y2 <- Y1

Y1[1:50] <- 1        #Y1 will be used to separate out Versicolor
Y2[51:100] <- 1      #Y2 is used to separate out Virginica
I <- 1:100
target <- cbind(Y1,Y2)
head(target)
##      Y1 Y2
## [1,]  1 -1
## [2,]  1 -1
## [3,]  1 -1
## [4,]  1 -1
## [5,]  1 -1
## [6,]  1 -1
X <- iris_data[,-15]

##place holder for input and ouput attirbutes
x_trainIn1<-0.0
x_trainOut1<-0.0
x_trainIn2<-0.0
x_trainOut2<-0.0

nxval=5 ### for 5-fold cross validation

ntol=nrow(iris_data) ##number of rows
y_trainIn<-matrix(nrow = (ntol - ntol/nxval), ncol = 2)
y_trainOut<-matrix(nrow = ntol/nxval, ncol = 2)
testErr <- matrix(nrow = nxval, ncol = 2)
lambda <- matrix(nrow = nxval, ncol = 2)
prolambda <- c(rep(0,2)) # prolambda will hold these three lambdas


for(ident in seq(1,2)){  # find the best lambda for each model
    grid=10^seq(10,-2,length=100)
    trainErr <- 0.0
    bestlam <-0.0
    
    for(ixval in seq(from = 1, to = nxval)){
        Iout <- which(I%%nxval== ixval - 1)
        Xin <- X[-Iout,]
        Xout <- X[Iout,]
        Yin <- target[-Iout,ident]  # run with Y1, Y2 
        Yout <- target[Iout,ident]  # run with Y1, Y2 
        dataIn <- cbind(Xin,Yin)
        
        if(ident==1){
            y_trainIn[, 1]<-target[-Iout,ident] 
            y_trainOut[, 1]<-target[Iout,ident]
            x_trainIn1<-X[-Iout,]
            x_trainOut1<-X[Iout,]
            }
        
        else if(ident==2) {
            y_trainIn[, 2]<-target[-Iout,ident]
            y_trainOut[, 2] <- target[Iout,ident]
            x_trainIn2<-X[-Iout,]
            x_trainOut2<-X[Iout,]
            }
    
## to get best lambda
    cv.out=cv.glmnet(as.matrix(Xin),as.matrix(Yin),alpha=0,lambda=grid)
    bestlam=cv.out$lambda.min
    
    # make fair comparison of error
    ridgeMod =glmnet(as.matrix(Xin), as.matrix(Yin),alpha=0,lambda=bestlam)
    yEst <- predict(ridgeMod, newx = as.matrix(Xout))
    
    testErr[ixval, ident] <-sqrt( sum((Yout-yEst)^2))/length(yEst)
    lambda[ixval, ident]<-bestlam

}##end Y1 and Y2 loop
  
  testErr
  MintestErr<-apply(testErr, 2, min)
  MintestErr # 0.09550798 0.09550798
  index<- which(min(testErr[, ident])==testErr[, ident])
  print(index)  
  prolambda[ident]<- lambda[index, ident]
  ##prolambda1[2]<- lambda[index, 2]
  
}
## [1] 2
## [1] 2
prolambda ## [1] 0.03053856 0.02310130
## [1] 0.01000000 0.07054802
library(ROCR)
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## 
## The following object is masked from 'package:stats':
## 
##     lowess
#Create the production model

##appropiate lambda for Y1 (versicolor)
xlambda1 <- prolambda[1]
Xin1 <- x_trainIn1
Yin1 <-y_trainIn[, 1]
dataIn1 <- cbind(Xin1,Yin1)
ridgeMod1 =glmnet(as.matrix(Xin1), as.matrix(Yin1),alpha=0,lambda=xlambda1)

pred_y1 <- predict(ridgeMod1, newx = as.matrix(x_trainOut1))
testErr1 <-sqrt( sum((y_trainOut[, 1] - pred_y1)^2))/length(pred_y1)

testErr1 #[1] 0.1417557
## [1] 0.1417557
###
##comparing predicted and test values
pred1<-prediction(pred_y1, as.matrix(y_trainOut[, 1]))
###Get TPR and FPR values 
perf1<-performance(pred1,"tpr","fpr")
##performance(pred1,"auc") #

###appropiate lambda for Y2 (virginica)
xlambda2 <- prolambda[2]
Xin2 <- x_trainIn2
Yin2 <- y_trainIn[, 2]
dataIn2 <- cbind(Xin2,Yin2)
ridgeMod2 =glmnet(as.matrix(Xin2), as.matrix(Yin2),alpha=0,lambda=xlambda2)

##
pred_y2 <- predict(ridgeMod2, newx = as.matrix(x_trainOut2))
##error
testErr2 <-sqrt( sum((y_trainOut[, 2] -pred_y2)^2))/length(pred_y2)
testErr2 #[1]  0.1430702
## [1] 0.1424886
pred2<-prediction(pred_y2, as.matrix(y_trainOut[, 2]))
perf2<-performance(pred2,"tpr","fpr")
##performance(pred2,"auc") #
##ROCR gives S4  class variable and it doesn't work like S3 which we used to
## Need to call with '@' instead of '$'. It doesn't work for points
##Need to change data.frame then as.matrix (direct transformation doesn't work)
x.values= data.frame(perf2@x.values)
y.values= data.frame(perf2@y.values)

plot(perf1,col="green", type = "l") # plot ROC curve
points(as.matrix(x.values), as.matrix(y.values),col="blue", type = "p") # plot ROC curve
abline(0,1)
legend("bottomright", c("Versicolor","Virginica"), cex = 0.6, 
       pch = c(20,20), col = c("green", "blue"))# lty = 1:2,

  1. This is a multi-class problem. Consider the Glass Identification Data Set from the UC Irvine Data Repository. The Data is located at the web site: http://archive.ics.uci.edu/ml/datasets/Glass%2BIdentification This problem will only work with building and vehicle window glass (classes 1,2 and 3), so it only uses the first 163 rows of data. (Ignore rows 164 through 214) With this set up this is a three class problem. Use ridge regression to classify this data into the three classes: building windows float processed, building windows non float processed, and vehicle windows float processed.
 ## Q5
  k = 10
  glass_data = read.table("glass.txt", header=FALSE, sep=',')
  glass_data = glass_data[1:163,]
  
  row.names(glass_data)<-NULL
  
  num_sample = nrow(glass_data)
  glass_data = glass_data[sample(num_sample, num_sample, replace=FALSE),]
  
  class1 = rep(0,163)
  class2 = rep(0,163)
  class3 = rep(0,163)
  
  class1[glass_data[,11]==1] = 1
  class1[glass_data[,11]!=1] = -1
  
  class2[glass_data[,11]==2] = 1
  class2[glass_data[,11]!=2] = -1
  
  class3[glass_data[,11]==3] = 1
  class3[glass_data[,11]!=3] = -1
  
  
  glass_data = cbind(glass_data, class1, class2, class3)
  
  
  min_glass_lambda=rep(1000,3)
  glass_train = glass_data[1:round((num_sample*((k-1)/k))),]
  
  error_glass_train_total = matrix(0, nrow = length(xlambda), ncol = 3)
  error_glass_cross_total = matrix(0, nrow = length(xlambda), ncol = 3)
  
  th_lambda = c(0,0,0)
  min_error = c(0,0,0)
  
  for(iy in 1:3){
    
    for(ilambda in 1:length(xlambda)){
      pick = k #pick kth set
      
      error_train = 0
      error_cross = 0
      
      for(j in 1:k){
        i_tmp = 1
        for(i in 1:k){
          
          #choose training set, and cross validation set 
          if(i == pick){
            glass_cross = glass_data[(round((i-1)*num_sample/k)+1):(round(num_sample*(i/k))),]
          } else {
            glass_train[((i_tmp-1)*num_sample/k+1):(num_sample*(i_tmp/k)),
                        ] = glass_data[((i-1)*num_sample/k+1):(num_sample*(i/k)),]
            i_tmp = i_tmp + 1
          }
        }
        
        pick = pick - 1
        
        y_glass_train = glass_train[,11+iy]
        x_glass_train = glass_train[,1:10]
        yx_glass_train = cbind(x_glass_train, y_glass_train)
        
        y_glass_cross = glass_cross[,11+iy]
        x_glass_cross = glass_cross[,1:10]
        
        glass_model = lm.ridge(y_glass_train~., yx_glass_train, lambda=xlambda[ilambda])
        A = as.array(glass_model$coef[1:10]/glass_model$scales)      
        X_train = x_glass_train
        for( i in seq(from = 1, to = ncol(x_glass_train))){
          X_train[,i] = x_glass_train[,i] - glass_model$xm[i]
        }
        X_train=as.matrix(X_train)
        yh = X_train%*%A + glass_model$ym
        
        
        yhP = (yh >= 0.0)
        yp = (y_glass_train >= 0.0)
        error_train = error_train + sum(yhP != yp)/(length(y_glass_train)*k*0.00001/0.00001)
        
        
        X_cross = x_glass_cross
        for( i in seq(from = 1, to = ncol(x_glass_cross))){
          X_cross[,i] = x_glass_cross[,i] - glass_model$xm[i]
        }
        X_cross=as.matrix(X_cross)
        yh = X_cross%*%A + glass_model$ym
        
        
        yhP = (yh >= 0.0)
        yp = (y_glass_cross >= 0.0)
        
        error_cross = error_cross + sum(yhP != yp)/(length(y_glass_cross)*k*0.00001/0.00001)
        
      }
      
      error_glass_train_total[ilambda,iy] = error_train
      error_glass_cross_total[ilambda,iy] = error_cross
      
    }
    min_glass_lambda[iy] <- xlambda[min(which(min(error_glass_cross_total[,iy
                                        ]) == error_glass_cross_total[,iy]))]
    th_lambda[iy]=min(which(min(error_glass_cross_total[,iy]) == error_glass_cross_total[,iy]))
  }
  
  sprintf("minimum classification error are %f, %f, %f",
          error_glass_cross_total[5,1], 
          error_glass_cross_total[11,2],
          error_glass_cross_total[13,3])
## [1] "minimum classification error are 0.018382, 0.171691, 0.091912"
## [1] "minimum classification error are 0.018382, 0.165441, 0.085662"
  plot(1:length(xlambda),error_glass_train_total[,1],
       ylim=c(min(error_glass_train_total, error_glass_cross_total),
              max(error_glass_train_total, error_glass_cross_total)))
  points(1:length(xlambda),error_glass_cross_total[,1], col='red')

plot(1:length(xlambda),error_glass_train_total[,2],
       ylim=c(min(error_glass_train_total, error_glass_cross_total),
              max(error_glass_train_total, error_glass_cross_total)))
  points(1:length(xlambda),error_glass_cross_total[,2], col='red')

plot(1:length(xlambda),error_glass_train_total[,3],
       ylim=c(min(error_glass_train_total, error_glass_cross_total),
              max(error_glass_train_total, error_glass_cross_total)))
  points(1:length(xlambda),error_glass_cross_total[,3], col='red')