rpart ridge MASS glmnet ROCR dplyr
Once again check out wine quality data set described in the web page below: http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality.names Remember the Red Wine data set (winequality-red.csv) contains 1599 observations of 11 attributes. The median score of the wine tasters is given in the last column. Note also that the delimiter used in this file is a semi colon and not a comma. This problem is to create an ordinary least squares linear model (use the lm function in R) for this data set using the first 1400 observations. Don’t forget to scale each column before you create the model. Next check the model’s performance on the last 199 observations. How well did the model predict the results for the last 199 observations? What measure did you use to evaluate how well the model did this prediction? Next use the model to predict the results for the whole data set and measure how well your model worked. (hint: use the r function lm and the regression example from class)
## Some house keep jobs:
# Set working directory:
rm(list=ls())
require(rpart)
## Loading required package: rpart
train<-read.csv("C:/Users/Andrew/SkyDrive/AGZ_Home/workspace_R/UCSC/MachinLearning/All_data/winequality-red.txt",sep=";",header=TRUE)
head(train)
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1 7.4 0.70 0.00 1.9 0.076
## 2 7.8 0.88 0.00 2.6 0.098
## 3 7.8 0.76 0.04 2.3 0.092
## 4 11.2 0.28 0.56 1.9 0.075
## 5 7.4 0.70 0.00 1.9 0.076
## 6 7.4 0.66 0.00 1.8 0.075
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
## 1 11 34 0.9978 3.51 0.56 9.4
## 2 25 67 0.9968 3.20 0.68 9.8
## 3 15 54 0.9970 3.26 0.65 9.8
## 4 17 60 0.9980 3.16 0.58 9.8
## 5 11 34 0.9978 3.51 0.56 9.4
## 6 13 40 0.9978 3.51 0.56 9.4
## quality
## 1 5
## 2 5
## 3 5
## 4 6
## 5 5
## 6 5
dim(train)
## [1] 1599 12
data_scaled<-as.data.frame(scale(train[,]))
head(data_scaled)
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1 -0.5281944 0.9615758 -1.391037 -0.45307667 -0.24363047
## 2 -0.2984541 1.9668271 -1.391037 0.04340257 0.22380518
## 3 -0.2984541 1.2966596 -1.185699 -0.16937425 0.09632273
## 4 1.6543385 -1.3840105 1.483689 -0.45307667 -0.26487754
## 5 -0.5281944 0.9615758 -1.391037 -0.45307667 -0.24363047
## 6 -0.5281944 0.7381867 -1.391037 -0.52400227 -0.26487754
## free.sulfur.dioxide total.sulfur.dioxide density pH
## 1 -0.46604672 -0.3790141 0.55809987 1.2882399
## 2 0.87236532 0.6241680 0.02825193 -0.7197081
## 3 -0.08364328 0.2289750 0.13422152 -0.3310730
## 4 0.10755844 0.4113718 0.66406945 -0.9787982
## 5 -0.46604672 -0.3790141 0.55809987 1.2882399
## 6 -0.27484500 -0.1966174 0.55809987 1.2882399
## sulphates alcohol quality
## 1 -0.57902538 -0.9599458 -0.7875763
## 2 0.12891007 -0.5845942 -0.7875763
## 3 -0.04807379 -0.5845942 -0.7875763
## 4 -0.46103614 -0.5845942 0.4507074
## 5 -0.57902538 -0.9599458 -0.7875763
## 6 -0.57902538 -0.9599458 -0.7875763
y_train<-train[, 12]
head(y_train)
## [1] 5 5 5 6 5 5
data_trainIn<-as.data.frame(scale(train[1:1400, ]))
head(data_trainIn)
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1 -0.6217657 0.9709908 -1.429300 -0.49308621 -0.26869645
## 2 -0.3934452 1.9759178 -1.429300 0.05382623 0.18205794
## 3 -0.3934452 1.3059664 -1.224791 -0.18056482 0.05912493
## 4 1.5472792 -1.3738389 1.433828 -0.49308621 -0.28918528
## 5 -0.6217657 0.9709908 -1.429300 -0.49308621 -0.26869645
## 6 -0.6217657 0.7476737 -1.429300 -0.57121656 -0.28918528
## free.sulfur.dioxide total.sulfur.dioxide density pH
## 1 -0.43642897 -0.3902667 0.49738799 1.3395776
## 2 0.92958952 0.6046744 -0.03340185 -0.6548535
## 3 -0.04613797 0.2127279 0.07275612 -0.2688346
## 4 0.14900753 0.3936263 0.60354596 -0.9121995
## 5 -0.43642897 -0.3902667 0.49738799 1.3395776
## 6 -0.24128347 -0.2093684 0.49738799 1.3395776
## sulphates alcohol quality
## 1 -0.55618727 -0.9250909 -0.7970650
## 2 0.12965728 -0.5557843 -0.7970650
## 3 -0.04180386 -0.5557843 -0.7970650
## 4 -0.44187984 -0.5557843 0.4455753
## 5 -0.55618727 -0.9250909 -0.7970650
## 6 -0.55618727 -0.9250909 -0.7970650
y_trainIn<-train[1:1400, 12]
head(y_trainIn)
## [1] 5 5 5 6 5 5
## select last 199 rows for cross validation
data_trainOut<-as.data.frame(scale(train[1401:1599 , ]))
head(data_trainOut)
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1401 0.72341166 0.8393421 -0.003382688 -0.23327968 0.1381207
## 1402 0.72341166 0.8393421 -0.003382688 -0.23327968 0.1381207
## 1403 0.44297220 -1.3440445 1.174638570 -0.28060145 -0.7992829
## 1404 0.06905293 -1.1760917 0.669772317 -0.42256677 -0.4979746
## 1405 0.81689148 -0.2243591 1.006349819 0.00332917 0.2050781
## 1406 0.53645202 -1.4560130 0.501483565 -0.28060145 -0.4644959
## free.sulfur.dioxide total.sulfur.dioxide density pH
## 1401 1.24569608 3.17999948 0.15918760 -0.9800461
## 1402 1.24569608 3.17999948 0.15918760 -0.9800461
## 1403 -1.10689620 -0.62101330 0.21789466 0.4892459
## 1404 -1.36829534 -0.97837347 0.04177347 -1.1347085
## 1405 -0.58409791 0.09370705 1.50945007 0.4119147
## 1406 -0.06129963 -0.29614041 -0.42788304 -0.7480527
## sulphates alcohol quality
## 1401 -1.2351520 -0.7269226 -0.7219613
## 1402 -1.2351520 -0.7269226 -0.7219613
## 1403 1.2508684 1.4405030 0.4853521
## 1404 3.4963062 -0.6185513 2.8999791
## 1405 -0.3530157 0.1400476 0.4853521
## 1406 1.8924221 0.7902753 1.6926656
y_trainOut<-train[1401:1599, 12]
head(y_trainOut)
## [1] 5 5 6 8 6 7
## predic model for last 199 observations
lmmodel <- lm(y_trainIn~., data = data_trainIn)
lmmodel
##
## Call:
## lm(formula = y_trainIn ~ ., data = data_trainIn)
##
## Coefficients:
## (Intercept) fixed.acidity volatile.acidity
## 5.641e+00 -1.733e-15 -1.423e-16
## citric.acid residual.sugar chlorides
## 1.382e-15 1.914e-16 -2.769e-16
## free.sulfur.dioxide total.sulfur.dioxide density
## 6.395e-16 -1.273e-16 3.607e-17
## pH sulphates alcohol
## -2.079e-15 1.940e-17 1.796e-15
## quality
## 8.047e-01
yEst<- predict(lmmodel, newdata = data_trainOut)
head(yEst)
## 1401 1402 1403 1404 1405 1406
## 5.060439 5.060439 6.032010 7.975152 6.032010 7.003581
# Error:
error <- sqrt((sum((y_trainOut-yEst)^2))/length(y_trainOut))
error
## [1] 0.04938217
## predic model for last whole data observations
yEst_all<- predict(lmmodel, newdata = data_scaled)
head(yEst_all)
## 1 2 3 4 5 6
## 5.007636 5.007636 5.007636 6.004130 5.007636 5.007636
error_all <- sqrt((sum((y_train-yEst_all)^2))/length(y_train))
error_all
## [1] 0.006102212
using only the first 1400 observations. Compare the results of applying the ridge regression model to the last 199 observations with the results of applying the ordinary least square model to these observations. Compare the coefficients resulting from the ridge regression with the coefficients that were obtained in problem 1. What conclusions can you make from this comparison?
# Method 1: Using ridge
require(rpart)
library(ridge)
## Warning: package 'ridge' was built under R version 3.1.2
train<-read.csv("C:/Users/Andrew/SkyDrive/AGZ_Home/workspace_R/UCSC/MachinLearning/All_data/winequality-red.txt",sep=";",header=TRUE)
data_scaled<-as.data.frame(scale(train[,]))
y_train<-train[, 12]
data_trainIn<-as.data.frame(scale(train[1:1400, ]))
y_trainIn<-train[1:1400, 12]
##select last 199 rows for cross validation
data_trainOut<-as.data.frame(scale(train[1401:1599 , ]))
y_trainOut<-train[1401:1599, 12]
## predic model for last 199 observations
lmmodel <- lm(y_trainIn~., data = data_trainIn)
lmmodel
##
## Call:
## lm(formula = y_trainIn ~ ., data = data_trainIn)
##
## Coefficients:
## (Intercept) fixed.acidity volatile.acidity
## 5.641e+00 -1.733e-15 -1.423e-16
## citric.acid residual.sugar chlorides
## 1.382e-15 1.914e-16 -2.769e-16
## free.sulfur.dioxide total.sulfur.dioxide density
## 6.395e-16 -1.273e-16 3.607e-17
## pH sulphates alcohol
## -2.079e-15 1.940e-17 1.796e-15
## quality
## 8.047e-01
yEst<- predict(lmmodel, newdata = data_trainOut)
error <- sqrt(sum((y_trainOut-yEst)^2)/length(y_trainOut))
error # [1] 0.04938217
## [1] 0.04938217
lmrdgemodel<-linearRidge(y_trainIn~.,data_trainIn)
lmrdgemodel
##
## Call:
## linearRidge(formula = y_trainIn ~ ., data = data_trainIn)
##
## (Intercept) fixed.acidity volatile.acidity
## 5.641429e+00 2.162746e-05 -1.016160e-04
## citric.acid residual.sugar chlorides
## -1.491799e-05 4.473866e-06 -4.697126e-05
## free.sulfur.dioxide total.sulfur.dioxide density
## 1.843231e-05 -5.993029e-05 -1.498896e-05
## pH sulphates alcohol
## -2.383937e-05 8.056200e-05 1.584799e-04
## quality
## 8.043137e-01
yEst_ridge<- predict(lmrdgemodel, newdata = data_trainOut)
error_ridge <- sqrt(sum((y_trainOut-yEst_ridge)^2)/length(y_trainOut))
error_ridge
## [1] 0.04951383
# Very close to the Q1.
It only involves the Versicolor and Virginica species (rows 51 through 150). Use cross validated ridge regression to classify these two species. Create and plot a ROC curve for this classification method.
set.seed(pi)
attach(iris)
library(MASS)
## Warning: package 'MASS' was built under R version 3.1.2
library(ridge)
library(glmnet)
## Warning: package 'glmnet' was built under R version 3.1.2
## Loading required package: Matrix
## Warning: package 'Matrix' was built under R version 3.1.2
## Loaded glmnet 1.9-8
data(iris)
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
oldpar <- par(no.readonly=TRUE)
head(oldpar)
## $xlog
## [1] FALSE
##
## $ylog
## [1] FALSE
##
## $adj
## [1] 0.5
##
## $ann
## [1] TRUE
##
## $ask
## [1] FALSE
##
## $bg
## [1] "transparent"
names(iris)
## [1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
## [5] "Species"
dim(iris)
## [1] 150 5
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
iris_data<-iris[51:150,]
Y1 <- c(rep(-1,100))
Y1
## [1] -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
## [24] -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
## [47] -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
## [70] -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
## [93] -1 -1 -1 -1 -1 -1 -1 -1
Y2 <- Y1
Y1[1:50] <- 1 #Y1 will be used to separate out Versicolor
Y2[51:100] <- 1 #Y2 is used to separate out Virginica
I <- 1:100
target <- cbind(Y1,Y2)
X <- iris_data[,-5]
##place holder for input and ouput attirbutes
x_trainIn1<-0.0
x_trainOut1<-0.0
x_trainIn2<-0.0
x_trainOut2<-0.0
nxval=5 ### for 5-fold cross validation
ntol=nrow(iris_data) ##number of rows
y_trainIn<-matrix(nrow = (ntol - ntol/nxval), ncol = 2)
y_trainOut<-matrix(nrow = ntol/nxval, ncol = 2)
testErr <- matrix(nrow = nxval, ncol = 2)
lambda <- matrix(nrow = nxval, ncol = 2)
prolambda <- c(rep(0,2)) # prolambda will hold these three lambdas
for(ident in seq(1,2)){ # find the best lambda for each model
grid=10^seq(10,-2,length=100)
trainErr <- 0.0
bestlam <-0.0
for(ixval in seq(from = 1, to = nxval)){
Iout <- which(I%%nxval== ixval - 1)
Xin <- X[-Iout,]
Xout <- X[Iout,]
Yin <- target[-Iout,ident] # run with Y1, Y2
Yout <- target[Iout,ident] # run with Y1, Y2
dataIn <- cbind(Xin,Yin)
if(ident==1){
y_trainIn[, 1]<-target[-Iout,ident]
y_trainOut[, 1]<-target[Iout,ident]
x_trainIn1<-X[-Iout,]
x_trainOut1<-X[Iout,]
}
else if(ident==2) {
y_trainIn[, 2]<-target[-Iout,ident]
y_trainOut[, 2] <- target[Iout,ident]
x_trainIn2<-X[-Iout,]
x_trainOut2<-X[Iout,]
}
## to get best lambda
cv.out=cv.glmnet(as.matrix(Xin),as.matrix(Yin),alpha=0,lambda=grid)
bestlam=cv.out$lambda.min
# make fair comparison of error
ridgeMod =glmnet(as.matrix(Xin), as.matrix(Yin),alpha=0,lambda=bestlam)
yEst <- predict(ridgeMod, newx = as.matrix(Xout))
testErr[ixval, ident] <-sqrt( sum((Yout-yEst)^2))/length(yEst)
lambda[ixval, ident]<-bestlam
}##end Y1 and Y2 loop
MintestErr<-apply(testErr, 2, min)
MintestErr #0.09318130 0.09427988
index<- which(min(testErr[, ident])==testErr[, ident])
print(index)
prolambda[ident]<- lambda[index, ident]
##prolambda1[2]<- lambda[index, 2]
}
## [1] 2
## [1] 2
prolambda ## [1] 0.03053856 0.02310130
## [1] 0.01321941 0.01000000
library(ROCR)
## Warning: package 'ROCR' was built under R version 3.1.2
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.1.2
##
## Attaching package: 'gplots'
##
## The following object is masked from 'package:stats':
##
## lowess
#Create the production model
##appropiate lambda for Y1 (versicolor)
xlambda1 <- prolambda[1]
Xin1 <- x_trainIn1
Yin1 <-y_trainIn[, 1]
dataIn1 <- cbind(Xin1,Yin1)
ridgeMod1 =glmnet(as.matrix(Xin1), as.matrix(Yin1),alpha=0,lambda=xlambda1)
pred_y1 <- predict(ridgeMod1, newx = as.matrix(x_trainOut1))
testErr1 <-sqrt( sum((y_trainOut[, 1] - pred_y1)^2))/length(pred_y1)
testErr1 #[1] 0.1435072
## [1] 0.1439283
##comparing predicted and test values
pred1<-prediction(pred_y1, as.matrix(y_trainOut[, 1]))
###Get TPR and FPR values
perf1<-performance(pred1,"tpr","fpr")
##performance(pred1,"auc") #
###appropiate lambda for Y2 (virginica)
xlambda2 <- prolambda[2]
Xin2 <- x_trainIn2
Yin2 <- y_trainIn[, 2]
dataIn2 <- cbind(Xin2,Yin2)
ridgeMod2 =glmnet(as.matrix(Xin2), as.matrix(Yin2),alpha=0,lambda=xlambda2)
##
pred_y2 <- predict(ridgeMod2, newx = as.matrix(x_trainOut2))
##error
testErr2 <-sqrt( sum((y_trainOut[, 2] -pred_y2)^2))/length(pred_y2)
testErr2 #[1] 0.1442572
## [1] 0.1442572
pred2<-prediction(pred_y2, as.matrix(y_trainOut[, 2]))
perf2<-performance(pred2,"tpr","fpr")
##performance(pred2,"auc") #
##ROCR gives S4 class variable and it doesn't work like S3 which we used to
## Need to call with '@' instead of '$'. It doesn't work for points
##Need to change data.frame then as.matrix (direct transformation doesn't work)
x.values= data.frame(perf2@x.values)
y.values= data.frame(perf2@y.values)
plot(perf1,col="red", type = "l") # plot ROC curve
points(as.matrix(x.values), as.matrix(y.values),col="blue", type = "p") # plot ROC curve
abline(0,1)
legend("bottomright", c("Versicolor","Virginica"), cex = 0.6,
pch = c(20,20), col = c("red", "blue"))# lty = 1:2,
Classify the iris data set with second degree terms added using a ridge regression. (ie supplement the original 4 attributes x1, x2, x3, and x4 by including the 10 second degree terms ( x1x1, x1x2, x1*x3, . ) for a total of 14 attributes.) Use multiclass to classify the data and then compare the results with the results obtained in class.
It is fine to use brute force to add these attributes. For those who are adventurous, investigate the function mutate in the package plyr.
##This solve probvlem 3 on Versicolor and
##Virginica species
attach(iris)
## The following objects are masked from iris (position 8):
##
## Petal.Length, Petal.Width, Sepal.Length, Sepal.Width, Species
library(glmnet)
library(plyr)
oldpar <- par(no.readonly=TRUE)
names(iris)
## [1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
## [5] "Species"
dim(iris)
## [1] 150 5
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
##data manipulation using mutate
iris_test<-mutate(iris, X5 =Sepal.Length^2, X6= Sepal.Length* Sepal.Width,
X7=Sepal.Length*Petal.Length, X8=Sepal.Length*Petal.Width,
X9 =Sepal.Width^2, X10=Sepal.Width*Petal.Length,
X11=Sepal.Width*Petal.Width, X12= Petal.Length^2,
X13=Petal.Length*Petal.Width, X14=Petal.Width^2)
iris_data<-iris[51:150,]
iris_trans1<-iris_test[, -5]
iris_trans2<-iris_test[, 5]
iris_trans<-cbind(iris_trans1, iris_trans2)
##iris_data<-iris[51:150,]
iris_data<-iris_trans[51:150,]
Y1 <- c(rep(-1,100))
Y2 <- Y1
Y1[1:50] <- 1 #Y1 will be used to separate out Versicolor
Y2[51:100] <- 1 #Y2 is used to separate out Virginica
I <- 1:100
target <- cbind(Y1,Y2)
head(target)
## Y1 Y2
## [1,] 1 -1
## [2,] 1 -1
## [3,] 1 -1
## [4,] 1 -1
## [5,] 1 -1
## [6,] 1 -1
X <- iris_data[,-15]
##place holder for input and ouput attirbutes
x_trainIn1<-0.0
x_trainOut1<-0.0
x_trainIn2<-0.0
x_trainOut2<-0.0
nxval=5 ### for 5-fold cross validation
ntol=nrow(iris_data) ##number of rows
y_trainIn<-matrix(nrow = (ntol - ntol/nxval), ncol = 2)
y_trainOut<-matrix(nrow = ntol/nxval, ncol = 2)
testErr <- matrix(nrow = nxval, ncol = 2)
lambda <- matrix(nrow = nxval, ncol = 2)
prolambda <- c(rep(0,2)) # prolambda will hold these three lambdas
for(ident in seq(1,2)){ # find the best lambda for each model
grid=10^seq(10,-2,length=100)
trainErr <- 0.0
bestlam <-0.0
for(ixval in seq(from = 1, to = nxval)){
Iout <- which(I%%nxval== ixval - 1)
Xin <- X[-Iout,]
Xout <- X[Iout,]
Yin <- target[-Iout,ident] # run with Y1, Y2
Yout <- target[Iout,ident] # run with Y1, Y2
dataIn <- cbind(Xin,Yin)
if(ident==1){
y_trainIn[, 1]<-target[-Iout,ident]
y_trainOut[, 1]<-target[Iout,ident]
x_trainIn1<-X[-Iout,]
x_trainOut1<-X[Iout,]
}
else if(ident==2) {
y_trainIn[, 2]<-target[-Iout,ident]
y_trainOut[, 2] <- target[Iout,ident]
x_trainIn2<-X[-Iout,]
x_trainOut2<-X[Iout,]
}
## to get best lambda
cv.out=cv.glmnet(as.matrix(Xin),as.matrix(Yin),alpha=0,lambda=grid)
bestlam=cv.out$lambda.min
# make fair comparison of error
ridgeMod =glmnet(as.matrix(Xin), as.matrix(Yin),alpha=0,lambda=bestlam)
yEst <- predict(ridgeMod, newx = as.matrix(Xout))
testErr[ixval, ident] <-sqrt( sum((Yout-yEst)^2))/length(yEst)
lambda[ixval, ident]<-bestlam
}##end Y1 and Y2 loop
testErr
MintestErr<-apply(testErr, 2, min)
MintestErr # 0.09550798 0.09550798
index<- which(min(testErr[, ident])==testErr[, ident])
print(index)
prolambda[ident]<- lambda[index, ident]
##prolambda1[2]<- lambda[index, 2]
}
## [1] 2
## [1] 2
prolambda ## [1] 0.03053856 0.02310130
## [1] 0.07054802 0.07054802
##y_trainOut
#library(arulesViz)
library(ROCR)
#Create the production model
##appropiate lambda for Y1 (versicolor)
xlambda1 <- prolambda[1]
Xin1 <- x_trainIn1
Yin1 <-y_trainIn[, 1]
dataIn1 <- cbind(Xin1,Yin1)
ridgeMod1 =glmnet(as.matrix(Xin1), as.matrix(Yin1),alpha=0,lambda=xlambda1)
pred_y1 <- predict(ridgeMod1, newx = as.matrix(x_trainOut1))
testErr1 <-sqrt( sum((y_trainOut[, 1] - pred_y1)^2))/length(pred_y1)
testErr1 #[1] 0.1417557
## [1] 0.1424886
###
##comparing predicted and test values
pred1<-prediction(pred_y1, as.matrix(y_trainOut[, 1]))
###Get TPR and FPR values
perf1<-performance(pred1,"tpr","fpr")
##performance(pred1,"auc") #
###appropiate lambda for Y2 (virginica)
xlambda2 <- prolambda[2]
Xin2 <- x_trainIn2
Yin2 <- y_trainIn[, 2]
dataIn2 <- cbind(Xin2,Yin2)
ridgeMod2 =glmnet(as.matrix(Xin2), as.matrix(Yin2),alpha=0,lambda=xlambda2)
##
pred_y2 <- predict(ridgeMod2, newx = as.matrix(x_trainOut2))
##error
testErr2 <-sqrt( sum((y_trainOut[, 2] -pred_y2)^2))/length(pred_y2)
testErr2 #[1] 0.1430702
## [1] 0.1424886
pred2<-prediction(pred_y2, as.matrix(y_trainOut[, 2]))
perf2<-performance(pred2,"tpr","fpr")
##performance(pred2,"auc") #
##ROCR gives S4 class variable and it doesn't work like S3 which we used to
## Need to call with '@' instead of '$'. It doesn't work for points
##Need to change data.frame then as.matrix (direct transformation doesn't work)
x.values= data.frame(perf2@x.values)
y.values= data.frame(perf2@y.values)
plot(perf1,col="green", type = "l") # plot ROC curve
points(as.matrix(x.values), as.matrix(y.values),col="blue", type = "p") # plot ROC curve
abline(0,1)
legend("bottomright", c("Versicolor","Virginica"), cex = 0.6,
pch = c(20,20), col = c("green", "blue"))# lty = 1:2,
The Data is located at the web site: http://archive.ics.uci.edu/ml/datasets/Glass%2BIdentification This problem will only work with building and vehicle window glass (classes 1,2 and 3), so it only uses the first 163 rows of data. (Ignore rows 164 through 214) With this set up this is a three class problem. Use ridge regression to classify this data into the three classes: building windows float processed, building windows non float processed, and vehicle windows float processed.
## Q 5 will try to solve probvlem 3 on Versicolor and Virginica species
rm(list=ls())
set.seed(pi)
library(glmnet)
oldpar <- par(no.readonly=TRUE)
#oldpar <- par(no.readonly=TRUE)
glass <- read.csv("C:/Users/Andrew/SkyDrive/AGZ_Home/workspace_R/UCSC/MachinLearning/All_data/glass.txt",sep = ",", header = FALSE)
names(glass)
## [1] "V1" "V2" "V3" "V4" "V5" "V6" "V7" "V8" "V9" "V10" "V11"
str(glass)
## 'data.frame': 214 obs. of 11 variables:
## $ V1 : int 1 2 3 4 5 6 7 8 9 10 ...
## $ V2 : num 1.52 1.52 1.52 1.52 1.52 ...
## $ V3 : num 13.6 13.9 13.5 13.2 13.3 ...
## $ V4 : num 4.49 3.6 3.55 3.69 3.62 3.61 3.6 3.61 3.58 3.6 ...
## $ V5 : num 1.1 1.36 1.54 1.29 1.24 1.62 1.14 1.05 1.37 1.36 ...
## $ V6 : num 71.8 72.7 73 72.6 73.1 ...
## $ V7 : num 0.06 0.48 0.39 0.57 0.55 0.64 0.58 0.57 0.56 0.57 ...
## $ V8 : num 8.75 7.83 7.78 8.22 8.07 8.07 8.17 8.24 8.3 8.4 ...
## $ V9 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ V10: num 0 0 0 0 0 0.26 0 0 0 0.11 ...
## $ V11: int 1 1 1 1 1 1 1 1 1 1 ...
dim(glass)
## [1] 214 11
glass_data<-glass[1:163,]
Y1 <- c(rep(-1,163))
Y1
## [1] -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
## [24] -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
## [47] -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
## [70] -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
## [93] -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
## [116] -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
## [139] -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
## [162] -1 -1
Y2 <- Y1
Y3 <-Y1
Y1[1:70] <- 1 #Y1 will be used to separate out building windows float processed
Y2[71:146] <- 1 #Y2 is used to separate out building windows non float processed
Y3[147:163] <- 1 #Y3 is used to separate out vehicle windows float processed
I <- 1:163
target <-cbind(Y1, Y2, Y3)
X <- glass_data[,-11]
##place holder for input and ouput attirbutes
x_trainIn<-0.0
x_trainOut<-0.0
nxval=10 ### for 5-fold cross validation
ntol=nrow(glass_data) ##number of rows
y_trainIn<-matrix(nrow = 147, ncol = 3)
y_trainOut<-matrix(nrow = 16, ncol = 3)
testErr <- matrix(nrow = 10, ncol = 3)
lambda <- matrix(nrow = 10, ncol = 3)
y_trainIn1<-0.0
y_trainIn2<-0.0
y_trainIn3<-0.0
y_trainOut1<-0.0
y_trainOut2<-0.0
y_trainOut3<-0.0
x_trainIn1<-0.0
x_trainIn2<-0.0
x_trainIn3<-0.0
x_trainOut1<-0.0
x_trainOut2<-0.0
x_trainOut3<-0.0
prolambda <- c(rep(0,3)) # prolambda will hold these three lambdas
index<-matrix(nrow = 3, ncol=3)
for(ident in seq(1,3)){ # find the best lambda for each model
grid=10^seq(10,-2,length=50)
trainErr <- 0.0
bestlam <-0.0
for(ixval in seq(from = 1, to = 10)){
Iout <- which(I%%10== ixval - 1)
##Iout <- which(I%%10== 10%%ixval)
Xin <- X[-Iout,]
Xout <- X[Iout,]
Yin <- target[-Iout,ident] # run with Y1, Y2 ,Y3
Yout <- target[Iout,ident] # run with Y1, Y2 ,Y3
dataIn <- cbind(Xin,Yin)
x_trainIn<-Xin
x_trainOut<-Xout
## to get best lambda
cv.out=cv.glmnet(as.matrix(Xin),as.matrix(Yin),alpha=0,lambda=grid)
bestlam=cv.out$lambda.min
# make fair comparison of error
ridgeMod =glmnet(as.matrix(Xin), as.matrix(Yin),alpha=0,lambda=bestlam)
yEst <- predict(ridgeMod, newx = as.matrix(Xout))
testErr[ixval, ident] <-sqrt( sum((Yout-yEst)^2))/length(yEst)
lambda[ixval, ident]<-bestlam
if(ident==1){
##y_trainIn[, 1]<-target[-Iout, ident]
##y_trainOut[, 1]<-target[Iout, ident]
y_trainIn1<-target[-Iout, ident]
y_trainOut1<-target[Iout, ident]
x_trainIn1<-X[-Iout,]
x_trainOut1<-X[Iout,]
}
else if(ident==2) {
# y_trainIn[, 2]<-target[-Iout,ident]
#y_trainOut[, 2] <- target[Iout,ident]
y_trainIn2<-target[-Iout, ident]
y_trainOut2<-target[Iout, ident]
x_trainIn2<-X[-Iout,]
x_trainOut2<-X[Iout,]
}
else if(ident==3){
y_trainIn3<-target[-Iout, ident]
y_trainOut3<-target[Iout, ident]
x_trainIn3<-X[-Iout,]
x_trainOut3<-X[Iout,]
}
}##end Y1 and Y2 loop
testErr
MintestErr<-apply(testErr, 2, min)
MintestErr #0.1050712 0.1689575 0.1078599
index<- min(which(min(testErr[, ident])==testErr[, ident]))
##index1<- min(which(min(testErr[, 1])==testErr[, 1]))
index
##print(index)
prolambda[ident]<- lambda[index, ident]
##prolambda1[2]<- lambda[index, 2]
}
prolambda
## [1] 0.09540955 0.09540955 0.01757511
# prolambda ## [1] 0.03053856 0.02310130
#
##y_trainOut
library(ROCR)
#Create the production model
##appropiate lambda for Y1
xlambda1 <- prolambda[1]
Xin1 <- x_trainIn1
Yin1 <-y_trainIn1
#dataIn1 <- cbind(Xin,Yin1)
ridgeMod1 =glmnet(as.matrix(Xin1), as.matrix(Yin1),alpha=0,lambda=xlambda1)
pred_y1 <- predict(ridgeMod1, newx = as.matrix(x_trainOut1))
testErr1 <-sqrt( sum((y_trainOut1 - pred_y1)^2))/length(pred_y1)
testErr1
## [1] 0.1050712
###
##comparing predicted and test values
pred1<-prediction(pred_y1, as.matrix(y_trainOut1))
###Get TPR and FPR values
perf1<-performance(pred1,"tpr","fpr")
#performance(pred1,"auc") #
##appropiate lambda for Y1
xlambda2 <- prolambda[2]
Xin2 <-x_trainIn2
Yin2 <-y_trainIn2
#dataIn1 <- cbind(Xin,Yin1)
ridgeMod2 =glmnet(as.matrix(Xin2), as.matrix(Yin2),alpha=0,lambda=xlambda2)
pred_y2 <- predict(ridgeMod2, newx = as.matrix(x_trainOut2))
testErr2 <-sqrt( sum((y_trainOut2 - pred_y2)^2))/length(pred_y2)
testErr2
## [1] 0.1934413
###
##comparing predicted and test values
pred2<-prediction(pred_y2, as.matrix(y_trainOut2))
###Get TPR and FPR values
perf2<-performance(pred2,"tpr","fpr")
#performance(pred1,"auc") #
# # ##ROCR gives S4 class variable and it doesn't work like S3 which we used to
# # ## Need to call with '@' instead of '$'. It doesn't work for points
# # ##Need to change data.frame then as.matrix (direct transformation doesn't work)
x2.values= data.frame(perf2@x.values)
y2.values= data.frame(perf2@y.values)
##appropiate lambda for Y1
xlambda3 <- prolambda[3]
Xin3 <-x_trainIn3
Yin3 <-y_trainIn3
#dataIn1 <- cbind(Xin,Yin1)
ridgeMod3 =glmnet(as.matrix(Xin3), as.matrix(Yin3),alpha=0,lambda=xlambda3)
pred_y3 <- predict(ridgeMod3, newx = as.matrix(x_trainOut3))
testErr3 <-sqrt( sum((y_trainOut3 - pred_y3)^2))/length(pred_y3)
testErr3
## [1] 0.1248874
###
##comparing predicted and test values
pred3<-prediction(pred_y3, as.matrix(y_trainOut3))
###Get TPR and FPR values
perf3<-performance(pred3,"tpr","fpr")
#performance(pred1,"auc") #
x3.values= data.frame(perf3@x.values)
y3.values= data.frame(perf3@y.values)
### ROC plots
plot(perf1,col="red", type = "l") # plot ROC curve
points(as.matrix(x2.values), as.matrix(y2.values),col="blue", type = "p") # plot ROC curve
points(as.matrix(x3.values), as.matrix(y3.values),col="green", type = "p") # plot ROC curve
abline(0,1)
legend("bottomright", c("building processed","building non processed","vehicle processed" ), cex = 0.6,
pch = c(20,20), col = c("red", "blue", "green"))# lty = 1:2,