# Loading packages
library(tidyverse)
library(MASS)
library(ROCR) # Computing auc and plotting ROC-curves.
library(glmnet) # Lasso.
library(e1071) # Cross-Validations & tunning parameter.
library(kernlab) # SVM.
# Loading the data.
digitmat = read.csv("digits23.csv", header=TRUE)
digitmat = as.matrix(digitmat)
digitmatsmall = read.csv("digits23small.csv", header=TRUE)
digitmatsmall = as.matrix(digitmatsmall)
digit.labels = read.csv("digits23_labels.csv", header=TRUE)
digit.labels = as.matrix(digit.labels)
#If you need to convect the labels to a factor, this can be done using factor(digit.labels)
digit.train.test = read.csv("digits23_train_test_indicator.csv", header=TRUE)
digit.train.test = as.matrix(digit.train.test)
#dim(digitmat) ##387 784
# Display the 2 digits.
par(mfrow = c(1, 2))
digit=matrix(digitmat[125,],nrow=28,ncol=28,byrow=FALSE)
image(digit[,28:1],col=gray(0:200 /200),axes=FALSE)
digit=matrix(digitmat[350,],nrow=28,ncol=28,byrow=FALSE)
image(digit[,28:1],col=gray(0:200 /200),axes=FALSE)
df_d <- data.frame(digitmatsmall, digit.labels, digit.train.test)
# Seperating traing and testing data.
t_dfd <- df_d %>%
filter(x.1=="Train") %>%
dplyr::select(-x.1)
test_dfd <- df_d %>%
filter(x.1=="Test") %>%
dplyr::select(-x.1)
# x: labels
m_lda <- lda(x~., data = t_dfd, prior = c(0.5, 0.5)) # Equal priors.
pr_mlda <- predict(m_lda)
# LDA - confusion matrix
lda.cm <- table(Actual.value = t_dfd$x, Classifier.Prediction = pr_mlda$class)
lda.cm
## Classifier.Prediction
## Actual.value 2 3
## 2 77 66
## 3 41 116
#sum(diag(lda.cm))/sum(lda.cm) # Accuracy = 0.6433333
LDA.test.error <- 1-sum(diag(lda.cm))/sum(lda.cm)
LDA.test.error # 0.3566667 (Incorrect classification)
## [1] 0.3566667
t_dfd3 <- t_dfd %>%
rapply(function(x) ifelse(x==2,0,x), how = "replace") %>% # Changing "2" to "0"
rapply(function(x) ifelse(x==3,1,x), how = "replace") # Changing "3" to "1"
glm_d <- glm(x~., data = t_dfd3, family = "binomial")
pr_glmd <- predict(glm_d) %>%
as.data.frame()
names(pr_glmd)[1] <- "pred"
# Threshold = 0.5
trans_pr_glmd3 <- mutate(pr_glmd, pred = ifelse(pred >= 0.50, 3, 2))
# Logistic regression - Confusion matrix
glm.cm <- table(Actual.value = t_dfd$x, Classifier.Prediction = trans_pr_glmd3$pred)
glm.cm
## Classifier.Prediction
## Actual.value 2 3
## 2 91 52
## 3 57 100
#sum(diag(glm.cm))/sum(glm.cm) # Overall accuracy = 0.6366667
1-sum(diag(glm.cm))/sum(glm.cm) # Incorrect classification = 0.3633333
## [1] 0.3633333
#LDA
lda.pred.r <- as.numeric(pr_mlda$class)
labs.r <- as.numeric(t_dfd$x)
roc.pr.lda <- prediction(predictions = lda.pred.r, labels=labs.r)
perf.lda <- performance(roc.pr.lda, "tpr", "fpr")
auc.perf.lda = performance(roc.pr.lda, measure = "auc")
# AUC
auc.perf.lda@y.values #0.6386575
## [[1]]
## [1] 0.6386575
#Logistic regression
glm.pred.r <- as.numeric(trans_pr_glmd3$pred)
roc.pr.glm <- prediction(predictions = glm.pred.r, labels = labs.r)
perf.glm <- performance(roc.pr.glm, "tpr", "fpr")
auc.perf.glm = performance(roc.pr.glm, measure = "auc")
# AUC
auc.perf.glm@y.values #0.6366532
## [[1]]
## [1] 0.6366532
# Display the 2 ROC curves.
par(mfrow = c(1, 2))
plot(perf.lda, lwd= 2, main= "LDA: ROC-curve.")
abline(0,1, col="gray")
plot(perf.glm, main= "Logistic Regression: ROC-curve.", lwd= 2)
abline(0,1, col="gray")
# Working with digitmat.
ncol(digitmat) #start with 784
## [1] 784
# Find the constant columns.
con.digitmat_a <- as.data.frame(digitmat) %>%
select_if(function(v) var(v, na.rm=TRUE) == 0)
# Return column names
con.digitmat_a %>% colnames()
## [1] "V1" "V2" "V3" "V4" "V5" "V6" "V7" "V8" "V9" "V10"
## [11] "V11" "V12" "V13" "V14" "V15" "V16" "V17" "V18" "V19" "V20"
## [21] "V21" "V22" "V23" "V24" "V25" "V26" "V27" "V28" "V29" "V30"
## [31] "V31" "V32" "V33" "V34" "V35" "V36" "V37" "V38" "V39" "V40"
## [41] "V41" "V42" "V43" "V44" "V45" "V46" "V47" "V48" "V49" "V50"
## [51] "V51" "V52" "V53" "V54" "V55" "V56" "V57" "V58" "V59" "V60"
## [61] "V61" "V62" "V63" "V64" "V65" "V66" "V67" "V77" "V78" "V79"
## [71] "V82" "V83" "V84" "V85" "V86" "V87" "V88" "V89" "V90" "V110"
## [81] "V111" "V112" "V113" "V114" "V115" "V116" "V139" "V140" "V141" "V142"
## [91] "V143" "V167" "V168" "V169" "V170" "V171" "V195" "V196" "V197" "V199"
## [101] "V222" "V223" "V224" "V225" "V226" "V227" "V250" "V251" "V252" "V253"
## [111] "V254" "V278" "V279" "V280" "V281" "V282" "V307" "V308" "V309" "V310"
## [121] "V311" "V335" "V336" "V337" "V338" "V339" "V340" "V362" "V363" "V364"
## [131] "V365" "V366" "V367" "V368" "V390" "V391" "V392" "V393" "V394" "V395"
## [141] "V396" "V420" "V421" "V422" "V423" "V448" "V449" "V450" "V476" "V477"
## [151] "V478" "V505" "V506" "V533" "V534" "V560" "V561" "V562" "V588" "V589"
## [161] "V590" "V616" "V617" "V618" "V644" "V645" "V646" "V670" "V671" "V672"
## [171] "V673" "V674" "V675" "V697" "V698" "V699" "V700" "V701" "V702" "V703"
## [181] "V724" "V725" "V726" "V727" "V728" "V729" "V730" "V731" "V732" "V733"
## [191] "V734" "V735" "V736" "V737" "V738" "V739" "V740" "V741" "V742" "V743"
## [201] "V744" "V751" "V752" "V753" "V754" "V755" "V756" "V757" "V758" "V759"
## [211] "V760" "V761" "V762" "V763" "V764" "V765" "V766" "V767" "V768" "V769"
## [221] "V770" "V771" "V772" "V773" "V774" "V775" "V776" "V777" "V778" "V779"
## [231] "V780" "V781" "V782" "V783" "V784"
ncol(con.digitmat_a) #235
## [1] 235
# Remove constant columns.
rdigitmat_a <- digitmat [,apply(digitmat, MARGIN = 2, var) != 0]
ncol(rdigitmat_a) #after constant columns have been removed: 549
## [1] 549
# Indices of perfectly correlated elements.
corr.digitmat_a <- subset(as.data.frame(which(cor(rdigitmat_a)^2>0.999, arr.ind = TRUE)), row < col)
cc_a <- unique(corr.digitmat_a$col) #some of columns have frequency > 1
df.rdigit_a <- data.frame(rdigitmat_a)
# Find perfectly correlated columns.
correlated_a <- dplyr::select(df.rdigit_a,all_of(cc_a))
colnames(correlated_a)
## [1] "V80" "V81" "V109" "V165" "V166" "V193" "V194" "V221" "V249" "V283"
## [11] "V334" "V479" "V532" "V615" "V643" "V723" "V745" "V746" "V747" "V748"
## [21] "V749" "V750"
# Remove perfectly correlated columns.
cl.dgt_a <- dplyr::select(df.rdigit_a,-all_of(cc_a))
ncol(cl.dgt_a) #after correated columns have been removed: 527
## [1] 527
After splitting the dataset to train/test, the removal of rows creates both constant(8) and perfectly correlated(2) columns.
# Seperate train data.
train.dgt_a <- data.frame(cl.dgt_a, digit.train.test) %>%
filter(x == "Train") %>%
dplyr::select(-x)
# Repeat process...
# Check for "extra" the constant columns.
con.digitmat_b <- as.data.frame(train.dgt_a) %>%
select_if(function(v) var(v, na.rm=TRUE) == 0)
ncol(con.digitmat_b) ## 8
## [1] 8
# Return "extra" column names.
e.con.col.names <- names(con.digitmat_b)
e.con.col.names
## [1] "V68" "V76" "V117" "V137" "V138" "V255" "V312" "V507"
# Remove "extra" constant columns.
rdigitmat_b <- train.dgt_a [,apply(train.dgt_a, MARGIN = 2, var) != 0]
ncol(rdigitmat_b) ## after "extra" constant columns have been removed: 519
## [1] 519
# Check for "extra" perfectly correlated elements.
corr.digitmat_b <- subset(as.data.frame(which(cor(rdigitmat_b)^2>0.999, arr.ind = TRUE)), row < col)
cc_b <- unique(corr.digitmat_b$col)
df.rdigit_b <- data.frame(rdigitmat_b)
# Find "extra" perfectly correlated columns.
correlated_b <- dplyr::select(df.rdigit_b,all_of(cc_b))
colnames(correlated_b)
## [1] "V447" "V475"
# Remove "extra" perfectly correlated columns.
cl.train.dgt <- dplyr::select(df.rdigit_b,-all_of(cc_b))
ncol(cl.train.dgt) ## After "extra"correated columns have been removed: 517
## [1] 517
# Remove all these "extra" columns from test data as well. (Same dimensions needed to do predictions.)
# Seperate test data.
test.dgt_b <- data.frame(cl.dgt_a, digit.train.test) %>%
filter(x == "Test") %>%
dplyr::select(-x)
# Remove "extra"" constant columns from test data.
test.dgt2 <- test.dgt_b[ , !names(test.dgt_b) %in% e.con.col.names]
# Remove "extra" columns from test data.
test.dgt3 <- dplyr::select(test.dgt2,-all_of(cc_b))
# Check that train/test have same columns.
all_equal(cl.train.dgt, test.dgt3, ignore_row_order = FALSE) #as expected.
## [1] "Different number of rows"
# Using cl.train.dgt & test.dgt3 from ex2.
# Convert the labels to a factor
lbls2 <- factor(digit.labels) %>%
as.data.frame() %>%
dplyr::rename(c("lbls" = "."))
# Separate labels to train/test.
train.lbls <- lbls2 %>%
cbind (digit.train.test) %>%
filter(x == "Train") %>%
dplyr::select(-x)
test.lbls <- lbls2 %>%
cbind (digit.train.test) %>%
filter(x == "Test") %>%
dplyr::select(-x)
# Training data.
l.data <- data.frame(train.lbls, cl.train.dgt)
# Testing data.
test.l.data <- data.frame(test.lbls, test.dgt3)
#suppresed "glm.fit: fitted probabilities numerically 0 or 1 occurred".
#Forward selection Step 1
aic.list <- list()
model.list <- list()
f <- for(i in 2:ncol(l.data)) {
ddf <- l.data[,c(1,i)] # make a dataframe
model.list[[i]] <- glm(lbls~., data = ddf, family = "binomial") # run glm & store it in a list
aic.list[[i]] <- AIC(model.list[[i]]) # Find AIC & store it in a list
}
unl <- unlist(aic.list) #get the numbers in a vector
min.aic <- which(unl == min(unl)) #find index of the min
min.aic
## [1] 342
unl[min.aic] #minimum AIC value
## [1] 226.0825
best.model <- model.list[[min.aic]] #find best model
best.model[c("coefficients")]
## $coefficients
## (Intercept) V516
## 1.23508307 -0.01844409
# suppresed "glm.fit: fitted probabilities numerically 0 or 1 occurred".
# Step 2
# V516 chosen in step 1, index: 342.
aic.list2 <- list()
model.list2 <- list()
f <- for(i in 2:ncol(l.data)) {
if(i == 342) next #skip i=342
ddf2 <- l.data[,c(1,342,i)]
model.list2[[i]] <- glm(lbls~., data = ddf2, family = "binomial")
aic.list2[[i]] <- AIC(model.list2[[i]])
}
unl2 <- unlist(aic.list2)
min.aic2 <- which(unl2 == min(unl2))
min.aic2
## [1] 205
unl2[min.aic2]
## [1] 186.5549
best.model2 <- model.list2[[min.aic2]]
best.model2[c("coefficients")]
## $coefficients
## (Intercept) V516 V349
## 0.23799681 -0.01699001 0.01293073
# suppresed "glm.fit: fitted probabilities numerically 0 or 1 occurred".
# Step 3
# V349,V516 chosen in step 2, indices: 205, 342.
aic.list3 <- list()
model.list3 <- list()
f <- for(i in 2:ncol(l.data)) {
if(i == 205 || i == 342) next #skip i=342, 205
ddf3 <- l.data[,c(1,205,342,i)]
model.list3[[i]] <- glm(lbls~., data = ddf3, family = "binomial")
aic.list3[[i]] <- AIC(model.list3[[i]])
}
unl3 <- unlist(aic.list3)
min.aic3 <- which(unl3 == min(unl3))
min.aic3
## [1] 461
unl3[min.aic3]
## [1] 168.1029
best.model3 <- model.list3[[min.aic3]]
best.model3[c("coefficients")]
## $coefficients
## (Intercept) V349 V516 V652
## -0.242471606 0.013641165 -0.015759235 0.008427346
# V349,V516 & V652 chosen in step 3, indexes: 205, 342 & 461.
# AIC of final model: 168.1029
fs.model <- glm(lbls ~ V349 + V516 + V652, data = l.data, family = "binomial")
fs.model
##
## Call: glm(formula = lbls ~ V349 + V516 + V652, family = "binomial",
## data = l.data)
##
## Coefficients:
## (Intercept) V349 V516 V652
## -0.242472 0.013641 -0.015759 0.008427
##
## Degrees of Freedom: 299 Total (i.e. Null); 296 Residual
## Null Deviance: 415.2
## Residual Deviance: 191.4 AIC: 199.4
fs.pred <- predict(fs.model, newdata = test.l.data) %>%
as.data.frame()
names(fs.pred)[1] <- "fs.pr" # Naming the column
# Threshold = 0.5
fs.pr.classes <- mutate(fs.pred, fs.pr = ifelse(fs.pr >= 0.50, 3, 2))
# fs model's test error = 0.183908
mm.testerror <- 1-mean(fs.pr.classes$fs.pr == test.l.data$lbls)
mm.testerror
## [1] 0.183908
# ex1 model's test error = 0.3633333
#confusion matrix for fs.model
table(Actual.value = test.l.data$lbls, Classifier.Prediction = fs.pr.classes$fs.pr)
## Classifier.Prediction
## Actual.value 2 3
## 2 47 7
## 3 9 24
# labels to factor: lbls2
# Separate labels to train/test.
# Train dataset: train.lbls
# Test dataset: test.lbls
# Training data.
las.data <- data.frame(train.lbls, cl.train.dgt)
# For glmnet() to work, a matrix of predictors and a vector of response are needed.
las.mat <- las.data[,-1] %>%
as.matrix()
las.resp <- las.data[[1]]
# Testing data.
test.las.data <- data.frame(test.lbls, test.dgt3)
test.las.mat <- test.las.data[,-1] %>%
as.matrix()
# labels are NOT factors, needed for pred.
df.test.las.resp <- data.frame(digit.labels, digit.train.test) %>%
filter(x.1 == "Test") %>%
dplyr::select(-x.1)
test.las.resp <- df.test.las.resp[[1]]
set.seed(1234)
lasso.dig <- cv.glmnet(x = las.mat, y = las.resp, alpha = 1, family = "binomial", nfolds = 10)
plot(lasso.dig)
min.lambda <- lasso.dig$lambda.min
min.lambda #0.01393736
## [1] 0.01393736
se1.lambda <- lasso.dig$lambda.1se
se1.lambda #0.0307337
## [1] 0.0307337
# Model with min lambda.
las.m.m <- glmnet(x = las.mat, y = las.resp, alpha = 1, family = "binomial", lambda = min.lambda)
# Prediction on test data
pred.l.min <- predict(las.m.m, newx = test.las.mat, type = "class") %>%
as.data.frame()
#test error=0.04597701
min.testerror <- 1-mean(pred.l.min[1] == test.las.resp)
min.testerror
## [1] 0.04597701
#length(which(coef(las.m.m) != 0)) # amount of coefs used = 39 (seen also in the plot)
# Model with 1se lambda.
las.m.se1 <- glmnet(x = las.mat, y = las.resp, alpha = 1, family = "binomial", lambda = se1.lambda)
# Prediction on test data
pred.l.se1 <- predict(las.m.se1, newx = test.las.mat, type = "class") %>%
as.data.frame()
#test error=0.05747126
se1.testerror <- 1-mean(pred.l.se1[1] == test.las.resp)
se1.testerror
## [1] 0.05747126
#length(which(coef(las.m.se1) != 0)) # amount of coefs used = 27
# Scaling training data (in order to scale = FALSE on tune() ).
scaled.train <- scale(cl.train.dgt)
# Scaling only done in order to tune(), no need to manually unscale the data.
# Training data: train.lbls
train.svm.data <- data.frame(train.lbls, scaled.train)
# Testing data ## test.las.data from Ex4
test.svm.data <- test.las.data
train_y <- as.factor(train.svm.data[,1])
train_x <- as.matrix(train.svm.data[,-1])
set.seed(1234)
tune.out <- tune(svm ,train.x = train_x,train.y = train_y,data = train.svm.data , kernel ="linear", ranges =list(cost=c(0.0001, 0.001,0.01,0.1, 1,5,10)), scale = FALSE)
summary(tune.out) ## error = 0.04666667, dispersion = 0.05488484
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 0.001
##
## - best performance: 0.04666667
##
## - Detailed performance results:
## cost error dispersion
## 1 1e-04 0.08333333 0.03600411
## 2 1e-03 0.04666667 0.05488484
## 3 1e-02 0.05000000 0.05499719
## 4 1e-01 0.05333333 0.05258738
## 5 1e+00 0.05333333 0.05258738
## 6 5e+00 0.05333333 0.05258738
## 7 1e+01 0.05333333 0.05258738
best.mod.t <- tune.out$best.model ## C = 0.001
summary(best.mod.t)
##
## Call:
## best.tune(method = svm, train.x = train_x, train.y = train_y, data = train.svm.data,
## ranges = list(cost = c(1e-04, 0.001, 0.01, 0.1, 1, 5, 10)), kernel = "linear",
## scale = FALSE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 0.001
##
## Number of Support Vectors: 121
##
## ( 59 62 )
##
##
## Number of Classes: 2
##
## Levels:
## 2 3
# NO manual scaling.
# las.data from Ex4
#SVM classifier
svm.dig <- ksvm(lbls ~ ., data= las.data, kernel = "vanilladot", C = 0.001)
## Setting default kernel parameters
svm.dig
## Support Vector Machine object of class "ksvm"
##
## SV type: C-svc (classification)
## parameter : cost C = 0.001
##
## Linear (vanilla) kernel function.
##
## Number of Support Vectors : 120
##
## Objective Function Value : -0.0631
## Training error : 0.013333
pred.svm <- predict(svm.dig, newdata = test.svm.data)
# test error = 0.03448276
svm.testerror <- 1-mean(pred.svm == test.las.resp)
svm.testerror
## [1] 0.03448276
#confusion matrix SVM
table(Actual.value = df.test.las.resp$x, Classifier.Prediction = pred.svm)
## Classifier.Prediction
## Actual.value 2 3
## 2 53 1
## 3 2 31
which(test.las.resp != pred.svm) ##45 68 78
## [1] 45 68 78