PR Assignment 1

Problem 0: Setting up the data

To create dataset A we first fix three mean vectors and three variance-covariance matrices. Mean vectors can be chosen arbitrarily. We call them mu1, mu2, mu3.

mu1 = c(0,0,0)
cat(paste("mu1 =","(", paste(as.character(mu1),sep= "", collapse = ","),")"), sep ="\n")

## mu1 = ( 0,0,0 )

mu2 = c(1,2,1)
cat(paste("mu2 =","(", paste(as.character(mu2),sep= "", collapse = ","),")"), sep ="\n")

## mu2 = ( 1,2,1 )

mu3 = c(-1,-2,-3)
cat(paste("mu3 =","(", paste(as.character(mu3),sep= "", collapse = ","),")"), sep ="\n")

## mu3 = ( -1,-2,-3 )

Next we choose three variance-covariance matrices for above three means. This can be done by arbitrarily choosing three full rank matrices of size three and multiplying it by its transpose. We call them sigma1, sigma2, sigma3.

A <- matrix(c(2,-3,2,-5,6,4,2,5,1), ncol = 3, byrow = TRUE)
#det(A) determinant not equal to 0
sigma1 <- (0.01)*t(A)%*%A
cat("sigma1", sep="\n")

## sigma1

sigma1

##       [,1]  [,2]  [,3]
## [1,]  0.33 -0.26 -0.14
## [2,] -0.26  0.70  0.23
## [3,] -0.14  0.23  0.21

B <- matrix(c(-4,3,-1,-5,4,4,2,-5,1), ncol = 3, byrow = TRUE)
#det(B) determinant not equal to 0
sigma2 <- (0.01)*t(B)%*%B
cat("sigma2", sep="\n")

## sigma2

sigma2

##       [,1]  [,2]  [,3]
## [1,]  0.45 -0.42 -0.14
## [2,] -0.42  0.50  0.08
## [3,] -0.14  0.08  0.18

C <- matrix(c(5,3,-1,2,-1,4,2,8,-1), ncol = 3, byrow = TRUE)
#det(C) determinant not equal to 0
sigma3 <- (0.01)*t(C)%*%C
cat("sigma3", sep="\n")

## sigma3

sigma3

##      [,1]  [,2]  [,3]
## [1,] 0.33  0.29  0.01
## [2,] 0.29  0.74 -0.15
## [3,] 0.01 -0.15  0.18

To simulate from the above three distributions we use the MASS package and store 500 samples from each in a data frame and add a column representing which distribution it comes from by labels 1, 2 and 3.

For dataset B ie the Leaf data. We first import the csv file, rename the columns and inspect some of the data.

data <- read.csv("leaf.csv", header = FALSE)
colnames(data) <- c("species","specimen number","eccentricity","aspect ratio","elongation","solidity","stochastic convexity","isoperimetric factor", "maximal indentation depth", "lobedness","average intensity", "average contrast", "smoothness", "third moment","uniformity","entropy")
head(data)

##   species specimen number eccentricity aspect ratio elongation solidity
## 1       1               1      0.72694       1.4742    0.32396  0.98535
## 2       1               2      0.74173       1.5257    0.36116  0.98152
## 3       1               3      0.76722       1.5725    0.38998  0.97755
## 4       1               4      0.73797       1.4597    0.35376  0.97566
## 5       1               5      0.82301       1.7707    0.44462  0.97698
## 6       1               6      0.72997       1.4892    0.34284  0.98755
##   stochastic convexity isoperimetric factor maximal indentation depth lobedness
## 1              1.00000              0.83592                 0.0046566 0.0039465
## 2              0.99825              0.79867                 0.0052423 0.0050016
## 3              1.00000              0.80812                 0.0074573 0.0101210
## 4              1.00000              0.81697                 0.0068768 0.0086068
## 5              1.00000              0.75493                 0.0074280 0.0100420
## 6              1.00000              0.84482                 0.0049451 0.0044506
##   average intensity average contrast smoothness third moment uniformity entropy
## 1         0.0477900         0.127950  0.0161080   0.00523230 2.7477e-04 1.17560
## 2         0.0241600         0.090476  0.0081195   0.00270800 7.4846e-05 0.69659
## 3         0.0118970         0.057445  0.0032891   0.00092068 3.7886e-05 0.44348
## 4         0.0159500         0.065491  0.0042707   0.00115440 6.6272e-05 0.58785
## 5         0.0079379         0.045339  0.0020514   0.00055986 2.3504e-05 0.34214
## 6         0.0104870         0.058528  0.0034138   0.00112480 2.4798e-05 0.34068

We see that the second column is just the reading number for a particular species/class so we will choose from the remaining 14 columns to keep in our dataset B finally. We also list out the items of the first column ie the classes and their frequencies below.

table(data$species)

## 
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 22 23 24 25 26 27 28 29 30 31 32 
## 12 10 10  8 12  8 10 11 14 13 16 12 13 12 10 12 11 13  9 12 11 12 12 12 11 11 
## 33 34 35 36 
## 11 11 11 10

We see that the dataset does not contain data from species numbers 16:21. So we choose our dataset from the remaining classes. Also we choose columns from 3:16 as the second column is just reading number.

classes <- c(1:15,22:36)
set.seed(123)
final_variables <- sample(3:16,4)
set.seed(123)
final_classes <- sample(classes, 6)
dataset <- data[, c(1,final_variables)]
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:MASS':
## 
##     select

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

datasetB <- filter(dataset, species %in% final_classes)

Next we create (scaled) training and test sets for dataset A and dataset B.

library(caTools)
set.seed(123)
split = sample.split(datasetA$which, SplitRatio = 0.5)
trainingA = subset(datasetA, split == TRUE)
testA = subset(datasetA, split == FALSE)
trainingA[-4] = scale(trainingA[-4])
testA[-4] = scale(testA[-4])

set.seed(123)
split = sample.split(datasetB$species, SplitRatio = 0.5)
trainingB = subset(datasetB, split == TRUE)
testB = subset(datasetB, split == FALSE)
trainingB[-1] = scale(trainingB[-1])
testB[-1] = scale(testB[-1])

Problem 1: Plotting

We plot the simulated dataset A.

library("scatterplot3d")

hues <- c("steelblue", "violetred", "springgreen")
hues <- hues[as.numeric(datasetA$which)]
scatterplot3d(datasetA[,1:3], pch = 16, color=hues, angle = 60,
              xlab = "X1", ylab = "X2", zlab = "X3", main = "Dataset A")

Problem 2: Bayes Classifier

First for dataset A.

trainingA$which = factor(trainingA$which, levels = c(1,2,3))
testA$which = factor(testA$which, levels = c(1,2,3))
library(e1071)
classifier = naiveBayes(x = trainingA[-4],
                        y = trainingA$which)
ypredA_nb = predict(classifier, newdata = testA[-4])
cmA_nb = table(testA[, 4], ypredA_nb)

Then for dataset B.

trainingB$species = factor(trainingB$species, levels = final_classes)
testB$species = factor(testB$species, levels = final_classes)
library(e1071)
classifier = naiveBayes(x = trainingB[-1], y = trainingB$species)
ypredB_nb <- predict(classifier, newdata = testB[-1])
cmB_nb = table(testB[,1], ypredB_nb)

Problem 3:

LDA

For dataset A. LDA

fit = lda(which~ one + two + three  , data = trainingA)
pred = predict(fit,testA)
predyA_lda <- pred$class
cmA_lda <- table(predyA_lda, testA$which)

For dataset A. QDA

#fit = qda(which~ one + two + three  , data = trainingA)
#pred = predict(fit,testA)
#predyA_lda <- pred$class
#cmA_qda = table(predyA_qda, testA$which)

For dataset B. LDA

fit = lda(species~. , data = trainingB)
pred = predict(fit,testB)
predyB_lda <- pred$class
cmB_qda <- table(predyB_lda, testB$species)

For dataset B. QDA

#```{r} #fit = qda(species~., data = trainingB) #pred = predict(fit,testB) #predyB_qda <- pred\(class #cmB_qda <- table(predyB_qda, testB\)species)



## SVM
For dataset A


```r
library(e1071)
classifier = svm(formula = which~. , data = trainingA, 
                type = 'C-classification', kernel= 'radial')

predA_svm <- predict(classifier, newdata = testA[-4])
cmA_svm = table(testA[,4], predA_svm)

For dataset B

classifier = svm(formula = species~. , data = trainingB, 
                type = 'C-classification', kernel= 'radial')

predB_svm <- predict(classifier, newdata = testB[-1])
cmB_svm = table(testB[,1], predB_svm)

1- NN for dataset A

library(class)
predA_1nn <- knn(train = trainingA[,-4],
              test = testA[, -4], cl = trainingA[,4],
              k = 1)

cmA_1nn = table(testA[, 4], predA_1nn)

3-NN for dataset A

predA_3nn <- knn(train = trainingA[,-4],
              test = testA[, -4], cl = trainingA[,4],
              k = 3)

cmA_3nn = table(testA[, 4], predA_3nn)

1-NN for dataset B

predB_1nn <- knn(train = trainingB[,-1],
              test = testB[, -1], cl = trainingB[,1],
              k = 1)

cmB_1nn = table(testB[, 1], predB_1nn)

3-NN for dataset B

predB_3nn <- knn(train = trainingB[,-1],
              test = testB[, -1], cl = trainingB[,1],
              k = 3)

cmB_3nn = table(testB[, 1], predB_3nn)

Problem 4

MULTIEDIT

set.seed(123)
me_A <- multiedit(trainingA[,-4], trainingA[,4], k = 1)

## pass 1 size 749

## pass 2 size 749

## pass 3 size 749

## pass 4 size 749

## pass 5 size 749

set.seed(123)
me_B <- multiedit(trainingB[,-1], trainingB[,1], k = 1)

## pass 1 size 23

## pass 2 size 17

## pass 3 size 12

## Warning in multiedit(trainingB[, -1], trainingB[, 1], k = 1): retained set is
## now too small to proceed

CONDENSE

set.seed(123)
cond_A <- condense(trainingA[,-4], trainingA[,4])

## [1] 415
## [1] 415 713
## [1] 179 415 713
## [1] 179 415 695 713
## [1]  36 179 415 695 713
## [1]  36 179 373 415 695 713
## [1]  36 179 373 415 636 695 713
## [1]  36  68 179 373 415 636 695 713
## [1]  23  36  68 179 373 415 636 695 713
##  [1]  23  36  68  75 179 373 415 636 695 713
##  [1]  23  36  68  75 179 373 396 415 636 695 713
##  [1]  23  36  68  75 179 373 396 415 481 636 695 713
##  [1]  23  36  68  75 179 366 373 396 415 481 636 695 713

set.seed(123)
cond_B <- condense(trainingB[,-1], trainingB[,1])

## [1] 31
## [1] 15 31
## [1] 15 25 31
## [1] 15 20 25 31
## [1]  3 15 20 25 31
## [1]  3 15 20 24 25 31
## [1]  3  7 15 20 24 25 31
## [1]  3  7 15 18 20 24 25 31
## [1]  3  7  8 15 18 20 24 25 31
##  [1]  3  7  8 14 15 18 20 24 25 31
##  [1]  3  7  8 10 14 15 18 20 24 25 31
##  [1]  3  6  7  8 10 14 15 18 20 24 25 31
##  [1]  1  3  6  7  8 10 14 15 18 20 24 25 31
##  [1]  1  3  6  7  8 10 14 15 18 20 24 25 29 31

MULTIEDIT AND CONDENSE