Background

For fun we will open a contest! Today there will be three perticipents. Each of them will demonstrate their predictive modeling skills using three diffrent approach. Later, as a judge, we will evaluate who wins the contest.

Data

Today we will use a simulated data set to run this contest. First we will generate 10 means \(m_k\) from a bivariate Gaussian distribution \(N((1, 0)^T , I)\) and labeled this class BLUE. Similarly, 10 more were drawn from \(N((0, 1)^T , I)\) and labeled class ORANGE. Then for each class we generated 100 observations as follows: for each observation, we picked an \(m_k\) at random with probability 1/10, and then generated a \(N(m_k, I/5)\), thus leading to a mixture of Gaussian clusters for each class. Then contestants will use several methods of classification to classify this two class. This idea was originally introduced in the book “The Elements of Statistical Learning- Trevor Hastie.”

Generating the dataset

set.seed(5)
library("MASS")
library("ggplot2")
library("gridExtra")
library("class")
library("purrr")
library("readxl")

Blue data

These are 10 bivariate normal means.

mean_blue <- mvrnorm(n = 10, c(1, 0), diag(2))
DF_mean_blue <- as.data.frame(mean_blue)
colnames(DF_mean_blue) <- c("cx", "cy")
head(DF_mean_blue)

##           cx          cy
## 1 -0.2276303 -0.84085548
## 2  1.8017795  1.38435934
## 3  2.0803926 -1.25549186
## 4  1.1575344  0.07014277
## 5  2.0717600  1.71144087
## 6  1.1389861 -0.60290798

Orange data

These are 10 bivariate normal means.

mean_orange <- mvrnorm(n = 10, c(0, 1), diag(2))
DF_mean_orange <- as.data.frame(mean_orange)
colnames(DF_mean_orange) <- c("cx", "cy")
head(DF_mean_orange)

##           cx        cy
## 1 -0.3159150 1.9005119
## 2 -1.1096942 1.9418694
## 3 -2.2154606 2.4679619
## 4 -1.2171036 1.7067611
## 5 -1.4792218 1.8190089
## 6 -0.9515738 0.7065182

p1=ggplot(DF_mean_blue,aes(x= cx, y=cy ))+
  geom_point(colour="blue",shape=23)+
  xlab('x')+
  ylab('y')


p2=ggplot(DF_mean_orange,aes(x= cx, y=cy ))+
  geom_point(colour="orange",shape=23)+
  xlab('x')+
  ylab('y')



p3=p1+
  geom_point(data=DF_mean_orange,aes(x= cx, y=cy ),colour="orange",shape=23)

grid.arrange(p1,p2,p3, nrow= 1, ncol= 3)

N=200

data_blue<-replicate(n=N/2 , c(mvrnorm(n=1,mean_blue[sample(nrow(mean_blue),1), ],diag(2)/5),0))
data_blue<-t(data_blue)
head(data_blue)

##            [,1]       [,2] [,3]
## [1,] -0.1942776 -1.1997100    0
## [2,]  1.0308856  0.4202639    0
## [3,]  1.3633620 -0.3265395    0
## [4,]  1.2072968  0.2131221    0
## [5,]  1.3447169  1.4683130    0
## [6,]  0.9727523 -0.2638756    0

data_orange<-replicate(n=N/2 , c(mvrnorm(n=1,mean_orange[sample(nrow(mean_orange),1), ],diag(2)/5),1))
data_orange<-t(data_orange)
head(data_orange)

##            [,1]        [,2] [,3]
## [1,] -0.5978233  1.60612174    1
## [2,]  0.5049599  2.35326310    1
## [3,]  2.2604100 -0.07646153    1
## [4,] -0.5130095  2.69908829    1
## [5,] -1.7120784  2.13035703    1
## [6,]  1.2857254  2.08849933    1

mix_data<-rbind(data_blue,data_orange)

DF_mix_data<-as.data.frame(mix_data)
colnames(DF_mix_data)<-c('cx','cy','Y')

DF_mix_data<-DF_mix_data[sample(nrow(DF_mix_data)), ]

head(DF_mix_data)

##               cx         cy Y
## 53   1.277968461 -0.8232594 0
## 86   2.000630527  1.8234847 0
## 81   1.221200390  1.6764360 0
## 139 -0.606239240  0.4919691 1
## 197 -1.663527863  2.1660482 1
## 126 -0.007252589  2.2335074 1

p4=ggplot(DF_mix_data[ ,1:2],aes(x=cx,y=cy))+
  geom_point(colour= ifelse( DF_mix_data[,3]==1, "orange", "blue"))+
  theme_bw()+xlim(-3,4)+ylim(-3,3)
p4

This is how we can generate our own dataset. For demonstration purpose i am going to use an existing simulated dataset like above.

Loading the data

training_data <- read_excel("~/Desktop/OrabyHW/FunPub/training_data.xlsx")
training_data <- data.frame(training_data)
head(training_data)

           cx        cy Y
1 -1.04751736 2.1215257 1
2  0.91731081 2.0648852 1
3 -0.03427192 1.9459291 1
4  1.86865342 0.8910806 1
5  0.34466477 0.4875041 1
6  1.17014450 0.7001344 1

2) Contestant 1: Linear Regression as a classifier

LR<-lm(Y~cx+cy,data=training_data)
summary(LR)


Call:
lm(formula = Y ~ cx + cy, data = training_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.79628 -0.20259 -0.01516  0.20987  0.78028 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  0.52390    0.03092  16.942  < 2e-16 ***
cx          -0.20001    0.01979 -10.109  < 2e-16 ***
cy           0.13706    0.01787   7.669 7.78e-13 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.3461 on 197 degrees of freedom
Multiple R-squared:  0.5282,    Adjusted R-squared:  0.5234 
F-statistic: 110.3 on 2 and 197 DF,  p-value: < 2.2e-16

SDiffLR<-(training_data[,3]-predict(LR,training_data[,1:2]))^2
MSE<- mean(SDiffLR)

The above linear model estimates \(\beta_{1}\) and \(\beta_{2}\) with standard error 0.02 for both parameter and the training MSE is 0.12

Finding the decision boundary

beta=coef(LR)

meshpixel=100
x_pixel=seq(-3,4,length=meshpixel)
y_pixel=seq(-3,3,length=meshpixel)
M<-expand.grid(x_pixel,y_pixel)
DF_M<-as.data.frame(M)
colnames(DF_M)<-c('cx','cy')
color_M<-(predict(LR,DF_M)>.5)+0
new_M_LR<-cbind.data.frame(DF_M,color_M)

p5=p4+geom_point(data=new_M_LR[ , 1:2],aes(x=cx,y=cy),
              colour=ifelse( new_M_LR[ , 3]==1,"orange","blue"),alpha=.03)+
  geom_abline(aes(intercept= (.5-beta[1])/beta[3], slope=-beta[2]/beta[3] ))
p5

3) Contestant 2: KNN with best K

Find_K <- function(K, M = 1000) {
    MSE <- 0
    for (i in 1:M) {
        train <- sample(200, 200 - 40)  # for 5-fold CV
        KNN <- knn(training_data[train, 1:2], training_data[-train, 1:2], training_data[train,
            3], k = K, prob = TRUE, use.all = FALSE)
        MSE <- MSE + mean(KNN != training_data[-train, 3])
    }
    CVK <- MSE/M
    return(CVK)
}

Find_K(15)

[1] 0.173575

LK <- 1:30
vCVK <- LK %>%
    map(function(K) Find_K(K))

vCVK <- unlist(vCVK)
vCVK <- as.data.frame(vCVK)
colnames(vCVK) <- c("CVK")

We used 5 fold MCCV to find the best K in terms of lowest training MSE.

ggplot() + geom_point(data = vCVK, aes(x = LK, y = CVK)) + xlab("K") + ylab("CV")

The CV MSE decreases with the increase of K up to a certain value and then increase again.

LK[which.min(vCVK$CVK)]

[1] 7

K= 7 gives the minimum CV

KNN7 <- knn(training_data[, 1:2], training_data[, 1:2], training_data[, 3], k = 7,
    prob = TRUE, use.all = TRUE)

MSE_KNN7 <- mean(KNN7 != training_data[, 3])

Performing the KNN with seven closest neighbor gives us the training MSE 0.12

Finding the decision boundary

KNN7_boundary<-knn(training_data[,1:2],DF_M,training_data[,3],
         k=7,prob=TRUE,use.all=TRUE)
# Will make a data frame by binding "DF_M" and "KNN7_boundary" side by side and create a data frame named "new_M_KNN".
# This data frame contain 10000 mesh points(x,y) and corresponding predictions
new_M_KNN<-cbind.data.frame(DF_M,KNN7_boundary)

# Reuse the previous colored scatter plot "P4" and add a layer to that plot and stores it in "p6"
# The layer colored the each mesh point conditioning on column 3 of "new_M_KNN" data
# if the 3rd column value 1 color it orange else blue
p6=p4+geom_point(data=new_M_KNN[,1:2],aes(x=cx,y=cy),
                 colour= ifelse( new_M_KNN[ , 3]==1,"orange","blue"), alpha=.05)


# Extract the probability of belonging to each group from "KNN7_boundary" and stored it in "pr1" variable
pr1<-attr(KNN7_boundary,"prob")

# In order to draw the boundary in the correct place we need to switch the probability.
# If the KNN7_boundary==1 calculate 1-pr1 else  pr1.
# Stores the findings in "pr2".
pr2<-ifelse(KNN7_boundary=="1",1-pr1,pr1)

# Will create a data frame "DF_M_KNN" by binding "DF_M" with "pr2" side by side.
DF_M_KNN<-cbind(DF_M,pr2)

# Use the plot "p6" and add a contour plot to it then save it in "p7"
# The contour plot gives us the decision boundary for two groups based on the probability "pr2": (<=.5==blue) orange if greater.
p7=p6+
  geom_contour(data=DF_M_KNN,aes(x=cx,y=cy,z=pr2),
               binwidth=.51,lwd=.5,colour="black")
p7

Contestant 3: KNN with K= 1

KNN1 <- knn(training_data[, 1:2], training_data[, 1:2], training_data[, 3], k = 1,
    prob = TRUE, use.all = TRUE)

mean(KNN1 != training_data[, 3])

[1] 0

As expected using K=1 gives us the training error 0. Because of using K=1 the over fitting occurred. However, in theory this model will fail during testing because the model was trained only one observation each time. So, when it will encounter a diffrent testing value it will fail.

Finding the decision boundary

KNN1_boundary<-knn(training_data[,1:2],DF_M,training_data[,3],
                   k=1,prob=TRUE,use.all=TRUE)

new_M_KNN1<-cbind.data.frame(DF_M,KNN1_boundary)


p8=p4+geom_point(data=new_M_KNN1[,1:2],aes(x=cx,y=cy),
                 colour= ifelse( new_M_KNN1[ , 3]==1,"orange","blue"), alpha=.05)

KNN1pr1<-attr(KNN1_boundary,"prob")

KNN1pr2<-ifelse(KNN1_boundary=="1",1-KNN1pr1,KNN1pr1)

DF_M_KNN1<-cbind(DF_M,KNN1pr2)

p9=p8+
  geom_contour(data=DF_M_KNN1,aes(x=cx,y=cy,z=KNN1pr2),
               binwidth=.51,lwd=.5,colour="black")
p9

Loading the test data

test_data <- as.data.frame(read_excel("~/Desktop/OrabyHW/FunPub/testing_data.xlsx"))
head(test_data)

##           cx         cy Y
## 1  1.1893464 -0.2660198 0
## 2 -2.6182795  2.7170750 1
## 3 -0.5477458 -1.6125951 0
## 4  1.2072968  0.2131221 0
## 5  1.7328078  1.8899233 1
## 6 -1.7083489  0.9357727 1

Judging

The linear regression model

SDiff <- (test_data[, 3] - predict(LR, test_data[, 1:2]))^2
MSELRt <- mean(SDiff)

After applying the LR model on the test data we can see the slight improvement in the MSE, is now 0.11

KNN with best K

KNN7t <- knn(training_data[, 1:2], test_data[, 1:2], training_data[, 3], k = 7, prob = TRUE,
    use.all = TRUE)

MSEKNN7t <- mean(KNN7t != test_data[, 3])

After applying the KNN with K= 7 on the test data we can see the MSE is now 0.2

KNN with K=1

KNN1t <- knn(training_data[, 1:2], test_data[, 1:2], training_data[, 3], k = 1, prob = TRUE,
    use.all = TRUE)

MSEKNN1t <- mean(KNN1t != test_data[, 3])

After applying the KNN with K= 1 on the test data we can see the MSE is now increased to 0.1

Comments:

Both the LR and KNN with K=1 has the same smaller testing MSE than the KNN with K= 7. So it seems that it is a tie between contestant 1 and contestant 2. However, being the parametric model LR has simplicity and better interpretability. So in my opinion contestant 1 wins!

Comming soon

Season 2: In season 2 there will be four more contestants: contestant 4 performing logistic regression, contestant 5 performing LDA, contestant 6 performing QDA, contestant 7 performing \(naïve\) Bayes.

Playing Judge (Season-1)!

Prosanta Barai

11/10/2021

Background

Data

Generating the dataset

Blue data

Orange data

Loading the data

2) Contestant 1: Linear Regression as a classifier

Finding the decision boundary

3) Contestant 2: KNN with best K

Finding the decision boundary

Contestant 3: KNN with K= 1

Finding the decision boundary

Loading the test data

Judging

The linear regression model

KNN with best K

KNN with K=1

Comments:

Comming soon