For fun we will open a contest! Today there will be three perticipents. Each of them will demonstrate their predictive modeling skills using three diffrent approach. Later, as a judge, we will evaluate who wins the contest.
Today we will use a simulated data set to run this contest. First we will generate 10 means \(m_k\) from a bivariate Gaussian distribution \(N((1, 0)^T , I)\) and labeled this class BLUE. Similarly, 10 more were drawn from \(N((0, 1)^T , I)\) and labeled class ORANGE. Then for each class we generated 100 observations as follows: for each observation, we picked an \(m_k\) at random with probability 1/10, and then generated a \(N(m_k, I/5)\), thus leading to a mixture of Gaussian clusters for each class. Then contestants will use several methods of classification to classify this two class. This idea was originally introduced in the book “The Elements of Statistical Learning- Trevor Hastie.”
set.seed(5)
library("MASS")
library("ggplot2")
library("gridExtra")
library("class")
library("purrr")
library("readxl")
These are 10 bivariate normal means.
mean_blue <- mvrnorm(n = 10, c(1, 0), diag(2))
DF_mean_blue <- as.data.frame(mean_blue)
colnames(DF_mean_blue) <- c("cx", "cy")
head(DF_mean_blue)
## cx cy
## 1 -0.2276303 -0.84085548
## 2 1.8017795 1.38435934
## 3 2.0803926 -1.25549186
## 4 1.1575344 0.07014277
## 5 2.0717600 1.71144087
## 6 1.1389861 -0.60290798
These are 10 bivariate normal means.
mean_orange <- mvrnorm(n = 10, c(0, 1), diag(2))
DF_mean_orange <- as.data.frame(mean_orange)
colnames(DF_mean_orange) <- c("cx", "cy")
head(DF_mean_orange)
## cx cy
## 1 -0.3159150 1.9005119
## 2 -1.1096942 1.9418694
## 3 -2.2154606 2.4679619
## 4 -1.2171036 1.7067611
## 5 -1.4792218 1.8190089
## 6 -0.9515738 0.7065182
p1=ggplot(DF_mean_blue,aes(x= cx, y=cy ))+
geom_point(colour="blue",shape=23)+
xlab('x')+
ylab('y')
p2=ggplot(DF_mean_orange,aes(x= cx, y=cy ))+
geom_point(colour="orange",shape=23)+
xlab('x')+
ylab('y')
p3=p1+
geom_point(data=DF_mean_orange,aes(x= cx, y=cy ),colour="orange",shape=23)
grid.arrange(p1,p2,p3, nrow= 1, ncol= 3)
N=200
data_blue<-replicate(n=N/2 , c(mvrnorm(n=1,mean_blue[sample(nrow(mean_blue),1), ],diag(2)/5),0))
data_blue<-t(data_blue)
head(data_blue)
## [,1] [,2] [,3]
## [1,] -0.1942776 -1.1997100 0
## [2,] 1.0308856 0.4202639 0
## [3,] 1.3633620 -0.3265395 0
## [4,] 1.2072968 0.2131221 0
## [5,] 1.3447169 1.4683130 0
## [6,] 0.9727523 -0.2638756 0
data_orange<-replicate(n=N/2 , c(mvrnorm(n=1,mean_orange[sample(nrow(mean_orange),1), ],diag(2)/5),1))
data_orange<-t(data_orange)
head(data_orange)
## [,1] [,2] [,3]
## [1,] -0.5978233 1.60612174 1
## [2,] 0.5049599 2.35326310 1
## [3,] 2.2604100 -0.07646153 1
## [4,] -0.5130095 2.69908829 1
## [5,] -1.7120784 2.13035703 1
## [6,] 1.2857254 2.08849933 1
mix_data<-rbind(data_blue,data_orange)
DF_mix_data<-as.data.frame(mix_data)
colnames(DF_mix_data)<-c('cx','cy','Y')
DF_mix_data<-DF_mix_data[sample(nrow(DF_mix_data)), ]
head(DF_mix_data)
## cx cy Y
## 53 1.277968461 -0.8232594 0
## 86 2.000630527 1.8234847 0
## 81 1.221200390 1.6764360 0
## 139 -0.606239240 0.4919691 1
## 197 -1.663527863 2.1660482 1
## 126 -0.007252589 2.2335074 1
p4=ggplot(DF_mix_data[ ,1:2],aes(x=cx,y=cy))+
geom_point(colour= ifelse( DF_mix_data[,3]==1, "orange", "blue"))+
theme_bw()+xlim(-3,4)+ylim(-3,3)
p4
This is how we can generate our own dataset. For demonstration purpose i am going to use an existing simulated dataset like above.
training_data <- read_excel("~/Desktop/OrabyHW/FunPub/training_data.xlsx")
training_data <- data.frame(training_data)
head(training_data)
cx cy Y
1 -1.04751736 2.1215257 1
2 0.91731081 2.0648852 1
3 -0.03427192 1.9459291 1
4 1.86865342 0.8910806 1
5 0.34466477 0.4875041 1
6 1.17014450 0.7001344 1
LR<-lm(Y~cx+cy,data=training_data)
summary(LR)
Call:
lm(formula = Y ~ cx + cy, data = training_data)
Residuals:
Min 1Q Median 3Q Max
-0.79628 -0.20259 -0.01516 0.20987 0.78028
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.52390 0.03092 16.942 < 2e-16 ***
cx -0.20001 0.01979 -10.109 < 2e-16 ***
cy 0.13706 0.01787 7.669 7.78e-13 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.3461 on 197 degrees of freedom
Multiple R-squared: 0.5282, Adjusted R-squared: 0.5234
F-statistic: 110.3 on 2 and 197 DF, p-value: < 2.2e-16
SDiffLR<-(training_data[,3]-predict(LR,training_data[,1:2]))^2
MSE<- mean(SDiffLR)
The above linear model estimates \(\beta_{1}\) and \(\beta_{2}\) with standard error 0.02 for both parameter and the training MSE is 0.12
beta=coef(LR)
meshpixel=100
x_pixel=seq(-3,4,length=meshpixel)
y_pixel=seq(-3,3,length=meshpixel)
M<-expand.grid(x_pixel,y_pixel)
DF_M<-as.data.frame(M)
colnames(DF_M)<-c('cx','cy')
color_M<-(predict(LR,DF_M)>.5)+0
new_M_LR<-cbind.data.frame(DF_M,color_M)
p5=p4+geom_point(data=new_M_LR[ , 1:2],aes(x=cx,y=cy),
colour=ifelse( new_M_LR[ , 3]==1,"orange","blue"),alpha=.03)+
geom_abline(aes(intercept= (.5-beta[1])/beta[3], slope=-beta[2]/beta[3] ))
p5
Find_K <- function(K, M = 1000) {
MSE <- 0
for (i in 1:M) {
train <- sample(200, 200 - 40) # for 5-fold CV
KNN <- knn(training_data[train, 1:2], training_data[-train, 1:2], training_data[train,
3], k = K, prob = TRUE, use.all = FALSE)
MSE <- MSE + mean(KNN != training_data[-train, 3])
}
CVK <- MSE/M
return(CVK)
}
Find_K(15)
[1] 0.173575
LK <- 1:30
vCVK <- LK %>%
map(function(K) Find_K(K))
vCVK <- unlist(vCVK)
vCVK <- as.data.frame(vCVK)
colnames(vCVK) <- c("CVK")
We used 5 fold MCCV to find the best K in terms of lowest training MSE.
ggplot() + geom_point(data = vCVK, aes(x = LK, y = CVK)) + xlab("K") + ylab("CV")
The CV MSE decreases with the increase of K up to a certain value and then increase again.
LK[which.min(vCVK$CVK)]
[1] 7
K= 7 gives the minimum CV
KNN7 <- knn(training_data[, 1:2], training_data[, 1:2], training_data[, 3], k = 7,
prob = TRUE, use.all = TRUE)
MSE_KNN7 <- mean(KNN7 != training_data[, 3])
Performing the KNN with seven closest neighbor gives us the training MSE 0.12
KNN7_boundary<-knn(training_data[,1:2],DF_M,training_data[,3],
k=7,prob=TRUE,use.all=TRUE)
# Will make a data frame by binding "DF_M" and "KNN7_boundary" side by side and create a data frame named "new_M_KNN".
# This data frame contain 10000 mesh points(x,y) and corresponding predictions
new_M_KNN<-cbind.data.frame(DF_M,KNN7_boundary)
# Reuse the previous colored scatter plot "P4" and add a layer to that plot and stores it in "p6"
# The layer colored the each mesh point conditioning on column 3 of "new_M_KNN" data
# if the 3rd column value 1 color it orange else blue
p6=p4+geom_point(data=new_M_KNN[,1:2],aes(x=cx,y=cy),
colour= ifelse( new_M_KNN[ , 3]==1,"orange","blue"), alpha=.05)
# Extract the probability of belonging to each group from "KNN7_boundary" and stored it in "pr1" variable
pr1<-attr(KNN7_boundary,"prob")
# In order to draw the boundary in the correct place we need to switch the probability.
# If the KNN7_boundary==1 calculate 1-pr1 else pr1.
# Stores the findings in "pr2".
pr2<-ifelse(KNN7_boundary=="1",1-pr1,pr1)
# Will create a data frame "DF_M_KNN" by binding "DF_M" with "pr2" side by side.
DF_M_KNN<-cbind(DF_M,pr2)
# Use the plot "p6" and add a contour plot to it then save it in "p7"
# The contour plot gives us the decision boundary for two groups based on the probability "pr2": (<=.5==blue) orange if greater.
p7=p6+
geom_contour(data=DF_M_KNN,aes(x=cx,y=cy,z=pr2),
binwidth=.51,lwd=.5,colour="black")
p7
KNN1 <- knn(training_data[, 1:2], training_data[, 1:2], training_data[, 3], k = 1,
prob = TRUE, use.all = TRUE)
mean(KNN1 != training_data[, 3])
[1] 0
As expected using K=1 gives us the training error 0. Because of using K=1 the over fitting occurred. However, in theory this model will fail during testing because the model was trained only one observation each time. So, when it will encounter a diffrent testing value it will fail.
KNN1_boundary<-knn(training_data[,1:2],DF_M,training_data[,3],
k=1,prob=TRUE,use.all=TRUE)
new_M_KNN1<-cbind.data.frame(DF_M,KNN1_boundary)
p8=p4+geom_point(data=new_M_KNN1[,1:2],aes(x=cx,y=cy),
colour= ifelse( new_M_KNN1[ , 3]==1,"orange","blue"), alpha=.05)
KNN1pr1<-attr(KNN1_boundary,"prob")
KNN1pr2<-ifelse(KNN1_boundary=="1",1-KNN1pr1,KNN1pr1)
DF_M_KNN1<-cbind(DF_M,KNN1pr2)
p9=p8+
geom_contour(data=DF_M_KNN1,aes(x=cx,y=cy,z=KNN1pr2),
binwidth=.51,lwd=.5,colour="black")
p9
test_data <- as.data.frame(read_excel("~/Desktop/OrabyHW/FunPub/testing_data.xlsx"))
head(test_data)
## cx cy Y
## 1 1.1893464 -0.2660198 0
## 2 -2.6182795 2.7170750 1
## 3 -0.5477458 -1.6125951 0
## 4 1.2072968 0.2131221 0
## 5 1.7328078 1.8899233 1
## 6 -1.7083489 0.9357727 1
SDiff <- (test_data[, 3] - predict(LR, test_data[, 1:2]))^2
MSELRt <- mean(SDiff)
After applying the LR model on the test data we can see the slight improvement in the MSE, is now 0.11
KNN7t <- knn(training_data[, 1:2], test_data[, 1:2], training_data[, 3], k = 7, prob = TRUE,
use.all = TRUE)
MSEKNN7t <- mean(KNN7t != test_data[, 3])
After applying the KNN with K= 7 on the test data we can see the MSE is now 0.2
KNN1t <- knn(training_data[, 1:2], test_data[, 1:2], training_data[, 3], k = 1, prob = TRUE,
use.all = TRUE)
MSEKNN1t <- mean(KNN1t != test_data[, 3])
After applying the KNN with K= 1 on the test data we can see the MSE is now increased to 0.1
Season 2: In season 2 there will be four more contestants: contestant 4 performing logistic regression, contestant 5 performing LDA, contestant 6 performing QDA, contestant 7 performing \(naïve\) Bayes.
Comments:
Both the LR and KNN with K=1 has the same smaller testing MSE than the KNN with K= 7. So it seems that it is a tie between contestant 1 and contestant 2. However, being the parametric model LR has simplicity and better interpretability. So in my opinion contestant 1 wins!