Theme Song
# install.packages("remotes")
library('BBmisc', 'rmsfuns')
#remotes::install_github("rstudio/sass")
lib('sass')
## sass
## TRUE
/* https://stackoverflow.com/a/66029010/3806250 */
h1 { color: #002C54; }
h2 { color: #2F496E; }
h3 { color: #375E97; }
h4 { color: #556DAC; }
h5 { color: #92AAC7; }
/* ----------------------------------------------------------------- */
/* https://gist.github.com/himynameisdave/c7a7ed14500d29e58149#file-broken-gradient-animation-less */
.hover01 {
/* color: #FFD64D; */
background: linear-gradient(155deg, #EDAE01 0%, #FFEB94 100%);
transition: all 0.45s;
&:hover{
background: linear-gradient(155deg, #EDAE01 20%, #FFEB94 80%);
}
}
.hover02 {
color: #FFD64D;
background: linear-gradient(155deg, #002C54 0%, #4CB5F5 100%);
transition: all 0.45s;
&:hover{
background: linear-gradient(155deg, #002C54 20%, #4CB5F5 80%);
}
}
.hover03 {
color: #FFD64D;
background: linear-gradient(155deg, #A10115 0%, #FF3C5C 100%);
transition: all 0.45s;
&:hover{
background: linear-gradient(155deg, #A10115 20%, #FF3C5C 80%);
}
}
## https://stackoverflow.com/a/36846793/3806250
options(width = 999)
knitr::opts_chunk$set(class.source = 'hover01', class.output = 'hover02', class.error = 'hover03')
if(!suppressPackageStartupMessages(require('BBmisc'))) {
install.packages('BBmisc', dependencies = TRUE, INSTALL_opts = '--no-lock')
}
suppressPackageStartupMessages(require('BBmisc'))
# suppressPackageStartupMessages(require('rmsfuns'))
pkgs <- c('devtools', 'knitr', 'kableExtra', 'tidyr',
'readr', 'lubridate', 'reprex', 'magrittr',
'timetk', 'plyr', 'dplyr', 'stringr',
'tdplyr', 'tidyverse', 'formattable',
'echarts4r', 'paletteer')
suppressAll(lib(pkgs))
# load_pkg(pkgs)
## Set the timezone but not change the datetime
Sys.setenv(TZ = 'Asia/Tokyo')
## options(knitr.table.format = 'html') will set all kableExtra tables to be 'html', otherwise need to set the parameter on every single table.
options(warn = -1, knitr.table.format = 'html')#, digits.secs = 6)
## https://stackoverflow.com/questions/39417003/long-vectors-not-supported-yet-abnor-in-rmd-but-not-in-r-script
knitr::opts_chunk$set(message = FALSE, warning = FALSE)#,
#cache = TRUE, cache.lazy = FALSE)
rm(pkgs)
課題をすぐに提出してください
課題の提出期限は、6月14日 15:59 JST
ですが、可能であれば1日か2日早く提出してください。 早い段階で提出すると、他の受講生のレビューを時間内に得る可能性が高くなります。
The data set banknote contains six measurements (length of bill, width of left edge, width of right edge, bottom margin width, top margin width, and length of diagonal, all in mm) made on 100 genuine and 100 counterfeit old-Swiss 1000-franc bank notes:
To load the dataset in R, use the command load("banknoteclassification.Rdata")
, but first make sure that your working directory is set to the directory containing the file. You should see four objects:
banknote.training
contains the characteristics for 30 notes (15 genuine and 15 counterfeit) in the training set.banknote.training.labels
contains the labels (“genuine” or “counterfeit”) for the 30 notes in the training setbanknote.test
contains the characteristics for 170 notes (85 genuine and 85 counterfeit) in the test set.banknote.test.labels
contains the labels (“genuine” or “counterfeit”) for the 170 notes in the test set. These are provided only for validation purposes.qda
function in R.As your priors, use distributions in the same families as in “Sample code for MCMC example 2”. In particular, use a uniform distribution for the weights, multivariate normal distributions for the means of the components, and inverse Wishart priors for the variance-covariance matrices of the components. The parameters of the priors should be set using the same empirical Bayes approach used in that example.
Reviewers will check that the code has been properly adapted, and whether the classification results you provide are correct.
Provide an MCMC algorithm to fit a semisupervised Bayesian quadratic discriminant model to the banknote data.
## load packages
# lib('mvtnorm', 'dae', 'Rfast', 'splus2R')
suppressAll(lib('EDISON', 'extraDistr', 'MCMCpack', 'invgamma'))
lib('mvtnorm')
## mvtnorm
## TRUE
## load data
# load('data/banknoteclassification.RData')
data = load("data/banknoteclassification.Rdata")
KK = length(unique(banknote.training.labels))
n = dim(banknote.training)[1] # Size of the training set
m = dim(banknote.test)[1] # Size of the test set
x = rbind(banknote.training, banknote.test) # Create dataset of observations, first n belong to the training set, and the rest belong to the test set
p = dim(x)[2] # Number of features
## Initialize the parameters
w = rep(1,KK)/KK #Assign equal weight to each component to start with
mu = rmvnorm(KK, apply(x,2,mean), var(x)) #RandomCluster centers randomly spread over the support of the data
Sigma = array(0, dim=c(KK,p,p)) #Initial variances are assumed to be the same
Sigma[1,,] = var(x)/KK
Sigma[2,,] = var(x)/KK
cc = c(as.numeric(banknote.training.labels), sample(1:KK, m, replace=TRUE, prob=w))
# Priors
aa = rep(1, KK)
dd = apply(x,2,mean)
DD = 10*var(x)
nu = p+1
SS = var(x)/3
# Number of iteration of the sampler
rrr = 11000
burn = 1000
# Storing the samples
cc.out = array(0, dim=c(rrr, n+m))
w.out = array(0, dim=c(rrr, KK))
mu.out = array(0, dim=c(rrr, KK, p))
Sigma.out = array(0, dim=c(rrr, KK, p, p))
logpost = rep(0, rrr)
for(s in 1:rrr){
# Sample the indicators
for(i in (n+1):(n+m)){
v = rep(0,KK)
for(k in 1:KK){
v[k] = log(w[k]) + dmvnorm(x[i,], mu[k,], Sigma[k,,], log=TRUE) #Compute the log of the weights
}
v = exp(v - max(v))/sum(exp(v - max(v)))
cc[i] = sample(1:KK, 1, replace=TRUE, prob=v)
}
# Sample the weights
w = as.vector(rdirichlet(1, aa + tabulate(cc)))
# Sample the means
DD.st = matrix(0, nrow=p, ncol=p)
for(k in 1:KK){
mk = sum(cc==k)
xsumk = apply(x[cc==k,], 2, sum)
DD.st = solve(mk*solve(Sigma[k,,]) + solve(DD))
dd.st = DD.st%*%(solve(Sigma[k,,])%*%xsumk + solve(DD)%*%dd)
mu[k,] = as.vector(rmvnorm(1,dd.st,DD.st))
}
# Sample the variances
xcensumk = array(0, dim=c(KK,p,p))
for(i in 1:n+m){
xcensumk[cc[i],,] = xcensumk[cc[i],,] + (x[i,] - mu[cc[i],])%*%t(x[i,] - mu[cc[i],])
}
for(k in 1:KK){
Sigma[k,,] = riwish(nu + sum(cc==k), SS + xcensumk[k,,])
}
# Store samples
cc.out[s,] = cc
w.out[s,] = w
mu.out[s,,] = mu
Sigma.out[s,,,] = Sigma
for(i in 1:n){
logpost[s] = logpost[s] + log(w[cc[i]]) + dmvnorm(x[i,], mu[cc[i],], Sigma[cc[i],,], log=TRUE)
}
logpost[s] = logpost[s] + ddirichlet(w, aa)
for(k in 1:KK){
logpost[s] = logpost[s] + dmvnorm(mu[k,], dd, DD)
logpost[s] = logpost[s] + diwish(Sigma[k,,], nu, SS)
}
if(s/250==floor(s/250)){
print(paste("s = ", s))
}
}
## [1] "s = 250"
## [1] "s = 500"
## [1] "s = 750"
## [1] "s = 1000"
## [1] "s = 1250"
## [1] "s = 1500"
## [1] "s = 1750"
## [1] "s = 2000"
## [1] "s = 2250"
## [1] "s = 2500"
## [1] "s = 2750"
## [1] "s = 3000"
## [1] "s = 3250"
## [1] "s = 3500"
## [1] "s = 3750"
## [1] "s = 4000"
## [1] "s = 4250"
## [1] "s = 4500"
## [1] "s = 4750"
## [1] "s = 5000"
## [1] "s = 5250"
## [1] "s = 5500"
## [1] "s = 5750"
## [1] "s = 6000"
## [1] "s = 6250"
## [1] "s = 6500"
## [1] "s = 6750"
## [1] "s = 7000"
## [1] "s = 7250"
## [1] "s = 7500"
## [1] "s = 7750"
## [1] "s = 8000"
## [1] "s = 8250"
## [1] "s = 8500"
## [1] "s = 8750"
## [1] "s = 9000"
## [1] "s = 9250"
## [1] "s = 9500"
## [1] "s = 9750"
## [1] "s = 10000"
## [1] "s = 10250"
## [1] "s = 10500"
## [1] "s = 10750"
## [1] "s = 11000"
What is the classification error for the test set?
# 0 observations are misclassified
error.genuine = 0
error.count = 0
probgenuine = rep(NA, m)
for(i in 1:m){
probgenuine[i] = sum(cc.out[-seq(1,burn),n+i]==2)/(rrr-burn)
}
probcounterfeit = rep(NA, m)
for(i in 1:m){
probcounterfeit[i] = sum(cc.out[-seq(1,burn),n+i]==1)/(rrr-burn)
}
error.genuine = 1-sum(probgenuine[banknote.test.labels=="genuine"]>=probcounterfeit[banknote.test.labels=="genuine"])/length(banknote.test.labels[banknote.test.labels=="genuine"])
error.count = 1-sum(probgenuine[banknote.test.labels=="counterfeit"]<probcounterfeit[banknote.test.labels=="counterfeit"])/length(banknote.test.labels[banknote.test.labels=="counterfeit"])
Is the R function qda
(which implements classical quadratic discriminant analysis) is used to classify the observations in the test set, what is the classification error?
# 3 observations are misclassified
# 0 for genuine
# 3 for counterfeit
# Using the qda and lda functions in R
# qda
modqda = qda(grouping=banknote.training.labels, x=banknote.training, method="mle")
ccpredqda = predict(modqda,newdata=banknote.test)
ccpredqda$class
## [1] genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine
## [83] genuine genuine genuine counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit genuine counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit genuine counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit genuine counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit
## [165] counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit
## Levels: counterfeit genuine
sum(!(ccpredqda$class == banknote.test.labels)) # Three error
## [1] 3
sum(!ccpredqda$class[banknote.test.labels=="genuine"] == "genuine")
## [1] 0
sum(!ccpredqda$class[banknote.test.labels=="counterfeit"] == "counterfeit")
## [1] 3
Is the setup of the algorithm correct?
Recall that the semisupervised version of the algorithm uses all observations but treats the labels for he training set as known. There are many ways to set this up, but one that requires minimal changes to the algorithm defines the
## All observations used for calculation
x = rbind(banknote.training,banknote.test)
## Size of the training and test sets
n = dim(banknote.training)[1]
m = dim(unique(banknote.test))[1]
## Number of components and dimensionality of the components
KK = length(unique(banknote.training.labels))
p = dim(banknote.training)[2]
## Starting value for the indicators
## Note that for the training set we use the true values, while for the
## test set the values are initialized randomly
cc = c(as.numeric(banknote.training.labels), sample(1:KK, m, replace=TRUE, prob=w))
However, be open minded in how you evaluate this item. The most important thing is that the setup is consistent with the rest of the implementation, and that it enables the use of both the training and test sets for the estimation of the means and variance of the components.
Is the sampler for
The full conditional for the indicators is identical to the one in the original code. The only difference is that the labels for the training set are considered known, and therefore not sampled. In practice, this means a simple change to the values over which the
# Sample the indicators
for(i in (n+1):(n+m)){
v = rep(0,KK)
for(k in 1:KK){
v[k] = log(w[k]) + dmvnorm(x[i,], mu[k,], Sigma[k,,], log=TRUE) #Compute the log of the weights
}
v = exp(v - max(v))/sum(exp(v - max(v)))
cc[i] = sample(1:KK, 1, replace=TRUE, prob=v)
}
Because the code never changes the values of the first
Is the sampler for the
Again, the sampler is identical to the one used before. The only difficulty here is that, because
# Sample the means
DD.st = matrix(0, nrow=p, ncol=p)
for(k in 1:KK){
mk = sum(cc==k)
xsumk = apply(x[cc==k,], 2, sum)
DD.st = solve(mk*solve(Sigma[k,,]) + solve(DD))
dd.st = DD.st%*%(solve(Sigma[k,,])%*%xsumk + solve(DD)%*%dd)
mu[k,] = as.vector(rmvnorm(1,dd.st,DD.st))
}
Is the sampler for the
Again, the sampler is identical to the one used before. The only difficulty is that, because
# Sample the variances
xcensumk = array(0, dim=c(KK,p,p))
for(i in 1:(n+m)){ ## Need to loop over all (n+m) observations, not just the first n
xcensumk[cc[i],,] = xcensumk[cc[i],,] + (x[i,] - mu[cc[i],])%*%t(x[i,] - mu[cc[i],])
}
for(k in 1:KK){
Sigma[k,,] = riwish(nu + sum(cc==k), SS + xcensumk[k,,])
}
Is the code used to classify observations in the test set correct?
The classification is based on the probabilities that each observation is classified as “genuine” and “counterfeit”. Those probabilities can be estimated using the frequencies for which each
probgenuine = rep(NA, m)
for(i in 1:m){
probgenuine[i] = sum(cc.out[-seq(1,burn),n+i]==2)/(rrr-burn)
}
A sample of the full code for this problem is provided below:
#### Semisupervised classification for the banknote dataset
rm(list=ls())
library(mvtnorm)
library(MCMCpack)
## Load data
#load("banknoteclassification.Rdata")
load("data/banknoteclassification.RData")
x = rbind(banknote.training,banknote.test)
## Generate data from a mixture with 3 components
KK = length(unique(banknote.training.labels))
p = dim(banknote.training)[2]
n = dim(banknote.training)[1]
m = dim(unique(banknote.test))[1]
## Initialize the parameters
w = rep(1,KK)/KK #Assign equal weight to each component to start with
mu = rmvnorm(KK, apply(x,2,mean), var(x)) #RandomCluster centers randomly spread over the support of the data
Sigma = array(0, dim=c(KK,p,p)) #Initial variances are assumed to be the same
Sigma[1,,] = var(x)/KK
Sigma[2,,] = var(x)/KK
cc = c(as.numeric(banknote.training.labels), sample(1:KK, m, replace=TRUE, prob=w))
# Priors
aa = rep(1, KK)
dd = apply(x,2,mean)
DD = 10*var(x)
nu = p+1
SS = var(x)/3
# Number of iteration of the sampler
rrr = 11000
burn = 1000
# Storing the samples
cc.out = array(0, dim=c(rrr, n+m))
w.out = array(0, dim=c(rrr, KK))
mu.out = array(0, dim=c(rrr, KK, p))
Sigma.out = array(0, dim=c(rrr, KK, p, p))
logpost = rep(0, rrr)
for(s in 1:rrr){
# Sample the indicators
for(i in (n+1):(n+m)){
v = rep(0,KK)
for(k in 1:KK){
v[k] = log(w[k]) + dmvnorm(x[i,], mu[k,], Sigma[k,,], log=TRUE) #Compute the log of the weights
}
v = exp(v - max(v))/sum(exp(v - max(v)))
cc[i] = sample(1:KK, 1, replace=TRUE, prob=v)
}
# Sample the weights
w = as.vector(rdirichlet(1, aa + tabulate(cc)))
# Sample the means
DD.st = matrix(0, nrow=p, ncol=p)
for(k in 1:KK){
mk = sum(cc==k)
xsumk = apply(x[cc==k,], 2, sum)
DD.st = solve(mk*solve(Sigma[k,,]) + solve(DD))
dd.st = DD.st%*%(solve(Sigma[k,,])%*%xsumk + solve(DD)%*%dd)
mu[k,] = as.vector(rmvnorm(1,dd.st,DD.st))
}
# Sample the variances
xcensumk = array(0, dim=c(KK,p,p))
for(i in 1:(n+m)){
xcensumk[cc[i],,] = xcensumk[cc[i],,] + (x[i,] - mu[cc[i],])%*%t(x[i,] - mu[cc[i],])
}
for(k in 1:KK){
Sigma[k,,] = riwish(nu + sum(cc==k), SS + xcensumk[k,,])
}
# Store samples
cc.out[s,] = cc
w.out[s,] = w
mu.out[s,,] = mu
Sigma.out[s,,,] = Sigma
for(i in 1:n){
logpost[s] = logpost[s] + log(w[cc[i]]) + dmvnorm(x[i,], mu[cc[i],], Sigma[cc[i],,], log=TRUE)
}
logpost[s] = logpost[s] + ddirichlet(w, aa)
for(k in 1:KK){
logpost[s] = logpost[s] + dmvnorm(mu[k,], dd, DD)
logpost[s] = logpost[s] + diwish(Sigma[k,,], nu, SS)
}
if(s/250==floor(s/250)){
print(paste("s = ", s))
}
}
## [1] "s = 250"
## [1] "s = 500"
## [1] "s = 750"
## [1] "s = 1000"
## [1] "s = 1250"
## [1] "s = 1500"
## [1] "s = 1750"
## [1] "s = 2000"
## [1] "s = 2250"
## [1] "s = 2500"
## [1] "s = 2750"
## [1] "s = 3000"
## [1] "s = 3250"
## [1] "s = 3500"
## [1] "s = 3750"
## [1] "s = 4000"
## [1] "s = 4250"
## [1] "s = 4500"
## [1] "s = 4750"
## [1] "s = 5000"
## [1] "s = 5250"
## [1] "s = 5500"
## [1] "s = 5750"
## [1] "s = 6000"
## [1] "s = 6250"
## [1] "s = 6500"
## [1] "s = 6750"
## [1] "s = 7000"
## [1] "s = 7250"
## [1] "s = 7500"
## [1] "s = 7750"
## [1] "s = 8000"
## [1] "s = 8250"
## [1] "s = 8500"
## [1] "s = 8750"
## [1] "s = 9000"
## [1] "s = 9250"
## [1] "s = 9500"
## [1] "s = 9750"
## [1] "s = 10000"
## [1] "s = 10250"
## [1] "s = 10500"
## [1] "s = 10750"
## [1] "s = 11000"
probgenuine = rep(NA, m)
for(i in 1:m){
probgenuine[i] = sum(cc.out[-seq(1,burn),n+i]==2)/(rrr-burn)
}
Is the classification error for the “genuine” class generated by the algorithm correct?
The value should be 0% (the algorithm perfectly classifies genuine banknotes in the test set).
Is the classification error for the “counterfeit” class generated by the algorithm correct?
Is the code used to classify observations in the test set correct?
This is a simple call to the qda
function
banknote.qda = qda(x=banknote.training, grouping=banknote.training.labels)
predict(banknote.qda, banknote.test)$class
## [1] genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine
## [83] genuine genuine genuine counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit genuine counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit genuine counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit genuine counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit
## [165] counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit
## Levels: counterfeit genuine
Is the classification error for the “genuine” class generated by the algorithm correct?
The value should be 0% (the algorithm perfectly classifies genuine banknotes in the test set). This is the same as the semisupervised Bayesian QDA algorithm you just implement
Is the classification error for the “counterfeit” class generated by the algorithm correct?
The value should also be 3.52% (3 errors out of 85 observations). In this case, the function
Provide an MCMC algorithm to fit a semisupervised Bayesian quadratic discriminant model to the banknote data.
load('data/banknoteclassification.RData')
n = dim(banknote.training)[1] # Size of the training set
m = dim(banknote.test)[1] # Size of the test set
x = rbind(banknote.training, banknote.test) # Create dataset of observations, first n belong to the training set, and the rest belong to the test set
p = dim(banknote.training)[2] # Number of features
KK = 2
## Initialize the parameters
w = rep(1,KK)/KK #Assign equal weight to each component to start with
mu = rmvnorm(KK, apply(x,2,mean), var(x)) #RandomCluster centers randomly spread over the support of the data
Sigma = array(0, dim=c(KK,p,p)) #Initial variances are assumed to be the same
Sigma[1,,] = var(x)/KK
Sigma[2,,] = var(x)/KK
cc = rep(NA, n+m)
cc[1:n] = as.numeric(banknote.training.labels)
# Priors
aa = rep(1, KK)
dd = apply(x,2,mean)
DD = 10*var(x)
nu = p+1
SS = var(x)/2
# Number of iteration of the sampler
rrr = 1000
# Storing the samples
cc.out = array(0, dim=c(rrr, (n+m)))
w.out = array(0, dim=c(rrr, KK))
mu.out = array(0, dim=c(rrr, KK, p))
Sigma.out = array(0, dim=c(rrr, KK, p, p))
logpost = rep(0, rrr)
for(s in 1:rrr){
# Sample the indicators
for(i in 1:(n+m)){
v = rep(0,KK)
for(k in 1:KK){
#Compute the log of the weights
v[k] = log(w[k]) + mvtnorm::dmvnorm(x[i,], mu[k,], Sigma[k,,], log=TRUE)
}
v = exp(v - max(v))/sum(exp(v - max(v)))
if (i>n) cc[i] = sample(1:KK, 1, replace=TRUE, prob=v)
}
# Sample the weights
w = as.vector(rdirichlet(1, aa + tabulate(cc)))
# Sample the means
DD.st = matrix(0, nrow=p, ncol=p)
for(k in 1:KK){
mk = sum(cc==k)
xsumk = apply(x[cc==k,], 2, sum)
DD.st = solve(mk*solve(Sigma[k,,]) + solve(DD))
dd.st = DD.st%*%(solve(Sigma[k,,])%*%xsumk + solve(DD)%*%dd)
mu[k,] = as.vector(rmvnorm(1,dd.st,DD.st))
}
# Sample the variances
xcensumk = array(0, dim=c(KK,p,p))
for(i in 1:(n+m)){
xcensumk[cc[i],,] = xcensumk[cc[i],,] + (x[i,] - mu[cc[i],])%*%t(x[i,] - mu[cc[i],])
}
for(k in 1:KK){
Sigma[k,,] = riwish(nu + sum(cc==k), SS + xcensumk[k,,])
}
# Store samples
cc.out[s,] = cc
w.out[s,] = w
mu.out[s,,] = mu
Sigma.out[s,,,] = Sigma
for(i in 1:(n+m)){
logpost[s] = logpost[s] + log(w[cc[i]]) + mvtnorm::dmvnorm(x[i,], mu[cc[i],], Sigma[cc[i],,], log=TRUE)
}
logpost[s] = logpost[s] + ddirichlet(w, aa)
for(k in 1:KK){
logpost[s] = logpost[s] + mvtnorm::dmvnorm(mu[k,], dd, DD, log=TRUE)
logpost[s] = logpost[s] + log(diwish(Sigma[k,,], nu, SS))
}
if(s/250==floor(s/250)){
print(paste("s = ", s))
}
}
## [1] "s = 250"
## [1] "s = 500"
## [1] "s = 750"
## [1] "s = 1000"
What is the classification error for the test set?
0
## [1] 0
Is the R function qda
(which implements classical quadratic discriminant analysis) is used to classify the observations in the test set, what is the classification error?
3
## [1] 3
Is the setup of the algorithm correct?
Recall that the semisupervised version of the algorithm uses all observations but treats the labels for he training set as known. There are many ways to set this up, but one that requires minimal changes to the algorithm defines the
## All observations used for calculation
x = rbind(banknote.training,banknote.test)
## Size of the training and test sets
n = dim(banknote.training)[1]
m = dim(unique(banknote.test))[1]
## Number of components and dimensionality of the components
KK = length(unique(banknote.training.labels))
p = dim(banknote.training)[2]
## Starting value for the indicators
## Note that for the training set we use the true values, while for the
## test set the values are initialized randomly
cc = c(as.numeric(banknote.training.labels), sample(1:KK, m, replace=TRUE, prob=w))
However, be open minded in how you evaluate this item. The most important thing is that the setup is consistent with the rest of the implementation, and that it enables the use of both the training and test sets for the estimation of the means and variance of the components.
Is the sampler for
The full conditional for the indicators is identical to the one in the original code. The only difference is that the labels for the training set are considered known, and therefore not sampled. In practice, this means a simple change to the values over which the
# Sample the indicators
for(i in (n+1):(n+m)){
v = rep(0,KK)
for(k in 1:KK){
v[k] = log(w[k]) + dmvnorm(x[i,], mu[k,], Sigma[k,,], log=TRUE) #Compute the log of the weights
}
v = exp(v - max(v))/sum(exp(v - max(v)))
cc[i] = sample(1:KK, 1, replace=TRUE, prob=v)
}
Because the code never changes the values of the first
Is the sampler for the
Again, the sampler is identical to the one used before. The only difficulty here is that, because
# Sample the means
DD.st = matrix(0, nrow=p, ncol=p)
for(k in 1:KK){
mk = sum(cc==k)
xsumk = apply(x[cc==k,], 2, sum)
DD.st = solve(mk*solve(Sigma[k,,]) + solve(DD))
dd.st = DD.st%*%(solve(Sigma[k,,])%*%xsumk + solve(DD)%*%dd)
mu[k,] = as.vector(rmvnorm(1,dd.st,DD.st))
}
Is the sampler for the
Again, the sampler is identical to the one used before. The only difficulty is that, because
# Sample the variances
xcensumk = array(0, dim=c(KK,p,p))
for(i in 1:(n+m)){ ## Need to loop over all (n+m) observations, not just the first n
xcensumk[cc[i],,] = xcensumk[cc[i],,] + (x[i,] - mu[cc[i],])%*%t(x[i,] - mu[cc[i],])
}
for(k in 1:KK){
Sigma[k,,] = riwish(nu + sum(cc==k), SS + xcensumk[k,,])
}
Is the code used to classify observations in the test set correct?
The classification is based on the probabilities that each observation is classified as “genuine” and “counterfeit”. Those probabilities can be estimated using the frequencies for which each
probgenuine = rep(NA, m)
for(i in 1:m){
probgenuine[i] = sum(cc.out[-seq(1,burn),n+i]==2)/(rrr-burn)
}
A sample of the full code for this problem is provided below:
#### Semisupervised classification for the banknote dataset
rm(list=ls())
library(mvtnorm)
library(MCMCpack)
## Load data
#load("banknoteclassification.Rdata")
load("data/banknoteclassification.RData")
x = rbind(banknote.training,banknote.test)
## Generate data from a mixture with 3 components
KK = length(unique(banknote.training.labels))
p = dim(banknote.training)[2]
n = dim(banknote.training)[1]
m = dim(unique(banknote.test))[1]
## Initialize the parameters
w = rep(1,KK)/KK #Assign equal weight to each component to start with
mu = rmvnorm(KK, apply(x,2,mean), var(x)) #RandomCluster centers randomly spread over the support of the data
Sigma = array(0, dim=c(KK,p,p)) #Initial variances are assumed to be the same
Sigma[1,,] = var(x)/KK
Sigma[2,,] = var(x)/KK
cc = c(as.numeric(banknote.training.labels), sample(1:KK, m, replace=TRUE, prob=w))
# Priors
aa = rep(1, KK)
dd = apply(x,2,mean)
DD = 10*var(x)
nu = p+1
SS = var(x)/3
# Number of iteration of the sampler
rrr = 11000
burn = 1000
# Storing the samples
cc.out = array(0, dim=c(rrr, n+m))
w.out = array(0, dim=c(rrr, KK))
mu.out = array(0, dim=c(rrr, KK, p))
Sigma.out = array(0, dim=c(rrr, KK, p, p))
logpost = rep(0, rrr)
for(s in 1:rrr){
# Sample the indicators
for(i in (n+1):(n+m)){
v = rep(0,KK)
for(k in 1:KK){
v[k] = log(w[k]) + dmvnorm(x[i,], mu[k,], Sigma[k,,], log=TRUE) #Compute the log of the weights
}
v = exp(v - max(v))/sum(exp(v - max(v)))
cc[i] = sample(1:KK, 1, replace=TRUE, prob=v)
}
# Sample the weights
w = as.vector(rdirichlet(1, aa + tabulate(cc)))
# Sample the means
DD.st = matrix(0, nrow=p, ncol=p)
for(k in 1:KK){
mk = sum(cc==k)
xsumk = apply(x[cc==k,], 2, sum)
DD.st = solve(mk*solve(Sigma[k,,]) + solve(DD))
dd.st = DD.st%*%(solve(Sigma[k,,])%*%xsumk + solve(DD)%*%dd)
mu[k,] = as.vector(rmvnorm(1,dd.st,DD.st))
}
# Sample the variances
xcensumk = array(0, dim=c(KK,p,p))
for(i in 1:(n+m)){
xcensumk[cc[i],,] = xcensumk[cc[i],,] + (x[i,] - mu[cc[i],])%*%t(x[i,] - mu[cc[i],])
}
for(k in 1:KK){
Sigma[k,,] = riwish(nu + sum(cc==k), SS + xcensumk[k,,])
}
# Store samples
cc.out[s,] = cc
w.out[s,] = w
mu.out[s,,] = mu
Sigma.out[s,,,] = Sigma
for(i in 1:n){
logpost[s] = logpost[s] + log(w[cc[i]]) + dmvnorm(x[i,], mu[cc[i],], Sigma[cc[i],,], log=TRUE)
}
logpost[s] = logpost[s] + ddirichlet(w, aa)
for(k in 1:KK){
logpost[s] = logpost[s] + dmvnorm(mu[k,], dd, DD)
logpost[s] = logpost[s] + diwish(Sigma[k,,], nu, SS)
}
if(s/250==floor(s/250)){
print(paste("s = ", s))
}
}
## [1] "s = 250"
## [1] "s = 500"
## [1] "s = 750"
## [1] "s = 1000"
## [1] "s = 1250"
## [1] "s = 1500"
## [1] "s = 1750"
## [1] "s = 2000"
## [1] "s = 2250"
## [1] "s = 2500"
## [1] "s = 2750"
## [1] "s = 3000"
## [1] "s = 3250"
## [1] "s = 3500"
## [1] "s = 3750"
## [1] "s = 4000"
## [1] "s = 4250"
## [1] "s = 4500"
## [1] "s = 4750"
## [1] "s = 5000"
## [1] "s = 5250"
## [1] "s = 5500"
## [1] "s = 5750"
## [1] "s = 6000"
## [1] "s = 6250"
## [1] "s = 6500"
## [1] "s = 6750"
## [1] "s = 7000"
## [1] "s = 7250"
## [1] "s = 7500"
## [1] "s = 7750"
## [1] "s = 8000"
## [1] "s = 8250"
## [1] "s = 8500"
## [1] "s = 8750"
## [1] "s = 9000"
## [1] "s = 9250"
## [1] "s = 9500"
## [1] "s = 9750"
## [1] "s = 10000"
## [1] "s = 10250"
## [1] "s = 10500"
## [1] "s = 10750"
## [1] "s = 11000"
probgenuine = rep(NA, m)
for(i in 1:m){
probgenuine[i] = sum(cc.out[-seq(1,burn),n+i]==2)/(rrr-burn)
}
Is the classification error for the “genuine” class generated by the algorithm correct?
The value should be 0% (the algorithm perfectly classifies genuine banknotes in the test set).
Is the classification error for the “counterfeit” class generated by the algorithm correct?
Is the code used to classify observations in the test set correct?
This is a simple call to the qda
function
banknote.qda = qda(x=banknote.training, grouping=banknote.training.labels)
predict(banknote.qda, banknote.test)$class
## [1] genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine
## [83] genuine genuine genuine counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit genuine counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit genuine counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit genuine counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit
## [165] counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit
## Levels: counterfeit genuine
Is the classification error for the “genuine” class generated by the algorithm correct?
The value should be 0% (the algorithm perfectly classifies genuine banknotes in the test set). This is the same as the semisupervised Bayesian QDA algorithm you just implement
Is the classification error for the “counterfeit” class generated by the algorithm correct?
The value should also be 3.52% (3 errors out of 85 observations). In this case, the function
Provide an MCMC algorithm to fit a semisupervised Bayesian quadratic discriminant model to the banknote data.
load('data/banknoteclassification.RData')
n = dim(banknote.training)[1] # Size of the training set
m = dim(banknote.test)[1] # Size of the test set
x = rbind(banknote.training, banknote.test) # Create dataset of observations, first n belong to the training set, and the rest belong to the test set
p = dim(banknote.training)[2] # Number of features
KK = 2
## Initialize the parameters
w = rep(1,KK)/KK #Assign equal weight to each component to start with
mu = rmvnorm(KK, apply(x,2,mean), var(x)) #RandomCluster centers randomly spread over the support of the data
Sigma = array(0, dim=c(KK,p,p)) #Initial variances are assumed to be the same
Sigma[1,,] = var(x)/KK
Sigma[2,,] = var(x)/KK
cc = rep(NA, n+m)
cc[1:n] = as.numeric(banknote.training.labels)
# Priors
aa = rep(1, KK)
dd = apply(x,2,mean)
DD = 10*var(x)
nu = p+1
SS = var(x)/2
# Number of iteration of the sampler
rrr = 1000
# Storing the samples
cc.out = array(0, dim=c(rrr, (n+m)))
w.out = array(0, dim=c(rrr, KK))
mu.out = array(0, dim=c(rrr, KK, p))
Sigma.out = array(0, dim=c(rrr, KK, p, p))
logpost = rep(0, rrr)
for(s in 1:rrr){
# Sample the indicators
for(i in 1:(n+m)){
v = rep(0,KK)
for(k in 1:KK){
#Compute the log of the weights
v[k] = log(w[k]) + mvtnorm::dmvnorm(x[i,], mu[k,], Sigma[k,,], log=TRUE)
}
v = exp(v - max(v))/sum(exp(v - max(v)))
if (i>n) cc[i] = sample(1:KK, 1, replace=TRUE, prob=v)
}
# Sample the weights
w = as.vector(rdirichlet(1, aa + tabulate(cc)))
# Sample the means
DD.st = matrix(0, nrow=p, ncol=p)
for(k in 1:KK){
mk = sum(cc==k)
xsumk = apply(x[cc==k,], 2, sum)
DD.st = solve(mk*solve(Sigma[k,,]) + solve(DD))
dd.st = DD.st%*%(solve(Sigma[k,,])%*%xsumk + solve(DD)%*%dd)
mu[k,] = as.vector(rmvnorm(1,dd.st,DD.st))
}
# Sample the variances
xcensumk = array(0, dim=c(KK,p,p))
for(i in 1:(n+m)){
xcensumk[cc[i],,] = xcensumk[cc[i],,] + (x[i,] - mu[cc[i],])%*%t(x[i,] - mu[cc[i],])
}
for(k in 1:KK){
Sigma[k,,] = riwish(nu + sum(cc==k), SS + xcensumk[k,,])
}
# Store samples
cc.out[s,] = cc
w.out[s,] = w
mu.out[s,,] = mu
Sigma.out[s,,,] = Sigma
for(i in 1:(n+m)){
logpost[s] = logpost[s] + log(w[cc[i]]) + mvtnorm::dmvnorm(x[i,], mu[cc[i],], Sigma[cc[i],,], log=TRUE)
}
logpost[s] = logpost[s] + ddirichlet(w, aa)
for(k in 1:KK){
logpost[s] = logpost[s] + mvtnorm::dmvnorm(mu[k,], dd, DD, log=TRUE)
logpost[s] = logpost[s] + log(diwish(Sigma[k,,], nu, SS))
}
if(s/250==floor(s/250)){
print(paste("s = ", s))
}
}
## [1] "s = 250"
## [1] "s = 500"
## [1] "s = 750"
## [1] "s = 1000"
What is the classification error for the test set?
0
## [1] 0
Is the R function qda
(which implements classical quadratic discriminant analysis) is used to classify the observations in the test set, what is the classification error?
3
## [1] 3
Is the setup of the algorithm correct?
Recall that the semisupervised version of the algorithm uses all observations but treats the labels for he training set as known. There are many ways to set this up, but one that requires minimal changes to the algorithm defines the
## All observations used for calculation
x = rbind(banknote.training,banknote.test)
## Size of the training and test sets
n = dim(banknote.training)[1]
m = dim(unique(banknote.test))[1]
## Number of components and dimensionality of the components
KK = length(unique(banknote.training.labels))
p = dim(banknote.training)[2]
## Starting value for the indicators
## Note that for the training set we use the true values, while for the
## test set the values are initialized randomly
cc = c(as.numeric(banknote.training.labels), sample(1:KK, m, replace=TRUE, prob=w))
However, be open minded in how you evaluate this item. The most important thing is that the setup is consistent with the rest of the implementation, and that it enables the use of both the training and test sets for the estimation of the means and variance of the components.
Is the sampler for
The full conditional for the indicators is identical to the one in the original code. The only difference is that the labels for the training set are considered known, and therefore not sampled. In practice, this means a simple change to the values over which the
# Sample the indicators
for(i in (n+1):(n+m)){
v = rep(0,KK)
for(k in 1:KK){
v[k] = log(w[k]) + dmvnorm(x[i,], mu[k,], Sigma[k,,], log=TRUE) #Compute the log of the weights
}
v = exp(v - max(v))/sum(exp(v - max(v)))
cc[i] = sample(1:KK, 1, replace=TRUE, prob=v)
}
Because the code never changes the values of the first
Is the sampler for the
Again, the sampler is identical to the one used before. The only difficulty here is that, because
# Sample the means
DD.st = matrix(0, nrow=p, ncol=p)
for(k in 1:KK){
mk = sum(cc==k)
xsumk = apply(x[cc==k,], 2, sum)
DD.st = solve(mk*solve(Sigma[k,,]) + solve(DD))
dd.st = DD.st%*%(solve(Sigma[k,,])%*%xsumk + solve(DD)%*%dd)
mu[k,] = as.vector(rmvnorm(1,dd.st,DD.st))
}
Is the sampler for the
Again, the sampler is identical to the one used before. The only difficulty is that, because
# Sample the variances
xcensumk = array(0, dim=c(KK,p,p))
for(i in 1:(n+m)){ ## Need to loop over all (n+m) observations, not just the first n
xcensumk[cc[i],,] = xcensumk[cc[i],,] + (x[i,] - mu[cc[i],])%*%t(x[i,] - mu[cc[i],])
}
for(k in 1:KK){
Sigma[k,,] = riwish(nu + sum(cc==k), SS + xcensumk[k,,])
}
Is the code used to classify observations in the test set correct?
The classification is based on the probabilities that each observation is classified as “genuine” and “counterfeit”. Those probabilities can be estimated using the frequencies for which each
probgenuine = rep(NA, m)
for(i in 1:m){
probgenuine[i] = sum(cc.out[-seq(1,burn),n+i]==2)/(rrr-burn)
}
A sample of the full code for this problem is provided below:
#### Semisupervised classification for the banknote dataset
rm(list=ls())
library(mvtnorm)
library(MCMCpack)
## Load data
#load("banknoteclassification.Rdata")
load("data/banknoteclassification.RData")
x = rbind(banknote.training,banknote.test)
## Generate data from a mixture with 3 components
KK = length(unique(banknote.training.labels))
p = dim(banknote.training)[2]
n = dim(banknote.training)[1]
m = dim(unique(banknote.test))[1]
## Initialize the parameters
w = rep(1,KK)/KK #Assign equal weight to each component to start with
mu = rmvnorm(KK, apply(x,2,mean), var(x)) #RandomCluster centers randomly spread over the support of the data
Sigma = array(0, dim=c(KK,p,p)) #Initial variances are assumed to be the same
Sigma[1,,] = var(x)/KK
Sigma[2,,] = var(x)/KK
cc = c(as.numeric(banknote.training.labels), sample(1:KK, m, replace=TRUE, prob=w))
# Priors
aa = rep(1, KK)
dd = apply(x,2,mean)
DD = 10*var(x)
nu = p+1
SS = var(x)/3
# Number of iteration of the sampler
rrr = 11000
burn = 1000
# Storing the samples
cc.out = array(0, dim=c(rrr, n+m))
w.out = array(0, dim=c(rrr, KK))
mu.out = array(0, dim=c(rrr, KK, p))
Sigma.out = array(0, dim=c(rrr, KK, p, p))
logpost = rep(0, rrr)
for(s in 1:rrr){
# Sample the indicators
for(i in (n+1):(n+m)){
v = rep(0,KK)
for(k in 1:KK){
v[k] = log(w[k]) + dmvnorm(x[i,], mu[k,], Sigma[k,,], log=TRUE) #Compute the log of the weights
}
v = exp(v - max(v))/sum(exp(v - max(v)))
cc[i] = sample(1:KK, 1, replace=TRUE, prob=v)
}
# Sample the weights
w = as.vector(rdirichlet(1, aa + tabulate(cc)))
# Sample the means
DD.st = matrix(0, nrow=p, ncol=p)
for(k in 1:KK){
mk = sum(cc==k)
xsumk = apply(x[cc==k,], 2, sum)
DD.st = solve(mk*solve(Sigma[k,,]) + solve(DD))
dd.st = DD.st%*%(solve(Sigma[k,,])%*%xsumk + solve(DD)%*%dd)
mu[k,] = as.vector(rmvnorm(1,dd.st,DD.st))
}
# Sample the variances
xcensumk = array(0, dim=c(KK,p,p))
for(i in 1:(n+m)){
xcensumk[cc[i],,] = xcensumk[cc[i],,] + (x[i,] - mu[cc[i],])%*%t(x[i,] - mu[cc[i],])
}
for(k in 1:KK){
Sigma[k,,] = riwish(nu + sum(cc==k), SS + xcensumk[k,,])
}
# Store samples
cc.out[s,] = cc
w.out[s,] = w
mu.out[s,,] = mu
Sigma.out[s,,,] = Sigma
for(i in 1:n){
logpost[s] = logpost[s] + log(w[cc[i]]) + dmvnorm(x[i,], mu[cc[i],], Sigma[cc[i],,], log=TRUE)
}
logpost[s] = logpost[s] + ddirichlet(w, aa)
for(k in 1:KK){
logpost[s] = logpost[s] + dmvnorm(mu[k,], dd, DD)
logpost[s] = logpost[s] + diwish(Sigma[k,,], nu, SS)
}
if(s/250==floor(s/250)){
print(paste("s = ", s))
}
}
## [1] "s = 250"
## [1] "s = 500"
## [1] "s = 750"
## [1] "s = 1000"
## [1] "s = 1250"
## [1] "s = 1500"
## [1] "s = 1750"
## [1] "s = 2000"
## [1] "s = 2250"
## [1] "s = 2500"
## [1] "s = 2750"
## [1] "s = 3000"
## [1] "s = 3250"
## [1] "s = 3500"
## [1] "s = 3750"
## [1] "s = 4000"
## [1] "s = 4250"
## [1] "s = 4500"
## [1] "s = 4750"
## [1] "s = 5000"
## [1] "s = 5250"
## [1] "s = 5500"
## [1] "s = 5750"
## [1] "s = 6000"
## [1] "s = 6250"
## [1] "s = 6500"
## [1] "s = 6750"
## [1] "s = 7000"
## [1] "s = 7250"
## [1] "s = 7500"
## [1] "s = 7750"
## [1] "s = 8000"
## [1] "s = 8250"
## [1] "s = 8500"
## [1] "s = 8750"
## [1] "s = 9000"
## [1] "s = 9250"
## [1] "s = 9500"
## [1] "s = 9750"
## [1] "s = 10000"
## [1] "s = 10250"
## [1] "s = 10500"
## [1] "s = 10750"
## [1] "s = 11000"
probgenuine = rep(NA, m)
for(i in 1:m){
probgenuine[i] = sum(cc.out[-seq(1,burn),n+i]==2)/(rrr-burn)
}
Is the classification error for the “genuine” class generated by the algorithm correct?
The value should be 0% (the algorithm perfectly classifies genuine banknotes in the test set).
Is the classification error for the “counterfeit” class generated by the algorithm correct?
Is the code used to classify observations in the test set correct?
This is a simple call to the qda
function
banknote.qda = qda(x=banknote.training, grouping=banknote.training.labels)
predict(banknote.qda, banknote.test)$class
## [1] genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine
## [83] genuine genuine genuine counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit genuine counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit genuine counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit genuine counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit
## [165] counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit
## Levels: counterfeit genuine
Is the classification error for the “genuine” class generated by the algorithm correct?
The value should be 0% (the algorithm perfectly classifies genuine banknotes in the test set). This is the same as the semisupervised Bayesian QDA algorithm you just implement
Is the classification error for the “counterfeit” class generated by the algorithm correct?
The value should also be 3.52% (3 errors out of 85 observations). In this case, the function
Provide an MCMC algorithm to fit a semisupervised Bayesian quadratic discriminant model to the banknote data.
load('data/banknoteclassification.RData')
#load("banknoteclassification.Rdata")
n = dim(banknote.training)[1] # Size of the training set
m = dim(banknote.test)[1] # Size of the test set
x = rbind(as.matrix(banknote.training), as.matrix(banknote.test)) # Create dataset of observations, first n belong to the training set, and the rest belong to the test set
labels = array(0, n+m)
for(i in 1:n) {
if(banknote.training.labels[i] == 'genuine') labels[i] = 1
else labels[i] = 2
}
for(j in 1:m) {
if(banknote.test.labels[j] == 'genuine') labels[j+n] = 1
else labels[j+n] = 2
}
p = dim(x)[2] # Number of features
KK = 2 #"genuine" or "counterfeit"
epsilon = 0.00001
# Initialize the parameters of the algorithm
set.seed(63252)
## Initialize the parameters
w = rep(1,KK)/KK #Assign equal weight to each component to start with
mu = rmvnorm(KK, apply(x,2,mean), var(x)) #RandomCluster centers randomly spread over the support of the data
Sigma = array(0, dim=c(KK,p,p)) #Initial variances are assumed to be the same
Sigma[1,,] = var(x)/KK
Sigma[2,,] = var(x)/KK
cc = sample(1:KK, n+m, replace=TRUE, prob=w)
cc[1:n] = labels[1:n]
# Priors
aa = rep(1, KK)
dd = apply(x,2,mean)
DD = 10*var(x)
nu = p
SS = var(x)/3
# Number of iteration of the sampler
rrr = 1500
# Storing the samples
cc.out = array(0, dim=c(rrr, n+m))
w.out = array(0, dim=c(rrr, KK))
mu.out = array(0, dim=c(rrr, KK, p))
Sigma.out = array(0, dim=c(rrr, KK, p, p))
logpost = rep(0, rrr)
for(s in 1:rrr){
# Sample the indicators
cc[1:n] = labels[1:n]
for(i in (n+1):(n+m)){
v = rep(0,KK)
for(k in 1:KK){
v[k] = log(w[k]) + mvtnorm::dmvnorm(x[i,], mu[k,], Sigma[k,,], log=TRUE) #Compute the log of the weights
}
v = exp(v - max(v))/sum(exp(v - max(v)))
cc[i] = sample(1:KK, 1, replace=TRUE, prob=v)
}
# Sample the weights
w = as.vector(rdirichlet(1, aa + tabulate(cc)))
# Sample the means
DD.st = matrix(0, nrow=p, ncol=p)
for(k in 1:KK){
mk = sum(cc==k)
xsumk = apply(x[cc==k,], 2, sum)
DD.st = solve(mk*solve(Sigma[k,,]) + solve(DD))
dd.st = DD.st%*%(solve(Sigma[k,,])%*%xsumk + solve(DD)%*%dd)
mu[k,] = as.vector(rmvnorm(1,dd.st,DD.st))
}
# Sample the variances
xcensumk = array(0, dim=c(KK,p,p))
for(i in 1:(n+m)){
xcensumk[cc[i],,] = xcensumk[cc[i],,] + (x[i,] - mu[cc[i],])%*%t(x[i,] - mu[cc[i],])
}
for(k in 1:KK){
Sigma[k,,] = riwish(nu + sum(cc==k), SS + xcensumk[k,,])
}
# Store samples
cc.out[s,] = cc
w.out[s,] = w
mu.out[s,,] = mu
Sigma.out[s,,,] = Sigma
for(i in 1:(n+m)){
logpost[s] = logpost[s] + log(w[cc[i]]) + mvtnorm::dmvnorm(x[i,], mu[cc[i],], Sigma[cc[i],,], log=TRUE)
}
logpost[s] = logpost[s] + ddirichlet(w, aa)
for(k in 1:KK){
logpost[s] = logpost[s] + mvtnorm::dmvnorm(mu[k,], dd, DD, log=TRUE)
logpost[s] = logpost[s] + log(diwish(Sigma[k,,], nu, SS))
}
if(s/250==floor(s/250)){
print(paste("s = ", s, logpost[s]))
}
}
## [1] "s = 250 -768.994581625327"
## [1] "s = 500 -780.482842676977"
## [1] "s = 750 -773.547286688692"
## [1] "s = 1000 -776.885682338432"
## [1] "s = 1250 -769.469945488137"
## [1] "s = 1500 -775.681312221997"
## Predicted labels
burn = 500
counter = 0
error_count = array(0,10)
error0 = 0
for (s in (burn+1):rrr) {
err = sum(!(cc.out[s,] == labels))
if (err==0) {
error0 = error0 + 1
} else {
error_count[err] = error_count[err] + 1
}
}
print(paste("error0", error0))
## [1] "error0 937"
print(paste("error_count", error_count))
## [1] "error_count 62" "error_count 1" "error_count 0" "error_count 0" "error_count 0" "error_count 0" "error_count 0" "error_count 0" "error_count 0" "error_count 0"
What is the classification error for the test set?
# genuine: 0/85 = 0%, counterfeit: 0/85 = 0%. Total of 0 errors.
Is the R function qda
(which implements classical quadratic discriminant analysis) is used to classify the observations in the test set, what is the classification error?
# genuine: 0/85 = 0%, counterfeit: 3/85 = 3.53%, Total of 3 errors.
Is the setup of the algorithm correct?
Recall that the semisupervised version of the algorithm uses all observations but treats the labels for he training set as known. There are many ways to set this up, but one that requires minimal changes to the algorithm defines the
## All observations used for calculation
x = rbind(banknote.training,banknote.test)
## Size of the training and test sets
n = dim(banknote.training)[1]
m = dim(unique(banknote.test))[1]
## Number of components and dimensionality of the components
KK = length(unique(banknote.training.labels))
p = dim(banknote.training)[2]
## Starting value for the indicators
## Note that for the training set we use the true values, while for the
## test set the values are initialized randomly
cc = c(as.numeric(banknote.training.labels), sample(1:KK, m, replace=TRUE, prob=w))
However, be open minded in how you evaluate this item. The most important thing is that the setup is consistent with the rest of the implementation, and that it enables the use of both the training and test sets for the estimation of the means and variance of the components.
Is the sampler for
The full conditional for the indicators is identical to the one in the original code. The only difference is that the labels for the training set are considered known, and therefore not sampled. In practice, this means a simple change to the values over which the
# Sample the indicators
for(i in (n+1):(n+m)){
v = rep(0,KK)
for(k in 1:KK){
v[k] = log(w[k]) + dmvnorm(x[i,], mu[k,], Sigma[k,,], log=TRUE) #Compute the log of the weights
}
v = exp(v - max(v))/sum(exp(v - max(v)))
cc[i] = sample(1:KK, 1, replace=TRUE, prob=v)
}
Because the code never changes the values of the first
Is the sampler for the
Again, the sampler is identical to the one used before. The only difficulty here is that, because
# Sample the means
DD.st = matrix(0, nrow=p, ncol=p)
for(k in 1:KK){
mk = sum(cc==k)
xsumk = apply(x[cc==k,], 2, sum)
DD.st = solve(mk*solve(Sigma[k,,]) + solve(DD))
dd.st = DD.st%*%(solve(Sigma[k,,])%*%xsumk + solve(DD)%*%dd)
mu[k,] = as.vector(rmvnorm(1,dd.st,DD.st))
}
Is the sampler for the
Again, the sampler is identical to the one used before. The only difficulty is that, because
# Sample the variances
xcensumk = array(0, dim=c(KK,p,p))
for(i in 1:(n+m)){ ## Need to loop over all (n+m) observations, not just the first n
xcensumk[cc[i],,] = xcensumk[cc[i],,] + (x[i,] - mu[cc[i],])%*%t(x[i,] - mu[cc[i],])
}
for(k in 1:KK){
Sigma[k,,] = riwish(nu + sum(cc==k), SS + xcensumk[k,,])
}
Is the code used to classify observations in the test set correct?
The classification is based on the probabilities that each observation is classified as “genuine” and “counterfeit”. Those probabilities can be estimated using the frequencies for which each
probgenuine = rep(NA, m)
for(i in 1:m){
probgenuine[i] = sum(cc.out[-seq(1,burn),n+i]==2)/(rrr-burn)
}
A sample of the full code for this problem is provided below:
#### Semisupervised classification for the banknote dataset
rm(list=ls())
library(mvtnorm)
library(MCMCpack)
## Load data
#load("banknoteclassification.Rdata")
load("data/banknoteclassification.RData")
x = rbind(banknote.training,banknote.test)
## Generate data from a mixture with 3 components
KK = length(unique(banknote.training.labels))
p = dim(banknote.training)[2]
n = dim(banknote.training)[1]
m = dim(unique(banknote.test))[1]
## Initialize the parameters
w = rep(1,KK)/KK #Assign equal weight to each component to start with
mu = rmvnorm(KK, apply(x,2,mean), var(x)) #RandomCluster centers randomly spread over the support of the data
Sigma = array(0, dim=c(KK,p,p)) #Initial variances are assumed to be the same
Sigma[1,,] = var(x)/KK
Sigma[2,,] = var(x)/KK
cc = c(as.numeric(banknote.training.labels), sample(1:KK, m, replace=TRUE, prob=w))
# Priors
aa = rep(1, KK)
dd = apply(x,2,mean)
DD = 10*var(x)
nu = p+1
SS = var(x)/3
# Number of iteration of the sampler
rrr = 11000
burn = 1000
# Storing the samples
cc.out = array(0, dim=c(rrr, n+m))
w.out = array(0, dim=c(rrr, KK))
mu.out = array(0, dim=c(rrr, KK, p))
Sigma.out = array(0, dim=c(rrr, KK, p, p))
logpost = rep(0, rrr)
for(s in 1:rrr){
# Sample the indicators
for(i in (n+1):(n+m)){
v = rep(0,KK)
for(k in 1:KK){
v[k] = log(w[k]) + dmvnorm(x[i,], mu[k,], Sigma[k,,], log=TRUE) #Compute the log of the weights
}
v = exp(v - max(v))/sum(exp(v - max(v)))
cc[i] = sample(1:KK, 1, replace=TRUE, prob=v)
}
# Sample the weights
w = as.vector(rdirichlet(1, aa + tabulate(cc)))
# Sample the means
DD.st = matrix(0, nrow=p, ncol=p)
for(k in 1:KK){
mk = sum(cc==k)
xsumk = apply(x[cc==k,], 2, sum)
DD.st = solve(mk*solve(Sigma[k,,]) + solve(DD))
dd.st = DD.st%*%(solve(Sigma[k,,])%*%xsumk + solve(DD)%*%dd)
mu[k,] = as.vector(rmvnorm(1,dd.st,DD.st))
}
# Sample the variances
xcensumk = array(0, dim=c(KK,p,p))
for(i in 1:(n+m)){
xcensumk[cc[i],,] = xcensumk[cc[i],,] + (x[i,] - mu[cc[i],])%*%t(x[i,] - mu[cc[i],])
}
for(k in 1:KK){
Sigma[k,,] = riwish(nu + sum(cc==k), SS + xcensumk[k,,])
}
# Store samples
cc.out[s,] = cc
w.out[s,] = w
mu.out[s,,] = mu
Sigma.out[s,,,] = Sigma
for(i in 1:n){
logpost[s] = logpost[s] + log(w[cc[i]]) + dmvnorm(x[i,], mu[cc[i],], Sigma[cc[i],,], log=TRUE)
}
logpost[s] = logpost[s] + ddirichlet(w, aa)
for(k in 1:KK){
logpost[s] = logpost[s] + dmvnorm(mu[k,], dd, DD)
logpost[s] = logpost[s] + diwish(Sigma[k,,], nu, SS)
}
if(s/250==floor(s/250)){
print(paste("s = ", s))
}
}
## [1] "s = 250"
## [1] "s = 500"
## [1] "s = 750"
## [1] "s = 1000"
## [1] "s = 1250"
## [1] "s = 1500"
## [1] "s = 1750"
## [1] "s = 2000"
## [1] "s = 2250"
## [1] "s = 2500"
## [1] "s = 2750"
## [1] "s = 3000"
## [1] "s = 3250"
## [1] "s = 3500"
## [1] "s = 3750"
## [1] "s = 4000"
## [1] "s = 4250"
## [1] "s = 4500"
## [1] "s = 4750"
## [1] "s = 5000"
## [1] "s = 5250"
## [1] "s = 5500"
## [1] "s = 5750"
## [1] "s = 6000"
## [1] "s = 6250"
## [1] "s = 6500"
## [1] "s = 6750"
## [1] "s = 7000"
## [1] "s = 7250"
## [1] "s = 7500"
## [1] "s = 7750"
## [1] "s = 8000"
## [1] "s = 8250"
## [1] "s = 8500"
## [1] "s = 8750"
## [1] "s = 9000"
## [1] "s = 9250"
## [1] "s = 9500"
## [1] "s = 9750"
## [1] "s = 10000"
## [1] "s = 10250"
## [1] "s = 10500"
## [1] "s = 10750"
## [1] "s = 11000"
probgenuine = rep(NA, m)
for(i in 1:m){
probgenuine[i] = sum(cc.out[-seq(1,burn),n+i]==2)/(rrr-burn)
}
Is the classification error for the “genuine” class generated by the algorithm correct?
The value should be 0% (the algorithm perfectly classifies genuine banknotes in the test set).
Is the classification error for the “counterfeit” class generated by the algorithm correct?
Is the code used to classify observations in the test set correct?
This is a simple call to the qda
function
banknote.qda = qda(x=banknote.training, grouping=banknote.training.labels)
predict(banknote.qda, banknote.test)$class
## [1] genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine
## [83] genuine genuine genuine counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit genuine counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit genuine counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit genuine counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit
## [165] counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit
## Levels: counterfeit genuine
Is the classification error for the “genuine” class generated by the algorithm correct?
The value should be 0% (the algorithm perfectly classifies genuine banknotes in the test set). This is the same as the semisupervised Bayesian QDA algorithm you just implement
Is the classification error for the “counterfeit” class generated by the algorithm correct?
The value should also be 3.52% (3 errors out of 85 observations). In this case, the function
Provide an MCMC algorithm to fit a semisupervised Bayesian quadratic discriminant model to the banknote data.
rm(list=ls())
library(mvtnorm)
library(ellipse)
library(MCMCpack)
#load("banknoteclassification.Rdata")
load('data/banknoteclassification.RData')
x = rbind(banknote.training,banknote.test)
KK = 2
p = 6
n = nrow(banknote.training)
m = nrow(banknote.test)
## Initialize the parameters
w = rep(1,KK)/KK
mu = rmvnorm(KK, apply(x,2,mean), var(x))
Sigma = array(0, dim=c(KK,p,p)) #Initial variances are
Sigma[1,,] = var(x)/KK
Sigma[2,,] = var(x)/KK
cc = c(as.numeric(banknote.training.labels), sample(1:KK, m, replace=TRUE, prob=w))
#We add the training labels and sample the unknown ones from the test set
# Priors
aa = rep(1, KK)
dd = apply(x,2,mean)
DD = 10*var(x)
nu = p+1
SS = var(x)/3
# Number of iteration of the sampler
rrr = 11000
burn = 1000
# Storing the samples
cc.out = array(0, dim=c(rrr, (n+m)))
w.out = array(0, dim=c(rrr, KK))
mu.out = array(0, dim=c(rrr, KK, p))
Sigma.out = array(0, dim=c(rrr, KK, p, p))
logpost = rep(0, rrr)
for(s in 1:rrr){
# Sample the indicators (only those from the test set)
for(i in (n+1):(n+m)){
v = rep(0,KK)
for(k in 1:KK){
v[k] = log(w[k]) + dmvnorm(x[i,], mu[k,], Sigma[k,,], log=TRUE) #Compute the log of the weights
v = exp(v - max(v))/sum(exp(v - max(v)))
cc[i] = sample(1:KK, 1, replace=TRUE, prob=v)
}
# Sample the weights
w = as.vector(rdirichlet(1, aa + tabulate(cc)))
# Sample the means
DD.st = matrix(0, nrow=p, ncol=p)
for(k in 1:KK){
mk = sum(cc==k)
xsumk = apply(x[cc==k,], 2, sum)
DD.st = solve(mk*solve(Sigma[k,,]) + solve(DD))
dd.st = DD.st%*%(solve(Sigma[k,,])%*%xsumk + solve(DD)%*%dd)
mu[k,] = as.vector(rmvnorm(1,dd.st,DD.st))
}
# Sample the variances
xcensumk = array(0, dim=c(KK,p,p))
for(i in 1:(n+m)){
xcensumk[cc[i],,] = xcensumk[cc[i],,] + (x[i,] - mu[cc[i],])%*%t(x[i,] - mu[cc[i],])
}
# Store samples
cc.out[s,] = cc
w.out[s,] = w
mu.out[s,,] = mu
Sigma.out[s,,,] = Sigma
for(i in 1:n){
logpost[s] = logpost[s] + log(w[cc[i]]) + dmvnorm(x[i,], mu[cc[i],], Sigma[cc[i],,], log=TRUE)
}
logpost[s] = logpost[s] + ddirichlet(w, aa)
for(k in 1:KK){
logpost[s] = logpost[s] + dmvnorm(mu[k,], dd, DD)
logpost[s] = logpost[s] + diwish(Sigma[k,,], nu, SS)
}
if(s/250==floor(s/250)){
print(paste("s = ", s))
}
}
## Error: <text>:93:0: unexpected end of input
## 91: }
## 92: }
## ^
What is the classification error for the test set?
#Calculate the posterior probability of being classified with label "1".
prob_label1 = rep(NA, m)
for(i in 1:m){
prob_label1[i] = sum(cc.out[-seq(1,burn),n+i]==1)/(rrr-burn)
}
#R 3.6.1> prob_label1
prob_label1
## [1] 0.0002 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0003 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 1.0000 1.0000 0.9588 0.9996 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 0.9999 0.9999 1.0000 0.9996 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 0.9629 1.0000 0.9998 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000
## [143] 1.0000 0.9999 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000
# The observations 1-84 have a posterior probability of being classified as label "1" of almost 0. Observations 85-170 have a probability of being classified with label "1" of almost 1.
#Comparing these values with the true values, the algorithm correctly classifies all the observations.
Is the R function qda
(which implements classical quadratic discriminant analysis) is used to classify the observations in the test set, what is the classification error?
modqda = qda(grouping=as.numeric(banknote.training.labels), x=banknote.training, method="mle")
ccpredqda = predict(modqda,newdata=banknote.test)
sum(!(ccpredqda$class == as.numeric(banknote.test.labels)))
#3errors overall
which(!(ccpredqda$class == as.numeric(banknote.test.labels))) as.numeric(banknote.test.labels)[which(!(ccpredqda$class == as.numeric(banknote.test.labels)))]
#These 3errors belong to the label "1" (counterfeit). Therefore, 0/85 (0%) error in category genuine and 3/85 (3.53%) errors in category counterfeit.
## Error: <text>:7:63: unexpected symbol
## 6: #3errors overall
## 7: which(!(ccpredqda$class == as.numeric(banknote.test.labels))) as.numeric
## ^
Is the setup of the algorithm correct?
Recall that the semisupervised version of the algorithm uses all observations but treats the labels for he training set as known. There are many ways to set this up, but one that requires minimal changes to the algorithm defines the
## All observations used for calculation
x = rbind(banknote.training,banknote.test)
## Size of the training and test sets
n = dim(banknote.training)[1]
m = dim(unique(banknote.test))[1]
## Number of components and dimensionality of the components
KK = length(unique(banknote.training.labels))
p = dim(banknote.training)[2]
## Starting value for the indicators
## Note that for the training set we use the true values, while for the
## test set the values are initialized randomly
cc = c(as.numeric(banknote.training.labels), sample(1:KK, m, replace=TRUE, prob=w))
However, be open minded in how you evaluate this item. The most important thing is that the setup is consistent with the rest of the implementation, and that it enables the use of both the training and test sets for the estimation of the means and variance of the components.
Is the sampler for
The full conditional for the indicators is identical to the one in the original code. The only difference is that the labels for the training set are considered known, and therefore not sampled. In practice, this means a simple change to the values over which the
# Sample the indicators
for(i in (n+1):(n+m)){
v = rep(0,KK)
for(k in 1:KK){
v[k] = log(w[k]) + dmvnorm(x[i,], mu[k,], Sigma[k,,], log=TRUE) #Compute the log of the weights
}
v = exp(v - max(v))/sum(exp(v - max(v)))
cc[i] = sample(1:KK, 1, replace=TRUE, prob=v)
}
Because the code never changes the values of the first
Is the sampler for the
Again, the sampler is identical to the one used before. The only difficulty here is that, because
# Sample the means
DD.st = matrix(0, nrow=p, ncol=p)
for(k in 1:KK){
mk = sum(cc==k)
xsumk = apply(x[cc==k,], 2, sum)
DD.st = solve(mk*solve(Sigma[k,,]) + solve(DD))
dd.st = DD.st%*%(solve(Sigma[k,,])%*%xsumk + solve(DD)%*%dd)
mu[k,] = as.vector(rmvnorm(1,dd.st,DD.st))
}
Is the sampler for the
Again, the sampler is identical to the one used before. The only difficulty is that, because
# Sample the variances
xcensumk = array(0, dim=c(KK,p,p))
for(i in 1:(n+m)){ ## Need to loop over all (n+m) observations, not just the first n
xcensumk[cc[i],,] = xcensumk[cc[i],,] + (x[i,] - mu[cc[i],])%*%t(x[i,] - mu[cc[i],])
}
for(k in 1:KK){
Sigma[k,,] = riwish(nu + sum(cc==k), SS + xcensumk[k,,])
}
Is the code used to classify observations in the test set correct?
The classification is based on the probabilities that each observation is classified as “genuine” and “counterfeit”. Those probabilities can be estimated using the frequencies for which each
probgenuine = rep(NA, m)
for(i in 1:m){
probgenuine[i] = sum(cc.out[-seq(1,burn),n+i]==2)/(rrr-burn)
}
A sample of the full code for this problem is provided below:
#### Semisupervised classification for the banknote dataset
rm(list=ls())
library(mvtnorm)
library(MCMCpack)
## Load data
#load("banknoteclassification.Rdata")
load("data/banknoteclassification.RData")
x = rbind(banknote.training,banknote.test)
## Generate data from a mixture with 3 components
KK = length(unique(banknote.training.labels))
p = dim(banknote.training)[2]
n = dim(banknote.training)[1]
m = dim(unique(banknote.test))[1]
## Initialize the parameters
w = rep(1,KK)/KK #Assign equal weight to each component to start with
mu = rmvnorm(KK, apply(x,2,mean), var(x)) #RandomCluster centers randomly spread over the support of the data
Sigma = array(0, dim=c(KK,p,p)) #Initial variances are assumed to be the same
Sigma[1,,] = var(x)/KK
Sigma[2,,] = var(x)/KK
cc = c(as.numeric(banknote.training.labels), sample(1:KK, m, replace=TRUE, prob=w))
# Priors
aa = rep(1, KK)
dd = apply(x,2,mean)
DD = 10*var(x)
nu = p+1
SS = var(x)/3
# Number of iteration of the sampler
rrr = 11000
burn = 1000
# Storing the samples
cc.out = array(0, dim=c(rrr, n+m))
w.out = array(0, dim=c(rrr, KK))
mu.out = array(0, dim=c(rrr, KK, p))
Sigma.out = array(0, dim=c(rrr, KK, p, p))
logpost = rep(0, rrr)
for(s in 1:rrr){
# Sample the indicators
for(i in (n+1):(n+m)){
v = rep(0,KK)
for(k in 1:KK){
v[k] = log(w[k]) + dmvnorm(x[i,], mu[k,], Sigma[k,,], log=TRUE) #Compute the log of the weights
}
v = exp(v - max(v))/sum(exp(v - max(v)))
cc[i] = sample(1:KK, 1, replace=TRUE, prob=v)
}
# Sample the weights
w = as.vector(rdirichlet(1, aa + tabulate(cc)))
# Sample the means
DD.st = matrix(0, nrow=p, ncol=p)
for(k in 1:KK){
mk = sum(cc==k)
xsumk = apply(x[cc==k,], 2, sum)
DD.st = solve(mk*solve(Sigma[k,,]) + solve(DD))
dd.st = DD.st%*%(solve(Sigma[k,,])%*%xsumk + solve(DD)%*%dd)
mu[k,] = as.vector(rmvnorm(1,dd.st,DD.st))
}
# Sample the variances
xcensumk = array(0, dim=c(KK,p,p))
for(i in 1:(n+m)){
xcensumk[cc[i],,] = xcensumk[cc[i],,] + (x[i,] - mu[cc[i],])%*%t(x[i,] - mu[cc[i],])
}
for(k in 1:KK){
Sigma[k,,] = riwish(nu + sum(cc==k), SS + xcensumk[k,,])
}
# Store samples
cc.out[s,] = cc
w.out[s,] = w
mu.out[s,,] = mu
Sigma.out[s,,,] = Sigma
for(i in 1:n){
logpost[s] = logpost[s] + log(w[cc[i]]) + dmvnorm(x[i,], mu[cc[i],], Sigma[cc[i],,], log=TRUE)
}
logpost[s] = logpost[s] + ddirichlet(w, aa)
for(k in 1:KK){
logpost[s] = logpost[s] + dmvnorm(mu[k,], dd, DD)
logpost[s] = logpost[s] + diwish(Sigma[k,,], nu, SS)
}
if(s/250==floor(s/250)){
print(paste("s = ", s))
}
}
## [1] "s = 250"
## [1] "s = 500"
## [1] "s = 750"
## [1] "s = 1000"
## [1] "s = 1250"
## [1] "s = 1500"
## [1] "s = 1750"
## [1] "s = 2000"
## [1] "s = 2250"
## [1] "s = 2500"
## [1] "s = 2750"
## [1] "s = 3000"
## [1] "s = 3250"
## [1] "s = 3500"
## [1] "s = 3750"
## [1] "s = 4000"
## [1] "s = 4250"
## [1] "s = 4500"
## [1] "s = 4750"
## [1] "s = 5000"
## [1] "s = 5250"
## [1] "s = 5500"
## [1] "s = 5750"
## [1] "s = 6000"
## [1] "s = 6250"
## [1] "s = 6500"
## [1] "s = 6750"
## [1] "s = 7000"
## [1] "s = 7250"
## [1] "s = 7500"
## [1] "s = 7750"
## [1] "s = 8000"
## [1] "s = 8250"
## [1] "s = 8500"
## [1] "s = 8750"
## [1] "s = 9000"
## [1] "s = 9250"
## [1] "s = 9500"
## [1] "s = 9750"
## [1] "s = 10000"
## [1] "s = 10250"
## [1] "s = 10500"
## [1] "s = 10750"
## [1] "s = 11000"
probgenuine = rep(NA, m)
for(i in 1:m){
probgenuine[i] = sum(cc.out[-seq(1,burn),n+i]==2)/(rrr-burn)
}
Is the classification error for the “genuine” class generated by the algorithm correct?
The value should be 0% (the algorithm perfectly classifies genuine banknotes in the test set).
Is the classification error for the “counterfeit” class generated by the algorithm correct?
Is the code used to classify observations in the test set correct?
This is a simple call to the qda
function
banknote.qda = qda(x=banknote.training, grouping=banknote.training.labels)
predict(banknote.qda, banknote.test)$class
## [1] genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine
## [83] genuine genuine genuine counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit genuine counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit genuine counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit genuine counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit
## [165] counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit
## Levels: counterfeit genuine
Is the classification error for the “genuine” class generated by the algorithm correct?
The value should be 0% (the algorithm perfectly classifies genuine banknotes in the test set). This is the same as the semisupervised Bayesian QDA algorithm you just implement
Is the classification error for the “counterfeit” class generated by the algorithm correct?
The value should also be 3.52% (3 errors out of 85 observations). In this case, the function
Provide an MCMC algorithm to fit a semisupervised Bayesian quadratic discriminant model to the banknote data.
#load('data/banknoteclassification.RData')
#data = load("W4_banknoteclassification.Rdata")
data = load("data/banknoteclassification.Rdata")
KK = length(unique(banknote.training.labels))
n = dim(banknote.training)[1] # Size of the training set
m = dim(banknote.test)[1] # Size of the test set
x = rbind(banknote.training, banknote.test) # Create dataset of observations, first n belong to the training set, and the rest belong to the test set
p = dim(x)[2] # Number of features
## Initialize the parameters
w = rep(1,KK)/KK #Assign equal weight to each component to start with
mu = rmvnorm(KK, apply(x,2,mean), var(x)) #RandomCluster centers randomly spread over the support of the data
Sigma = array(0, dim=c(KK,p,p)) #Initial variances are assumed to be the same
Sigma[1,,] = var(x)/KK
Sigma[2,,] = var(x)/KK
cc = c(as.numeric(banknote.training.labels), sample(1:KK, m, replace=TRUE, prob=w))
# Priors
aa = rep(1, KK)
dd = apply(x,2,mean)
DD = 10*var(x)
nu = p+1
SS = var(x)/3
# Number of iteration of the sampler
rrr = 11000
burn = 1000
# Storing the samples
cc.out = array(0, dim=c(rrr, n+m))
w.out = array(0, dim=c(rrr, KK))
mu.out = array(0, dim=c(rrr, KK, p))
Sigma.out = array(0, dim=c(rrr, KK, p, p))
logpost = rep(0, rrr)
for(s in 1:rrr){
# Sample the indicators
for(i in (n+1):(n+m)){
v = rep(0,KK)
for(k in 1:KK){
v[k] = log(w[k]) + dmvnorm(x[i,], mu[k,], Sigma[k,,], log=TRUE) #Compute the log of the weights
}
v = exp(v - max(v))/sum(exp(v - max(v)))
cc[i] = sample(1:KK, 1, replace=TRUE, prob=v)
}
# Sample the weights
w = as.vector(rdirichlet(1, aa + tabulate(cc)))
# Sample the means
DD.st = matrix(0, nrow=p, ncol=p)
for(k in 1:KK){
mk = sum(cc==k)
xsumk = apply(x[cc==k,], 2, sum)
DD.st = solve(mk*solve(Sigma[k,,]) + solve(DD))
dd.st = DD.st%*%(solve(Sigma[k,,])%*%xsumk + solve(DD)%*%dd)
mu[k,] = as.vector(rmvnorm(1,dd.st,DD.st))
}
# Sample the variances
xcensumk = array(0, dim=c(KK,p,p))
for(i in 1:n+m){
xcensumk[cc[i],,] = xcensumk[cc[i],,] + (x[i,] - mu[cc[i],])%*%t(x[i,] - mu[cc[i],])
}
for(k in 1:KK){
Sigma[k,,] = riwish(nu + sum(cc==k), SS + xcensumk[k,,])
}
# Store samples
cc.out[s,] = cc
w.out[s,] = w
mu.out[s,,] = mu
Sigma.out[s,,,] = Sigma
for(i in 1:n){
logpost[s] = logpost[s] + log(w[cc[i]]) + dmvnorm(x[i,], mu[cc[i],], Sigma[cc[i],,], log=TRUE)
}
logpost[s] = logpost[s] + ddirichlet(w, aa)
for(k in 1:KK){
logpost[s] = logpost[s] + dmvnorm(mu[k,], dd, DD)
logpost[s] = logpost[s] + diwish(Sigma[k,,], nu, SS)
}
if(s/250==floor(s/250)){
print(paste("s = ", s))
}
}
## [1] "s = 250"
## [1] "s = 500"
## [1] "s = 750"
## [1] "s = 1000"
## [1] "s = 1250"
## [1] "s = 1500"
## [1] "s = 1750"
## [1] "s = 2000"
## [1] "s = 2250"
## [1] "s = 2500"
## [1] "s = 2750"
## [1] "s = 3000"
## [1] "s = 3250"
## [1] "s = 3500"
## [1] "s = 3750"
## [1] "s = 4000"
## [1] "s = 4250"
## [1] "s = 4500"
## [1] "s = 4750"
## [1] "s = 5000"
## [1] "s = 5250"
## [1] "s = 5500"
## [1] "s = 5750"
## [1] "s = 6000"
## [1] "s = 6250"
## [1] "s = 6500"
## [1] "s = 6750"
## [1] "s = 7000"
## [1] "s = 7250"
## [1] "s = 7500"
## [1] "s = 7750"
## [1] "s = 8000"
## [1] "s = 8250"
## [1] "s = 8500"
## [1] "s = 8750"
## [1] "s = 9000"
## [1] "s = 9250"
## [1] "s = 9500"
## [1] "s = 9750"
## [1] "s = 10000"
## [1] "s = 10250"
## [1] "s = 10500"
## [1] "s = 10750"
## [1] "s = 11000"
What is the classification error for the test set?
# 0 observations are misclassified
error.genuine = 0
error.count = 0
probgenuine = rep(NA, m)
for(i in 1:m){
probgenuine[i] = sum(cc.out[-seq(1,burn),n+i]==2)/(rrr-burn)
}
probcounterfeit = rep(NA, m)
for(i in 1:m){
probcounterfeit[i] = sum(cc.out[-seq(1,burn),n+i]==1)/(rrr-burn)
}
error.genuine = 1-sum(probgenuine[banknote.test.labels=="genuine"]>=probcounterfeit[banknote.test.labels=="genuine"])/length(banknote.test.labels[banknote.test.labels=="genuine"])
error.count = 1-sum(probgenuine[banknote.test.labels=="counterfeit"]<probcounterfeit[banknote.test.labels=="counterfeit"])/length(banknote.test.labels[banknote.test.labels=="counterfeit"])
Is the R function qda
(which implements classical quadratic discriminant analysis) is used to classify the observations in the test set, what is the classification error?
# 3 observations are misclassified
# 0 for genuine
# 3 for counterfeit
# Using the qda and lda functions in R
# qda
modqda = qda(grouping=banknote.training.labels, x=banknote.training, method="mle")
ccpredqda = predict(modqda,newdata=banknote.test)
ccpredqda$class
## [1] genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine
## [83] genuine genuine genuine counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit genuine counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit genuine counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit genuine counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit
## [165] counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit
## Levels: counterfeit genuine
sum(!(ccpredqda$class == banknote.test.labels)) # Three error
## [1] 3
sum(!ccpredqda$class[banknote.test.labels=="genuine"] == "genuine")
## [1] 0
sum(!ccpredqda$class[banknote.test.labels=="counterfeit"] == "counterfeit")
## [1] 3
Is the setup of the algorithm correct?
Recall that the semisupervised version of the algorithm uses all observations but treats the labels for he training set as known. There are many ways to set this up, but one that requires minimal changes to the algorithm defines the
## All observations used for calculation
x = rbind(banknote.training,banknote.test)
## Size of the training and test sets
n = dim(banknote.training)[1]
m = dim(unique(banknote.test))[1]
## Number of components and dimensionality of the components
KK = length(unique(banknote.training.labels))
p = dim(banknote.training)[2]
## Starting value for the indicators
## Note that for the training set we use the true values, while for the
## test set the values are initialized randomly
cc = c(as.numeric(banknote.training.labels), sample(1:KK, m, replace=TRUE, prob=w))
However, be open minded in how you evaluate this item. The most important thing is that the setup is consistent with the rest of the implementation, and that it enables the use of both the training and test sets for the estimation of the means and variance of the components.
Is the sampler for
The full conditional for the indicators is identical to the one in the original code. The only difference is that the labels for the training set are considered known, and therefore not sampled. In practice, this means a simple change to the values over which the
# Sample the indicators
for(i in (n+1):(n+m)){
v = rep(0,KK)
for(k in 1:KK){
v[k] = log(w[k]) + dmvnorm(x[i,], mu[k,], Sigma[k,,], log=TRUE) #Compute the log of the weights
}
v = exp(v - max(v))/sum(exp(v - max(v)))
cc[i] = sample(1:KK, 1, replace=TRUE, prob=v)
}
Because the code never changes the values of the first
Is the sampler for the
Again, the sampler is identical to the one used before. The only difficulty here is that, because
# Sample the means
DD.st = matrix(0, nrow=p, ncol=p)
for(k in 1:KK){
mk = sum(cc==k)
xsumk = apply(x[cc==k,], 2, sum)
DD.st = solve(mk*solve(Sigma[k,,]) + solve(DD))
dd.st = DD.st%*%(solve(Sigma[k,,])%*%xsumk + solve(DD)%*%dd)
mu[k,] = as.vector(rmvnorm(1,dd.st,DD.st))
}
Is the sampler for the
Again, the sampler is identical to the one used before. The only difficulty is that, because
# Sample the variances
xcensumk = array(0, dim=c(KK,p,p))
for(i in 1:(n+m)){ ## Need to loop over all (n+m) observations, not just the first n
xcensumk[cc[i],,] = xcensumk[cc[i],,] + (x[i,] - mu[cc[i],])%*%t(x[i,] - mu[cc[i],])
}
for(k in 1:KK){
Sigma[k,,] = riwish(nu + sum(cc==k), SS + xcensumk[k,,])
}
Is the code used to classify observations in the test set correct?
The classification is based on the probabilities that each observation is classified as “genuine” and “counterfeit”. Those probabilities can be estimated using the frequencies for which each
probgenuine = rep(NA, m)
for(i in 1:m){
probgenuine[i] = sum(cc.out[-seq(1,burn),n+i]==2)/(rrr-burn)
}
A sample of the full code for this problem is provided below:
#### Semisupervised classification for the banknote dataset
rm(list=ls())
library(mvtnorm)
library(MCMCpack)
## Load data
#load("banknoteclassification.Rdata")
load("data/banknoteclassification.RData")
x = rbind(banknote.training,banknote.test)
## Generate data from a mixture with 3 components
KK = length(unique(banknote.training.labels))
p = dim(banknote.training)[2]
n = dim(banknote.training)[1]
m = dim(unique(banknote.test))[1]
## Initialize the parameters
w = rep(1,KK)/KK #Assign equal weight to each component to start with
mu = rmvnorm(KK, apply(x,2,mean), var(x)) #RandomCluster centers randomly spread over the support of the data
Sigma = array(0, dim=c(KK,p,p)) #Initial variances are assumed to be the same
Sigma[1,,] = var(x)/KK
Sigma[2,,] = var(x)/KK
cc = c(as.numeric(banknote.training.labels), sample(1:KK, m, replace=TRUE, prob=w))
# Priors
aa = rep(1, KK)
dd = apply(x,2,mean)
DD = 10*var(x)
nu = p+1
SS = var(x)/3
# Number of iteration of the sampler
rrr = 11000
burn = 1000
# Storing the samples
cc.out = array(0, dim=c(rrr, n+m))
w.out = array(0, dim=c(rrr, KK))
mu.out = array(0, dim=c(rrr, KK, p))
Sigma.out = array(0, dim=c(rrr, KK, p, p))
logpost = rep(0, rrr)
for(s in 1:rrr){
# Sample the indicators
for(i in (n+1):(n+m)){
v = rep(0,KK)
for(k in 1:KK){
v[k] = log(w[k]) + dmvnorm(x[i,], mu[k,], Sigma[k,,], log=TRUE) #Compute the log of the weights
}
v = exp(v - max(v))/sum(exp(v - max(v)))
cc[i] = sample(1:KK, 1, replace=TRUE, prob=v)
}
# Sample the weights
w = as.vector(rdirichlet(1, aa + tabulate(cc)))
# Sample the means
DD.st = matrix(0, nrow=p, ncol=p)
for(k in 1:KK){
mk = sum(cc==k)
xsumk = apply(x[cc==k,], 2, sum)
DD.st = solve(mk*solve(Sigma[k,,]) + solve(DD))
dd.st = DD.st%*%(solve(Sigma[k,,])%*%xsumk + solve(DD)%*%dd)
mu[k,] = as.vector(rmvnorm(1,dd.st,DD.st))
}
# Sample the variances
xcensumk = array(0, dim=c(KK,p,p))
for(i in 1:(n+m)){
xcensumk[cc[i],,] = xcensumk[cc[i],,] + (x[i,] - mu[cc[i],])%*%t(x[i,] - mu[cc[i],])
}
for(k in 1:KK){
Sigma[k,,] = riwish(nu + sum(cc==k), SS + xcensumk[k,,])
}
# Store samples
cc.out[s,] = cc
w.out[s,] = w
mu.out[s,,] = mu
Sigma.out[s,,,] = Sigma
for(i in 1:n){
logpost[s] = logpost[s] + log(w[cc[i]]) + dmvnorm(x[i,], mu[cc[i],], Sigma[cc[i],,], log=TRUE)
}
logpost[s] = logpost[s] + ddirichlet(w, aa)
for(k in 1:KK){
logpost[s] = logpost[s] + dmvnorm(mu[k,], dd, DD)
logpost[s] = logpost[s] + diwish(Sigma[k,,], nu, SS)
}
if(s/250==floor(s/250)){
print(paste("s = ", s))
}
}
## [1] "s = 250"
## [1] "s = 500"
## [1] "s = 750"
## [1] "s = 1000"
## [1] "s = 1250"
## [1] "s = 1500"
## [1] "s = 1750"
## [1] "s = 2000"
## [1] "s = 2250"
## [1] "s = 2500"
## [1] "s = 2750"
## [1] "s = 3000"
## [1] "s = 3250"
## [1] "s = 3500"
## [1] "s = 3750"
## [1] "s = 4000"
## [1] "s = 4250"
## [1] "s = 4500"
## [1] "s = 4750"
## [1] "s = 5000"
## [1] "s = 5250"
## [1] "s = 5500"
## [1] "s = 5750"
## [1] "s = 6000"
## [1] "s = 6250"
## [1] "s = 6500"
## [1] "s = 6750"
## [1] "s = 7000"
## [1] "s = 7250"
## [1] "s = 7500"
## [1] "s = 7750"
## [1] "s = 8000"
## [1] "s = 8250"
## [1] "s = 8500"
## [1] "s = 8750"
## [1] "s = 9000"
## [1] "s = 9250"
## [1] "s = 9500"
## [1] "s = 9750"
## [1] "s = 10000"
## [1] "s = 10250"
## [1] "s = 10500"
## [1] "s = 10750"
## [1] "s = 11000"
probgenuine = rep(NA, m)
for(i in 1:m){
probgenuine[i] = sum(cc.out[-seq(1,burn),n+i]==2)/(rrr-burn)
}
Is the classification error for the “genuine” class generated by the algorithm correct?
The value should be 0% (the algorithm perfectly classifies genuine banknotes in the test set).
Is the classification error for the “counterfeit” class generated by the algorithm correct?
Is the code used to classify observations in the test set correct?
This is a simple call to the qda
function
banknote.qda = qda(x=banknote.training, grouping=banknote.training.labels)
predict(banknote.qda, banknote.test)$class
## [1] genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine
## [83] genuine genuine genuine counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit genuine counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit genuine counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit genuine counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit
## [165] counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit
## Levels: counterfeit genuine
Is the classification error for the “genuine” class generated by the algorithm correct?
The value should be 0% (the algorithm perfectly classifies genuine banknotes in the test set). This is the same as the semisupervised Bayesian QDA algorithm you just implement
Is the classification error for the “counterfeit” class generated by the algorithm correct?
The value should also be 3.52% (3 errors out of 85 observations). In this case, the function
Provide an MCMC algorithm to fit a semisupervised Bayesian quadratic discriminant model to the banknote data.
load('data/banknoteclassification.RData')
library(mvtnorm)
library(MCMCpack)
#load("banknoteclassification.Rdata")
n = nrow(banknote.training) # Size of the training set
m = nrow(banknote.test) # Size of the test set
x = rbind(as.matrix(banknote.training), as.matrix(banknote.test)) # Create dataset of observations, first n belong to the training set, and the rest belong to the test set
p = dim(x)[2] # Number of features
KK = 2
## Initialize the parameters
w = c(mean(banknote.training.labels=="counterfeit"),mean(banknote.training.labels=="genuine"))
mu = rbind(apply(banknote.training[banknote.training.labels=="counterfeit",],2,mean),apply(banknote.training[banknote.training.labels=="genuine",],2,mean))
Sigma = array(0, dim=c(KK,p,p)) Sigma[1,,] = var(x[banknote.training.labels=="counterfeit",]) Sigma[2,,] = var(x[banknote.training.labels=="genuine",])
cc = array(0,n+m)
cc[1:n] = as.integer(banknote.training.labels)
cc[(n+1):(n+m)] = sample(1:2,m,replace=TRUE,prob = w)
# Priors
aa = rep(1, KK)
dd = apply(x,2,mean)
DD = 10*var(x)
nu = p
SS = var(x)/2
# Number of iteration of the sampler
rrr = 1000
# Storing the samples
cc.out = array(0, dim=c(rrr, n+m))
w.out = array(0, dim=c(rrr, KK))
mu.out = array(0, dim=c(rrr, KK, p))
Sigma.out = array(0, dim=c(rrr, KK, p, p))
logpost = rep(0, rrr) for(s in 1:rrr){
# Sample the indicators c
c[1:n] = as.integer(banknote.training.labels) # Training set
for(i in (n+1):(n+m)){
v = rep(0,KK)
for(k in 1:KK){
v[k] = log(w[k]) + mvtnorm::dmvnorm(x[i,], mu[k,], Sigma[k,,], log=TRUE) #Compute the log of the weights } v = exp(v - max(v))/sum(exp(v - max(v)))
cc[i] = sample(1:KK, 1, replace=TRUE, prob=v)
}
# Sample the weights
w = as.vector(rdirichlet(1, aa + tabulate(cc)))
# Sample the means
DD.st = matrix(0, nrow=p, ncol=p)
for(k in 1:KK){
mk = sum(cc==k)
xsumk = apply(x[cc==k,], 2, sum)
DD.st = solve(mk*solve(Sigma[k,,]) + solve(DD))
dd.st = DD.st%*%(solve(Sigma[k,,])%*%xsumk + solve(DD)%*%dd)
mu[k,] = as.vector(rmvnorm(1,dd.st,DD.st))
}
# Sample the variances
xcensumk = array(0, dim=c(KK,p,p))
for(i in 1:n){ xcensumk[cc[i],,] = xcensumk[cc[i],,] + (x[i,] - mu[cc[i],])%*%t(x[i,] - mu[cc[i],])
}
for(k in 1:KK){
Sigma[k,,] = riwish(nu + sum(cc==k), SS + xcensumk[k,,])
}
# Store samples
cc.out[s,] = cc
w.out[s,] = w
mu.out[s,,] = mu
Sigma.out[s,,,] = Sigma
for(i in 1:(n+m)){
logpost[s] = logpost[s] + log(w[cc[i]]) + mvtnorm::dmvnorm(x[i,], mu[cc[i],], Sigma[cc[i],,], log=TRUE)
}
logpost[s] = logpost[s] + ddirichlet(w, aa)
for(k in 1:KK){
logpost[s] = logpost[s] + mvtnorm::dmvnorm(mu[k,], dd, DD)
logpost[s] = logpost[s] + diwish(Sigma[k,,], nu, SS)
}
if(s/250==floor(s/250)){
print(paste("s = ", s))
}
}
}
## Error: <text>:15:37: unexpected symbol
## 14: mu = rbind(apply(banknote.training[banknote.training.labels=="counterfeit",],2,mean),apply(banknote.training[banknote.training.labels=="genuine",],2,mean))
## 15: Sigma = array(0, dim=c(KK,p,p)) Sigma
## ^
What is the classification error for the test set?
# classification probability of test data
prob_1 = apply(cc.out[,(n+1):(n+m)],2,function(x){mean(x==1)}) ## probability of test data being counterfeit
prob_2 = 1-prob_1 ## probability of test data being genuine
prob = cbind(prob_1,prob_2)
# classification of test data using mcmc probs
pred = ifelse(prob[,1]>prob[,2],1,2)
n_err = sum(!pred==as.integer(banknote.test.labels)) # number of errors= 1 error
classification_err=n_err/length(pred)
#number of errors=1, classification error mcmc= 0.005882353
Is the R function qda
(which implements classical quadratic discriminant analysis) is used to classify the observations in the test set, what is the classification error?
#classification of test data using qda
modqda = qda(grouping=banknote.training.labels, x=banknote.training, method="mle")
ccpredqda = predict(modqda,newdata=banknote.test)
n_qdaerr = sum(!(ccpredqda$class == banknote.test.labels)) # number of errors= 3 error qda_err=n_qdaerr/length(banknote.test.labels)
#number of errors=3, classification error qda= 0.01764706
Is the setup of the algorithm correct?
Recall that the semisupervised version of the algorithm uses all observations but treats the labels for he training set as known. There are many ways to set this up, but one that requires minimal changes to the algorithm defines the
## All observations used for calculation
x = rbind(banknote.training,banknote.test)
## Size of the training and test sets
n = dim(banknote.training)[1]
m = dim(unique(banknote.test))[1]
## Number of components and dimensionality of the components
KK = length(unique(banknote.training.labels))
p = dim(banknote.training)[2]
## Starting value for the indicators
## Note that for the training set we use the true values, while for the
## test set the values are initialized randomly
cc = c(as.numeric(banknote.training.labels), sample(1:KK, m, replace=TRUE, prob=w))
However, be open minded in how you evaluate this item. The most important thing is that the setup is consistent with the rest of the implementation, and that it enables the use of both the training and test sets for the estimation of the means and variance of the components.
Is the sampler for
The full conditional for the indicators is identical to the one in the original code. The only difference is that the labels for the training set are considered known, and therefore not sampled. In practice, this means a simple change to the values over which the
# Sample the indicators
for(i in (n+1):(n+m)){
v = rep(0,KK)
for(k in 1:KK){
v[k] = log(w[k]) + dmvnorm(x[i,], mu[k,], Sigma[k,,], log=TRUE) #Compute the log of the weights
}
v = exp(v - max(v))/sum(exp(v - max(v)))
cc[i] = sample(1:KK, 1, replace=TRUE, prob=v)
}
Because the code never changes the values of the first
Is the sampler for the
Again, the sampler is identical to the one used before. The only difficulty here is that, because
# Sample the means
DD.st = matrix(0, nrow=p, ncol=p)
for(k in 1:KK){
mk = sum(cc==k)
xsumk = apply(x[cc==k,], 2, sum)
DD.st = solve(mk*solve(Sigma[k,,]) + solve(DD))
dd.st = DD.st%*%(solve(Sigma[k,,])%*%xsumk + solve(DD)%*%dd)
mu[k,] = as.vector(rmvnorm(1,dd.st,DD.st))
}
Is the sampler for the
Again, the sampler is identical to the one used before. The only difficulty is that, because
# Sample the variances
xcensumk = array(0, dim=c(KK,p,p))
for(i in 1:(n+m)){ ## Need to loop over all (n+m) observations, not just the first n
xcensumk[cc[i],,] = xcensumk[cc[i],,] + (x[i,] - mu[cc[i],])%*%t(x[i,] - mu[cc[i],])
}
for(k in 1:KK){
Sigma[k,,] = riwish(nu + sum(cc==k), SS + xcensumk[k,,])
}
Is the code used to classify observations in the test set correct?
The classification is based on the probabilities that each observation is classified as “genuine” and “counterfeit”. Those probabilities can be estimated using the frequencies for which each
probgenuine = rep(NA, m)
for(i in 1:m){
probgenuine[i] = sum(cc.out[-seq(1,burn),n+i]==2)/(rrr-burn)
}
A sample of the full code for this problem is provided below:
#### Semisupervised classification for the banknote dataset
rm(list=ls())
library(mvtnorm)
library(MCMCpack)
## Load data
#load("banknoteclassification.Rdata")
load("data/banknoteclassification.RData")
x = rbind(banknote.training,banknote.test)
## Generate data from a mixture with 3 components
KK = length(unique(banknote.training.labels))
p = dim(banknote.training)[2]
n = dim(banknote.training)[1]
m = dim(unique(banknote.test))[1]
## Initialize the parameters
w = rep(1,KK)/KK #Assign equal weight to each component to start with
mu = rmvnorm(KK, apply(x,2,mean), var(x)) #RandomCluster centers randomly spread over the support of the data
Sigma = array(0, dim=c(KK,p,p)) #Initial variances are assumed to be the same
Sigma[1,,] = var(x)/KK
Sigma[2,,] = var(x)/KK
cc = c(as.numeric(banknote.training.labels), sample(1:KK, m, replace=TRUE, prob=w))
# Priors
aa = rep(1, KK)
dd = apply(x,2,mean)
DD = 10*var(x)
nu = p+1
SS = var(x)/3
# Number of iteration of the sampler
rrr = 11000
burn = 1000
# Storing the samples
cc.out = array(0, dim=c(rrr, n+m))
w.out = array(0, dim=c(rrr, KK))
mu.out = array(0, dim=c(rrr, KK, p))
Sigma.out = array(0, dim=c(rrr, KK, p, p))
logpost = rep(0, rrr)
for(s in 1:rrr){
# Sample the indicators
for(i in (n+1):(n+m)){
v = rep(0,KK)
for(k in 1:KK){
v[k] = log(w[k]) + dmvnorm(x[i,], mu[k,], Sigma[k,,], log=TRUE) #Compute the log of the weights
}
v = exp(v - max(v))/sum(exp(v - max(v)))
cc[i] = sample(1:KK, 1, replace=TRUE, prob=v)
}
# Sample the weights
w = as.vector(rdirichlet(1, aa + tabulate(cc)))
# Sample the means
DD.st = matrix(0, nrow=p, ncol=p)
for(k in 1:KK){
mk = sum(cc==k)
xsumk = apply(x[cc==k,], 2, sum)
DD.st = solve(mk*solve(Sigma[k,,]) + solve(DD))
dd.st = DD.st%*%(solve(Sigma[k,,])%*%xsumk + solve(DD)%*%dd)
mu[k,] = as.vector(rmvnorm(1,dd.st,DD.st))
}
# Sample the variances
xcensumk = array(0, dim=c(KK,p,p))
for(i in 1:(n+m)){
xcensumk[cc[i],,] = xcensumk[cc[i],,] + (x[i,] - mu[cc[i],])%*%t(x[i,] - mu[cc[i],])
}
for(k in 1:KK){
Sigma[k,,] = riwish(nu + sum(cc==k), SS + xcensumk[k,,])
}
# Store samples
cc.out[s,] = cc
w.out[s,] = w
mu.out[s,,] = mu
Sigma.out[s,,,] = Sigma
for(i in 1:n){
logpost[s] = logpost[s] + log(w[cc[i]]) + dmvnorm(x[i,], mu[cc[i],], Sigma[cc[i],,], log=TRUE)
}
logpost[s] = logpost[s] + ddirichlet(w, aa)
for(k in 1:KK){
logpost[s] = logpost[s] + dmvnorm(mu[k,], dd, DD)
logpost[s] = logpost[s] + diwish(Sigma[k,,], nu, SS)
}
if(s/250==floor(s/250)){
print(paste("s = ", s))
}
}
## [1] "s = 250"
## [1] "s = 500"
## [1] "s = 750"
## [1] "s = 1000"
## [1] "s = 1250"
## [1] "s = 1500"
## [1] "s = 1750"
## [1] "s = 2000"
## [1] "s = 2250"
## [1] "s = 2500"
## [1] "s = 2750"
## [1] "s = 3000"
## [1] "s = 3250"
## [1] "s = 3500"
## [1] "s = 3750"
## [1] "s = 4000"
## [1] "s = 4250"
## [1] "s = 4500"
## [1] "s = 4750"
## [1] "s = 5000"
## [1] "s = 5250"
## [1] "s = 5500"
## [1] "s = 5750"
## [1] "s = 6000"
## [1] "s = 6250"
## [1] "s = 6500"
## [1] "s = 6750"
## [1] "s = 7000"
## [1] "s = 7250"
## [1] "s = 7500"
## [1] "s = 7750"
## [1] "s = 8000"
## [1] "s = 8250"
## [1] "s = 8500"
## [1] "s = 8750"
## [1] "s = 9000"
## [1] "s = 9250"
## [1] "s = 9500"
## [1] "s = 9750"
## [1] "s = 10000"
## [1] "s = 10250"
## [1] "s = 10500"
## [1] "s = 10750"
## [1] "s = 11000"
probgenuine = rep(NA, m)
for(i in 1:m){
probgenuine[i] = sum(cc.out[-seq(1,burn),n+i]==2)/(rrr-burn)
}
Is the classification error for the “genuine” class generated by the algorithm correct?
The value should be 0% (the algorithm perfectly classifies genuine banknotes in the test set).
Is the classification error for the “counterfeit” class generated by the algorithm correct?
Is the code used to classify observations in the test set correct?
This is a simple call to the qda
function
banknote.qda = qda(x=banknote.training, grouping=banknote.training.labels)
predict(banknote.qda, banknote.test)$class
## [1] genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine
## [83] genuine genuine genuine counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit genuine counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit genuine counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit genuine counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit
## [165] counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit
## Levels: counterfeit genuine
Is the classification error for the “genuine” class generated by the algorithm correct?
The value should be 0% (the algorithm perfectly classifies genuine banknotes in the test set). This is the same as the semisupervised Bayesian QDA algorithm you just implement
Is the classification error for the “counterfeit” class generated by the algorithm correct?
The value should also be 3.52% (3 errors out of 85 observations). In this case, the function
Provide an MCMC algorithm to fit a semisupervised Bayesian quadratic discriminant model to the banknote data.
load('data/banknoteclassification.RData')
library(MCMCpack)
library(mvtnorm)
set.seed(21196)
x.train = banknote.training
x.test = banknote.test
n.train = nrow(x.train)
n.test = nrow(x.test)
KK = 2
p = ncol(x.train)
##### Initialize the parameters
w = 0.5
mu = rmvnorm(KK, mean = apply(x.train,2,mean), sigma = diag(p))
Sigma = array(0, dim=c(KK,p,p))
Sigma[1,,] = diag(p) * apply(x.train, 2, var)
Sigma[2,,] = diag(p) * apply(x.train, 2, var)
cc = sample(1:KK, n.train, replace=TRUE, prob=c(w, 1-w))
##### The parameters of the priors shoult be set using empirical bayes
# uniform distribution for the weights
a = 0
b = 1
# multivariate normal for the means
dd = apply(x.train,2,mean)# prior for the mean of the multivariate normal
DD = var(x.train)
# inverse-wishart for the covariance matrix
# this is the conjugate prior for the cov matrix of a multivariate normal
nu = p # scalar degrees of freedom
SS = var(x.train) # symmetric, positive-semidefinite k x k scale matrix
# Storing the samples
cc.out = array(0, dim=c(rrr, n.train))
w.out = array(0, dim=c(rrr, KK))
mu.out = array(0, dim=c(rrr, KK, p))
Sigma.out = array(0, dim=c(rrr, KK, p, p))
logpost = rep(0, rrr)
rrr = 500
# burn = 1000
for(s in 1:rrr){
# Sample the indicators
for(i in 1:n.train){
v = rep(0,KK)
v[1] = log(w) + mvtnorm::dmvnorm(x.train[i,], mu[1,], Sigma[1,,], log=TRUE) #Compute the log of the weights
v[2] = log(1-w) + mvtnorm::dmvnorm(x.train[i,], mu[2,], Sigma[2,,], log=TRUE)
v = exp(v - max(v))/sum(exp(v - max(v)))
cc[i] = sample(1:KK, 1, replace=TRUE, prob=v)
}
# Sample the weights
w = rbeta(1, 1+sum(cc==1), 1+n.train-sum(cc==1))
# Sample the means
DD.st = matrix(0, nrow=p, ncol=p)
for(k in 1:KK){
mk = sum(cc==k)
xsumk = apply(x.train[cc==k,],2,sum)
DD.st = solve(mk*solve(Sigma[k,,]) + solve(DD))
dd.st = DD.st%*%(solve(Sigma[k,,]) %*% xsumk + solve(DD)%*%dd)
mu[k,] = as.vector(rmvnorm(1,dd.st,DD.st))
}
# Sample the variances
xcensumk = array(0, dim=c(KK,p,p))
for(i in 1:n.train){
xcensumk[cc[i],,] = xcensumk[cc[i],,] + (x.train[i,] - mu[cc[i],]) %*% t(x.train[i,] - mu[cc[i],])
}
for(k in 1:KK){
Sigma[k,,] = riwish(nu + sum(cc==k), SS + xcensumk[k,,])
}
# Store samples
cc.out[s,] = cc
w.out[s,] = w
mu.out[s,,] = mu
Sigma.out[s,,,] = Sigma
for(i in 1:n.train){
logpost[s] = logpost[s] + log(w[cc[i]]) + mvtnorm::dmvnorm(x.train[i,], mu[cc[i],], Sigma[cc[i],,], log=TRUE)
}
logpost[s] = logpost[s] + dunif(w, min = 0, max = 1)
for(k in 1:KK){
logpost[s] = logpost[s] + mvtnorm::dmvnorm(mu[k,], dd, DD, log=TRUE)
logpost[s] = logpost[s] + log(diwish(Sigma[k,,], nu, SS))
}
if(s/250==floor(s/250)){
print(paste("s = ", s))
}
}
## Error in apply(x.train[cc == k, ], 2, sum): dim(X) must have a positive length
What is the classification error for the test set?
0
## [1] 0
Is the R function qda
(which implements classical quadratic discriminant analysis) is used to classify the observations in the test set, what is the classification error?
modqda = qda(grouping=classes.train, x=x.train, method="mle")
## Error in qda.default(x, grouping, ...): object 'classes.train' not found
ccpredqda = predict(modqda,newdata=x.test)
## Error in predict(modqda, newdata = x.test): object 'modqda' not found
pred.classes = as.integer(ccpredqda$class)
## Error in eval(expr, envir, enclos): object 'ccpredqda' not found
sum(pred.classes - classes.test != 0)
## Error in eval(expr, envir, enclos): object 'pred.classes' not found
Is the setup of the algorithm correct?
Recall that the semisupervised version of the algorithm uses all observations but treats the labels for he training set as known. There are many ways to set this up, but one that requires minimal changes to the algorithm defines the
## All observations used for calculation
x = rbind(banknote.training,banknote.test)
## Size of the training and test sets
n = dim(banknote.training)[1]
m = dim(unique(banknote.test))[1]
## Number of components and dimensionality of the components
KK = length(unique(banknote.training.labels))
p = dim(banknote.training)[2]
## Starting value for the indicators
## Note that for the training set we use the true values, while for the
## test set the values are initialized randomly
cc = c(as.numeric(banknote.training.labels), sample(1:KK, m, replace=TRUE, prob=w))
## Error in sample.int(length(x), size, replace, prob): incorrect number of probabilities
However, be open minded in how you evaluate this item. The most important thing is that the setup is consistent with the rest of the implementation, and that it enables the use of both the training and test sets for the estimation of the means and variance of the components.
Is the sampler for
The full conditional for the indicators is identical to the one in the original code. The only difference is that the labels for the training set are considered known, and therefore not sampled. In practice, this means a simple change to the values over which the
# Sample the indicators
for(i in (n+1):(n+m)){
v = rep(0,KK)
for(k in 1:KK){
v[k] = log(w[k]) + dmvnorm(x[i,], mu[k,], Sigma[k,,], log=TRUE) #Compute the log of the weights
}
v = exp(v - max(v))/sum(exp(v - max(v)))
cc[i] = sample(1:KK, 1, replace=TRUE, prob=v)
}
## Error in sample.int(length(x), size, replace, prob): NA in probability vector
Because the code never changes the values of the first
Is the sampler for the
Again, the sampler is identical to the one used before. The only difficulty here is that, because
# Sample the means
DD.st = matrix(0, nrow=p, ncol=p)
for(k in 1:KK){
mk = sum(cc==k)
xsumk = apply(x[cc==k,], 2, sum)
DD.st = solve(mk*solve(Sigma[k,,]) + solve(DD))
dd.st = DD.st%*%(solve(Sigma[k,,])%*%xsumk + solve(DD)%*%dd)
mu[k,] = as.vector(rmvnorm(1,dd.st,DD.st))
}
Is the sampler for the
Again, the sampler is identical to the one used before. The only difficulty is that, because
# Sample the variances
xcensumk = array(0, dim=c(KK,p,p))
for(i in 1:(n+m)){ ## Need to loop over all (n+m) observations, not just the first n
xcensumk[cc[i],,] = xcensumk[cc[i],,] + (x[i,] - mu[cc[i],])%*%t(x[i,] - mu[cc[i],])
}
## Error in xcensumk[cc[i], , ] <- xcensumk[cc[i], , ] + (x[i, ] - mu[cc[i], : NAs are not allowed in subscripted assignments
for(k in 1:KK){
Sigma[k,,] = riwish(nu + sum(cc==k), SS + xcensumk[k,,])
}
Is the code used to classify observations in the test set correct?
The classification is based on the probabilities that each observation is classified as “genuine” and “counterfeit”. Those probabilities can be estimated using the frequencies for which each
probgenuine = rep(NA, m)
for(i in 1:m){
probgenuine[i] = sum(cc.out[-seq(1,burn),n+i]==2)/(rrr-burn)
}
## Error in cc.out[-seq(1, burn), n + i]: subscript out of bounds
A sample of the full code for this problem is provided below:
#### Semisupervised classification for the banknote dataset
rm(list=ls())
library(mvtnorm)
library(MCMCpack)
## Load data
#load("banknoteclassification.Rdata")
load("data/banknoteclassification.RData")
x = rbind(banknote.training,banknote.test)
## Generate data from a mixture with 3 components
KK = length(unique(banknote.training.labels))
p = dim(banknote.training)[2]
n = dim(banknote.training)[1]
m = dim(unique(banknote.test))[1]
## Initialize the parameters
w = rep(1,KK)/KK #Assign equal weight to each component to start with
mu = rmvnorm(KK, apply(x,2,mean), var(x)) #RandomCluster centers randomly spread over the support of the data
Sigma = array(0, dim=c(KK,p,p)) #Initial variances are assumed to be the same
Sigma[1,,] = var(x)/KK
Sigma[2,,] = var(x)/KK
cc = c(as.numeric(banknote.training.labels), sample(1:KK, m, replace=TRUE, prob=w))
# Priors
aa = rep(1, KK)
dd = apply(x,2,mean)
DD = 10*var(x)
nu = p+1
SS = var(x)/3
# Number of iteration of the sampler
rrr = 11000
burn = 1000
# Storing the samples
cc.out = array(0, dim=c(rrr, n+m))
w.out = array(0, dim=c(rrr, KK))
mu.out = array(0, dim=c(rrr, KK, p))
Sigma.out = array(0, dim=c(rrr, KK, p, p))
logpost = rep(0, rrr)
for(s in 1:rrr){
# Sample the indicators
for(i in (n+1):(n+m)){
v = rep(0,KK)
for(k in 1:KK){
v[k] = log(w[k]) + dmvnorm(x[i,], mu[k,], Sigma[k,,], log=TRUE) #Compute the log of the weights
}
v = exp(v - max(v))/sum(exp(v - max(v)))
cc[i] = sample(1:KK, 1, replace=TRUE, prob=v)
}
# Sample the weights
w = as.vector(rdirichlet(1, aa + tabulate(cc)))
# Sample the means
DD.st = matrix(0, nrow=p, ncol=p)
for(k in 1:KK){
mk = sum(cc==k)
xsumk = apply(x[cc==k,], 2, sum)
DD.st = solve(mk*solve(Sigma[k,,]) + solve(DD))
dd.st = DD.st%*%(solve(Sigma[k,,])%*%xsumk + solve(DD)%*%dd)
mu[k,] = as.vector(rmvnorm(1,dd.st,DD.st))
}
# Sample the variances
xcensumk = array(0, dim=c(KK,p,p))
for(i in 1:(n+m)){
xcensumk[cc[i],,] = xcensumk[cc[i],,] + (x[i,] - mu[cc[i],])%*%t(x[i,] - mu[cc[i],])
}
for(k in 1:KK){
Sigma[k,,] = riwish(nu + sum(cc==k), SS + xcensumk[k,,])
}
# Store samples
cc.out[s,] = cc
w.out[s,] = w
mu.out[s,,] = mu
Sigma.out[s,,,] = Sigma
for(i in 1:n){
logpost[s] = logpost[s] + log(w[cc[i]]) + dmvnorm(x[i,], mu[cc[i],], Sigma[cc[i],,], log=TRUE)
}
logpost[s] = logpost[s] + ddirichlet(w, aa)
for(k in 1:KK){
logpost[s] = logpost[s] + dmvnorm(mu[k,], dd, DD)
logpost[s] = logpost[s] + diwish(Sigma[k,,], nu, SS)
}
if(s/250==floor(s/250)){
print(paste("s = ", s))
}
}
## [1] "s = 250"
## [1] "s = 500"
## [1] "s = 750"
## [1] "s = 1000"
## [1] "s = 1250"
## [1] "s = 1500"
## [1] "s = 1750"
## [1] "s = 2000"
## [1] "s = 2250"
## [1] "s = 2500"
## [1] "s = 2750"
## [1] "s = 3000"
## [1] "s = 3250"
## [1] "s = 3500"
## [1] "s = 3750"
## [1] "s = 4000"
## [1] "s = 4250"
## [1] "s = 4500"
## [1] "s = 4750"
## [1] "s = 5000"
## [1] "s = 5250"
## [1] "s = 5500"
## [1] "s = 5750"
## [1] "s = 6000"
## [1] "s = 6250"
## [1] "s = 6500"
## [1] "s = 6750"
## [1] "s = 7000"
## [1] "s = 7250"
## [1] "s = 7500"
## [1] "s = 7750"
## [1] "s = 8000"
## [1] "s = 8250"
## [1] "s = 8500"
## [1] "s = 8750"
## [1] "s = 9000"
## [1] "s = 9250"
## [1] "s = 9500"
## [1] "s = 9750"
## [1] "s = 10000"
## [1] "s = 10250"
## [1] "s = 10500"
## [1] "s = 10750"
## [1] "s = 11000"
probgenuine = rep(NA, m)
for(i in 1:m){
probgenuine[i] = sum(cc.out[-seq(1,burn),n+i]==2)/(rrr-burn)
}
Is the classification error for the “genuine” class generated by the algorithm correct?
The value should be 0% (the algorithm perfectly classifies genuine banknotes in the test set).
Is the classification error for the “counterfeit” class generated by the algorithm correct?
Is the code used to classify observations in the test set correct?
This is a simple call to the qda
function
banknote.qda = qda(x=banknote.training, grouping=banknote.training.labels)
predict(banknote.qda, banknote.test)$class
## [1] genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine
## [83] genuine genuine genuine counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit genuine counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit genuine counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit genuine counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit
## [165] counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit
## Levels: counterfeit genuine
Is the classification error for the “genuine” class generated by the algorithm correct?
The value should be 0% (the algorithm perfectly classifies genuine banknotes in the test set). This is the same as the semisupervised Bayesian QDA algorithm you just implement
Is the classification error for the “counterfeit” class generated by the algorithm correct?
The value should also be 3.52% (3 errors out of 85 observations). In this case, the function
Provide an MCMC algorithm to fit a semisupervised Bayesian quadratic discriminant model to the banknote data.
load('data/banknoteclassification.RData')
n = dim(banknote.training)[1] # Size of the training set
m = dim(banknote.test)[1] # Size of the test set
x = rbind(as.matrix(banknote.training),as.matrix(banknote.test)) # Create dataset of observations
p = dim(x)[2] # Number offeatures
KK = 2
training.label = as.numeric(banknote.training.labels)
test.label = as.numeric(banknote.test.labels)
par(mfrow = c(1,1))
par(mar = c(2,2,2,2)+0.1)
colscale = c("black","red") # Black =counterfeit; Red = genuine
## Initialize the parameters
mu =rmvnorm(KK, apply(x,2,mean), var(x)) #RandomCluster centers randomly spread over the support of the data
Sigma = array(0,dim=c(KK,p,p)) #Initial variances are assumed to be the same
Sigma[1,,] = var(x)/KK
# Priors
aa = rep(1, KK)
dd = apply(x,2,mean)
DD = 10*var(x)
nu = p
SS = var(x)/2
# Number of iteration of the sampler
rrr = 10000
burn = 1000
# Storing the samples
cc.out = array(0,dim=c(rrr, n+m))
w.out = array(0,dim=c(rrr, KK))
mu.out = array(0,dim=c(rrr, KK, p))
Sigma.out = array(0, dim=c(rrr, KK, p, p))
logpost = rep(0,rrr)
for(s in 1:rrr){
for(i in 1:m){
v = rep(0,KK)
for(k in 1:KK){
v[k] = log(w[k])+ dmvnorm(x[n+i,], mu[k,], Sigma[k,,], log=TRUE)
}
v = exp(v -max(v))/sum(exp(v - max(v)))
v[k] = log(w[k])+ dmvnorm(x[n+i,], mu[k,], Sigma[k,,], log=TRUE)
}
v = exp(v -max(v))/sum(exp(v - max(v)))
cc[i] = sample(1:KK, 1, replace=TRUE, prob=v)
}
# Sample the weights
for(k in 1:KK){
mk = sum(cc==k)
xsumk = apply(x[cc==k,], 2, sum)
DD.st = solve(mk*solve(Sigma[k,,]) + solve(DD))
dd.st = DD.st%*%(solve(Sigma[k,,])%*%xsumk + solve(DD)%*%dd)
mu[k,] = as.vector(rmvnorm(1,dd.st,DD.st))
}
## Error in solve.default(Sigma[k, , ]): Lapack routine dgesv: system is exactly singular: U[1,1] = 0
# Sample the variances
xcensumk = array(0,dim=c(KK,p,p))
for(i in 1:n+m){
mk = sum(cc==k)
xsumk = apply(x[cc==k,], 2, sum)
DD.st = solve(mk*solve(Sigma[k,,]) + solve(DD))
dd.st = DD.st%*%(solve(Sigma[k,,])%*%xsumk + solve(DD)%*%dd)
mu[k,] = as.vector(rmvnorm(1,dd.st,DD.st))
}
## Error in solve.default(Sigma[k, , ]): Lapack routine dgesv: system is exactly singular: U[1,1] = 0
# Sample the variances
xcensumk = array(0, dim=c(KK,p,p))
for(i in 1:n+m){
xcensumk[cc[i],,] = xcensumk[cc[i],,] + (x[i,] - mu[cc[i],])%*%t(x[i,] - mu[cc[i],])
}
for(k in 1:KK){
Sigma[k,,] = riwish(nu + sum(cc==k), SS + xcensumk[k,,])
}
# Store samples
cc.out[s,] = cc
w.out[s,] = w
mu.out[s,,] = mu
Sigma.out[s,,,] = Sigma
for(i in 1:n+m){
logpost[s] = logpost[s] + log(w[cc[i]]) + mvtnorm::dmvnorm(x[i,], mu[cc[i],],Sigma[cc[i],,], log=TRUE)
}
logpost[s] = logpost[s] + log(w[cc[i]]) + mvtnorm::dmvnorm(x[i,], mu[cc[i],],Sigma[cc[i],,], log=TRUE)
for(k in 1:KK){
logpost[s] = logpost[s] + mvtnorm::dmvnorm(mu[k,], dd, DD)
for(k in 1:KK){
logpost[s] = logpost[s] + mvtnorm::dmvnorm(mu[k,], dd, DD)
logpost[s] = logpost[s]+ diwish(Sigma[k,,], nu, SS)
}
if(s/250==floor(s/250)){
print(paste("s = ", s))
}
}
## [1] "s = 10000"
## [1] "s = 10000"
What is the classification error for the test set?
## True labels
as.numeric(banknote.test.labels)
## [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
probcounterf = rep(NA, m)
for(i in 1:m){
probcaounterf[i] =sum(cc.out[-seq(1,burn),n+i]==1)/(rrr-burn)
}
## Error in eval(expr, envir, enclos): object 'probcaounterf' not found
probgenuine = rep(NA, m)
for(i in 1:m){
probgenuine[i] =sum(cc.out[-seq(1,burn),n+i]==2)/(rrr-burn)
}
#Ther classification was completely correct!
Is the R function qda
(which implements classical quadratic discriminant analysis) is used to classify the observations in the test set, what is the classification error?
# Using the qda and lda functions in R
# qda
modqda = qda(grouping=as.numeric(banknote.training.labels),x=banknote.training, method="mle")
ccpredqda = predict(modqda,newdata=banknote.test)
sum(!(ccpredqda$class == as.numeric(banknote.test.labels)))
## [1] 3
#answer = 3
Is the setup of the algorithm correct?
Recall that the semisupervised version of the algorithm uses all observations but treats the labels for he training set as known. There are many ways to set this up, but one that requires minimal changes to the algorithm defines the
## All observations used for calculation
x = rbind(banknote.training,banknote.test)
## Size of the training and test sets
n = dim(banknote.training)[1]
m = dim(unique(banknote.test))[1]
## Number of components and dimensionality of the components
KK = length(unique(banknote.training.labels))
p = dim(banknote.training)[2]
## Starting value for the indicators
## Note that for the training set we use the true values, while for the
## test set the values are initialized randomly
cc = c(as.numeric(banknote.training.labels), sample(1:KK, m, replace=TRUE, prob=w))
However, be open minded in how you evaluate this item. The most important thing is that the setup is consistent with the rest of the implementation, and that it enables the use of both the training and test sets for the estimation of the means and variance of the components.
Is the sampler for
The full conditional for the indicators is identical to the one in the original code. The only difference is that the labels for the training set are considered known, and therefore not sampled. In practice, this means a simple change to the values over which the
# Sample the indicators
for(i in (n+1):(n+m)){
v = rep(0,KK)
for(k in 1:KK){
v[k] = log(w[k]) + dmvnorm(x[i,], mu[k,], Sigma[k,,], log=TRUE) #Compute the log of the weights
}
v = exp(v - max(v))/sum(exp(v - max(v)))
cc[i] = sample(1:KK, 1, replace=TRUE, prob=v)
}
Because the code never changes the values of the first
Is the sampler for the
Again, the sampler is identical to the one used before. The only difficulty here is that, because
# Sample the means
DD.st = matrix(0, nrow=p, ncol=p)
for(k in 1:KK){
mk = sum(cc==k)
xsumk = apply(x[cc==k,], 2, sum)
DD.st = solve(mk*solve(Sigma[k,,]) + solve(DD))
dd.st = DD.st%*%(solve(Sigma[k,,])%*%xsumk + solve(DD)%*%dd)
mu[k,] = as.vector(rmvnorm(1,dd.st,DD.st))
}
Is the sampler for the
Again, the sampler is identical to the one used before. The only difficulty is that, because
# Sample the variances
xcensumk = array(0, dim=c(KK,p,p))
for(i in 1:(n+m)){ ## Need to loop over all (n+m) observations, not just the first n
xcensumk[cc[i],,] = xcensumk[cc[i],,] + (x[i,] - mu[cc[i],])%*%t(x[i,] - mu[cc[i],])
}
for(k in 1:KK){
Sigma[k,,] = riwish(nu + sum(cc==k), SS + xcensumk[k,,])
}
Is the code used to classify observations in the test set correct?
The classification is based on the probabilities that each observation is classified as “genuine” and “counterfeit”. Those probabilities can be estimated using the frequencies for which each
probgenuine = rep(NA, m)
for(i in 1:m){
probgenuine[i] = sum(cc.out[-seq(1,burn),n+i]==2)/(rrr-burn)
}
A sample of the full code for this problem is provided below:
#### Semisupervised classification for the banknote dataset
rm(list=ls())
library(mvtnorm)
library(MCMCpack)
## Load data
#load("banknoteclassification.Rdata")
load("data/banknoteclassification.RData")
x = rbind(banknote.training,banknote.test)
## Generate data from a mixture with 3 components
KK = length(unique(banknote.training.labels))
p = dim(banknote.training)[2]
n = dim(banknote.training)[1]
m = dim(unique(banknote.test))[1]
## Initialize the parameters
w = rep(1,KK)/KK #Assign equal weight to each component to start with
mu = rmvnorm(KK, apply(x,2,mean), var(x)) #RandomCluster centers randomly spread over the support of the data
Sigma = array(0, dim=c(KK,p,p)) #Initial variances are assumed to be the same
Sigma[1,,] = var(x)/KK
Sigma[2,,] = var(x)/KK
cc = c(as.numeric(banknote.training.labels), sample(1:KK, m, replace=TRUE, prob=w))
# Priors
aa = rep(1, KK)
dd = apply(x,2,mean)
DD = 10*var(x)
nu = p+1
SS = var(x)/3
# Number of iteration of the sampler
rrr = 11000
burn = 1000
# Storing the samples
cc.out = array(0, dim=c(rrr, n+m))
w.out = array(0, dim=c(rrr, KK))
mu.out = array(0, dim=c(rrr, KK, p))
Sigma.out = array(0, dim=c(rrr, KK, p, p))
logpost = rep(0, rrr)
for(s in 1:rrr){
# Sample the indicators
for(i in (n+1):(n+m)){
v = rep(0,KK)
for(k in 1:KK){
v[k] = log(w[k]) + dmvnorm(x[i,], mu[k,], Sigma[k,,], log=TRUE) #Compute the log of the weights
}
v = exp(v - max(v))/sum(exp(v - max(v)))
cc[i] = sample(1:KK, 1, replace=TRUE, prob=v)
}
# Sample the weights
w = as.vector(rdirichlet(1, aa + tabulate(cc)))
# Sample the means
DD.st = matrix(0, nrow=p, ncol=p)
for(k in 1:KK){
mk = sum(cc==k)
xsumk = apply(x[cc==k,], 2, sum)
DD.st = solve(mk*solve(Sigma[k,,]) + solve(DD))
dd.st = DD.st%*%(solve(Sigma[k,,])%*%xsumk + solve(DD)%*%dd)
mu[k,] = as.vector(rmvnorm(1,dd.st,DD.st))
}
# Sample the variances
xcensumk = array(0, dim=c(KK,p,p))
for(i in 1:(n+m)){
xcensumk[cc[i],,] = xcensumk[cc[i],,] + (x[i,] - mu[cc[i],])%*%t(x[i,] - mu[cc[i],])
}
for(k in 1:KK){
Sigma[k,,] = riwish(nu + sum(cc==k), SS + xcensumk[k,,])
}
# Store samples
cc.out[s,] = cc
w.out[s,] = w
mu.out[s,,] = mu
Sigma.out[s,,,] = Sigma
for(i in 1:n){
logpost[s] = logpost[s] + log(w[cc[i]]) + dmvnorm(x[i,], mu[cc[i],], Sigma[cc[i],,], log=TRUE)
}
logpost[s] = logpost[s] + ddirichlet(w, aa)
for(k in 1:KK){
logpost[s] = logpost[s] + dmvnorm(mu[k,], dd, DD)
logpost[s] = logpost[s] + diwish(Sigma[k,,], nu, SS)
}
if(s/250==floor(s/250)){
print(paste("s = ", s))
}
}
## [1] "s = 250"
## [1] "s = 500"
## [1] "s = 750"
## [1] "s = 1000"
## [1] "s = 1250"
## [1] "s = 1500"
## [1] "s = 1750"
## [1] "s = 2000"
## [1] "s = 2250"
## [1] "s = 2500"
## [1] "s = 2750"
## [1] "s = 3000"
## [1] "s = 3250"
## [1] "s = 3500"
## [1] "s = 3750"
## [1] "s = 4000"
## [1] "s = 4250"
## [1] "s = 4500"
## [1] "s = 4750"
## [1] "s = 5000"
## [1] "s = 5250"
## [1] "s = 5500"
## [1] "s = 5750"
## [1] "s = 6000"
## [1] "s = 6250"
## [1] "s = 6500"
## [1] "s = 6750"
## [1] "s = 7000"
## [1] "s = 7250"
## [1] "s = 7500"
## [1] "s = 7750"
## [1] "s = 8000"
## [1] "s = 8250"
## [1] "s = 8500"
## [1] "s = 8750"
## [1] "s = 9000"
## [1] "s = 9250"
## [1] "s = 9500"
## [1] "s = 9750"
## [1] "s = 10000"
## [1] "s = 10250"
## [1] "s = 10500"
## [1] "s = 10750"
## [1] "s = 11000"
probgenuine = rep(NA, m)
for(i in 1:m){
probgenuine[i] = sum(cc.out[-seq(1,burn),n+i]==2)/(rrr-burn)
}
Is the classification error for the “genuine” class generated by the algorithm correct?
The value should be 0% (the algorithm perfectly classifies genuine banknotes in the test set).
Is the classification error for the “counterfeit” class generated by the algorithm correct?
Is the code used to classify observations in the test set correct?
This is a simple call to the qda
function
banknote.qda = qda(x=banknote.training, grouping=banknote.training.labels)
predict(banknote.qda, banknote.test)$class
## [1] genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine
## [83] genuine genuine genuine counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit genuine counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit genuine counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit genuine counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit
## [165] counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit
## Levels: counterfeit genuine
Is the classification error for the “genuine” class generated by the algorithm correct?
The value should be 0% (the algorithm perfectly classifies genuine banknotes in the test set). This is the same as the semisupervised Bayesian QDA algorithm you just implement
Is the classification error for the “counterfeit” class generated by the algorithm correct?
The value should also be 3.52% (3 errors out of 85 observations). In this case, the function
Provide an MCMC algorithm to fit a semisupervised Bayesian quadratic discriminant model to the banknote data.
load('data/banknoteclassification.RData')
n = dim(banknote.training)[1] # Size of the training set
m = dim(banknote.test)[1] # Size of the test set
x = rbind(as.matrix(banknote.training),as.matrix(banknote.test)) # Create dataset of observations
p = dim(x)[2] # Number offeatures
KK = 2
training.label = as.numeric(banknote.training.labels)
test.label = as.numeric(banknote.test.labels)
par(mfrow=c(1,1))
par(mar=c(2,2,2,2)+0.1)
colscale = c("black","red") # Black =counterfeit; Red = genuine
pairs(banknote.training, col=colscale[banknote.training.labels],pch=as.numeric(banknote.training.labels))
## Initialize the parameters
w = rep(1,KK)/KK #Assign equal weight toeach component to start with
mu = rmvnorm(KK, apply(x,2,mean), var(x)) #RandomCluster centers randomly spread over the support of the data
Sigma = array(0,dim=c(KK,p,p)) #Initial variances are assumed to be the same
Sigma[1,,] = var(x)/KK
Sigma[1,,] = var(x)/KK
cc = c(training.label, sample(1:KK, m, replace=TRUE, prob=w))
# Priors
aa = rep(1, KK)
dd = apply(x,2,mean)
DD = 10*var(x)
nu = p
SS = var(x)/2
# Number of iteration of the sampler
rrr = 10000
burn = 1000
# Storing the samples
cc.out = array(0,dim=c(rrr, n+m))
w.out = array(0,dim=c(rrr, KK))
mu.out = array(0,dim=c(rrr, KK, p))
Sigma.out = array(0, dim=c(rrr, KK, p, p))
logpost = rep(0,rrr)
for(s in 1:rrr){
for(i in 1:m){
v = rep(0,KK)
for(k in 1:KK){
v[k] = log(w[k])+ dmvnorm(x[n+i,], mu[k,], Sigma[k,,], log=TRUE)
}
v = exp(v -max(v))/sum(exp(v - max(v)))
cc[i] = sample(1:KK, 1, replace=TRUE, prob=v)
}
# Sample the weights
w = as.vector(rdirichlet(1, aa + tabulate(cc)))
# Sample the means
DD.st = matrix(0,nrow=p, ncol=p)
for(k in 1:KK){
mk = sum(cc==k)
xsumk = apply(x[cc==k,], 2, sum)
DD.st = solve(mk*solve(Sigma[k,,]) + solve(DD))
dd.st = DD.st%*%(solve(Sigma[k,,])%*%xsumk + solve(DD)%*%dd)
mu[k,] = as.vector(rmvnorm(1,dd.st,DD.st))
}
# Sample the variances
xcensumk = array(0,dim=c(KK,p,p))
for(i in 1:n+m){
xcensumk[cc[i],,] = xcensumk[cc[i],,] + (x[i,] - mu[cc[i],])%*%t(x[i,] - mu[cc[i],])
}
for(k in 1:KK){
Sigma[k,,] = riwish(nu + sum(cc==k), SS + xcensumk[k,,])
}
# Store samples
cc.out[s,] = cc
w.out[s,] = w
mu.out[s,,] = mu
Sigma.out[s,,,] = Sigma
for(i in 1:n+m){
logpost[s] = logpost[s] + log(w[cc[i]]) + mvtnorm::dmvnorm(x[i,], mu[cc[i],],Sigma[cc[i],,], log=TRUE)
}
logpost[s] =logpost[s] + ddirichlet(w, aa)
for(k in 1:KK){
logpost[s] =logpost[s] + mvtnorm::dmvnorm(mu[k,], dd, DD)
logpost[s] = logpost[s]+ diwish(Sigma[k,,], nu, SS)
}
if(s/250==floor(s/250)){
print(paste("s = ", s))
}
}
## Error in solve.default(Sigma[k, , ]): Lapack routine dgesv: system is exactly singular: U[1,1] = 0
What is the classification error for the test set?
## True labels
as.numeric(banknote.test.labels)
## [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
probcounterf = rep(NA, m)
probcounterf = rep(NA, m)
for(i in 1:m){
probcaounterf[i] = sum(cc.out[-seq(1,burn),n+i]==1)/(rrr-burn)
}
## Error in eval(expr, envir, enclos): object 'probcaounterf' not found
probgenuine = rep(NA, m)
for(i in 1:m){
probgenuine[i] = sum(cc.out[-seq(1,burn),n+i]==2)/(rrr-burn)
}
#
#Ther classification was completely correct!
Is the R function qda
(which implements classical quadratic discriminant analysis) is used to classify the observations in the test set, what is the classification error?
# Using the qda and lda functions in R
# qda
modqda = qda(grouping=as.numeric(banknote.training.labels),x=banknote.training, method="mle")
ccpredqda = predict(modqda,newdata=banknote.test)
sum(!(ccpredqda$class == as.numeric(banknote.test.labels)))
## [1] 3
#answer = 3
Is the setup of the algorithm correct?
Recall that the semisupervised version of the algorithm uses all observations but treats the labels for he training set as known. There are many ways to set this up, but one that requires minimal changes to the algorithm defines the
## All observations used for calculation
x = rbind(banknote.training,banknote.test)
## Size of the training and test sets
n = dim(banknote.training)[1]
m = dim(unique(banknote.test))[1]
## Number of components and dimensionality of the components
KK = length(unique(banknote.training.labels))
p = dim(banknote.training)[2]
## Starting value for the indicators
## Note that for the training set we use the true values, while for the
## test set the values are initialized randomly
cc = c(as.numeric(banknote.training.labels), sample(1:KK, m, replace=TRUE, prob=w))
However, be open minded in how you evaluate this item. The most important thing is that the setup is consistent with the rest of the implementation, and that it enables the use of both the training and test sets for the estimation of the means and variance of the components.
Is the sampler for
The full conditional for the indicators is identical to the one in the original code. The only difference is that the labels for the training set are considered known, and therefore not sampled. In practice, this means a simple change to the values over which the
# Sample the indicators
for(i in (n+1):(n+m)){
v = rep(0,KK)
for(k in 1:KK){
v[k] = log(w[k]) + dmvnorm(x[i,], mu[k,], Sigma[k,,], log=TRUE) #Compute the log of the weights
}
v = exp(v - max(v))/sum(exp(v - max(v)))
cc[i] = sample(1:KK, 1, replace=TRUE, prob=v)
}
Because the code never changes the values of the first
Is the sampler for the
Again, the sampler is identical to the one used before. The only difficulty here is that, because
# Sample the means
DD.st = matrix(0, nrow=p, ncol=p)
for(k in 1:KK){
mk = sum(cc==k)
xsumk = apply(x[cc==k,], 2, sum)
DD.st = solve(mk*solve(Sigma[k,,]) + solve(DD))
dd.st = DD.st%*%(solve(Sigma[k,,])%*%xsumk + solve(DD)%*%dd)
mu[k,] = as.vector(rmvnorm(1,dd.st,DD.st))
}
## Error in solve.default(Sigma[k, , ]): Lapack routine dgesv: system is exactly singular: U[1,1] = 0
Is the sampler for the
Again, the sampler is identical to the one used before. The only difficulty is that, because
# Sample the variances
xcensumk = array(0, dim=c(KK,p,p))
for(i in 1:(n+m)){ ## Need to loop over all (n+m) observations, not just the first n
xcensumk[cc[i],,] = xcensumk[cc[i],,] + (x[i,] - mu[cc[i],])%*%t(x[i,] - mu[cc[i],])
}
for(k in 1:KK){
Sigma[k,,] = riwish(nu + sum(cc==k), SS + xcensumk[k,,])
}
Is the code used to classify observations in the test set correct?
The classification is based on the probabilities that each observation is classified as “genuine” and “counterfeit”. Those probabilities can be estimated using the frequencies for which each
probgenuine = rep(NA, m)
for(i in 1:m){
probgenuine[i] = sum(cc.out[-seq(1,burn),n+i]==2)/(rrr-burn)
}
A sample of the full code for this problem is provided below:
#### Semisupervised classification for the banknote dataset
rm(list=ls())
library(mvtnorm)
library(MCMCpack)
## Load data
#load("banknoteclassification.Rdata")
load("data/banknoteclassification.RData")
x = rbind(banknote.training,banknote.test)
## Generate data from a mixture with 3 components
KK = length(unique(banknote.training.labels))
p = dim(banknote.training)[2]
n = dim(banknote.training)[1]
m = dim(unique(banknote.test))[1]
## Initialize the parameters
w = rep(1,KK)/KK #Assign equal weight to each component to start with
mu = rmvnorm(KK, apply(x,2,mean), var(x)) #RandomCluster centers randomly spread over the support of the data
Sigma = array(0, dim=c(KK,p,p)) #Initial variances are assumed to be the same
Sigma[1,,] = var(x)/KK
Sigma[2,,] = var(x)/KK
cc = c(as.numeric(banknote.training.labels), sample(1:KK, m, replace=TRUE, prob=w))
# Priors
aa = rep(1, KK)
dd = apply(x,2,mean)
DD = 10*var(x)
nu = p+1
SS = var(x)/3
# Number of iteration of the sampler
rrr = 11000
burn = 1000
# Storing the samples
cc.out = array(0, dim=c(rrr, n+m))
w.out = array(0, dim=c(rrr, KK))
mu.out = array(0, dim=c(rrr, KK, p))
Sigma.out = array(0, dim=c(rrr, KK, p, p))
logpost = rep(0, rrr)
for(s in 1:rrr){
# Sample the indicators
for(i in (n+1):(n+m)){
v = rep(0,KK)
for(k in 1:KK){
v[k] = log(w[k]) + dmvnorm(x[i,], mu[k,], Sigma[k,,], log=TRUE) #Compute the log of the weights
}
v = exp(v - max(v))/sum(exp(v - max(v)))
cc[i] = sample(1:KK, 1, replace=TRUE, prob=v)
}
# Sample the weights
w = as.vector(rdirichlet(1, aa + tabulate(cc)))
# Sample the means
DD.st = matrix(0, nrow=p, ncol=p)
for(k in 1:KK){
mk = sum(cc==k)
xsumk = apply(x[cc==k,], 2, sum)
DD.st = solve(mk*solve(Sigma[k,,]) + solve(DD))
dd.st = DD.st%*%(solve(Sigma[k,,])%*%xsumk + solve(DD)%*%dd)
mu[k,] = as.vector(rmvnorm(1,dd.st,DD.st))
}
# Sample the variances
xcensumk = array(0, dim=c(KK,p,p))
for(i in 1:(n+m)){
xcensumk[cc[i],,] = xcensumk[cc[i],,] + (x[i,] - mu[cc[i],])%*%t(x[i,] - mu[cc[i],])
}
for(k in 1:KK){
Sigma[k,,] = riwish(nu + sum(cc==k), SS + xcensumk[k,,])
}
# Store samples
cc.out[s,] = cc
w.out[s,] = w
mu.out[s,,] = mu
Sigma.out[s,,,] = Sigma
for(i in 1:n){
logpost[s] = logpost[s] + log(w[cc[i]]) + dmvnorm(x[i,], mu[cc[i],], Sigma[cc[i],,], log=TRUE)
}
logpost[s] = logpost[s] + ddirichlet(w, aa)
for(k in 1:KK){
logpost[s] = logpost[s] + dmvnorm(mu[k,], dd, DD)
logpost[s] = logpost[s] + diwish(Sigma[k,,], nu, SS)
}
if(s/250==floor(s/250)){
print(paste("s = ", s))
}
}
## [1] "s = 250"
## [1] "s = 500"
## [1] "s = 750"
## [1] "s = 1000"
## [1] "s = 1250"
## [1] "s = 1500"
## [1] "s = 1750"
## [1] "s = 2000"
## [1] "s = 2250"
## [1] "s = 2500"
## [1] "s = 2750"
## [1] "s = 3000"
## [1] "s = 3250"
## [1] "s = 3500"
## [1] "s = 3750"
## [1] "s = 4000"
## [1] "s = 4250"
## [1] "s = 4500"
## [1] "s = 4750"
## [1] "s = 5000"
## [1] "s = 5250"
## [1] "s = 5500"
## [1] "s = 5750"
## [1] "s = 6000"
## [1] "s = 6250"
## [1] "s = 6500"
## [1] "s = 6750"
## [1] "s = 7000"
## [1] "s = 7250"
## [1] "s = 7500"
## [1] "s = 7750"
## [1] "s = 8000"
## [1] "s = 8250"
## [1] "s = 8500"
## [1] "s = 8750"
## [1] "s = 9000"
## [1] "s = 9250"
## [1] "s = 9500"
## [1] "s = 9750"
## [1] "s = 10000"
## [1] "s = 10250"
## [1] "s = 10500"
## [1] "s = 10750"
## [1] "s = 11000"
probgenuine = rep(NA, m)
for(i in 1:m){
probgenuine[i] = sum(cc.out[-seq(1,burn),n+i]==2)/(rrr-burn)
}
Is the classification error for the “genuine” class generated by the algorithm correct?
The value should be 0% (the algorithm perfectly classifies genuine banknotes in the test set).
Is the classification error for the “counterfeit” class generated by the algorithm correct?
Is the code used to classify observations in the test set correct?
This is a simple call to the qda
function
banknote.qda = qda(x=banknote.training, grouping=banknote.training.labels)
predict(banknote.qda, banknote.test)$class
## [1] genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine
## [83] genuine genuine genuine counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit genuine counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit genuine counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit genuine counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit
## [165] counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit
## Levels: counterfeit genuine
Is the classification error for the “genuine” class generated by the algorithm correct?
The value should be 0% (the algorithm perfectly classifies genuine banknotes in the test set). This is the same as the semisupervised Bayesian QDA algorithm you just implement
Is the classification error for the “counterfeit” class generated by the algorithm correct?
The value should also be 3.52% (3 errors out of 85 observations). In this case, the function
Provide an MCMC algorithm to fit a semisupervised Bayesian quadratic discriminant model to the banknote data.
load('data/banknoteclassification.RData')
What is the classification error for the test set?
Is the R function qda
(which implements classical quadratic discriminant analysis) is used to classify the observations in the test set, what is the classification error?
Is the setup of the algorithm correct?
Recall that the semisupervised version of the algorithm uses all observations but treats the labels for he training set as known. There are many ways to set this up, but one that requires minimal changes to the algorithm defines the
## All observations used for calculation
x = rbind(banknote.training,banknote.test)
## Size of the training and test sets
n = dim(banknote.training)[1]
m = dim(unique(banknote.test))[1]
## Number of components and dimensionality of the components
KK = length(unique(banknote.training.labels))
p = dim(banknote.training)[2]
## Starting value for the indicators
## Note that for the training set we use the true values, while for the
## test set the values are initialized randomly
cc = c(as.numeric(banknote.training.labels), sample(1:KK, m, replace=TRUE, prob=w))
However, be open minded in how you evaluate this item. The most important thing is that the setup is consistent with the rest of the implementation, and that it enables the use of both the training and test sets for the estimation of the means and variance of the components.
Is the sampler for
The full conditional for the indicators is identical to the one in the original code. The only difference is that the labels for the training set are considered known, and therefore not sampled. In practice, this means a simple change to the values over which the
# Sample the indicators
for(i in (n+1):(n+m)){
v = rep(0,KK)
for(k in 1:KK){
v[k] = log(w[k]) + dmvnorm(x[i,], mu[k,], Sigma[k,,], log=TRUE) #Compute the log of the weights
}
v = exp(v - max(v))/sum(exp(v - max(v)))
cc[i] = sample(1:KK, 1, replace=TRUE, prob=v)
}
Because the code never changes the values of the first
Is the sampler for the
Again, the sampler is identical to the one used before. The only difficulty here is that, because
# Sample the means
DD.st = matrix(0, nrow=p, ncol=p)
for(k in 1:KK){
mk = sum(cc==k)
xsumk = apply(x[cc==k,], 2, sum)
DD.st = solve(mk*solve(Sigma[k,,]) + solve(DD))
dd.st = DD.st%*%(solve(Sigma[k,,])%*%xsumk + solve(DD)%*%dd)
mu[k,] = as.vector(rmvnorm(1,dd.st,DD.st))
}
Is the sampler for the
Again, the sampler is identical to the one used before. The only difficulty is that, because
# Sample the variances
xcensumk = array(0, dim=c(KK,p,p))
for(i in 1:(n+m)){ ## Need to loop over all (n+m) observations, not just the first n
xcensumk[cc[i],,] = xcensumk[cc[i],,] + (x[i,] - mu[cc[i],])%*%t(x[i,] - mu[cc[i],])
}
for(k in 1:KK){
Sigma[k,,] = riwish(nu + sum(cc==k), SS + xcensumk[k,,])
}
Is the code used to classify observations in the test set correct?
The classification is based on the probabilities that each observation is classified as “genuine” and “counterfeit”. Those probabilities can be estimated using the frequencies for which each
probgenuine = rep(NA, m)
for(i in 1:m){
probgenuine[i] = sum(cc.out[-seq(1,burn),n+i]==2)/(rrr-burn)
}
A sample of the full code for this problem is provided below:
#### Semisupervised classification for the banknote dataset
rm(list=ls())
library(mvtnorm)
library(MCMCpack)
## Load data
#load("banknoteclassification.Rdata")
load("data/banknoteclassification.RData")
x = rbind(banknote.training,banknote.test)
## Generate data from a mixture with 3 components
KK = length(unique(banknote.training.labels))
p = dim(banknote.training)[2]
n = dim(banknote.training)[1]
m = dim(unique(banknote.test))[1]
## Initialize the parameters
w = rep(1,KK)/KK #Assign equal weight to each component to start with
mu = rmvnorm(KK, apply(x,2,mean), var(x)) #RandomCluster centers randomly spread over the support of the data
Sigma = array(0, dim=c(KK,p,p)) #Initial variances are assumed to be the same
Sigma[1,,] = var(x)/KK
Sigma[2,,] = var(x)/KK
cc = c(as.numeric(banknote.training.labels), sample(1:KK, m, replace=TRUE, prob=w))
# Priors
aa = rep(1, KK)
dd = apply(x,2,mean)
DD = 10*var(x)
nu = p+1
SS = var(x)/3
# Number of iteration of the sampler
rrr = 11000
burn = 1000
# Storing the samples
cc.out = array(0, dim=c(rrr, n+m))
w.out = array(0, dim=c(rrr, KK))
mu.out = array(0, dim=c(rrr, KK, p))
Sigma.out = array(0, dim=c(rrr, KK, p, p))
logpost = rep(0, rrr)
for(s in 1:rrr){
# Sample the indicators
for(i in (n+1):(n+m)){
v = rep(0,KK)
for(k in 1:KK){
v[k] = log(w[k]) + dmvnorm(x[i,], mu[k,], Sigma[k,,], log=TRUE) #Compute the log of the weights
}
v = exp(v - max(v))/sum(exp(v - max(v)))
cc[i] = sample(1:KK, 1, replace=TRUE, prob=v)
}
# Sample the weights
w = as.vector(rdirichlet(1, aa + tabulate(cc)))
# Sample the means
DD.st = matrix(0, nrow=p, ncol=p)
for(k in 1:KK){
mk = sum(cc==k)
xsumk = apply(x[cc==k,], 2, sum)
DD.st = solve(mk*solve(Sigma[k,,]) + solve(DD))
dd.st = DD.st%*%(solve(Sigma[k,,])%*%xsumk + solve(DD)%*%dd)
mu[k,] = as.vector(rmvnorm(1,dd.st,DD.st))
}
# Sample the variances
xcensumk = array(0, dim=c(KK,p,p))
for(i in 1:(n+m)){
xcensumk[cc[i],,] = xcensumk[cc[i],,] + (x[i,] - mu[cc[i],])%*%t(x[i,] - mu[cc[i],])
}
for(k in 1:KK){
Sigma[k,,] = riwish(nu + sum(cc==k), SS + xcensumk[k,,])
}
# Store samples
cc.out[s,] = cc
w.out[s,] = w
mu.out[s,,] = mu
Sigma.out[s,,,] = Sigma
for(i in 1:n){
logpost[s] = logpost[s] + log(w[cc[i]]) + dmvnorm(x[i,], mu[cc[i],], Sigma[cc[i],,], log=TRUE)
}
logpost[s] = logpost[s] + ddirichlet(w, aa)
for(k in 1:KK){
logpost[s] = logpost[s] + dmvnorm(mu[k,], dd, DD)
logpost[s] = logpost[s] + diwish(Sigma[k,,], nu, SS)
}
if(s/250==floor(s/250)){
print(paste("s = ", s))
}
}
## [1] "s = 250"
## [1] "s = 500"
## [1] "s = 750"
## [1] "s = 1000"
## [1] "s = 1250"
## [1] "s = 1500"
## [1] "s = 1750"
## [1] "s = 2000"
## [1] "s = 2250"
## [1] "s = 2500"
## [1] "s = 2750"
## [1] "s = 3000"
## [1] "s = 3250"
## [1] "s = 3500"
## [1] "s = 3750"
## [1] "s = 4000"
## [1] "s = 4250"
## [1] "s = 4500"
## [1] "s = 4750"
## [1] "s = 5000"
## [1] "s = 5250"
## [1] "s = 5500"
## [1] "s = 5750"
## [1] "s = 6000"
## [1] "s = 6250"
## [1] "s = 6500"
## [1] "s = 6750"
## [1] "s = 7000"
## [1] "s = 7250"
## [1] "s = 7500"
## [1] "s = 7750"
## [1] "s = 8000"
## [1] "s = 8250"
## [1] "s = 8500"
## [1] "s = 8750"
## [1] "s = 9000"
## [1] "s = 9250"
## [1] "s = 9500"
## [1] "s = 9750"
## [1] "s = 10000"
## [1] "s = 10250"
## [1] "s = 10500"
## [1] "s = 10750"
## [1] "s = 11000"
probgenuine = rep(NA, m)
for(i in 1:m){
probgenuine[i] = sum(cc.out[-seq(1,burn),n+i]==2)/(rrr-burn)
}
Is the classification error for the “genuine” class generated by the algorithm correct?
The value should be 0% (the algorithm perfectly classifies genuine banknotes in the test set).
Is the classification error for the “counterfeit” class generated by the algorithm correct?
Is the code used to classify observations in the test set correct?
This is a simple call to the qda
function
banknote.qda = qda(x=banknote.training, grouping=banknote.training.labels)
predict(banknote.qda, banknote.test)$class
## [1] genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine genuine
## [83] genuine genuine genuine counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit genuine counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit genuine counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit genuine counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit
## [165] counterfeit counterfeit counterfeit counterfeit counterfeit counterfeit
## Levels: counterfeit genuine
Is the classification error for the “genuine” class generated by the algorithm correct?
The value should be 0% (the algorithm perfectly classifies genuine banknotes in the test set). This is the same as the semisupervised Bayesian QDA algorithm you just implement
Is the classification error for the “counterfeit” class generated by the algorithm correct?
The value should also be 3.52% (3 errors out of 85 observations). In this case, the function
It’s useful to record some information about how your file was created.
suppressMessages(require('dplyr', quietly = TRUE))
suppressMessages(require('magrittr', quietly = TRUE))
suppressMessages(require('formattable', quietly = TRUE))
suppressMessages(require('knitr', quietly = TRUE))
suppressMessages(require('kableExtra', quietly = TRUE))
sys1 <- devtools::session_info()$platform %>%
unlist %>% data.frame(Category = names(.), session_info = .)
rownames(sys1) <- NULL
sys2 <- data.frame(Sys.info()) %>%
dplyr::mutate(Category = rownames(.)) %>% .[2:1]
names(sys2)[2] <- c('Sys.info')
rownames(sys2) <- NULL
if (nrow(sys1) == 9 & nrow(sys2) == 8) {
sys2 %<>% rbind(., data.frame(
Category = 'Current time',
Sys.info = paste(as.character(lubridate::now('Asia/Tokyo')), 'JST🗾')))
} else {
sys1 %<>% rbind(., data.frame(
Category = 'Current time',
session_info = paste(as.character(lubridate::now('Asia/Tokyo')), 'JST🗾')))
}
sys <- cbind(sys1, sys2) %>%
kbl(caption = 'Additional session information:') %>%
kable_styling(bootstrap_options = c('striped', 'hover', 'condensed', 'responsive')) %>%
row_spec(0, background = 'DimGrey', color = 'yellow') %>%
column_spec(1, background = 'CornflowerBlue', color = 'red') %>%
column_spec(2, background = 'grey', color = 'black') %>%
column_spec(3, background = 'CornflowerBlue', color = 'blue') %>%
column_spec(4, background = 'grey', color = 'white') %>%
row_spec(9, bold = T, color = 'yellow', background = '#D7261E')
rm(sys1, sys2)
sys
Category | session_info | Category | Sys.info |
---|---|---|---|
version | R version 4.1.0 (2021-05-18) | sysname | Linux |
os | Ubuntu 20.04.2 LTS | release | 5.8.0-54-generic |
system | x86_64, linux-gnu | version | #61~20.04.1-Ubuntu SMP Thu May 13 00:05:49 UTC 2021 |
ui | X11 | nodename | Scibrokes-Trading |
language | en | machine | x86_64 |
collate | en_US.UTF-8 | login | englianhu |
ctype | en_US.UTF-8 | user | englianhu |
tz | Asia/Tokyo | effective_user | englianhu |
date | 2021-06-02 | Current time | 2021-06-02 04:55:11 JST🗾 |