LIBRARY DECLARATION
library(factoextra)
## Loading required package: ggplot2
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
library(cluster)
library(NbClust)
library(class)
library(MASS)
library(corrplot)
nsim=1000 # Number of simulation runs
N=10000 # Popu size
n=160 # Sample size
bias=c(0,0)
mse=c(0,0)
cp=matrix(0,2,3)
len=c(0,0)
set.seed(123)
#-----------------------------
#x0,x3,
Sigma034 <- matrix(c(1600,300,40,300,100,10,40,10,4),3,3)
Sigma034
## [,1] [,2] [,3]
## [1,] 1600 300 40
## [2,] 300 100 10
## [3,] 40 10 4
x034=mvrnorm(N, c(200,50,100), Sigma034)
colnames(x034) <- c("X0","X3","X4")
x0=x034[,1]
x3=x034[,2]
head(x034)
## X0 X3 X4
## [1,] 180.5458 30.64825 97.14238
## [2,] 190.5958 49.31401 99.45744
## [3,] 263.5142 56.30706 97.69202
## [4,] 202.1538 54.33802 97.47423
## [5,] 205.4728 49.67262 98.19836
## [6,] 270.0093 56.15851 98.51154
#-----------------------------
#x1
Sigma15<- matrix(c(4,200,200,10000),2,2)
Sigma15
## [,1] [,2]
## [1,] 4 200
## [2,] 200 10000
x15=mvrnorm(N, c(20,500), Sigma15)
colnames(x15) <- c("X1","X5")
head(x15)
## X1 X5
## [1,] 19.61279 480.6394
## [2,] 20.51629 525.8147
## [3,] 18.92337 446.1687
## [4,] 17.64187 382.0937
## [5,] 21.80129 590.0647
## [6,] 19.96750 498.3750
x1=x15[,1]
#-----------------------------
#x2
x2=rchisq(N, 3, ncp = 20)
#-----------------------------
#noise
e=rnorm(N,mean =40,sd = 4)
y=50+log(3)*x0+log(4)*x1+log(0.5)*x2+log(0.25)*x3+2
#-----------------------------
#Final data: one responses and 3 variables
data=cbind(y,x0,x1,x2,x3)
summary(data)
## y x0 x1 x2
## Min. : 85.39 Min. : 46.55 Min. :11.74 Min. : 1.018
## 1st Qu.:190.19 1st Qu.:173.13 1st Qu.:18.64 1st Qu.:16.514
## Median :213.92 Median :199.60 Median :20.01 Median :22.239
## Mean :213.91 Mean :199.89 Mean :19.99 Mean :23.142
## 3rd Qu.:238.10 3rd Qu.:227.00 3rd Qu.:21.33 3rd Qu.:28.679
## Max. :354.85 Max. :355.40 Max. :27.97 Max. :74.239
## x3
## Min. :13.63
## 1st Qu.:43.33
## Median :50.14
## Mean :50.04
## 3rd Qu.:56.81
## Max. :85.42
#-----------------------------
# Population Parameters
#-----------------------------
muy=mean(data[,1]) # popu mean
muy
## [1] 213.9058
vary=var(data[,1])
vary
## [1] 1260.328
Correlation between variables.
M <- cor(data)
corrplot(M, method="circle")
Cluster Examination
set.seed(123)
# Compute and plot wss for k = 2 to k = 15
k.max <-10# Maximal number of clusters
#Determine number of clusters
wss <- (nrow(data)-1)*sum(apply(data,2,var))
for (i in 2:15) wss[i] <- sum(kmeans(data,
centers=i)$withinss)
plot(1:15, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares")
# Ward Hierarchical Clustering
d <- dist(data, method = "euclidean") # distance matrix
fit <- hclust(d, method="ward")
## The "ward" method has been renamed to "ward.D"; note new "ward.D2"
plot(fit) # display dendogram
groups <- cutree(fit, k=4) # cut tree into 4 clusters
# draw dendogram with red borders around the 4 clusters
rect.hclust(fit, k=4, border="red")
Cluster plots
data=as.data.frame(data)
results<-kmeans(data,4) #To see the result of cluster algorithm performed on the data
plot(data[c("y","x1")],col=results$cluster) #Cluster plots
#Create data with cluster labels
data.c=data
data.c$cluster=results$cluster
Two additional variables that are used to generate MAR data were then added to the population. X6 is drawn from a Bernoulli distribution with p=0.5 and X7 is drawn from a uniform distribution in the range [30, 60].
set.seed(123)
x6=rbinom(N, 1, 0.5)
x6=as.matrix(x6)
x7=runif(N,30,60)
x7=as.matrix(x7)
data=cbind(data,x6,x7)
head(data,5)
## y x0 x1 x2 x3 x6 x7
## 1 217.1038 180.5458 19.61279 25.893064 30.64825 0 39.31775
## 2 206.5513 190.5958 20.51629 21.521414 49.31401 1 39.73560
## 3 282.7941 263.5142 18.92337 9.927225 56.30706 0 56.10763
## 4 201.4465 202.1538 17.64187 31.408245 54.33802 1 39.86021
## 5 229.9131 205.4728 21.80129 13.249704 49.67262 1 33.77104
Missingness for Y was introduced according to a MAR schema so that it was dependent from X6 (binomial with equal probability) and X7 (uniform in the range [30, 60]). Four dummy categories were created, A: X6=0 and X7<60th percentile of [30, 60], B: X6=1 and X7<60th percentile of [30, 60], C: X6=0 and X7≥60th percentile of [30, 60], D: A: X6=1 and X7≥60th percentile of [30, 60]
risk=NULL
for (i in 1:N)
{
if (data$x6[i]==0 & data$x7[i]<qunif(0.6, min = 30, max = 60))
{risk=c(risk,"A")}
else if (data$x6[i]==1 & data$x7[i]<qunif(0.6, min = 30, max = 60))
{risk=c(risk,"B")}
else if (data$x6[i]==0& data$x7[i]>=qunif(0.6, min = 30, max = 60))
{risk=c(risk,"C")}
else{risk=c(risk,"D")}
i=i+1}
risk=as.data.frame(risk)
data=cbind(data,risk)
Assign probability of different risks Each category was given a different risk of having missing cases so that Risk (A)=0.02, Risk (B)=1.5 times Risk (A)=0.03, Risk (C)=1.5 times Risk (B)=0.045 and Risk (D)=1.5 times Risk (C)=0.0675. Missing around 16% of the data
set.seed(123)
nA=sum(data$risk=="A")
nB=sum(data$risk=="B")
nC=sum(data$risk=="C")
nD=sum(data$risk=="D")
pA=rbinom(nA,1,round(0.02*N/nA,2))
pB=rbinom(nB,1,round(0.03*N/nB,2))
pC=rbinom(nC,1,round(0.045*N/nC,2))
pD=rbinom(nD,1,round(0.0675*N/nD,2))
#Missing
y.mar=data
y.mar$i=as.matrix(rep(0,N))
j=0;i=1
while (i<=N){
if(y.mar$risk[i]=="A")
{j=j+1;
if(pA[j]==1)
{y.mar$i[i]=1}
else{y.mar$i[i]=y.mar$i[i]}}
i=i+1
}
j=0;i=1
while (i<=N){
if(y.mar$risk[i]=="B")
{j=j+1;
if(pB[j]==1)
{y.mar$i[i]=1}
else{y.mar$i[i]=y.mar$i[i]}}
i=i+1
}
j=0;i=1
while (i<=N){
if(y.mar$risk[i]=="C")
{j=j+1;
if(pC[j]==1)
{y.mar$i[i]=1}
else{y.mar$i[i]=y.mar$i[i]}}
i=i+1
}
j=0;i=1
while (i<=N){
if(y.mar$risk[i]=="D")
{j=j+1;
if(pD[j]==1)
{y.mar$i[i]=1}
else{y.mar$i[i]=y.mar$i[i]}}
i=i+1
}
sum(y.mar$i)#Number of missing data
## [1] 1576
y.mar$y=ifelse(y.mar$i==1,NA,y.mar$y)#if indicator i=1, then removing the responses' values, replacing with NA
head(y.mar)
## y x0 x1 x2 x3 x6 x7 risk i
## 1 217.1038 180.5458 19.61279 25.893064 30.64825 0 39.31775 A 0
## 2 206.5513 190.5958 20.51629 21.521414 49.31401 1 39.73560 B 0
## 3 282.7941 263.5142 18.92337 9.927225 56.30706 0 56.10763 C 0
## 4 201.4465 202.1538 17.64187 31.408245 54.33802 1 39.86021 B 0
## 5 229.9131 205.4728 21.80129 13.249704 49.67262 1 33.77104 B 0
## 6 280.9229 270.0093 19.96750 25.306554 56.15851 0 40.68664 A 0
y.mar=y.mar[,c(-6,-7,-8,-9)]
Resulting give us y.mar which is data set that containts missing values at random.
y.mar=cbind(y.mar,data.c$cluster) #Attaching cluster to y.mar
head(y.mar)
## y x0 x1 x2 x3 data.c$cluster
## 1 217.1038 180.5458 19.61279 25.893064 30.64825 4
## 2 206.5513 190.5958 20.51629 21.521414 49.31401 4
## 3 282.7941 263.5142 18.92337 9.927225 56.30706 2
## 4 201.4465 202.1538 17.64187 31.408245 54.33802 4
## 5 229.9131 205.4728 21.80129 13.249704 49.67262 3
## 6 280.9229 270.0093 19.96750 25.306554 56.15851 2
#Using SRSWOR to sample a sample with n=160
set.seed(123)
#-------------------------STEP 1------------------------------------------
s=y.mar[sample(nrow(y.mar),160,replace=FALSE),c(-6)] #A Complete Sampling Data set
sfull=na.omit(s) #sfull is a subset of S that does not contain data with missing responses.
r.sfull=kmeans(sfull[,-6],4) #Perform clusterring on sfull
smiss=subset(s,is.na(s$y)) #smiss is a subset of S that only containts missing responses
#-------------------------END OF STEP 1-----------------------------------
plot(sfull[c("y","x1")],col=r.sfull$cluster)
sfull$clusters=r.sfull$cluster #Attaching cluster results of the complete set sfull
Now we need to assign those missing data to the appropriate clusters. We will use KNN algorithm (supervised learning) to labelling the missing data.
Using KNN to classifies the miss datum into “appropriate” clusters.
s1=s
r.sfull1=r.sfull
sfull1=sfull
smiss1=smiss
#-------------------------STEP 2-------------------------------------------------
cl <-factor(sfull1[c(1:nrow(sfull1)),6])
smiss.cluster1=knn(sfull1[,c(-1,-6)], smiss1[,c(-1)], k=10, l = 0, cl,prob = TRUE, use.all = TRUE)
#-------------------------END OF STEP 2------------------------------------------
REPEATING FOR 1000 RUNS
#======================================
mu1=NULL
sigmahat1=NULL
varhat1=NULL
nsim=1000 # Number of simulation runs
N=10000 # Popu size
n=160 # Sample size
bias1=c(0,0)
mse1=c(0,0)
cp1=matrix(0,2,3)
len1=c(0,0)
#======================================
s1=s
r.sfull1=r.sfull
sfull1=sfull
smiss1=smiss
# Repeated simulation starts from here:
#======================================
set.seed(123)
nsim=1000
for(m in 1:nsim)
{
#STEP 1
s1=y.mar[sample(nrow(y.mar),160,replace=FALSE),c(-6)] #A Complete Sampling Data set
sfull1=na.omit(s1) #sfull is a subset of S that does not contain data with missing responses.
r.sfull1=kmeans(sfull1[,-6],4) #Perform clusterring on sfull
smiss1=subset(s1,is.na(s1$y)) #smiss is a subset of S that only containts missing responses
#STEP 2
sfull1$clusters=r.sfull1$cluster #Attaching cluster results of the complete set sfull
cl <-factor(sfull1[c(1:nrow(sfull1)),6])
smiss.cluster1=knn(sfull1[,c(-1,-6)], smiss1[,c(-1)], k=10, l = 0, cl,prob = TRUE, use.all = TRUE)
smiss.c1=data.frame(smiss1,as.data.frame(smiss.cluster1))
#STEP 3
smiss.row1=as.vector(rownames(smiss1))
smiss.compare1=cbind(y.mar[smiss.row1,],as.matrix(smiss.c1$smiss.cluster1))#
y1=data.c[smiss.row1,]
smiss.compare1=cbind(smiss.c1,y1$y)
mean1=r.sfull1$centers[,1]
smiss.compare1$yhat=ifelse(smiss.compare1[,6]==1,mean1[1],
ifelse(smiss.compare1[,6]==2,mean1[2],
ifelse(smiss.compare1[,6]==3,mean1[3],mean1[4])))
imputed.y1=rbind(as.matrix(smiss.compare1$yhat,ncol=1),as.matrix(sfull1$y,ncol=1))
#STEP 4
# muhat=c(muhat,mean(imputed.y))
# varhat=c(varhat,var(imputed.y))
# sigmahat=c(sigmahat,sqrt(var(imputed.y)))
muhat1=mean(imputed.y1)
varhat1=var(imputed.y1)
#STEP5
# Calculate 95% CI
#-----------------------
mu1=muhat1-1.96*sqrt((1-n/N)*varhat1/n)
mu2=muhat1+1.96*sqrt((1-n/N)*varhat1/n)
#-------------------------------------------------------------------
# Calcu bias, mse, average length, coverage probabilities
#-------------------------------------------------------------------
bias1[1]=bias1[1]+muhat1-muy
mse1[1]=mse1[1]+(muhat1-muy)^2
len1[1]=len1[1]+mu2-mu1
cp1[1,1]=cp1[1,1]+(muy<=mu1)
cp1[1,2]=cp1[1,2]+(muy>mu1)*(muy<mu2)
cp1[1,3]=cp1[1,3]+(muy>=mu2)
}
bias1=bias1[1]/(muy*nsim)
mse1=mse1[1]/nsim
len1=len1[1]/nsim
#OUTPUT
#======================================
bias1
## [1] 0.0007764621
mse1
## [1] 8.062797
len1
## [1] 10.71104
This KNN algorithm is used to imputing missing datum using the cluster information and a dependent variable that has highest correlation with our responses. As we see in the Part1. Data exploration, the correlation between y and x1 is strongest among other covariates.Therefore , we will use the information of the cluster and x1 to find the nearest neighbors of the missing responses.
#======================================
mu2=NULL
sigmahat2=NULL
varhat2=NULL
nsim=1000 # Number of simulation runs
N=10000 # Popu size
n=160 # Sample size
bias2=c(0,0)
mse2=c(0,0)
cp2=matrix(0,2,3)
len2=c(0,0)
#======================================
s2=s
r.sfull2=r.sfull
sfull2=sfull
smiss2=smiss
# Repeated simulation starts from here:
#======================================
for(m in 1:nsim)
{
#STEP 1
s2=y.mar[sample(nrow(y.mar),160,replace=FALSE),c(-6)] #A Complete Sampling Data set
sfull2=na.omit(s2) #sfull is a subset of S that does not contain data with missing responses.
r.sfull2=kmeans(sfull2[,-6],4) #Perform clusterring on sfull
smiss2=subset(s2,is.na(s2$y)) #smiss is a subset of S that only containts missing responses
#STEP 2
sfull2$clusters=r.sfull2$cluster #Attaching cluster results of the complete set sfull
cl <-factor(sfull2[c(1:nrow(sfull2)),6])
smiss.cluster2=knn(sfull2[,c(-1,-6)], smiss2[,c(-1)], k=100, l = 0, cl,prob = TRUE, use.all = TRUE)
smiss.c2=data.frame(smiss2,as.data.frame(smiss.cluster2))
cl <-factor(sfull2[c(1:nrow(sfull2)),1])
#Applying KNN to imputing missing values, replacing NAs datum by the nearest neighbor responses: using information of x0 and cluster to identify the closest neighborhood.
smiss.yhat2=knn(as.matrix(sfull2[,c(2,6)],ncol=1), as.matrix(smiss.c2[,c(2,6)],ncol=1), k=100, l = 0, cl,prob = TRUE, use.all = TRUE)
smiss.c2=data.frame(smiss2,as.data.frame(smiss.cluster2),as.data.frame(smiss.yhat2))
#STEP 3
smiss.row2=as.vector(rownames(smiss2))
smiss.compare2=cbind(y.mar[smiss.row2,],as.matrix(smiss.c2$smiss.yhat2))#
y2=data.c[smiss.row2,]
smiss.compare2=cbind(smiss.c2,y2$y)
imputed.y2=rbind(as.matrix(smiss.compare2$smiss.yhat2,ncol=1),as.matrix(sfull2$y,ncol=1))
imputed.y2=as.numeric(imputed.y2)
#STEP 4
# muhat=c(muhat,mean(imputed.y))
# varhat=c(varhat,var(imputed.y))
# sigmahat=c(sigmahat,sqrt(var(imputed.y)))
muhat2=mean(imputed.y2)
varhat2=var(imputed.y2)
#STEP5
# Calculate 95% CI
#-----------------------
mu1=muhat2-1.96*sqrt((1-n/N)*varhat2/n)
mu2=muhat2+1.96*sqrt((1-n/N)*varhat2/n)
#-------------------------------------------------------------------
# Calcu bias, mse, average length, coverage probabilities
#-------------------------------------------------------------------
bias2[1]=bias2[1]+muhat2-muy
mse2[1]=mse2[1]+(muhat2-muy)^2
len2[1]=len2[1]+mu2-mu1
cp2[1,1]=cp2[1,1]+(muy<=mu1)
cp2[1,2]=cp2[1,2]+(muy>mu1)*(muy<mu2)
cp2[1,3]=cp2[1,3]+(muy>=mu2)
}
bias2=bias2[1]/(muy*nsim)
mse2=mse2[1]/nsim
len2=len2[1]/nsim
cp2=cp2/nsim
# OUTPUT
#======================================
bias2
## [1] -4.687967e-05
mse2
## [1] 9.07625
len2
## [1] 10.50814
cp2
## [,1] [,2] [,3]
## [1,] 0.04 0.918 0.042
## [2,] 0.00 0.000 0.000