LIBRARY DECLARATION

library(factoextra)
## Loading required package: ggplot2
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
library(cluster)
library(NbClust)
library(class)
library(MASS)
library(corrplot)

PARAMETER INITIALIZATION

nsim=1000                                                 # Number of simulation runs
N=10000                                                    # Popu size
n=160                                                         # Sample size
bias=c(0,0)
mse=c(0,0)
cp=matrix(0,2,3)
len=c(0,0)

DATA SIMULATION

set.seed(123)
#-----------------------------
#x0,x3,
Sigma034 <- matrix(c(1600,300,40,300,100,10,40,10,4),3,3)
Sigma034
##      [,1] [,2] [,3]
## [1,] 1600  300   40
## [2,]  300  100   10
## [3,]   40   10    4
x034=mvrnorm(N, c(200,50,100), Sigma034)
colnames(x034) <- c("X0","X3","X4")
x0=x034[,1]
x3=x034[,2]
head(x034)
##            X0       X3       X4
## [1,] 180.5458 30.64825 97.14238
## [2,] 190.5958 49.31401 99.45744
## [3,] 263.5142 56.30706 97.69202
## [4,] 202.1538 54.33802 97.47423
## [5,] 205.4728 49.67262 98.19836
## [6,] 270.0093 56.15851 98.51154
#-----------------------------
#x1
Sigma15<- matrix(c(4,200,200,10000),2,2)
Sigma15
##      [,1]  [,2]
## [1,]    4   200
## [2,]  200 10000
x15=mvrnorm(N, c(20,500), Sigma15)
colnames(x15) <- c("X1","X5")
head(x15)
##            X1       X5
## [1,] 19.61279 480.6394
## [2,] 20.51629 525.8147
## [3,] 18.92337 446.1687
## [4,] 17.64187 382.0937
## [5,] 21.80129 590.0647
## [6,] 19.96750 498.3750
x1=x15[,1]
#-----------------------------
#x2
x2=rchisq(N, 3, ncp = 20)
#-----------------------------
#noise
e=rnorm(N,mean =40,sd = 4)
y=50+log(3)*x0+log(4)*x1+log(0.5)*x2+log(0.25)*x3+2
#-----------------------------
#Final data: one responses and 3 variables
data=cbind(y,x0,x1,x2,x3)
summary(data)
##        y                x0               x1              x2        
##  Min.   : 85.39   Min.   : 46.55   Min.   :11.74   Min.   : 1.018  
##  1st Qu.:190.19   1st Qu.:173.13   1st Qu.:18.64   1st Qu.:16.514  
##  Median :213.92   Median :199.60   Median :20.01   Median :22.239  
##  Mean   :213.91   Mean   :199.89   Mean   :19.99   Mean   :23.142  
##  3rd Qu.:238.10   3rd Qu.:227.00   3rd Qu.:21.33   3rd Qu.:28.679  
##  Max.   :354.85   Max.   :355.40   Max.   :27.97   Max.   :74.239  
##        x3       
##  Min.   :13.63  
##  1st Qu.:43.33  
##  Median :50.14  
##  Mean   :50.04  
##  3rd Qu.:56.81  
##  Max.   :85.42

POPULATION PARAMETERS

#-----------------------------
# Population Parameters
#-----------------------------
muy=mean(data[,1])                                         # popu mean
muy
## [1] 213.9058
vary=var(data[,1])
vary
## [1] 1260.328

PART1: DATA EXPLORATION

Correlation between variables.

M <- cor(data)
corrplot(M, method="circle")

Cluster Examination

set.seed(123)
# Compute and plot wss for k = 2 to k = 15
k.max <-10# Maximal number of clusters

#Determine number of clusters

wss <- (nrow(data)-1)*sum(apply(data,2,var))
for (i in 2:15) wss[i] <- sum(kmeans(data, 
    centers=i)$withinss)
plot(1:15, wss, type="b", xlab="Number of Clusters",
  ylab="Within groups sum of squares")

# Ward Hierarchical Clustering
d <- dist(data, method = "euclidean") # distance matrix
fit <- hclust(d, method="ward") 
## The "ward" method has been renamed to "ward.D"; note new "ward.D2"
plot(fit) # display dendogram
groups <- cutree(fit, k=4) # cut tree into 4 clusters
# draw dendogram with red borders around the 4 clusters 
rect.hclust(fit, k=4, border="red")

Cluster plots

data=as.data.frame(data)
results<-kmeans(data,4) #To see the result of cluster algorithm performed on the data 
plot(data[c("y","x1")],col=results$cluster) #Cluster plots

#Create data with cluster labels
data.c=data
data.c$cluster=results$cluster

PART 2: GENERATING MAR MISSING MECHANISMS

Two additional variables that are used to generate MAR data were then added to the population. X6 is drawn from a Bernoulli distribution with p=0.5 and X7 is drawn from a uniform distribution in the range [30, 60].

set.seed(123)
x6=rbinom(N, 1, 0.5)
x6=as.matrix(x6)
x7=runif(N,30,60)
x7=as.matrix(x7)
data=cbind(data,x6,x7)
head(data,5)
##          y       x0       x1        x2       x3 x6       x7
## 1 217.1038 180.5458 19.61279 25.893064 30.64825  0 39.31775
## 2 206.5513 190.5958 20.51629 21.521414 49.31401  1 39.73560
## 3 282.7941 263.5142 18.92337  9.927225 56.30706  0 56.10763
## 4 201.4465 202.1538 17.64187 31.408245 54.33802  1 39.86021
## 5 229.9131 205.4728 21.80129 13.249704 49.67262  1 33.77104

Missingness for Y was introduced according to a MAR schema so that it was dependent from X6 (binomial with equal probability) and X7 (uniform in the range [30, 60]). Four dummy categories were created, A: X6=0 and X7<60th percentile of [30, 60], B: X6=1 and X7<60th percentile of [30, 60], C: X6=0 and X7≥60th percentile of [30, 60], D: A: X6=1 and X7≥60th percentile of [30, 60]

risk=NULL
for (i in 1:N)
{
  if (data$x6[i]==0 & data$x7[i]<qunif(0.6, min = 30, max = 60))
          {risk=c(risk,"A")}
     else if (data$x6[i]==1 & data$x7[i]<qunif(0.6, min = 30, max = 60))
     {risk=c(risk,"B")}
     else if (data$x6[i]==0& data$x7[i]>=qunif(0.6, min = 30, max = 60))
     {risk=c(risk,"C")}
  else{risk=c(risk,"D")}
  i=i+1}
risk=as.data.frame(risk)
data=cbind(data,risk)

Assign probability of different risks Each category was given a different risk of having missing cases so that Risk (A)=0.02, Risk (B)=1.5 times Risk (A)=0.03, Risk (C)=1.5 times Risk (B)=0.045 and Risk (D)=1.5 times Risk (C)=0.0675. Missing around 16% of the data

set.seed(123)
nA=sum(data$risk=="A")
nB=sum(data$risk=="B")
nC=sum(data$risk=="C")
nD=sum(data$risk=="D")
pA=rbinom(nA,1,round(0.02*N/nA,2))
pB=rbinom(nB,1,round(0.03*N/nB,2))
pC=rbinom(nC,1,round(0.045*N/nC,2))
pD=rbinom(nD,1,round(0.0675*N/nD,2))

#Missing
y.mar=data
y.mar$i=as.matrix(rep(0,N))
j=0;i=1
while (i<=N){
  if(y.mar$risk[i]=="A")
  {j=j+1;
  if(pA[j]==1)
  {y.mar$i[i]=1}
  else{y.mar$i[i]=y.mar$i[i]}}
  i=i+1
}

j=0;i=1
while (i<=N){
  if(y.mar$risk[i]=="B")
  {j=j+1;
  if(pB[j]==1)
  {y.mar$i[i]=1}
  else{y.mar$i[i]=y.mar$i[i]}}
  i=i+1
}

j=0;i=1
while (i<=N){
  if(y.mar$risk[i]=="C")
  {j=j+1;
  if(pC[j]==1)
  {y.mar$i[i]=1}
  else{y.mar$i[i]=y.mar$i[i]}}
  i=i+1
}

j=0;i=1
while (i<=N){
  if(y.mar$risk[i]=="D")
  {j=j+1;
  if(pD[j]==1)
  {y.mar$i[i]=1}
  else{y.mar$i[i]=y.mar$i[i]}}
  i=i+1
}


sum(y.mar$i)#Number of missing data
## [1] 1576
y.mar$y=ifelse(y.mar$i==1,NA,y.mar$y)#if indicator i=1, then removing the responses' values, replacing with NA
head(y.mar)
##          y       x0       x1        x2       x3 x6       x7 risk i
## 1 217.1038 180.5458 19.61279 25.893064 30.64825  0 39.31775    A 0
## 2 206.5513 190.5958 20.51629 21.521414 49.31401  1 39.73560    B 0
## 3 282.7941 263.5142 18.92337  9.927225 56.30706  0 56.10763    C 0
## 4 201.4465 202.1538 17.64187 31.408245 54.33802  1 39.86021    B 0
## 5 229.9131 205.4728 21.80129 13.249704 49.67262  1 33.77104    B 0
## 6 280.9229 270.0093 19.96750 25.306554 56.15851  0 40.68664    A 0
y.mar=y.mar[,c(-6,-7,-8,-9)]

Resulting give us y.mar which is data set that containts missing values at random.

y.mar=cbind(y.mar,data.c$cluster) #Attaching cluster to y.mar
head(y.mar)
##          y       x0       x1        x2       x3 data.c$cluster
## 1 217.1038 180.5458 19.61279 25.893064 30.64825              4
## 2 206.5513 190.5958 20.51629 21.521414 49.31401              4
## 3 282.7941 263.5142 18.92337  9.927225 56.30706              2
## 4 201.4465 202.1538 17.64187 31.408245 54.33802              4
## 5 229.9131 205.4728 21.80129 13.249704 49.67262              3
## 6 280.9229 270.0093 19.96750 25.306554 56.15851              2

PART 3: SIMULATION

#Using SRSWOR to sample a sample with n=160
set.seed(123)
#-------------------------STEP 1------------------------------------------
s=y.mar[sample(nrow(y.mar),160,replace=FALSE),c(-6)] #A Complete Sampling Data set
sfull=na.omit(s) #sfull is a subset of S that does not contain data with missing responses. 
r.sfull=kmeans(sfull[,-6],4) #Perform clusterring on sfull
smiss=subset(s,is.na(s$y)) #smiss is a subset of S that only containts missing responses
#-------------------------END OF STEP 1-----------------------------------
plot(sfull[c("y","x1")],col=r.sfull$cluster)

sfull$clusters=r.sfull$cluster #Attaching cluster results of the complete set sfull

1. IMPUTING MISSING VALUES WITH CLUSTER MEAN.

Now we need to assign those missing data to the appropriate clusters. We will use KNN algorithm (supervised learning) to labelling the missing data.

Using KNN to classifies the miss datum into “appropriate” clusters.

s1=s
r.sfull1=r.sfull
sfull1=sfull
smiss1=smiss
#-------------------------STEP 2-------------------------------------------------
cl <-factor(sfull1[c(1:nrow(sfull1)),6])
smiss.cluster1=knn(sfull1[,c(-1,-6)], smiss1[,c(-1)], k=10, l = 0, cl,prob = TRUE, use.all = TRUE)
#-------------------------END OF STEP 2------------------------------------------

REPEATING FOR 1000 RUNS

#======================================
mu1=NULL
sigmahat1=NULL
varhat1=NULL
nsim=1000                                                 # Number of simulation runs
N=10000                                                    # Popu size
n=160                                                         # Sample size
bias1=c(0,0)
mse1=c(0,0)
cp1=matrix(0,2,3)
len1=c(0,0)
#======================================
s1=s
r.sfull1=r.sfull
sfull1=sfull
smiss1=smiss
# Repeated simulation starts from here:
#======================================
set.seed(123)
nsim=1000
for(m in 1:nsim)
  {
  #STEP 1
  s1=y.mar[sample(nrow(y.mar),160,replace=FALSE),c(-6)] #A Complete Sampling Data set
  sfull1=na.omit(s1) #sfull is a subset of S that does not contain data with missing responses. 
  r.sfull1=kmeans(sfull1[,-6],4) #Perform clusterring on sfull
  smiss1=subset(s1,is.na(s1$y)) #smiss is a subset of S that only containts missing responses
  #STEP 2
  sfull1$clusters=r.sfull1$cluster #Attaching cluster results of the complete set sfull
  cl <-factor(sfull1[c(1:nrow(sfull1)),6])
  smiss.cluster1=knn(sfull1[,c(-1,-6)], smiss1[,c(-1)], k=10, l = 0, cl,prob = TRUE, use.all = TRUE)
  smiss.c1=data.frame(smiss1,as.data.frame(smiss.cluster1))
  #STEP 3
  smiss.row1=as.vector(rownames(smiss1))
  smiss.compare1=cbind(y.mar[smiss.row1,],as.matrix(smiss.c1$smiss.cluster1))#
  y1=data.c[smiss.row1,]
  smiss.compare1=cbind(smiss.c1,y1$y)
  mean1=r.sfull1$centers[,1]
  smiss.compare1$yhat=ifelse(smiss.compare1[,6]==1,mean1[1],
                          ifelse(smiss.compare1[,6]==2,mean1[2],
                                 ifelse(smiss.compare1[,6]==3,mean1[3],mean1[4])))
  imputed.y1=rbind(as.matrix(smiss.compare1$yhat,ncol=1),as.matrix(sfull1$y,ncol=1))
  
  #STEP 4 
  # muhat=c(muhat,mean(imputed.y))
  # varhat=c(varhat,var(imputed.y))
  # sigmahat=c(sigmahat,sqrt(var(imputed.y)))
  muhat1=mean(imputed.y1)
  varhat1=var(imputed.y1)
  
  #STEP5
  
  # Calculate 95% CI
  #-----------------------
  mu1=muhat1-1.96*sqrt((1-n/N)*varhat1/n)
  mu2=muhat1+1.96*sqrt((1-n/N)*varhat1/n)
  #-------------------------------------------------------------------
  # Calcu bias, mse, average length, coverage probabilities
  #-------------------------------------------------------------------
  bias1[1]=bias1[1]+muhat1-muy
  mse1[1]=mse1[1]+(muhat1-muy)^2
  len1[1]=len1[1]+mu2-mu1
  cp1[1,1]=cp1[1,1]+(muy<=mu1)
  cp1[1,2]=cp1[1,2]+(muy>mu1)*(muy<mu2)
  cp1[1,3]=cp1[1,3]+(muy>=mu2)

  }

bias1=bias1[1]/(muy*nsim)
mse1=mse1[1]/nsim
len1=len1[1]/nsim
#OUTPUT 
#======================================
bias1
## [1] 0.0007764621
mse1
## [1] 8.062797
len1
## [1] 10.71104

2. IMPUTING MISSING VALUES WITH NEAREAST NEIGHBORHOOD VALUES BASED ON CLUSTERS & VARIABLES.

This KNN algorithm is used to imputing missing datum using the cluster information and a dependent variable that has highest correlation with our responses. As we see in the Part1. Data exploration, the correlation between y and x1 is strongest among other covariates.Therefore , we will use the information of the cluster and x1 to find the nearest neighbors of the missing responses.

#======================================
mu2=NULL
sigmahat2=NULL
varhat2=NULL
nsim=1000                                                 # Number of simulation runs
N=10000                                                    # Popu size
n=160                                                         # Sample size
bias2=c(0,0)
mse2=c(0,0)
cp2=matrix(0,2,3)
len2=c(0,0)
#======================================
s2=s
r.sfull2=r.sfull
sfull2=sfull
smiss2=smiss
# Repeated simulation starts from here:
#======================================
for(m in 1:nsim)
  {
  #STEP 1
  s2=y.mar[sample(nrow(y.mar),160,replace=FALSE),c(-6)] #A Complete Sampling Data set
  sfull2=na.omit(s2) #sfull is a subset of S that does not contain data with missing responses. 
  r.sfull2=kmeans(sfull2[,-6],4) #Perform clusterring on sfull
  smiss2=subset(s2,is.na(s2$y)) #smiss is a subset of S that only containts missing responses
  #STEP 2
  sfull2$clusters=r.sfull2$cluster #Attaching cluster results of the complete set sfull
  cl <-factor(sfull2[c(1:nrow(sfull2)),6])
  smiss.cluster2=knn(sfull2[,c(-1,-6)], smiss2[,c(-1)], k=100, l = 0, cl,prob = TRUE, use.all = TRUE)
  smiss.c2=data.frame(smiss2,as.data.frame(smiss.cluster2))
  
  cl <-factor(sfull2[c(1:nrow(sfull2)),1]) 
  #Applying KNN to imputing missing values, replacing NAs datum by the nearest neighbor responses: using information of x0   and cluster to identify the closest neighborhood.
  smiss.yhat2=knn(as.matrix(sfull2[,c(2,6)],ncol=1), as.matrix(smiss.c2[,c(2,6)],ncol=1), k=100, l = 0, cl,prob = TRUE, use.all = TRUE)
  smiss.c2=data.frame(smiss2,as.data.frame(smiss.cluster2),as.data.frame(smiss.yhat2))
  
  
  #STEP 3
  smiss.row2=as.vector(rownames(smiss2))
  smiss.compare2=cbind(y.mar[smiss.row2,],as.matrix(smiss.c2$smiss.yhat2))#
  y2=data.c[smiss.row2,]
  smiss.compare2=cbind(smiss.c2,y2$y)
  imputed.y2=rbind(as.matrix(smiss.compare2$smiss.yhat2,ncol=1),as.matrix(sfull2$y,ncol=1))
  imputed.y2=as.numeric(imputed.y2)
  
  #STEP 4 
  # muhat=c(muhat,mean(imputed.y))
  # varhat=c(varhat,var(imputed.y))
  # sigmahat=c(sigmahat,sqrt(var(imputed.y)))
  
  muhat2=mean(imputed.y2)
  varhat2=var(imputed.y2)
  
  #STEP5
  
  # Calculate 95% CI
  #-----------------------
  mu1=muhat2-1.96*sqrt((1-n/N)*varhat2/n)
  mu2=muhat2+1.96*sqrt((1-n/N)*varhat2/n)
  #-------------------------------------------------------------------
  # Calcu bias, mse, average length, coverage probabilities
  #-------------------------------------------------------------------
  bias2[1]=bias2[1]+muhat2-muy
  mse2[1]=mse2[1]+(muhat2-muy)^2
  len2[1]=len2[1]+mu2-mu1
  cp2[1,1]=cp2[1,1]+(muy<=mu1)
  cp2[1,2]=cp2[1,2]+(muy>mu1)*(muy<mu2)
  cp2[1,3]=cp2[1,3]+(muy>=mu2)

}
bias2=bias2[1]/(muy*nsim)
mse2=mse2[1]/nsim
len2=len2[1]/nsim
cp2=cp2/nsim

# OUTPUT
#======================================
bias2
## [1] -4.687967e-05
mse2
## [1] 9.07625
len2
## [1] 10.50814
cp2
##      [,1]  [,2]  [,3]
## [1,] 0.04 0.918 0.042
## [2,] 0.00 0.000 0.000