Algorithm To Detect The Normal Distributions in the Sample

These are the packages that we are going to use for the analysis

library(ggplot2)
library(reshape2)
library(mixtools)
library(lattice)

Description

In order to test the Algorith we will use the following three actual datasets as well as two Normal Distributions with SD equal to 1 and Mean equal to 10 and 5 respectively.
We will apply the EM Algorithm by using the mixtools library. When the Algorith returns and Warning that did not converge it means that there is only only distribution but although it does not converge it tries to estimate two Normal Distributions. In this case we will ignore the estimated distributions and we will assume that there is only one Normal Distribution in the sample.
A problem which occur with the actual results is that strictly speaking they do not follow the Normal Distribution and this cause some problems to the EM Algorithm since it tries to fit different distributions based on the Mean and Standard Deviation, but in our case we should treat those results that are coming from the same distribution. For that reason we will apply a threshold in the differene of the mean where the user can define what difference is acceptable within the same distribution. In our case the default value is 15%

##Set a seed in order to be able to reproduce the data
set.seed(5)
#Generate 1000 values of the Normal of mean 10 and sd 1
x1<-rnorm(1000,10,1)
#Generate 1000 values of the Normal of mean 5 and sd 1
x2<-rnorm(1000,5,1)
#Create one sample by merging the two above
x<-c(x1,x2)



##Load the datasets

dataset_1112<-read.csv("data_1112.csv", col.names = FALSE)
dataset_1114<-read.csv("data_1114.csv", col.names = FALSE)
dataset_1115<-read.csv("data_1115.csv", col.names = FALSE)

Looking at the Distribution of the received Dataset

As we can see from the Density Plots and the Histograms the dataset does not seem to follow the Normal Distribution. However we want to be able to detect in case we receive two different datasets that there are two distinct distributions.

densityplot(as.matrix(dataset_1112), main="Dataset 1112", xlab="Values")

densityplot(as.matrix(dataset_1114), main="Dataset 1114", xlab="Values")

densityplot(as.matrix(dataset_1115), main="Dataset 1115", xlab="Values")

hist(as.matrix(dataset_1112), col="green", xlab = "Values", main="Dataset 1112")

hist(as.matrix(dataset_1114), col="green", xlab = "Values", main="Dataset 1114")

hist(as.matrix(dataset_1115), col="green", xlab = "Values", main="Dataset 1115")

Write a function which estimates the Normal Distributions in the Sample

The function below estimated the mean and the standard deviation of the estimated distributions within the sample

Check_Distributions<-function(x=x, threshold=0.15) {
  
  ### Make sure that the dataset are class "matrix"
  x<-as.matrix(x)
  
  
  ### Apply the EM Algorithm
  
  ###Mixture Model
  

  
  EM_model<-normalmixEM( x[,1],  maxit = 300, epsilon = 1e-18 )
  ##Mean of the Distributions
  means<-EM_model$mu
  ##Proportion of the Distributions
  proportions<-EM_model$lambda
  ##Standard deviation of the Distributions
  sds<-EM_model$sigma
  
  Dists<-data.frame(Variable=ifelse(EM_model$posterior[,1]>EM_model$posterior[,2],"Dist1","Dist2"), Values=EM_model$x)
  

 
  
  ### Plot the Simulated Distributions
  r1<-rnorm(1000, EM_model$mu[1], EM_model$sigma[1])
  r2<-rnorm(1000, EM_model$mu[2], EM_model$sigma[2])
  
  
  densities<-data.frame(Dist1=r1, Dist2=r2)
  densities.m <- melt(densities, id.vars=NULL,value.name = "Values")
  
  
  gg1<-ggplot(densities.m, aes(Values)) + geom_density(aes(col=variable))+ggtitle("Estimated Normal Distributions in the Sample")
  
  
  
  ### Second Plot of the Sample Values
  
  ## p2<-plot(Dists$Values, col=Dists$Variable, main="Sample Values", ylab = "Values")
  
  
  
  
  
  ###check if the difference is greater than threshold percent
  
  if(min(means)*(1+threshold)>max(means)) {
    
    msg="We should consider only one distribution in the sample"; print("We should consider only one distribution in the sample")
  } else { msg="We should consider more than one distribution in the sample"; print("We should consider more than one distribution in the sample")}
  
  
  output<-list(means=means, proportions=proportions, sds=sds, gg1=gg1, Dists=Dists, msg=msg)
  
  return(output)
  
}

Apply and Validate the Model

At the beginning we will start with the Random Generated Normal Distributions that we built before. We expect the algorith to suggest as that there is only one distribution when we test each sample and to suggest that there are two distributions when we test the merged sample

We start with the X1 which is the Normal(10,1) and as we can see the Model Suggests that there is only one Distribution in the Sample

fit.model<-Check_Distributions(x1)

## WARNING! NOT CONVERGENT! 
## number of iterations= 300 
## [1] "We should consider only one distribution in the sample"

fit.model$msg

## [1] "We should consider only one distribution in the sample"

fit.model$gg1

plot(fit.model$Dists$Values, col=fit.model$Dists$Variable, main="Sample Values", ylab = "Values")

We continue with the X2 which is the Normal(5,1) and as we can see the Model Suggests that there is only one Distribution in the Sample

fit.model<-Check_Distributions(x2)

## WARNING! NOT CONVERGENT! 
## number of iterations= 300 
## [1] "We should consider only one distribution in the sample"

fit.model$msg

## [1] "We should consider only one distribution in the sample"

fit.model$gg1

plot(fit.model$Dists$Values, col=fit.model$Dists$Variable, main="Sample Values", ylab = "Values")

We continue with the X which is the Mixture Normal and as we can see the Model Suggests that there are two Distributions in the Sample

fit.model<-Check_Distributions(x)

## number of iterations= 33 
## [1] "We should consider more than one distribution in the sample"

fit.model$msg

## [1] "We should consider more than one distribution in the sample"

fit.model$gg1

plot(fit.model$Dists$Values, col=fit.model$Dists$Variable, main="Sample Values", ylab = "Values")

Apply the Algorith to the Dataset

Let’s start the 1112. As we can see the Model suggests that there is only one Distribution however the EM converged by fitting two Distribution which have a great overlap and the difference in the mean values is less than 15% and for that reason we assume that there is only one Distribution in the sample

fit.model<-Check_Distributions(dataset_1112)

## number of iterations= 96 
## [1] "We should consider only one distribution in the sample"

fit.model$msg

## [1] "We should consider only one distribution in the sample"

fit.model$gg1

plot(fit.model$Dists$Values, col=fit.model$Dists$Variable, main="Sample Values", ylab = "Values")

Let’s continue with the 1114. As we can see the Model suggests that there is only one Distribution. Also the EM did not converge which means that there is only one distribution in the sample

fit.model<-Check_Distributions(dataset_1114)

## WARNING! NOT CONVERGENT! 
## number of iterations= 300 
## [1] "We should consider only one distribution in the sample"

fit.model$msg

## [1] "We should consider only one distribution in the sample"

fit.model$gg1

plot(fit.model$Dists$Values, col=fit.model$Dists$Variable, main="Sample Values", ylab = "Values")

Let’s continue with the 1115. As we can see the Model suggests that there is only one Distribution and there is a great overlap between the two estimated distributions which indicated that in our case we should consider one Distribution in the sample

fit.model<-Check_Distributions(dataset_1115)

## number of iterations= 20 
## [1] "We should consider only one distribution in the sample"

fit.model$msg

## [1] "We should consider only one distribution in the sample"

fit.model$gg1

plot(fit.model$Dists$Values, col=fit.model$Dists$Variable, main="Sample Values", ylab = "Values")

Let’s continue with the merged 1114 and 1115. As you can see here the model suggests that there are more than one Distributions and the difference is clear by looking at the plots

fit.model<-Check_Distributions(rbind(dataset_1115, dataset_1114 ))

## number of iterations= 4 
## [1] "We should consider more than one distribution in the sample"

fit.model$msg

## [1] "We should consider more than one distribution in the sample"

fit.model$gg1

plot(fit.model$Dists$Values, col=fit.model$Dists$Variable, main="Sample Values", ylab = "Values")

Let’s continue with the merged 1114 and 1112. As you can see here the model suggests that there are more than one Distributions and the difference is clear by looking at the plots

fit.model<-Check_Distributions(rbind(dataset_1112, dataset_1114 ))

## number of iterations= 2 
## [1] "We should consider more than one distribution in the sample"

fit.model$msg

## [1] "We should consider more than one distribution in the sample"

fit.model$gg1

plot(fit.model$Dists$Values, col=fit.model$Dists$Variable, main="Sample Values", ylab = "Values")

Let’s continue with the merged 1115 and 1112. As you can see here the model suggests that there are more than one Distributions and the difference is clear by looking at the plots

fit.model<-Check_Distributions(rbind(dataset_1112, dataset_1115 ))

## number of iterations= 2 
## [1] "We should consider more than one distribution in the sample"

fit.model$msg

## [1] "We should consider more than one distribution in the sample"

fit.model$gg1

plot(fit.model$Dists$Values, col=fit.model$Dists$Variable, main="Sample Values", ylab = "Values")