Objectives: EM algorithm to distinguish non-normal distributions

  1. Identify students with missing gender
  2. Estimate female and male students’ underlying height distributions

PART A: Histogram

Read Data: (‘data.txt’)

2000 height data: 500 female (denote as 1), 500 male (denote as 2), 1000 unknown (denote as 0)..

Observe the two histograms: two modes are observed for both female and male group

file = 'data.txt'
df <- read.table(file, header = TRUE, sep = " ", dec = ".")
x <- df$height
y <- df$gender
y_female <- df[df$gender==1,]
y_male   <- df[df$gender==2,]

par(mfrow=c(1,2))
hist(y_female$height,breaks=30, xlab = "labeled female height",main = NULL)
hist(y_male$height,  breaks=30, xlab = "labeled male height",  main = NULL)

PART B:

(a) - (d) See the scanned notes in the appendix

(e) EM Algorithm

Parameter setting:

mu = seq(140,200,0.5)
K  = length(mu)
sig= 0.7
p1 = 0.5
pi_1 = rep( 1/K,K)
pi_2 = rep( 1/K,K)

Calculate \(f_k(x_i|\mu_k,\sigma)\), where \(k=1,\cdots,K\) \(i=1,\cdots, 2000\)

# input xi of R1, output fx_i of R^K
fx_i <- function(xi,mu,sig){
  result = rep(K,0)
  for (i in 1:K){
    result[i] = dnorm(xi,mu[i], sig)
  }
  result
}

E-Step:

# input pi_1 of R^K, xi of R^n, output W_ni1/2 of R^K
W_ni1 <- function(pi_1,x){
  result = 0
  for (xi in x){
    result = result + pi_1*fx_i(xi,mu,sig)/ (t(pi_1)%*% fx_i(xi,mu,sig))
  }
  return(result)
}

W_ni2 <- function(pi_2,x){
  result = 0
  for (xi in x){
    result = result + pi_2*fx_i(xi,mu,sig)/ (t(pi_2)%*% fx_i(xi,mu,sig))
  }
  return(result)
}

# input pi_1 of R^K, p1 & xi of R^n, output W_Gi of R1
W_Gi <- function (p1, pi_1, pi_2, x){
  result = 0
  for (xi in x){
    t1 = p1 * (t(pi_1)%*% fx_i(xi,mu,sig))
    t2 = (1-p1) * (t(pi_2)%*% fx_i(xi,mu,sig))
    result = result +  2 - t1/(t1+t2)
  }
  return(result)
}


# input pi_1 of R^K, p1 & xi of R^n, output W_Gi1/2 of R^K
W_Gi1 <- function(p1,pi_1, pi_2,x){
  result = 0
  for (xi in x){
    t0 = p1 * pi_1 * fx_i(xi,mu,sig)
    t1 = p1 * (t(pi_1)%*% fx_i(xi,mu,sig))
    t2 = (1-p1) * (t(pi_2)%*% fx_i(xi,mu,sig))
    result = result +   t0/(t1+t2)
  }
  return(result)
}

W_Gi2 <- function(p1,pi_1, pi_2,x){
  result = 0
  for (xi in x){
  t0 = (1-p1) * pi_2 * fx_i(xi,mu,sig)
  t1 = p1 * (t(pi_1)%*% fx_i(xi,mu,sig))
  t2 = (1-p1) * (t(pi_2)%*% fx_i(xi,mu,sig))
  result =result + t0/(t1+t2)
  }
  return(result)
}

M-Step:

EM <- function (pi_1, pi_2, p1){
  p1a = (2500-W_Gi(p1, pi_1, pi_2, x[1001:2000]))/2000
  pi_1a = (W_ni1(pi_1,x[1:500])   +W_Gi1(p1,pi_1,pi_2,x[1001:2000]))/(500+sum(W_Gi1(p1,pi_1,pi_2,x[1001:2000])))
  pi_2a = (W_ni2(pi_2,x[501:1000])+W_Gi2(p1,pi_1,pi_2,x[1001:2000]))/(500+sum(W_Gi2(p1,pi_1,pi_2,x[1001:2000])))
  return(list(pi_1=pi_1a, pi_2=pi_2a, p1 =p1a))
}

Main Program:

Main <- function(pi_1,pi_2,p1, tol, Tmax){
  err=10^3; Time <- 1;
  
  while(err>tol || Time == Tmax){
    iterate <- EM(pi_1,pi_2,p1)
    p1a <- iterate$p1
    pi_1a <- iterate$pi_1
    pi_2a <- iterate$pi_2
    Time <- Time+1
    err = max(max(abs(p1a-p1)), max(abs(pi_1a-pi_1)), max(abs(pi_2a-pi_2)))
    cat("Error is ", err, ", p1 estimate is ", p1a,"\n")
    p1 <- p1a
    pi_1 <- pi_1a
    pi_2 <- pi_2a
  }
  return(list(pi_1=pi_1, pi_2=pi_2, p1 =p1))
}

main <- Main(pi_1,pi_2,p1, tol=10^-3, Tmax = 10^3)
Error is  0.08837692 , p1 estimate is  0.5 
Error is  0.03559439 , p1 estimate is  0.471716 
Error is  0.02735029 , p1 estimate is  0.4443657 
Error is  0.02005084 , p1 estimate is  0.4243148 
Error is  0.01336047 , p1 estimate is  0.4109544 
Error is  0.008535887 , p1 estimate is  0.4024185 
Error is  0.005342398 , p1 estimate is  0.3970761 
Error is  0.003640151 , p1 estimate is  0.3937686 
Error is  0.002879467 , p1 estimate is  0.3917339 
Error is  0.002474126 , p1 estimate is  0.3904878 
Error is  0.002176012 , p1 estimate is  0.389728 
Error is  0.001951005 , p1 estimate is  0.3892672 
Error is  0.001777061 , p1 estimate is  0.3889901 
Error is  0.001639553 , p1 estimate is  0.3888257 
Error is  0.001528566 , p1 estimate is  0.3887301 
Error is  0.001437257 , p1 estimate is  0.3886765 
Error is  0.001360825 , p1 estimate is  0.3886484 
Error is  0.00129584 , p1 estimate is  0.3886356 
Error is  0.001239817 , p1 estimate is  0.3886319 
Error is  0.001190922 , p1 estimate is  0.3886333 
Error is  0.001147784 , p1 estimate is  0.3886376 
Error is  0.001109357 , p1 estimate is  0.3886434 
Error is  0.001074837 , p1 estimate is  0.3886498 
Error is  0.001043593 , p1 estimate is  0.3886562 
Error is  0.001015124 , p1 estimate is  0.3886625 
Error is  0.0009890277 , p1 estimate is  0.3886685 
p1 <- main$p1
pi_1 <- main$pi_1
pi_2 <- main$pi_2

(f) Estimated \(p_1\) is around 0.389 \(\pm 0.001\)

(g) Draw the Estimated pdf curves for \(g1\) and \(g2\).

x <-seq(140,200,1)

g1<-rep(0,length(x))

for (i in 1:length(x)){
        g1[i]=fx_i(x[i],mu,sig)%*% pi_1
}


g2<-rep(0,length(x))
for (i in 1:length(x)){
        g2[i]=fx_i(x[i],mu,sig)%*% pi_2
}

par(mfrow=c(1,1))
plot(x,g1,type="l",lwd=2,xlim=c(140,200),ylim=c(0,0.28),col="pink",xlab="height",ylab="p.d.f.",main="Approximated p.d.f. curves for g1 and g2")
lines(x,g2,type="l",lwd=2,col="lightblue")
legend(190,0.25,c("female ","male "),col=c("pink","lightblue"),pch=20)

(h) Estimate the missing gender data and output csv.

y<-df[1001:2000,]

# Probability of gender to be 1
P_Gi1 <- function (p1, pi_1, pi_2, xi){
    t1 = p1 * (t(pi_1)%*% fx_i(xi,mu,sig))
    t2 = (1-p1) * (t(pi_2)%*% fx_i(xi,mu,sig))
    result = t1/(t1+t2)
    return(result)
}
for (i in 1:1000){
    PrG1=P_Gi1(p1,pi_1,pi_2,y[i,1])
    if (PrG1>0.5) {
        y[i,2]=1
    }else{
        y[i,2]=2
    }
}
write.table(y,file="Gender_Indicator_Table_Zhiling GU.csv",row.names=F)

p1_approx = dim(y[y$gender==1,])[1]/1000
cat("estimated gender: female proportion is", p1_approx)
estimated gender: female proportion is 0.278