Read Data: (‘data.txt’)
2000 height data: 500 female (denote as 1), 500 male (denote as 2), 1000 unknown (denote as 0)..
Observe the two histograms: two modes are observed for both female and male group
file = 'data.txt'
df <- read.table(file, header = TRUE, sep = " ", dec = ".")
x <- df$height
y <- df$gender
y_female <- df[df$gender==1,]
y_male <- df[df$gender==2,]
par(mfrow=c(1,2))
hist(y_female$height,breaks=30, xlab = "labeled female height",main = NULL)
hist(y_male$height, breaks=30, xlab = "labeled male height", main = NULL)
Parameter setting:
mu = seq(140,200,0.5)
K = length(mu)
sig= 0.7
p1 = 0.5
pi_1 = rep( 1/K,K)
pi_2 = rep( 1/K,K)
Calculate \(f_k(x_i|\mu_k,\sigma)\), where \(k=1,\cdots,K\) \(i=1,\cdots, 2000\)
# input xi of R1, output fx_i of R^K
fx_i <- function(xi,mu,sig){
result = rep(K,0)
for (i in 1:K){
result[i] = dnorm(xi,mu[i], sig)
}
result
}
E-Step:
# input pi_1 of R^K, xi of R^n, output W_ni1/2 of R^K
W_ni1 <- function(pi_1,x){
result = 0
for (xi in x){
result = result + pi_1*fx_i(xi,mu,sig)/ (t(pi_1)%*% fx_i(xi,mu,sig))
}
return(result)
}
W_ni2 <- function(pi_2,x){
result = 0
for (xi in x){
result = result + pi_2*fx_i(xi,mu,sig)/ (t(pi_2)%*% fx_i(xi,mu,sig))
}
return(result)
}
# input pi_1 of R^K, p1 & xi of R^n, output W_Gi of R1
W_Gi <- function (p1, pi_1, pi_2, x){
result = 0
for (xi in x){
t1 = p1 * (t(pi_1)%*% fx_i(xi,mu,sig))
t2 = (1-p1) * (t(pi_2)%*% fx_i(xi,mu,sig))
result = result + 2 - t1/(t1+t2)
}
return(result)
}
# input pi_1 of R^K, p1 & xi of R^n, output W_Gi1/2 of R^K
W_Gi1 <- function(p1,pi_1, pi_2,x){
result = 0
for (xi in x){
t0 = p1 * pi_1 * fx_i(xi,mu,sig)
t1 = p1 * (t(pi_1)%*% fx_i(xi,mu,sig))
t2 = (1-p1) * (t(pi_2)%*% fx_i(xi,mu,sig))
result = result + t0/(t1+t2)
}
return(result)
}
W_Gi2 <- function(p1,pi_1, pi_2,x){
result = 0
for (xi in x){
t0 = (1-p1) * pi_2 * fx_i(xi,mu,sig)
t1 = p1 * (t(pi_1)%*% fx_i(xi,mu,sig))
t2 = (1-p1) * (t(pi_2)%*% fx_i(xi,mu,sig))
result =result + t0/(t1+t2)
}
return(result)
}
M-Step:
EM <- function (pi_1, pi_2, p1){
p1a = (2500-W_Gi(p1, pi_1, pi_2, x[1001:2000]))/2000
pi_1a = (W_ni1(pi_1,x[1:500]) +W_Gi1(p1,pi_1,pi_2,x[1001:2000]))/(500+sum(W_Gi1(p1,pi_1,pi_2,x[1001:2000])))
pi_2a = (W_ni2(pi_2,x[501:1000])+W_Gi2(p1,pi_1,pi_2,x[1001:2000]))/(500+sum(W_Gi2(p1,pi_1,pi_2,x[1001:2000])))
return(list(pi_1=pi_1a, pi_2=pi_2a, p1 =p1a))
}
Main Program:
Main <- function(pi_1,pi_2,p1, tol, Tmax){
err=10^3; Time <- 1;
while(err>tol || Time == Tmax){
iterate <- EM(pi_1,pi_2,p1)
p1a <- iterate$p1
pi_1a <- iterate$pi_1
pi_2a <- iterate$pi_2
Time <- Time+1
err = max(max(abs(p1a-p1)), max(abs(pi_1a-pi_1)), max(abs(pi_2a-pi_2)))
cat("Error is ", err, ", p1 estimate is ", p1a,"\n")
p1 <- p1a
pi_1 <- pi_1a
pi_2 <- pi_2a
}
return(list(pi_1=pi_1, pi_2=pi_2, p1 =p1))
}
main <- Main(pi_1,pi_2,p1, tol=10^-3, Tmax = 10^3)
Error is 0.08837692 , p1 estimate is 0.5
Error is 0.03559439 , p1 estimate is 0.471716
Error is 0.02735029 , p1 estimate is 0.4443657
Error is 0.02005084 , p1 estimate is 0.4243148
Error is 0.01336047 , p1 estimate is 0.4109544
Error is 0.008535887 , p1 estimate is 0.4024185
Error is 0.005342398 , p1 estimate is 0.3970761
Error is 0.003640151 , p1 estimate is 0.3937686
Error is 0.002879467 , p1 estimate is 0.3917339
Error is 0.002474126 , p1 estimate is 0.3904878
Error is 0.002176012 , p1 estimate is 0.389728
Error is 0.001951005 , p1 estimate is 0.3892672
Error is 0.001777061 , p1 estimate is 0.3889901
Error is 0.001639553 , p1 estimate is 0.3888257
Error is 0.001528566 , p1 estimate is 0.3887301
Error is 0.001437257 , p1 estimate is 0.3886765
Error is 0.001360825 , p1 estimate is 0.3886484
Error is 0.00129584 , p1 estimate is 0.3886356
Error is 0.001239817 , p1 estimate is 0.3886319
Error is 0.001190922 , p1 estimate is 0.3886333
Error is 0.001147784 , p1 estimate is 0.3886376
Error is 0.001109357 , p1 estimate is 0.3886434
Error is 0.001074837 , p1 estimate is 0.3886498
Error is 0.001043593 , p1 estimate is 0.3886562
Error is 0.001015124 , p1 estimate is 0.3886625
Error is 0.0009890277 , p1 estimate is 0.3886685
p1 <- main$p1
pi_1 <- main$pi_1
pi_2 <- main$pi_2
x <-seq(140,200,1)
g1<-rep(0,length(x))
for (i in 1:length(x)){
g1[i]=fx_i(x[i],mu,sig)%*% pi_1
}
g2<-rep(0,length(x))
for (i in 1:length(x)){
g2[i]=fx_i(x[i],mu,sig)%*% pi_2
}
par(mfrow=c(1,1))
plot(x,g1,type="l",lwd=2,xlim=c(140,200),ylim=c(0,0.28),col="pink",xlab="height",ylab="p.d.f.",main="Approximated p.d.f. curves for g1 and g2")
lines(x,g2,type="l",lwd=2,col="lightblue")
legend(190,0.25,c("female ","male "),col=c("pink","lightblue"),pch=20)
y<-df[1001:2000,]
# Probability of gender to be 1
P_Gi1 <- function (p1, pi_1, pi_2, xi){
t1 = p1 * (t(pi_1)%*% fx_i(xi,mu,sig))
t2 = (1-p1) * (t(pi_2)%*% fx_i(xi,mu,sig))
result = t1/(t1+t2)
return(result)
}
for (i in 1:1000){
PrG1=P_Gi1(p1,pi_1,pi_2,y[i,1])
if (PrG1>0.5) {
y[i,2]=1
}else{
y[i,2]=2
}
}
write.table(y,file="Gender_Indicator_Table_Zhiling GU.csv",row.names=F)
p1_approx = dim(y[y$gender==1,])[1]/1000
cat("estimated gender: female proportion is", p1_approx)
estimated gender: female proportion is 0.278