Galton Heighet Data Anaylsis

This document present the process of Galton Heighet data analysis.

1.Data Description

library(HistData)
data("GaltonFamilies")
head(GaltonFamilies)

##   family father mother midparentHeight children childNum gender
## 1    001   78.5   67.0           75.43        4        1   male
## 2    001   78.5   67.0           75.43        4        2 female
## 3    001   78.5   67.0           75.43        4        3 female
## 4    001   78.5   67.0           75.43        4        4 female
## 5    002   75.5   66.5           73.66        4        1   male
## 6    002   75.5   66.5           73.66        4        2   male
##   childHeight
## 1        73.2
## 2        69.2
## 3        69.0
## 4        69.0
## 5        73.5
## 6        72.5

summary(GaltonFamilies)

##      family        father         mother      midparentHeight
##  185    : 15   Min.   :62.0   Min.   :58.00   Min.   :64.40  
##  066    : 11   1st Qu.:68.0   1st Qu.:63.00   1st Qu.:68.14  
##  120    : 11   Median :69.0   Median :64.00   Median :69.25  
##  130    : 11   Mean   :69.2   Mean   :64.09   Mean   :69.21  
##  166    : 11   3rd Qu.:71.0   3rd Qu.:65.88   3rd Qu.:70.14  
##  097    : 10   Max.   :78.5   Max.   :70.50   Max.   :75.43  
##  (Other):865                                                 
##     children         childNum         gender     childHeight   
##  Min.   : 1.000   Min.   : 1.000   female:453   Min.   :56.00  
##  1st Qu.: 4.000   1st Qu.: 2.000   male  :481   1st Qu.:64.00  
##  Median : 6.000   Median : 3.000                Median :66.50  
##  Mean   : 6.171   Mean   : 3.586                Mean   :66.75  
##  3rd Qu.: 8.000   3rd Qu.: 5.000                3rd Qu.:69.70  
##  Max.   :15.000   Max.   :15.000                Max.   :79.00  
##

g <- GaltonFamilies
par(mfrow=c(2,3))
hist(g$father, main = "Father", breaks = 30)
abline(v=mean(g$father), col=2, lwd=2)
hist(g$childHeight[g$gender=="male"], main = "boys", breaks = 30)
abline(v=mean(g$childHeight[g$gender=="male"]), col=2, lwd=2)
hist(c(g$childHeight[g$gender=="male"],g$father), main = "boyes and father", breaks = 30)
abline(v=mean(c(g$childHeight[g$gender=="male"],g$father)), col=2, lwd=2)

hist(g$mother, main = "Mother", breaks = 30)
abline(v=mean(g$mother), col=2, lwd=2)
hist(g$childHeight[g$gender=="female"], main = "girls", breaks = 30)
abline(v=mean(g$childHeight[g$gender=="female"]), col=2,lwd=2)
hist(c(g$childHeight[g$gender=="female"],g$mother), main = "girls and mothre", breaks = 30)
abline(v=mean(c(g$childHeight[g$gender=="female"],g$mother)), col=2, lwd=2)

library(ggplot2)
ggplot(data=GaltonFamilies,aes(x=midparentHeight,y=childHeight,color=gender)) +
  geom_point()

As Galton shows, that the gender has the 5.2 inch difference or 1.08 times effect. The number 1.08 can proved use the mean level difference or do regression to take gender as covarite, the second has been done by ZiJian. Different groups has almost the same results as both methods show. So on the next analysis, we can take gender as one covarite or adjust it by times 1.08 for each male or mother.

g <- GaltonFamilies
mean(g$father)/mean(g$mother) # father/mother

## [1] 1.079698

# children male/female
mean(g$childHeight[g$gender == "male"])/mean(g$childHeight[g$gender == "female"])

## [1] 1.080028

# all children.
mean(c(g$father, g$childHeight[g$gender == "male"]))/
  mean(c(g$mother, g$childHeight[g$gender == "female"]))

## [1] 1.079814

g <- GaltonFamilies
g$childHeight <- with(GaltonFamilies,ifelse(gender=="female",1.08,1)*childHeight)
ggplot(data=g,aes(x=midparentHeight,y=childHeight,color=gender)) + geom_point()

After the gender adjustment, the order of each family children seems has no difference.

g <- GaltonFamilies
g$mother <- g$mother*1.08
g$childHeight <- with(g, ifelse(gender=="female", 1.08, 1)*childHeight)

g.child <- aggregate(g$childHeight, list(g$family), cbind)$x
gpa <- g[which(g$childNum==1),2:4]
#par(bg="lightgrey")
plot(c(as.numeric(gpa[1,]), c(g.child[[1]])), ylim = c(60, 80), xaxt='n',
     main = "One line one family", xlim = c(0.5, 18.5), type = "l", ylab = "Heighe(inch)")
points(c(as.numeric(gpa[1, ]), c(g.child[[1]])), pch= 20)
axis(1, at = 1:18, labels = c("father", "Mother", "Mid-pm", paste0("C", 1:15)), las=2)
abline(v=1:18, h=c(55, 60, 65, 70, 75, 80), col="white")
abline(v=1:4, col=1)

for(i in 1:205) {
  lines(c(as.numeric(gpa[i, ]), c(g.child[[i]])), col=2)
  points(c(as.numeric(gpa[i,]), c(g.child[[i]])), pch=20, col=2)
  #Sys.sleep(0.2)
  lines(c(as.numeric(gpa[i, ]), c(g.child[[i]])), col="grey")
  points(c(as.numeric(gpa[i,]), c(g.child[[i]])), pch=20, col="grey")
  #Sys.sleep(0.1)
}

child.orm <- aggregate(g$childHeight, list(g$childNum), mean)$x
child.orv <- aggregate(g$childHeight, list(g$childNum), sd)$x
paret.orm <- apply(gpa, 2, mean)
paret.orv <- apply(gpa, 2, sd)
gpa.mean  <- c(paret.orm, child.orm)
gpa.var   <- c(paret.orv, child.orv)

for(i in 1:14){
  y=c(gpa.mean[i]-2*gpa.var[i], gpa.mean[i], gpa.mean[i]+2*gpa.var[i])
  lines(x=c(i,i,i), y=y, lwd=3)
  points(x=c(i,i,i), y=y,pch=20, cex=1.5)
}
lines(c(paret.orm, child.orm), col=1, lwd=3)

g <- GaltonFamilies
varf <- aggregate(g$childHeight, list(g$family), sd)
par(mfrow = c(2, 1))
plot(varf[, 2], main = "Standard Error of each family children", pch=20)
abline(h=1:8, col="grey")
plot(gpa.var, main = "Standard Error of each ith children", xaxt="n", pch=20)
axis(1, at = 1:18, labels = c("father", "Mother", "Mid-pm", paste0("C", 1:15)), las=2)

g <- GaltonFamilies
g$mother<-g$mother*1.08
gpa <- g[which(g$childNum==1),2:3]
#par(bg="lightgrey")
plot(1:2, gpa[1, ], type="l", main = "Father and Mother", ylim = c(60, 78),
     xaxt="n", ylab="height", xlab=" ")
axis(1, at=1:2, labels = c("Husband", "Wife"))
for(i in 1:205) {
  lines(as.numeric(gpa[i, ]), col=2)
  points(as.numeric(gpa[i,]), pch=20, col=2)
  #Sys.sleep(0.2)
  lines(as.numeric(gpa[i, ]), col="grey")
  points(as.numeric(gpa[i,]), pch=20, col="grey")
  #Sys.sleep(0.1)
}
fm <- gpa[order(gpa[, 1]),]
fm1 <- apply(fm[1:40,], 2, mean)
fm2 <- apply(fm[41:80,], 2, mean)
fm3 <- apply(fm[81:121, ], 2, mean)
fm4 <- apply(fm[122:163, ], 2, mean)
fm5 <- apply(fm[164:205, ], 2, mean)
lines(fm1); lines(fm2); lines(fm3); lines(fm4); lines(fm5);

#lines(fmmid); lines(fmhig)
#lines(fm1,fm2,fm3,fm4, fm5, col=3, lwd=2)

2.Heritability with OLS

Using the narrow sense heritability difinition, here we show that it has some relationship with the coefficient of ordinary least square.

2.1 Father or Mother with offspring

Follow $P$ is phentype and $G$ is genotype, $S_{off}$ is the offspring’s sample variance, $S_{father/mother}$ means variance one of them,$S_{off, father/mother}$ means the covariance of the offspring and father or mother.

First we have:

\[cov(P_{off}, P_{mother}) = cov(P_{off}, P_{father}) \\ cov(G_{off}, G_{mother}) = cov(G_{off}, G_{fatehr}) =\frac{\sigma_g^2}{2}\]

Then we do simple linear regression and calcualte the coefficient $\beta_1$.

\[P_{off} = \beta_1 P_{father/mother}\\ \hat{\beta}_1 = \frac{S_{off, father/mother}}{S_{father/mother}^2}\]

Last we calculate the correlation of this pairs $r_1$:

\[r_1 \equiv \frac{cov(P_{off}, P_{father/mother})}{\sqrt{Var(P_{off})Var(P_{father/mother})}} = \frac{\frac{\sigma_g^2}{2}}{\sigma_g^2+\sigma_e^2}= \frac{h^2}{2}\\ \hat{r}_1 = \frac{S_{off, father/mother}}{S_{off}S_{father/mother}}=\frac{\hat{h}^2}{2}\]

When $S_{off} = S_{father/mother}$, we can have:

\[{\hat{h}}^2 = 2\hat{\beta}_1\]

2.2 Midparents with offspring

With the same way as above, here we take the midparent’s height as predictor:

\[cov(P_{off}, P_{mid}) = cov(G_{off}, G_{mid}) = \frac{\sigma_g^2}{2}\]

\[P_{off} = \beta_2 P_{mid}\\ \hat{\beta}_2 = \frac{S_{off, mid}}{S_{off}^2}\]

\[r_2 \equiv \frac{cov(P_{off}, P_{mid})}{\sqrt{Var(P_{off})Var(P_{mid})}} = \frac{\frac{\sigma_g^2}{2}}{\sqrt{\frac{1}{2}} \sigma_g^2+\sigma_e^2}= \sqrt{\frac{1}{2}}h^2\\ \hat{r}_2 = \frac{S_{off, mid}}{S_{off}S_{mid}}=\sqrt{\frac{1}{2}}\hat{h}^2\]

When $S_{off} = \sqrt{2}S_{mid}$, we can have: \[\hat{h}^2 = \hat{\beta}_2\]

That’s all, there are four methods we can use to estimate the heritability. The first one is the two times of the coefficient of OLS between father or mother and children; the second one is the coefficient of OLS between midparents and children; the next to is the corresponding correlation of this pairs.

2.3 Heritability estimation.

Taking gender as one covariate to do two variable regression, the heritability estimation is 0.687. This is not correct because the children are not independent for the family structure. The variance of the coefficient of midparents has lower estimated 0.039 which should bigger. The difference between male and female is 5.215 inch.

library(HistData)
data("GaltonFamilies")
fit1 <- lm(childHeight~midparentHeight+gender,data=GaltonFamilies)
summary(fit1)$coef

##                   Estimate Std. Error   t value      Pr(>|t|)
## (Intercept)     16.5141022 2.73391750  6.040454  2.219169e-09
## midparentHeight  0.6870152 0.03943983 17.419326  4.881957e-59
## gendermale       5.2151054 0.14215811 36.685247 5.519341e-183

Taking father or mother to do regression with child’s height. The heritability should twice of the coffeicient. This has the same problem as above fit, but it seems that the father has much more heritability then mother for their children on the average level. The gender has almost the same effect all around 5.2.

g<- GaltonFamilies
# father
fit2 <- lm(childHeight~father+gender, data = g)
summary(fit2)$coef

##               Estimate Std. Error  t value      Pr(>|t|)
## (Intercept) 35.6790984 2.09313335 17.04578  6.533613e-57
## father       0.4104067 0.03018159 13.59792  1.518043e-38
## gendermale   5.1804523 0.14947528 34.65759 1.054827e-169

# mother
fit3 <- lm(childHeight~mother+gender, data = g)
summary(fit3)$coef

##               Estimate Std. Error  t value      Pr(>|t|)
## (Intercept) 42.1011532  2.1778494 19.33153  2.899497e-70
## mother       0.3429967  0.0339055 10.11626  6.783142e-23
## gendermale   5.1697457  0.1553339 33.28150 1.246403e-160

As above describe, the child is not independent for family structure. So we take one child of each family or the mean height of each family’s children to do regression with the father, mother or midparents. Here we adjust the gender by times 1.08. The heritability estimation almost the same with the above estimation.While the variance of this estimation is much bigger than it. It also finds that taking mean of child’s height will have smaller variance then take only one child’s. The heritability of father (0.441/0.409) is higher than mother’s (0.342/0.345)on the mean level.

g<- GaltonFamilies
g[g$gender == "female", "childHeight"] <- g[g$gender == "female", "childHeight"]*1.08
ch.mean <- aggregate(g$childHeight, list(g$family), mean)[, 2]
ch.one <- g$childHeight[which(g$childNum==1)]   # a least one child
ph.mid <- g$midparentHeight[which(g$childNum==1)]
ph.fat <- g$father[which(g$childNum==1)]
ph.mot <- g$mother[which(g$childNum==1)]

# one child
fit4 <- lm(ch.one ~ ph.mid)
summary(fit4)$coef

##               Estimate Std. Error  t value     Pr(>|t|)
## (Intercept) 22.2776781 5.58995661 3.985304 9.394965e-05
## ph.mid       0.6964473 0.08072673 8.627220 1.814303e-15

# one child with father
fit5 <- lm(ch.one~ ph.fat)
summary(fit5)$coef

##               Estimate Std. Error  t value     Pr(>|t|)
## (Intercept) 39.9082918 4.21704539 9.463567 7.766010e-18
## ph.fat       0.4411218 0.06079379 7.256033 8.274813e-12

# one child with mother
fit6 <- lm(ch.one~ ph.mot)
summary(fit6)$coef

##              Estimate Std. Error   t value     Pr(>|t|)
## (Intercept) 48.602098 4.71345067 10.311362 2.567025e-20
## ph.mot       0.341912 0.07359678  4.645746 6.086119e-06

# child's average with midparent
fit7 <- lm(ch.mean ~ ph.mid)
summary(fit7)$coef

##               Estimate Std. Error   t value     Pr(>|t|)
## (Intercept) 23.0570930 3.98119010  5.791508 2.624595e-08
## ph.mid       0.6681181 0.05749391 11.620676 2.913084e-24

# child's average with father
fit8 <- lm(ch.mean ~ ph.fat)
summary(fit8)$coef

##               Estimate Std. Error  t value     Pr(>|t|)
## (Intercept) 40.9691832 3.14343040 13.03327 1.300479e-28
## ph.fat       0.4087715 0.04531634  9.02040 1.433013e-16

# child's average with mother
fit9 <- lm(ch.mean ~ ph.mot)
summary(fit9)$coef

##               Estimate Std. Error   t value     Pr(>|t|)
## (Intercept) 47.2115396 3.57540281 13.204537 3.828509e-29
## ph.mot       0.3451784 0.05582707  6.182993 3.398177e-09

Like boostrap, here we take one child of each family 1000 times to calculate the heritablity.

# chose one child many times
g<- GaltonFamilies
fam.num <- g$family
g[g$gender == "female", "childHeight"] <- g[g$gender == "female", "childHeight"]*1.08
g$mother <- g$mother*1.08

ch.rand <- function(f, data) {
  f.u <- unique(f)
  l <- length(f.u)
  ind.sam <- rep(0, l)
  for(i in 1:l) {
    ind <- which(f == f.u[i])
    if(length(ind) == 1) {
      ind.sam[i] <- ind
    }else{
      ind.sam[i] <- sample(ind, 1)
    }
  }
  data.rand <- data[ind.sam, ]
  return(data.rand)
}

B <- 1
Bres <- matrix(0, B, 6)
for(i in 1:B){
  galton.rand <- ch.rand(fam.num, g)
  head(galton.rand)
  fit <- lm(childHeight~midparentHeight, data = galton.rand)
  fit1 <- lm(childHeight~father, data = galton.rand)
  fit2 <- lm(childHeight~mother, data = galton.rand)
  Bres[i, 1:2] <- summary(fit)$coef[2, 1:2]
  Bres[i, 3:4] <- summary(fit1)$coef[2, 1:2]
  Bres[i, 5:6] <- summary(fit2)$coef[2, 1:2]
}

Results show that it has normal distribution, the mean level is 0.667 and the variance mean level is 0.08 which proves that if not consider the family structure the variance will lower estimate. Results also show that the mean level of father(0.408) is higher than mother’s(0.318)

load("/Users/zhouchao/Documents/R/Galton/Bres.rda")
head(galton.rand)

##    family father mother midparentHeight children childNum gender
## 3     001   78.5  72.36           75.43        4        3 female
## 6     002   75.5  71.82           73.66        4        2   male
## 10    003   75.0  69.12           72.06        2        2 female
## 11    004   75.0  69.12           72.06        5        1   male
## 18    005   75.0  63.18           69.09        6        3   male
## 22    006   74.0  73.44           73.72        1        1 female
##    childHeight
## 3        74.52
## 6        72.50
## 10       73.44
## 11       70.50
## 18       68.00
## 22       75.06

summary(fit)$coef

##                   Estimate Std. Error  t value     Pr(>|t|)
## (Intercept)     24.4434184 5.69569271 4.291562 2.745595e-05
## midparentHeight  0.6463626 0.08225371 7.858158 2.238396e-13

head(Bres)

##           [,1]       [,2]      [,3]       [,4]      [,5]       [,6]
## [1,] 0.6866913 0.08018497 0.4133752 0.06109323 0.3359559 0.06702884
## [2,] 0.6721370 0.08040848 0.4249455 0.06043547 0.3063937 0.06746048
## [3,] 0.6620967 0.08200132 0.3724345 0.06292389 0.3527708 0.06716758
## [4,] 0.7022348 0.07960241 0.3995739 0.06160019 0.3691220 0.06622175
## [5,] 0.6279158 0.07948939 0.3681620 0.06041099 0.3180524 0.06532163
## [6,] 0.6890986 0.07764888 0.3690742 0.06084795 0.3876317 0.06393105

colMeans(Bres)

## [1] 0.66663884 0.08214050 0.40869321 0.06201990 0.31798940 0.06822192

par(mfrow=c(2,3))
hist(Bres[, 1], breaks = 100, main = "Midparent")
hist(Bres[, 3], breaks = 100, main = "Father")
hist(Bres[, 5], breaks = 100, main = "Mother")

hist(Bres[, 2], breaks = 100, main = "Variance for midparent")
hist(Bres[, 4], breaks = 100, main = "Variance for father")
hist(Bres[, 6], breaks = 100, main = "Variance for mother")

Using the correlation as the estimate of the heritability. Using father, mother and mean height of them to do correlation and regression, result shows that regression has higher heritability (0.854, 0.656, 0.713) then correlation calculation (0.818, 0.629, 0.703). The reason maybe that the variance for father or mother is not equally which has used in the derivation for the relationship between heritability and coefficient.

g<- GaltonFamilies
g[g$gender == "female", "childHeight"] <- g[g$gender == "female", "childHeight"]*1.08
g$mother <- g$mother*1.08

r1 <- cov(g$childHeight, g$father)/sqrt(var(g$childHeight)*var(g$father))
r2 <- cov(g$childHeight, g$mother)/sqrt(var(g$childHeight)*var(g$mother))
2*r1;2*r2

## [1] 0.818477

## [1] 0.6290058

fit10 <- lm(childHeight~father, data = g)
summary(fit10)$coef

##              Estimate Std. Error  t value     Pr(>|t|)
## (Intercept) 39.684187 2.15940881 18.37734 1.325172e-64
## father       0.427027 0.03118669 13.69260 5.101723e-39

fit11 <- lm(childHeight~mother, data = g)
summary(fit11)$coef

##               Estimate Std. Error  t value     Pr(>|t|)
## (Intercept) 46.4969005 2.24930400 20.67168 1.938720e-78
## mother       0.3284815 0.03247596 10.11461 6.868425e-23

r4 <-cov(g$childHeight,g$midparentHeight)/sqrt(var(g$childHeight)*var(g$midparentHeight))
r4*sqrt(2)

## [1] 0.7028815

fit12 <- lm(childHeight~midparentHeight, data = g)
summary(fit12)$coef

##                  Estimate Std. Error   t value     Pr(>|t|)
## (Intercept)     19.917513 2.82129612  7.059703 3.258458e-12
## midparentHeight  0.712585 0.04075238 17.485726 1.995762e-59

3.Moment methods

Consider the model: \[y = X\beta + e, \\ \beta \sim N(0, \sigma_g^2/M)\\ e \sim N(0, \sigma_e^2)\]

The moment estimator: \[(\hat{\sigma}_g^2, \hat{\sigma}_e^2) = argmin||yy^T - (\sigma_g^2K+\sigma_e^2I)||\] The solution is equal to after do devariation:

\[ \begin{bmatrix} tr(KK^T) & tr(K)\\ tr(K) & N \end{bmatrix} \begin{bmatrix} \sigma_g^2\\ \sigma_e^2 \end{bmatrix} =\begin{bmatrix} y^TKy\\ y^Ty \end{bmatrix} \] Here, $y$ is centered, $K$ is the matrix as above calculate. And here $tr(K) = N = 1344$, so we can get the solution: \[\hat{\sigma}_g^2 = \frac{y^TKy - yTy}{tr(KK^T) - tr(K)}\\ \hat{\sigma}_e^2 = \frac{y^Ty - N\hat{\sigma}_g^2}{N}\]

Data transform to get the all height and K matrix.

g <- GaltonFamilies
fat.h <- g$father
mot.h <- g$mother
chi.h <- g$childHeight
gend  <- rep(0, length(fat.h))
gend[g$gender == "male"]   <- 1
chi.h1.08 <- chi.h
chi.h1.08[gend == 0] <- chi.h[gend == 0]*1.08

fam.num <- unique(g$family)
N <- nrow(g) + 2*length(fam.num)
yheight <-  yheight1.08 <- gender <- family.index <- c()
Kgrm <- matrix(0, N, N)
# Y1 <- Y2 <- tK <- tKK <-0
temp <- 0
for(i in 1:length(fam.num)) {
  id      <- fam.num[i]
  index.i <- which(g$family == id)
  Ni <- length(index.i)
  ge      <- c(1,0, gend[index.i])
  gender  <- append(gender,  ge)
  
  fam <- rep(id, 2+length(index.i))
  family.index <- append(family.index, fam)
  
  yi      <- c(fat.h[index.i][1], mot.h[index.i][1], chi.h[index.i])
  yi1.08  <- c(fat.h[index.i][1], mot.h[index.i][1]*1.08, chi.h1.08[index.i])
  yheight <- append(yheight, yi)
  yheight1.08 <- append(yheight1.08, yi1.08)
  
  Ki <- diag(1, length(yi))
  Ki[Ki==0] <- 0.5; Ki[1,2] <- 0; Ki[2,1] <- 0
  ly  <- length(yheight) - length(yi) + 1
  ki.row <- ki.col <- ly:length(yheight)
  
  Kgrm[ki.row, ki.col] <- Ki
}

length(yheight) == N

## [1] TRUE

dim(Kgrm)

## [1] 1344 1344

3.1 moment estimation adjust gender by female all multiple 1.08

mom <- function(y, K, I){
  y <- as.matrix(y)
  yc<- y - mean(y)
  yTKy  <- t(yc) %*% K %*% (yc)
  yTy   <- t(yc) %*% yc
  trKKT <- sum(K^2)
  trK   <- sum(diag(K))
  N     <- sum(diag(I))
  
  sigma_g2 <- (yTKy*N - yTy*trK)/(trKKT*N - trK^2)
  sigma_a2 <- (yTy - trK * sigma_g2)/N
  her.mom <- sigma_g2/(sigma_g2 + sigma_a2)
  res <- data.frame(her.mom = her.mom, sigma_g2 = sigma_g2, sigam_a2 = sigma_a2)
  return(res)
  
}
res <- mom(yheight1.08, Kgrm, I = diag(N))
res

##     her.mom sigma_g2 sigam_a2
## 1 0.7580243 5.052884 1.612976

3.2 moment estimation adjust gender by setting it as one covariate (add projection)

X <- cbind(rep(1, N), gender)
V <- diag(N) -   X %*% solve(t(X) %*% X) %*% t(X)
Vyheight <- V %*% as.matrix(yheight)
VK <- V %*% Kgrm 

mom(Vyheight, VK, I = V)

##     her.mom sigma_g2 sigam_a2
## 1 0.7585287 4.731165 1.506127

3.3 variance of heritability

http://thestatsgeek.com/2013/10/12/the-robust-sandwich-variance-estimator-for-linear-regression/

consider the linear regression model $E(Y|X) = X^T\beta$, so the variance of $\beta$ can be estimated by: \[N^{-1}A(\beta)^{-1}B(\beta){A(\beta)^{-1}}^T \] define equation $D = X(Y-X^T) $ so we have $A(\beta) = E(-\frac{\partial }{\partial \beta}D) = E(XX^T)$, $B(\beta) = Var(D)$, here, we can get match to the moment methods model with the matched estimation function: \[X = \begin{bmatrix} tr(KK^T) & tr(K)\\ tr(K) & N \end{bmatrix}, \quad A = X^{-1}\]

\[B=var(D) = cov(\begin{bmatrix} y^TKy\\ y^Ty \end{bmatrix}) = \begin{bmatrix} Var(y^TKy) & cov(y^TKy, y^Ty)\\ cov(y^TKy, y^Ty) & Var(y^Ty) \end{bmatrix}\]

where the element can calculate by $Var(y^TKy) = 2tr([K\Sigma]^2)$, $Var(y^Ty)=2tr(\Sigma^2)$, $cov(y^TKy, y^Ty) = 2tr(K\Sigma^2)$, and $\Sigma = K\sigma_g^2 + I\sigma_e^2$.

Then use the Delta method \[Var(h(\theta)) \approx \triangledown h^T\Sigma \triangledown h\\ h = \frac{\sigma_g^2}{\sigma_g^2 + \sigma_e^2} ,\quad \triangledown h = \begin{bmatrix} \frac{\sigma_e^2}{(\sigma_g^2 + \sigma_e^2)^2}\\ \frac{-\sigma_g^2}{(\sigma_g^2 + \sigma_e^2)^2} \end{bmatrix}\]

trK2 <- sum(Kgrm^2)
trK  <- sum(diag(Kgrm))
sg<- as.numeric(res[2])
se<- as.numeric(res[3])
X <- matrix(c(trK2, trK, trK, N), 2)
A <- solve(X)

S <- sg*Kgrm + se*diag(N)
covB <- matrix(0,2,2)
covB[1,1] <- 2*sum((Kgrm%*%S)^2)
covB[1,2] <- covB[2,1] <- 2*sum(diag(Kgrm%*%S%*%S))
covB[2,2] <- 2*sum(S^2)
covSig <- A%*%covB%*%A

gh <- c(se/(sg+se)^2, -sg/(sg+se)^2)
sqrt(t(gh) %*% covSig %*% gh)

##            [,1]
## [1,] 0.08765207

4.MM and EM methods

\[y=\mu+g+e\] where, \[g\sim\mathcal{N}(0,K_{grm}\sigma_g),\] \[e\sim\mathcal{N}(0,I\sigma_e),\] and \[h^2=\frac{\sigma_g}{\sigma_g+\sigma_e}.\] Because the prior of latent $g$ has correlation, So we will make some changes. after $y$ is centering, because \[cov(y)=K_{grm}\sigma_g+I\sigma_e,\] we assume that \[y=Xw+e,\] where, \[w\sim\mathcal{N}(0,I\sigma_g).\] Above is the classical form of random effects model. When y is centering, we know \[\\ cov(y)=(Xww^TX^T)+I\sigma_e\\ =(XX^T)\sigma_g+I\sigma_e\\\] so \[(XX^T)=K_{grm}\] and \[X=U\sqrt{\lambda}.\] So, the classical form of lmm is \[y=U\sqrt{\lambda}w+e\] where, \[w\sim\mathcal{N}(0,I\sigma_g),\]

\[e\sim\mathcal{N}(0,I\sigma_e).\] Use glmnet-ridge regression, we can easily get the $\frac{\sigma_e}{\sigma_g}=369$, and this is a poor answer. Use EM algorithn, we can get $\sigma_e=2.0272$, $\sigma_g=4.4988$, and $h^2=0.6894$.

4.1 EM use the Glmnet

library(glmnet)

## Loading required package: Matrix

## Loading required package: foreach

## Loaded glmnet 2.0-10

eig <- eigen(Kgrm)
w=eig$vectors%*%sqrt(diag(eig$values))
fit=cv.glmnet(x=w, y=yheight1.08-mean(yheight1.08), alpha=0)
fit$lambda.min

## [1] 368.822

4.2 MM

time_start <- proc.time()
y <- yheight1.08-mean(yheight1.08)
## Fixed effect
# x <- u%*%diag(sqrt(lambda))
# inv_x <- t(u%*%diag(sqrt(1/lambda)))
x=0

eig <- eigen(Kgrm)
u <- eig$vectors
lambda <- eig$values # Eigen value of kgrm, this is a vector

# Initial the sigma and lower bound
sigma_g <- 10
sigma_e <- 10
maxite <- 50
low_b <- rep(0,maxite)
I <- diag(rep(1,length(Kgrm[1,]))) #n*n

#
for (i in 1:maxite) {
  omega <- Kgrm*sigma_g+I*sigma_e
  invomega <- u%*%diag(1/(sigma_g*lambda+sigma_e))%*%t(u)
  # If fixed effects exist
  if (length(which(x!=0))!=0){
    beta <- (inv_x%*%omega%*%t(inv_x))%*%t(x)%*%invomega%*%y
  }else{
    beta=0
  }
  #
  sigma_g_old <- sigma_g
  sigma_e_old <- sigma_e
  
  sigma_g <- sigma_g*sqrt(t((y-c(x%*%beta)))%*%invomega%*%
                            Kgrm%*%invomega%*%(y-c(x%*%beta))/sum(invomega*Kgrm))
  sigma_e <- sigma_e*sqrt(t((y-c(x%*%beta)))%*%invomega%*%
                            I%*%invomega%*%(y-c(x%*%beta))/sum(diag(invomega)))
  sigma_g <- sigma_g[1,1]
  sigma_e <- sigma_e[1,1]
  low_b[i] <- (-0.5*sigma_e*sum(diag(invomega))-0.5*sigma_g*sum(invomega*Kgrm)
           -0.5*sigma_e_old^2/sigma_e*
             t((y-c(x%*%beta)))%*%(invomega%*%(invomega%*%(y-c(x%*%beta))))
           -0.5*sigma_g_old^2/sigma_g*
             t((y-c(x%*%beta)))%*%(invomega%*%(Kgrm%*%(invomega%*%(y-c(x%*%beta)))))
           -0.5*sum(log(sigma_g_old*lambda+sigma_e_old)))
  #print(i)
}
#
time_end <- proc.time()
time_end-time_start

##    user  system elapsed 
##  28.875   0.726   8.480

sigma_g/(sigma_g+sigma_e)

## [1] 0.6839411

5.Prediction

Question: predict the next child’s height of the famliy.

5.1 Methods with average

The most simple method is taking the average of the famliy’s children to predict the next child’s. Here we first adjust the gender by multiple 1.08, so if the next child is female, then her height have to divide by 1.08.

# sample 1/2 family take the half last children as the test data
# here to compare different methods, we can also random take half childs as test

train.test <- function(g, seed=2, cv=2, rand=F) {
  set.seed(seed)
  index.f <- sample(1:cv, 205, replace = T)
  fam.num <- unique(g$family)
  results <- list()
  for(i in 1:cv){
    fam.test <- fam.num[index.f==i]
    n <- length(fam.test)
    index.c <- c()
    for(j in 1:n){
      fam.n <- fam.test[j]
      temp <- g[g$family==fam.n, ]
      chi.num <- temp$children[1]
      if(rand){
        chi.temp <- as.numeric(sample(rownames(temp), (chi.num/2+0.5)))
      }else{
        chi.temp <- as.numeric(rownames(temp)[1:(chi.num/2+ 0.5)])
      }
      index.c <- append(index.c, chi.temp)
    }
    train <- g[-index.c, ]
    test  <- g[index.c,  ]
    results[[i]] <- list(train=train, test=test)
  }
  results
}
tt <- train.test(g, seed=1, rand=F)
head(tt[[1]]$test)

##    family father mother midparentHeight children childNum gender
## 1     001   78.5   67.0           75.43        4        1   male
## 2     001   78.5   67.0           75.43        4        2 female
## 5     002   75.5   66.5           73.66        4        1   male
## 6     002   75.5   66.5           73.66        4        2   male
## 16    005   75.0   58.5           69.09        6        1   male
## 17    005   75.0   58.5           69.09        6        2   male
##    childHeight
## 1         73.2
## 2         69.2
## 5         73.5
## 6         72.5
## 16        72.0
## 17        69.0

g <- GaltonFamilies
g$childHeight <- with(g, ifelse(g$gender=="female", 1.08, 1)*childHeight)
ch.mean <- aggregate(g$childHeight, list(g$family), mean)$x
hist(ch.mean, breaks = 30)

Do not consider the gender effect by multiple 1.08. Each child was predicted by his brother’s mean level.

g <- GaltonFamilies
g$childHeight <- with(g, ifelse(g$gender=="female", 1.08, 1)*childHeight)
cv = 2

ave.err4 <- rep(0, 100*cv)
for(s in 1:100){
  tt <- train.test(g, seed=s, cv, rand = T)
  err <- c()
  for(v in 1:cv) {
    train <- tt[[v]]$train
    test  <- tt[[v]]$test
    ch.mean <- aggregate(train$childHeight, list(train$family), mean)
    
    fam.test <- test$family
    pred <-c()
    for(i in 1:length(fam.test)) {
      index <- which(ch.mean$Group.1==fam.test[i])
      if(length(index)==0){
        pred[i] <- mean(ch.mean$x)
      }else{
        pred[i] <- ch.mean$x[index]
      }
    }
    err[v] <- sqrt(mean((pred-test$childHeight)^2))
  }
  ave.err4[(s*2-1):(s*2)] <- err
}

Consider the gender. Each child was predicted by male or female overall mean level.

g <- GaltonFamilies
cv = 2
ave.err5 <- rep(0, 100*cv)
for(s in 1:100){
  tt <- train.test(g, seed=s, cv, rand = T)
  err <- c()
  for(v in 1:cv) {
    train <- tt[[v]]$train
    test  <- tt[[v]]$test
    ch.mean <- aggregate(train$childHeight, list(train$gender), mean)
    
    fam.test <- test$family
    pred <-c()
    for(i in 1:length(fam.test)) {
      gender.i <- test$gender[i]
      if(gender.i == "male"){
        pred[i] <- ch.mean$x[2]
      }else{
        pred[i] <- ch.mean$x[1]
      }
    }
    err[v] <- sqrt(mean((pred-test$childHeight)^2, na.rm = T))
  }
  ave.err5[(s*cv-1):(s*cv)] <- err
}

Consider the gender. Each child was predicted by his famliy’s male or female mean level.

g <- GaltonFamilies
cv = 2

ave.err6 <- rep(0, 100*cv)
for(s in 1:100){
  tt <- train.test(g, seed=s, cv, rand = T)
  err <- c()
  for(v in 1:cv) {
    train <- tt[[v]]$train
    test  <- tt[[v]]$test
    train.fam <- unique(train$family)
    ch.mgend <- data.frame(family=train.fam, male=rep(0, length(train.fam)), 
                           female = rep(0,length(train.fam)))
    for(j in 1:length(train.fam)) {
      fam.num <- train.fam[j]
      temp <- train[fam.num == train$family, ]
      mal.ind <- which(temp$gender=="male")
      fem.ind <- which(temp$gender=="female")
      mal.mean <- mean(temp$childHeight[mal.ind])
      fem.mean <- mean(temp$childHeight[fem.ind])
      if(length(mal.ind) == 0) {
        mal.mean <- mean(train$childHeight[which(train$gender=="male")])
      }
      if(length(fem.ind) == 0) {
        fem.mean <- mean(train$childHeight[which(train$gender=="female")])
      }
      ch.mgend$male[j] <- mal.mean
      ch.mgend$female[j] <- fem.mean
    }
  
    fam.test <- test$family
    pred <-c()
    for(i in 1:length(fam.test)) {
      fam.tn <- fam.test[i]
      fam.tg <- test[i, 7]
      index  <- which(ch.mgend[, 1] == fam.tn)
      if(length(index) == 0) {
        if(fam.tg == "male"){
          pred[i] <- mean(train$childHeight[train$gender=="male"])
        }else{
          pred[i] <- mean(train$childHeight[train$gender=="female"])
        }
      }else{
         if(fam.tg == "male"){
            pred[i] <- ch.mgend$male[index]
          }else{
            pred[i] <- ch.mgend$female[index]
          }
       }
    }
    err[v] <- sqrt(mean((pred-test$childHeight)^2))
  }
  ave.err6[(s*cv-1):(s*cv)] <- err
}

ave.err <- cbind(ave.err1, ave.err2, ave.err3, ave.err4, ave.err5, ave.err6)
save(ave.err, file="/Users/zhouchao/Documents/R/Galton/ave.err.rda")

load("/Users/zhouchao/Documents/R/Galton/ave.err.rda")
boxplot(ave.err, col = rainbow(3))
abline(h=seq(2.2, 3.2, 0.2), col="lightgrey")

colMeans(ave.err)

## ave.err1 ave.err2 ave.err3 ave.err4 ave.err5 ave.err6 
## 2.601692 2.663082 2.888484 2.414207 2.489960 2.517108

5.2 Methods with OLS`

Do not consider the famliy structure and do OLS to predict.

g <- GaltonFamilies
fit1 <- lm(childHeight~father+mother+gender, data = g)
summary(fit1)$coef

cv = 2
lm.err5 <- rep(0, 100*cv)
for(s in 1:100){
  tt <- train.test(g, seed=s, cv, rand = T)
  err <- c()
  for(v in 1:cv) {
    train <- tt[[v]]$train
    test  <- tt[[v]]$test
    temp.fit <- lm(childHeight~father+mother+gender, data = train)
    temp.pred<- predict(temp.fit, test)
    err[v] <- sqrt(mean((temp.pred-test$childHeight)^2))
  }
  lm.err5[(s*cv-1):(s*cv)] <- err
}

Taking the child’s mean level to do OLS with gender adjusted by multiple 1.08.

g <- GaltonFamilies
g$childHeight <- with(g, ifelse(g$gender=="female", 1.08, 1)*childHeight)
cv = 2
lm.err6 <- rep(0, 100*cv)
for(s in 1:100){
  tt <- train.test(g, seed=s, cv, rand = T)
  err <- c()
  for(v in 1:cv) {
    train <- tt[[v]]$train
    test  <- tt[[v]]$test
    
    ch.mean<- aggregate(train$childHeight, list(train$family), mean)[, 2]
    ph.fat <- aggregate(train$father, list(train$family), mean)[, 2]
    ph.mot <- aggregate(train$mother, list(train$family), mean)[, 2]
    newdata <- data.frame(y=ch.mean, x1=ph.fat, x2=ph.mot)
    temp.fit<- lm(y~x1+x2, data=newdata)
    temp.pred <- c()
    for(i in 1:nrow(test)) {
      newtest <- data.frame(x1=test$father[i], x2=test$mother[i])
      temp.pred[i] <- predict(temp.fit, newtest)
    }
    err[v] <- sqrt(mean((temp.pred-test$childHeight)^2))
  }
  lm.err6[(s*cv-1):(s*cv)] <- err
}

Taking the first one child to do OLS.

g <- GaltonFamilies
cv = 2
lm.err7 <- rep(0, 100*cv)
for(s in 1:100){
  tt <- train.test(g, seed=s, cv, rand = T)
  err <- c()
  for(v in 1:cv) {
    train <- tt[[v]]$train
    test  <- tt[[v]]$test
    ch.one <- train$childHeight[train$childNum==1]
    ch.gen <- train$gender[train$childNum==1]
    ph.fat <- train$father[train$childNum==1]
    ph.mot <- train$mother[train$childNum==1]
    
    newdata <- data.frame(y=ch.one, x1=ph.fat, x2=ph.mot,x3=ch.gen)
    temp.fit<- lm(y~x1+x2+x3, data=newdata)
    temp.pred <- c()
    for(i in 1:nrow(test)) {
      newtest <- data.frame(x1=test$father[i], x2=test$mother[i], x3=test$gender[i])
      temp.pred[i] <- predict(temp.fit, newtest)
    }
    err[v] <- sqrt(mean((temp.pred-test$childHeight)^2))
  }
  lm.err7[(s*cv-1):(s*cv)] <- err
}

Taking one child randomly to do OLS

g <- GaltonFamilies
cv = 2
lm.err8 <- rep(0, 100*cv)
for(s in 1:100){
  tt <- train.test(g, seed=s, cv, rand = T)
  err <- c()
  for(v in 1:cv) {
    train <- tt[[v]]$train
    test  <- tt[[v]]$test
    
    newtrain <- ch.rand(train$family, train)
    temp.fit<- lm(childHeight~father+mother+gender, data = newtrain)
    temp.pred <- c()
    for(i in 1:nrow(test)) {
      newtest <- test[i, ]
      temp.pred[i] <- predict(temp.fit, newtest)
    }
    err[v] <- sqrt(mean((temp.pred-test$childHeight)^2))
  }
  lm.err8[(s*cv-1):(s*cv)] <- err
}

lm.err <- cbind(lm.err1, lm.err2, lm.err3, lm.err4, lm.err5, lm.err6, lm.err7, lm.err8)
save(lm.err, file="/Users/zhouchao/Documents/R/Galton/lm.err.rda")

load("/Users/zhouchao/Documents/R/Galton/lm.err.rda")
boxplot(lm.err, col = rainbow(4))
abline(h=seq(2,2.8,0.2), col="lightgrey")

colMeans(lm.err)

##  lm.err1  lm.err2  lm.err3  lm.err4  lm.err5  lm.err6  lm.err7  lm.err8 
## 2.311023 2.365793 2.399640 2.340505 2.160662 2.233701 2.469846 2.172661

5.3 Methods with the conditional distribution (By JingSi and Can)

This methods is conditioned on the observed data.

## y family.index, K are matching 
cond.dist <- function(fam.mis, mu=NULL, gender=1,
                      sigma_g, simga_a, y, family.index, K=K, KI) { 
  # one miss data
  n <- length(y)
  f <- length(unique(family.index))
  
  if(is.null(mu)) {
    temp <- 0
    mu <- c()
    for(i in 1:f){
      fam.n <- unique(family.index)[i]
      fn <- sum(fam.n==family.index)
      mu[temp + (1:fn)] <- rep(mean(y[which(fam.n == family.index)]), fn)
      temp <- temp + fn
    }
    mis.ind <- which(fam.mis == family.index)
    if(length(mis.ind) == 0){
      mu.mis <- mean(y) + 5.2*gender
    }else{
      mu.mis <- mean(y[mis.ind]) + 5.2*gender
    }
  }
  else{
    mu.mis <- mean(y) + 5.2*gender
  }
  if(is.null(K))  K  <- diag(1, n)
 
  B <- rep(0, n)
  fam.index <- which(fam.mis == family.index)
  B[fam.index] <- 1/2
  if(is.null(KI)) KI <- solve(K*sigma_g + diag(n)*sigma_e)
  
  mu.obs <- mu 
  # mu.mis <- mu.obs[fam.index] + 5.2*gender # each family's mean
  BSigma <- sigma_g * colSums(B * KI)
  mu.star<- mu.mis + sum(BSigma *(y-mu.obs))
  sigma.star <- sigma_g + sigma_e - sum(BSigma*B)*sigma_g
  res <- c(mu.star, sigma.star)
  return(res)
}

data.trans <- function(train) {
  fam.num <- unique(train$family)
  N <- nrow(train) + 2*length(fam.num)
  yheight <- gender <- family.index <- c()
  Kgrm <- matrix(0, N, N)

  temp <- 0
  for(i in 1:length(fam.num)) {
    id      <- fam.num[i]
    index.i <- which(train$family == id)
    Ni <- length(index.i)
    
    ge      <- c(1,0, gend[index.i])
    gender  <- append(gender,  ge)
  
    fam <- rep(id, 2+length(index.i))
    family.index[temp + (1:(Ni+2))] <- fam
  
    yi<- c(train$father[index.i][1], train$mother[index.i][1], train$childHeight[index.i])
    yheight <- append(yheight, yi)
    
    Ki <- diag(1, length(yi))
    Ki[Ki==0] <- 0.5; Ki[1,2] <- 0; Ki[2,1] <- 0
    ly  <- length(yheight) - length(yi) + 1
    ki.row <- ki.col <- ly:length(yheight)
    Kgrm[ki.row, ki.col] <- Ki
    temp <- temp+Ni+2
  }
  results <- list(K=Kgrm, y=yheight, gender=gender, family.index=family.index)
}

All familiy with the same interspects with gender adjusted.

g  <- GaltonFamilies
g$family <- as.numeric(g$family)

sigma_g <- 4.4988
sigma_e <- 2.0272
cv = 2

cdm.err3 <- rep(0, 100*cv)
cdm.err4 <- rep(0, 100*cv)
cdm.std2  <- c()
for(s in 1:100){
  tt <- train.test(g, seed=s, cv, rand = T)
  err1 <- err2 <- mstd <- c()
  
  for(v in 1:cv) {
    train <- tt[[v]]$train
    test  <- tt[[v]]$test
    newtrain <- data.trans(train)
    KI <- solve(newtrain$K *sigma_g + diag(length(newtrain$y))*sigma_e)
    
    pred1 <-pred2 <- c()
    std <- c()
    for(i in 1:nrow(test)) {
      fam.mis <- test$family[i]
      fam.gen <- ifelse(test$gender[i]=="male",1,0)
      
      mu1 <- mean(newtrain$y) + 5.2*newtrain$gender # all family with the same mean
      temp.pred1 <- cond.dist(fam.mis, gender=fam.gen, mu=mu1, sigma_g, simga_a, 
               y=newtrain$y, family.index=newtrain$family.index, K=newtrain$K, KI)
      pred1[i] <- temp.pred1[1]
      
      temp.pred2 <- cond.dist(fam.mis, gender=fam.gen, mu=NULL, sigma_g, simga_a, 
               y=newtrain$y, family.index=newtrain$family.index, K=newtrain$K, KI)
      pred2[i] <- temp.pred2[1]
      std[i] <- temp.pred1[2]
    }
    err1[v] <- sqrt(mean((pred1-test$childHeight)^2))
    err2[v] <- sqrt(mean((pred2-test$childHeight)^2))
    mstd <- c(mstd, std)
  }
  cdm.err3[(s*cv-1):(s*cv)] <- err1
  cdm.err4[(s*cv-1):(s*cv)] <- err2
  cdm.std2 <- c(cdm.std2, mstd)
}

cdm.err <- cbind(cdm.err1, cdm.err2, cdm.err3, cdm.err4)
save(cdm.err, file="/Users/zhouchao/Documents/R/Galton/cdm.err.rda")

1.One use the overall mean level as the interspect. 2. Each family’s mean level as his own interspect.

load("/Users/zhouchao/Documents/R/Galton/cdm.err.rda")
boxplot(cdm.err, col =rainbow(2))
abline(h=seq(2.3, 3.6, 0.2), col="grey")

colMeans(cdm.err)

## cdm.err1 cdm.err2 cdm.err3 cdm.err4 
## 2.455075 2.468594 2.391902 3.423050

# boxplot(sqrt(cdm.std), ylim=c(2.1,2.3), col=2)

5.4 Methods by XiangHong

This method take each chid’s height will by effected by the fix effect such as gender, father or mother and the random effect come from family level and children level.

\[y_{ij} = \beta_0 + \beta_1 M_{i}+ \beta_2 g_{ij} + \alpha_1 f_i + \alpha_2 g_j + e_{ij}\] which assume family’s effect and children effect follow normal distribution $f_i \sim N(0,\sigma_f^2)$ and $g_j \sim N(0, \sigma_g^2)$.

library(lme4)
g <- GaltonFamilies
R <- 1
cv <- 2
lmm.err6 <- rep(0, R*cv)
for(s in 1:R){
  tt <- train.test(g, seed=s, cv, rand = T)
  err <- c()
  for(v in 1:cv) {
    train <- tt[[v]]$train
    test  <- tt[[v]]$test
    
    temp.fit <- lmer(childHeight~father+mother+gender + (1|family) + (1|childNum), 
                    data = train)
    temp.pred = predict(temp.fit, newdata = test, allow.new.levels = TRUE)
   
    err[v] <- sqrt(mean((temp.pred-test$childHeight)^2))
  }
  lmm.err6[(s*cv-1):(s*cv)] <- err
}

plot(temp.fit)

plot(temp.pred, test$childHeight, pch=20, ylab = "actual value")
abline(a=0,b=1,col=2)

lmm.err <- cbind(lmm.err5, lmm.err6)
save(lmm.err, file="/Users/zhouchao/Documents/R/Galton/lmm.err.rda")

load("/Users/zhouchao/Documents/R/Galton/lmm.err.rda")
boxplot(lmm.err, col=rainbow(2))

colMeans(lmm.err)

## lmm.err5 lmm.err6 
## 1.986466 1.866852

5.6 Methods by PengHao

5.7 Methods by David

5.8 Other methods such as RF(random forest), NNT (neural network)

library(randomForest)

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

g <- GaltonFamilies
g$family <- as.numeric(g$family)
cv = 2
R=1
rf.err9 <- rep(0, R*cv)
for(s in 1:R){
  tt <- train.test(g, seed=s, cv, rand = F)
  err <- c()
  for(v in 1:cv) {
    train <- tt[[v]]$train
    test  <- tt[[v]]$test
    
    temp.fit <- randomForest(childHeight~father+mother+gender, data = train)
    temp.pred = predict(temp.fit, newdata = test)
    err[v] <- sqrt(mean((temp.pred-test$childHeight)^2))
  }
  rf.err9[(s*cv-1):(s*cv)] <- err
}

par(mfrow=c(1,2))
plot(temp.fit)
plot(temp.pred, test$childHeight, pch=20, ylab = "actual value")
abline(a=0,b=1,col=2)

rf.err <- cbind(rf.err1, rf.err3, rf.err5, rf.err7, rf.err9,
               rf.err2, rf.err4, rf.err6, rf.err8, rf.err10)
save(rf.err, file="/Users/zhouchao/Documents/R/Galton/rf.err.rda")

Random Forest.

1 childHeight~father+mother+gender+childNum 2. childHeight~father+mother+gender+childNum+children 3. childHeight~father+mother+gender+childNum+children+family 4. childHeight~.(all variable) 5. childHeight~father+mother+gender

load("/Users/zhouchao/Documents/R/Galton/rf.err.rda")
boxplot(rf.err, col=rainbow(5))
abline(h=seq(1.6, 2.8, 0.2), col="grey")

colMeans(rf.err)

##  rf.err1  rf.err3  rf.err5  rf.err7  rf.err9  rf.err2  rf.err4  rf.err6 
## 2.172269 2.159621 2.056058 2.064234 2.404077 2.036974 1.991147 1.869770 
##  rf.err8 rf.err10 
## 1.874016 2.178973

Neural Network

library(nnet)
g <- GaltonFamilies
g$family <- as.numeric(g$family)
R<-1
cv = 2
nnt.err10 <- rep(0, R*cv)
for(s in 1:R){
  tt <- train.test(g, seed=s, cv, rand = T)
  err <- c()
  for(v in 1:cv) {
    train <- tt[[v]]$train
    test  <- tt[[v]]$test
    
    temp.fit <- nnet(childHeight~father+mother+gender,
                     data = train, size =10, linout=TRUE,
                     rang = 0.1, decay = 5e-4, maxit = 200)
    temp.pred = predict(temp.fit, newdata = test) #, type = "raw")
    err[v] <- sqrt(mean((temp.pred-test$childHeight)^2))
  }
  nnt.err10[(s*cv-1):(s*cv)] <- err
}

## # weights:  51
## initial  value 2983509.158760 
## final  value 8353.782886 
## converged
## # weights:  51
## initial  value 3022453.990113 
## iter  10 value 9198.941696
## iter  20 value 9135.011180
## iter  30 value 9099.172748
## iter  40 value 4125.037095
## iter  50 value 3585.301056
## iter  60 value 3126.582494
## iter  70 value 3095.751293
## iter  80 value 3092.311968
## iter  90 value 3088.776384
## iter 100 value 3088.539483
## iter 110 value 3088.409944
## iter 120 value 3088.303619
## iter 130 value 3087.457557
## iter 140 value 3080.730838
## iter 150 value 3071.067831
## iter 160 value 3054.190359
## iter 170 value 3033.119049
## iter 180 value 3027.390180
## iter 190 value 3024.949447
## iter 200 value 3024.890180
## final  value 3024.890180 
## stopped after 200 iterations

nnt.err <- cbind(nnt.err1, nnt.err3, nnt.err5, nnt.err7, nnt.err9,
                 nnt.err2, nnt.err4, nnt.err6, nnt.err8, nnt.err10)
save(nnt.err, file="/Users/zhouchao/Documents/R/Galton/nnt.err.rda")

NNT.

load("/Users/zhouchao/Documents/R/Galton/nnt.err.rda")
boxplot(nnt.err, col=rainbow(5))

#abline(h=seq(1.6, 2.8, 0.2), col="grey")
colMeans(nnt.err)

##  nnt.err1  nnt.err3  nnt.err5  nnt.err7  nnt.err9  nnt.err2  nnt.err4 
##  2.804182  2.836102  2.830070  2.802699  3.005308  2.599410  2.571843 
##  nnt.err6  nnt.err8 nnt.err10 
##  2.694780  2.571749  2.738264