This document present the process of Galton Heighet data analysis.
library(HistData)
data("GaltonFamilies")
head(GaltonFamilies)
## family father mother midparentHeight children childNum gender
## 1 001 78.5 67.0 75.43 4 1 male
## 2 001 78.5 67.0 75.43 4 2 female
## 3 001 78.5 67.0 75.43 4 3 female
## 4 001 78.5 67.0 75.43 4 4 female
## 5 002 75.5 66.5 73.66 4 1 male
## 6 002 75.5 66.5 73.66 4 2 male
## childHeight
## 1 73.2
## 2 69.2
## 3 69.0
## 4 69.0
## 5 73.5
## 6 72.5
summary(GaltonFamilies)
## family father mother midparentHeight
## 185 : 15 Min. :62.0 Min. :58.00 Min. :64.40
## 066 : 11 1st Qu.:68.0 1st Qu.:63.00 1st Qu.:68.14
## 120 : 11 Median :69.0 Median :64.00 Median :69.25
## 130 : 11 Mean :69.2 Mean :64.09 Mean :69.21
## 166 : 11 3rd Qu.:71.0 3rd Qu.:65.88 3rd Qu.:70.14
## 097 : 10 Max. :78.5 Max. :70.50 Max. :75.43
## (Other):865
## children childNum gender childHeight
## Min. : 1.000 Min. : 1.000 female:453 Min. :56.00
## 1st Qu.: 4.000 1st Qu.: 2.000 male :481 1st Qu.:64.00
## Median : 6.000 Median : 3.000 Median :66.50
## Mean : 6.171 Mean : 3.586 Mean :66.75
## 3rd Qu.: 8.000 3rd Qu.: 5.000 3rd Qu.:69.70
## Max. :15.000 Max. :15.000 Max. :79.00
##
g <- GaltonFamilies
par(mfrow=c(2,3))
hist(g$father, main = "Father", breaks = 30)
abline(v=mean(g$father), col=2, lwd=2)
hist(g$childHeight[g$gender=="male"], main = "boys", breaks = 30)
abline(v=mean(g$childHeight[g$gender=="male"]), col=2, lwd=2)
hist(c(g$childHeight[g$gender=="male"],g$father), main = "boyes and father", breaks = 30)
abline(v=mean(c(g$childHeight[g$gender=="male"],g$father)), col=2, lwd=2)
hist(g$mother, main = "Mother", breaks = 30)
abline(v=mean(g$mother), col=2, lwd=2)
hist(g$childHeight[g$gender=="female"], main = "girls", breaks = 30)
abline(v=mean(g$childHeight[g$gender=="female"]), col=2,lwd=2)
hist(c(g$childHeight[g$gender=="female"],g$mother), main = "girls and mothre", breaks = 30)
abline(v=mean(c(g$childHeight[g$gender=="female"],g$mother)), col=2, lwd=2)
library(ggplot2)
ggplot(data=GaltonFamilies,aes(x=midparentHeight,y=childHeight,color=gender)) +
geom_point()
As Galton shows, that the gender has the 5.2 inch difference or 1.08 times effect. The number 1.08 can proved use the mean level difference or do regression to take gender as covarite, the second has been done by ZiJian. Different groups has almost the same results as both methods show. So on the next analysis, we can take gender as one covarite or adjust it by times 1.08 for each male or mother.
g <- GaltonFamilies
mean(g$father)/mean(g$mother) # father/mother
## [1] 1.079698
# children male/female
mean(g$childHeight[g$gender == "male"])/mean(g$childHeight[g$gender == "female"])
## [1] 1.080028
# all children.
mean(c(g$father, g$childHeight[g$gender == "male"]))/
mean(c(g$mother, g$childHeight[g$gender == "female"]))
## [1] 1.079814
g <- GaltonFamilies
g$childHeight <- with(GaltonFamilies,ifelse(gender=="female",1.08,1)*childHeight)
ggplot(data=g,aes(x=midparentHeight,y=childHeight,color=gender)) + geom_point()
After the gender adjustment, the order of each family children seems has no difference.
g <- GaltonFamilies
g$mother <- g$mother*1.08
g$childHeight <- with(g, ifelse(gender=="female", 1.08, 1)*childHeight)
g.child <- aggregate(g$childHeight, list(g$family), cbind)$x
gpa <- g[which(g$childNum==1),2:4]
#par(bg="lightgrey")
plot(c(as.numeric(gpa[1,]), c(g.child[[1]])), ylim = c(60, 80), xaxt='n',
main = "One line one family", xlim = c(0.5, 18.5), type = "l", ylab = "Heighe(inch)")
points(c(as.numeric(gpa[1, ]), c(g.child[[1]])), pch= 20)
axis(1, at = 1:18, labels = c("father", "Mother", "Mid-pm", paste0("C", 1:15)), las=2)
abline(v=1:18, h=c(55, 60, 65, 70, 75, 80), col="white")
abline(v=1:4, col=1)
for(i in 1:205) {
lines(c(as.numeric(gpa[i, ]), c(g.child[[i]])), col=2)
points(c(as.numeric(gpa[i,]), c(g.child[[i]])), pch=20, col=2)
#Sys.sleep(0.2)
lines(c(as.numeric(gpa[i, ]), c(g.child[[i]])), col="grey")
points(c(as.numeric(gpa[i,]), c(g.child[[i]])), pch=20, col="grey")
#Sys.sleep(0.1)
}
child.orm <- aggregate(g$childHeight, list(g$childNum), mean)$x
child.orv <- aggregate(g$childHeight, list(g$childNum), sd)$x
paret.orm <- apply(gpa, 2, mean)
paret.orv <- apply(gpa, 2, sd)
gpa.mean <- c(paret.orm, child.orm)
gpa.var <- c(paret.orv, child.orv)
for(i in 1:14){
y=c(gpa.mean[i]-2*gpa.var[i], gpa.mean[i], gpa.mean[i]+2*gpa.var[i])
lines(x=c(i,i,i), y=y, lwd=3)
points(x=c(i,i,i), y=y,pch=20, cex=1.5)
}
lines(c(paret.orm, child.orm), col=1, lwd=3)
g <- GaltonFamilies
varf <- aggregate(g$childHeight, list(g$family), sd)
par(mfrow = c(2, 1))
plot(varf[, 2], main = "Standard Error of each family children", pch=20)
abline(h=1:8, col="grey")
plot(gpa.var, main = "Standard Error of each ith children", xaxt="n", pch=20)
axis(1, at = 1:18, labels = c("father", "Mother", "Mid-pm", paste0("C", 1:15)), las=2)
g <- GaltonFamilies
g$mother<-g$mother*1.08
gpa <- g[which(g$childNum==1),2:3]
#par(bg="lightgrey")
plot(1:2, gpa[1, ], type="l", main = "Father and Mother", ylim = c(60, 78),
xaxt="n", ylab="height", xlab=" ")
axis(1, at=1:2, labels = c("Husband", "Wife"))
for(i in 1:205) {
lines(as.numeric(gpa[i, ]), col=2)
points(as.numeric(gpa[i,]), pch=20, col=2)
#Sys.sleep(0.2)
lines(as.numeric(gpa[i, ]), col="grey")
points(as.numeric(gpa[i,]), pch=20, col="grey")
#Sys.sleep(0.1)
}
fm <- gpa[order(gpa[, 1]),]
fm1 <- apply(fm[1:40,], 2, mean)
fm2 <- apply(fm[41:80,], 2, mean)
fm3 <- apply(fm[81:121, ], 2, mean)
fm4 <- apply(fm[122:163, ], 2, mean)
fm5 <- apply(fm[164:205, ], 2, mean)
lines(fm1); lines(fm2); lines(fm3); lines(fm4); lines(fm5);
#lines(fmmid); lines(fmhig)
#lines(fm1,fm2,fm3,fm4, fm5, col=3, lwd=2)
Using the narrow sense heritability difinition, here we show that it has some relationship with the coefficient of ordinary least square.
Follow \(P\) is phentype and \(G\) is genotype, \(S_{off}\) is the offspring’s sample variance, \(S_{father/mother}\) means variance one of them,\(S_{off, father/mother}\) means the covariance of the offspring and father or mother.
First we have:
\[cov(P_{off}, P_{mother}) = cov(P_{off}, P_{father}) \\ cov(G_{off}, G_{mother}) = cov(G_{off}, G_{fatehr}) =\frac{\sigma_g^2}{2}\]
Then we do simple linear regression and calcualte the coefficient \(\beta_1\).
\[P_{off} = \beta_1 P_{father/mother}\\ \hat{\beta}_1 = \frac{S_{off, father/mother}}{S_{father/mother}^2}\]
Last we calculate the correlation of this pairs \(r_1\):
\[r_1 \equiv \frac{cov(P_{off}, P_{father/mother})}{\sqrt{Var(P_{off})Var(P_{father/mother})}} = \frac{\frac{\sigma_g^2}{2}}{\sigma_g^2+\sigma_e^2}= \frac{h^2}{2}\\ \hat{r}_1 = \frac{S_{off, father/mother}}{S_{off}S_{father/mother}}=\frac{\hat{h}^2}{2}\]
When \(S_{off} = S_{father/mother}\), we can have:
\[{\hat{h}}^2 = 2\hat{\beta}_1\]
With the same way as above, here we take the midparent’s height as predictor:
\[cov(P_{off}, P_{mid}) = cov(G_{off}, G_{mid}) = \frac{\sigma_g^2}{2}\]
\[P_{off} = \beta_2 P_{mid}\\ \hat{\beta}_2 = \frac{S_{off, mid}}{S_{off}^2}\]
\[r_2 \equiv \frac{cov(P_{off}, P_{mid})}{\sqrt{Var(P_{off})Var(P_{mid})}} = \frac{\frac{\sigma_g^2}{2}}{\sqrt{\frac{1}{2}} \sigma_g^2+\sigma_e^2}= \sqrt{\frac{1}{2}}h^2\\ \hat{r}_2 = \frac{S_{off, mid}}{S_{off}S_{mid}}=\sqrt{\frac{1}{2}}\hat{h}^2\]
When \(S_{off} = \sqrt{2}S_{mid}\), we can have: \[\hat{h}^2 = \hat{\beta}_2\]
That’s all, there are four methods we can use to estimate the heritability. The first one is the two times of the coefficient of OLS between father or mother and children; the second one is the coefficient of OLS between midparents and children; the next to is the corresponding correlation of this pairs.
Taking gender as one covariate to do two variable regression, the heritability estimation is 0.687. This is not correct because the children are not independent for the family structure. The variance of the coefficient of midparents has lower estimated 0.039 which should bigger. The difference between male and female is 5.215 inch.
library(HistData)
data("GaltonFamilies")
fit1 <- lm(childHeight~midparentHeight+gender,data=GaltonFamilies)
summary(fit1)$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 16.5141022 2.73391750 6.040454 2.219169e-09
## midparentHeight 0.6870152 0.03943983 17.419326 4.881957e-59
## gendermale 5.2151054 0.14215811 36.685247 5.519341e-183
Taking father or mother to do regression with child’s height. The heritability should twice of the coffeicient. This has the same problem as above fit, but it seems that the father has much more heritability then mother for their children on the average level. The gender has almost the same effect all around 5.2.
g<- GaltonFamilies
# father
fit2 <- lm(childHeight~father+gender, data = g)
summary(fit2)$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 35.6790984 2.09313335 17.04578 6.533613e-57
## father 0.4104067 0.03018159 13.59792 1.518043e-38
## gendermale 5.1804523 0.14947528 34.65759 1.054827e-169
# mother
fit3 <- lm(childHeight~mother+gender, data = g)
summary(fit3)$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 42.1011532 2.1778494 19.33153 2.899497e-70
## mother 0.3429967 0.0339055 10.11626 6.783142e-23
## gendermale 5.1697457 0.1553339 33.28150 1.246403e-160
As above describe, the child is not independent for family structure. So we take one child of each family or the mean height of each family’s children to do regression with the father, mother or midparents. Here we adjust the gender by times 1.08. The heritability estimation almost the same with the above estimation.While the variance of this estimation is much bigger than it. It also finds that taking mean of child’s height will have smaller variance then take only one child’s. The heritability of father (0.441/0.409) is higher than mother’s (0.342/0.345)on the mean level.
g<- GaltonFamilies
g[g$gender == "female", "childHeight"] <- g[g$gender == "female", "childHeight"]*1.08
ch.mean <- aggregate(g$childHeight, list(g$family), mean)[, 2]
ch.one <- g$childHeight[which(g$childNum==1)] # a least one child
ph.mid <- g$midparentHeight[which(g$childNum==1)]
ph.fat <- g$father[which(g$childNum==1)]
ph.mot <- g$mother[which(g$childNum==1)]
# one child
fit4 <- lm(ch.one ~ ph.mid)
summary(fit4)$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.2776781 5.58995661 3.985304 9.394965e-05
## ph.mid 0.6964473 0.08072673 8.627220 1.814303e-15
# one child with father
fit5 <- lm(ch.one~ ph.fat)
summary(fit5)$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.9082918 4.21704539 9.463567 7.766010e-18
## ph.fat 0.4411218 0.06079379 7.256033 8.274813e-12
# one child with mother
fit6 <- lm(ch.one~ ph.mot)
summary(fit6)$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 48.602098 4.71345067 10.311362 2.567025e-20
## ph.mot 0.341912 0.07359678 4.645746 6.086119e-06
# child's average with midparent
fit7 <- lm(ch.mean ~ ph.mid)
summary(fit7)$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.0570930 3.98119010 5.791508 2.624595e-08
## ph.mid 0.6681181 0.05749391 11.620676 2.913084e-24
# child's average with father
fit8 <- lm(ch.mean ~ ph.fat)
summary(fit8)$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 40.9691832 3.14343040 13.03327 1.300479e-28
## ph.fat 0.4087715 0.04531634 9.02040 1.433013e-16
# child's average with mother
fit9 <- lm(ch.mean ~ ph.mot)
summary(fit9)$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 47.2115396 3.57540281 13.204537 3.828509e-29
## ph.mot 0.3451784 0.05582707 6.182993 3.398177e-09
Like boostrap, here we take one child of each family 1000 times to calculate the heritablity.
# chose one child many times
g<- GaltonFamilies
fam.num <- g$family
g[g$gender == "female", "childHeight"] <- g[g$gender == "female", "childHeight"]*1.08
g$mother <- g$mother*1.08
ch.rand <- function(f, data) {
f.u <- unique(f)
l <- length(f.u)
ind.sam <- rep(0, l)
for(i in 1:l) {
ind <- which(f == f.u[i])
if(length(ind) == 1) {
ind.sam[i] <- ind
}else{
ind.sam[i] <- sample(ind, 1)
}
}
data.rand <- data[ind.sam, ]
return(data.rand)
}
B <- 1
Bres <- matrix(0, B, 6)
for(i in 1:B){
galton.rand <- ch.rand(fam.num, g)
head(galton.rand)
fit <- lm(childHeight~midparentHeight, data = galton.rand)
fit1 <- lm(childHeight~father, data = galton.rand)
fit2 <- lm(childHeight~mother, data = galton.rand)
Bres[i, 1:2] <- summary(fit)$coef[2, 1:2]
Bres[i, 3:4] <- summary(fit1)$coef[2, 1:2]
Bres[i, 5:6] <- summary(fit2)$coef[2, 1:2]
}
Results show that it has normal distribution, the mean level is 0.667 and the variance mean level is 0.08 which proves that if not consider the family structure the variance will lower estimate. Results also show that the mean level of father(0.408) is higher than mother’s(0.318)
load("/Users/zhouchao/Documents/R/Galton/Bres.rda")
head(galton.rand)
## family father mother midparentHeight children childNum gender
## 3 001 78.5 72.36 75.43 4 3 female
## 6 002 75.5 71.82 73.66 4 2 male
## 10 003 75.0 69.12 72.06 2 2 female
## 11 004 75.0 69.12 72.06 5 1 male
## 18 005 75.0 63.18 69.09 6 3 male
## 22 006 74.0 73.44 73.72 1 1 female
## childHeight
## 3 74.52
## 6 72.50
## 10 73.44
## 11 70.50
## 18 68.00
## 22 75.06
summary(fit)$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.4434184 5.69569271 4.291562 2.745595e-05
## midparentHeight 0.6463626 0.08225371 7.858158 2.238396e-13
head(Bres)
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 0.6866913 0.08018497 0.4133752 0.06109323 0.3359559 0.06702884
## [2,] 0.6721370 0.08040848 0.4249455 0.06043547 0.3063937 0.06746048
## [3,] 0.6620967 0.08200132 0.3724345 0.06292389 0.3527708 0.06716758
## [4,] 0.7022348 0.07960241 0.3995739 0.06160019 0.3691220 0.06622175
## [5,] 0.6279158 0.07948939 0.3681620 0.06041099 0.3180524 0.06532163
## [6,] 0.6890986 0.07764888 0.3690742 0.06084795 0.3876317 0.06393105
colMeans(Bres)
## [1] 0.66663884 0.08214050 0.40869321 0.06201990 0.31798940 0.06822192
par(mfrow=c(2,3))
hist(Bres[, 1], breaks = 100, main = "Midparent")
hist(Bres[, 3], breaks = 100, main = "Father")
hist(Bres[, 5], breaks = 100, main = "Mother")
hist(Bres[, 2], breaks = 100, main = "Variance for midparent")
hist(Bres[, 4], breaks = 100, main = "Variance for father")
hist(Bres[, 6], breaks = 100, main = "Variance for mother")
Using the correlation as the estimate of the heritability. Using father, mother and mean height of them to do correlation and regression, result shows that regression has higher heritability (0.854, 0.656, 0.713) then correlation calculation (0.818, 0.629, 0.703). The reason maybe that the variance for father or mother is not equally which has used in the derivation for the relationship between heritability and coefficient.
g<- GaltonFamilies
g[g$gender == "female", "childHeight"] <- g[g$gender == "female", "childHeight"]*1.08
g$mother <- g$mother*1.08
r1 <- cov(g$childHeight, g$father)/sqrt(var(g$childHeight)*var(g$father))
r2 <- cov(g$childHeight, g$mother)/sqrt(var(g$childHeight)*var(g$mother))
2*r1;2*r2
## [1] 0.818477
## [1] 0.6290058
fit10 <- lm(childHeight~father, data = g)
summary(fit10)$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.684187 2.15940881 18.37734 1.325172e-64
## father 0.427027 0.03118669 13.69260 5.101723e-39
fit11 <- lm(childHeight~mother, data = g)
summary(fit11)$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 46.4969005 2.24930400 20.67168 1.938720e-78
## mother 0.3284815 0.03247596 10.11461 6.868425e-23
r4 <-cov(g$childHeight,g$midparentHeight)/sqrt(var(g$childHeight)*var(g$midparentHeight))
r4*sqrt(2)
## [1] 0.7028815
fit12 <- lm(childHeight~midparentHeight, data = g)
summary(fit12)$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.917513 2.82129612 7.059703 3.258458e-12
## midparentHeight 0.712585 0.04075238 17.485726 1.995762e-59
Consider the model: \[y = X\beta + e, \\ \beta \sim N(0, \sigma_g^2/M)\\ e \sim N(0, \sigma_e^2)\]
The moment estimator: \[(\hat{\sigma}_g^2, \hat{\sigma}_e^2) = argmin||yy^T - (\sigma_g^2K+\sigma_e^2I)||\] The solution is equal to after do devariation:
\[ \begin{bmatrix} tr(KK^T) & tr(K)\\ tr(K) & N \end{bmatrix} \begin{bmatrix} \sigma_g^2\\ \sigma_e^2 \end{bmatrix} =\begin{bmatrix} y^TKy\\ y^Ty \end{bmatrix} \] Here, \(y\) is centered, \(K\) is the matrix as above calculate. And here \(tr(K) = N = 1344\), so we can get the solution: \[\hat{\sigma}_g^2 = \frac{y^TKy - yTy}{tr(KK^T) - tr(K)}\\ \hat{\sigma}_e^2 = \frac{y^Ty - N\hat{\sigma}_g^2}{N}\]
Data transform to get the all height and K matrix.
g <- GaltonFamilies
fat.h <- g$father
mot.h <- g$mother
chi.h <- g$childHeight
gend <- rep(0, length(fat.h))
gend[g$gender == "male"] <- 1
chi.h1.08 <- chi.h
chi.h1.08[gend == 0] <- chi.h[gend == 0]*1.08
fam.num <- unique(g$family)
N <- nrow(g) + 2*length(fam.num)
yheight <- yheight1.08 <- gender <- family.index <- c()
Kgrm <- matrix(0, N, N)
# Y1 <- Y2 <- tK <- tKK <-0
temp <- 0
for(i in 1:length(fam.num)) {
id <- fam.num[i]
index.i <- which(g$family == id)
Ni <- length(index.i)
ge <- c(1,0, gend[index.i])
gender <- append(gender, ge)
fam <- rep(id, 2+length(index.i))
family.index <- append(family.index, fam)
yi <- c(fat.h[index.i][1], mot.h[index.i][1], chi.h[index.i])
yi1.08 <- c(fat.h[index.i][1], mot.h[index.i][1]*1.08, chi.h1.08[index.i])
yheight <- append(yheight, yi)
yheight1.08 <- append(yheight1.08, yi1.08)
Ki <- diag(1, length(yi))
Ki[Ki==0] <- 0.5; Ki[1,2] <- 0; Ki[2,1] <- 0
ly <- length(yheight) - length(yi) + 1
ki.row <- ki.col <- ly:length(yheight)
Kgrm[ki.row, ki.col] <- Ki
}
length(yheight) == N
## [1] TRUE
dim(Kgrm)
## [1] 1344 1344
mom <- function(y, K, I){
y <- as.matrix(y)
yc<- y - mean(y)
yTKy <- t(yc) %*% K %*% (yc)
yTy <- t(yc) %*% yc
trKKT <- sum(K^2)
trK <- sum(diag(K))
N <- sum(diag(I))
sigma_g2 <- (yTKy*N - yTy*trK)/(trKKT*N - trK^2)
sigma_a2 <- (yTy - trK * sigma_g2)/N
her.mom <- sigma_g2/(sigma_g2 + sigma_a2)
res <- data.frame(her.mom = her.mom, sigma_g2 = sigma_g2, sigam_a2 = sigma_a2)
return(res)
}
res <- mom(yheight1.08, Kgrm, I = diag(N))
res
## her.mom sigma_g2 sigam_a2
## 1 0.7580243 5.052884 1.612976
X <- cbind(rep(1, N), gender)
V <- diag(N) - X %*% solve(t(X) %*% X) %*% t(X)
Vyheight <- V %*% as.matrix(yheight)
VK <- V %*% Kgrm
mom(Vyheight, VK, I = V)
## her.mom sigma_g2 sigam_a2
## 1 0.7585287 4.731165 1.506127
http://thestatsgeek.com/2013/10/12/the-robust-sandwich-variance-estimator-for-linear-regression/
consider the linear regression model \(E(Y|X) = X^T\beta\), so the variance of \(\beta\) can be estimated by: \[N^{-1}A(\beta)^{-1}B(\beta){A(\beta)^{-1}}^T \] define equation $D = X(Y-X^T) $ so we have \(A(\beta) = E(-\frac{\partial }{\partial \beta}D) = E(XX^T)\), \(B(\beta) = Var(D)\), here, we can get match to the moment methods model with the matched estimation function: \[X = \begin{bmatrix} tr(KK^T) & tr(K)\\ tr(K) & N \end{bmatrix}, \quad A = X^{-1}\]
\[B=var(D) = cov(\begin{bmatrix} y^TKy\\ y^Ty \end{bmatrix}) = \begin{bmatrix} Var(y^TKy) & cov(y^TKy, y^Ty)\\ cov(y^TKy, y^Ty) & Var(y^Ty) \end{bmatrix}\]
where the element can calculate by \(Var(y^TKy) = 2tr([K\Sigma]^2)\), \(Var(y^Ty)=2tr(\Sigma^2)\), \(cov(y^TKy, y^Ty) = 2tr(K\Sigma^2)\), and \(\Sigma = K\sigma_g^2 + I\sigma_e^2\).
Then use the Delta method \[Var(h(\theta)) \approx \triangledown h^T\Sigma \triangledown h\\ h = \frac{\sigma_g^2}{\sigma_g^2 + \sigma_e^2} ,\quad \triangledown h = \begin{bmatrix} \frac{\sigma_e^2}{(\sigma_g^2 + \sigma_e^2)^2}\\ \frac{-\sigma_g^2}{(\sigma_g^2 + \sigma_e^2)^2} \end{bmatrix}\]
trK2 <- sum(Kgrm^2)
trK <- sum(diag(Kgrm))
sg<- as.numeric(res[2])
se<- as.numeric(res[3])
X <- matrix(c(trK2, trK, trK, N), 2)
A <- solve(X)
S <- sg*Kgrm + se*diag(N)
covB <- matrix(0,2,2)
covB[1,1] <- 2*sum((Kgrm%*%S)^2)
covB[1,2] <- covB[2,1] <- 2*sum(diag(Kgrm%*%S%*%S))
covB[2,2] <- 2*sum(S^2)
covSig <- A%*%covB%*%A
gh <- c(se/(sg+se)^2, -sg/(sg+se)^2)
sqrt(t(gh) %*% covSig %*% gh)
## [,1]
## [1,] 0.08765207
\[y=\mu+g+e\] where, \[g\sim\mathcal{N}(0,K_{grm}\sigma_g),\] \[e\sim\mathcal{N}(0,I\sigma_e),\] and \[h^2=\frac{\sigma_g}{\sigma_g+\sigma_e}.\] Because the prior of latent \(g\) has correlation, So we will make some changes. after \(y\) is centering, because \[cov(y)=K_{grm}\sigma_g+I\sigma_e,\] we assume that \[y=Xw+e,\] where, \[w\sim\mathcal{N}(0,I\sigma_g).\] Above is the classical form of random effects model. When y is centering, we know \[\\ cov(y)=(Xww^TX^T)+I\sigma_e\\ =(XX^T)\sigma_g+I\sigma_e\\\] so \[(XX^T)=K_{grm}\] and \[X=U\sqrt{\lambda}.\] So, the classical form of lmm is \[y=U\sqrt{\lambda}w+e\] where, \[w\sim\mathcal{N}(0,I\sigma_g),\]
\[e\sim\mathcal{N}(0,I\sigma_e).\] Use glmnet-ridge regression, we can easily get the \(\frac{\sigma_e}{\sigma_g}=369\), and this is a poor answer. Use EM algorithn, we can get \(\sigma_e=2.0272\), \(\sigma_g=4.4988\), and \(h^2=0.6894\).
library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-10
eig <- eigen(Kgrm)
w=eig$vectors%*%sqrt(diag(eig$values))
fit=cv.glmnet(x=w, y=yheight1.08-mean(yheight1.08), alpha=0)
fit$lambda.min
## [1] 368.822
time_start <- proc.time()
y <- yheight1.08-mean(yheight1.08)
## Fixed effect
# x <- u%*%diag(sqrt(lambda))
# inv_x <- t(u%*%diag(sqrt(1/lambda)))
x=0
eig <- eigen(Kgrm)
u <- eig$vectors
lambda <- eig$values # Eigen value of kgrm, this is a vector
# Initial the sigma and lower bound
sigma_g <- 10
sigma_e <- 10
maxite <- 50
low_b <- rep(0,maxite)
I <- diag(rep(1,length(Kgrm[1,]))) #n*n
#
for (i in 1:maxite) {
omega <- Kgrm*sigma_g+I*sigma_e
invomega <- u%*%diag(1/(sigma_g*lambda+sigma_e))%*%t(u)
# If fixed effects exist
if (length(which(x!=0))!=0){
beta <- (inv_x%*%omega%*%t(inv_x))%*%t(x)%*%invomega%*%y
}else{
beta=0
}
#
sigma_g_old <- sigma_g
sigma_e_old <- sigma_e
sigma_g <- sigma_g*sqrt(t((y-c(x%*%beta)))%*%invomega%*%
Kgrm%*%invomega%*%(y-c(x%*%beta))/sum(invomega*Kgrm))
sigma_e <- sigma_e*sqrt(t((y-c(x%*%beta)))%*%invomega%*%
I%*%invomega%*%(y-c(x%*%beta))/sum(diag(invomega)))
sigma_g <- sigma_g[1,1]
sigma_e <- sigma_e[1,1]
low_b[i] <- (-0.5*sigma_e*sum(diag(invomega))-0.5*sigma_g*sum(invomega*Kgrm)
-0.5*sigma_e_old^2/sigma_e*
t((y-c(x%*%beta)))%*%(invomega%*%(invomega%*%(y-c(x%*%beta))))
-0.5*sigma_g_old^2/sigma_g*
t((y-c(x%*%beta)))%*%(invomega%*%(Kgrm%*%(invomega%*%(y-c(x%*%beta)))))
-0.5*sum(log(sigma_g_old*lambda+sigma_e_old)))
#print(i)
}
#
time_end <- proc.time()
time_end-time_start
## user system elapsed
## 28.875 0.726 8.480
sigma_g/(sigma_g+sigma_e)
## [1] 0.6839411
Question: predict the next child’s height of the famliy.
The most simple method is taking the average of the famliy’s children to predict the next child’s. Here we first adjust the gender by multiple 1.08, so if the next child is female, then her height have to divide by 1.08.
# sample 1/2 family take the half last children as the test data
# here to compare different methods, we can also random take half childs as test
train.test <- function(g, seed=2, cv=2, rand=F) {
set.seed(seed)
index.f <- sample(1:cv, 205, replace = T)
fam.num <- unique(g$family)
results <- list()
for(i in 1:cv){
fam.test <- fam.num[index.f==i]
n <- length(fam.test)
index.c <- c()
for(j in 1:n){
fam.n <- fam.test[j]
temp <- g[g$family==fam.n, ]
chi.num <- temp$children[1]
if(rand){
chi.temp <- as.numeric(sample(rownames(temp), (chi.num/2+0.5)))
}else{
chi.temp <- as.numeric(rownames(temp)[1:(chi.num/2+ 0.5)])
}
index.c <- append(index.c, chi.temp)
}
train <- g[-index.c, ]
test <- g[index.c, ]
results[[i]] <- list(train=train, test=test)
}
results
}
tt <- train.test(g, seed=1, rand=F)
head(tt[[1]]$test)
## family father mother midparentHeight children childNum gender
## 1 001 78.5 67.0 75.43 4 1 male
## 2 001 78.5 67.0 75.43 4 2 female
## 5 002 75.5 66.5 73.66 4 1 male
## 6 002 75.5 66.5 73.66 4 2 male
## 16 005 75.0 58.5 69.09 6 1 male
## 17 005 75.0 58.5 69.09 6 2 male
## childHeight
## 1 73.2
## 2 69.2
## 5 73.5
## 6 72.5
## 16 72.0
## 17 69.0
g <- GaltonFamilies
g$childHeight <- with(g, ifelse(g$gender=="female", 1.08, 1)*childHeight)
ch.mean <- aggregate(g$childHeight, list(g$family), mean)$x
hist(ch.mean, breaks = 30)
g <- GaltonFamilies
g$childHeight <- with(g, ifelse(g$gender=="female", 1.08, 1)*childHeight)
cv = 2
ave.err4 <- rep(0, 100*cv)
for(s in 1:100){
tt <- train.test(g, seed=s, cv, rand = T)
err <- c()
for(v in 1:cv) {
train <- tt[[v]]$train
test <- tt[[v]]$test
ch.mean <- aggregate(train$childHeight, list(train$family), mean)
fam.test <- test$family
pred <-c()
for(i in 1:length(fam.test)) {
index <- which(ch.mean$Group.1==fam.test[i])
if(length(index)==0){
pred[i] <- mean(ch.mean$x)
}else{
pred[i] <- ch.mean$x[index]
}
}
err[v] <- sqrt(mean((pred-test$childHeight)^2))
}
ave.err4[(s*2-1):(s*2)] <- err
}
g <- GaltonFamilies
cv = 2
ave.err5 <- rep(0, 100*cv)
for(s in 1:100){
tt <- train.test(g, seed=s, cv, rand = T)
err <- c()
for(v in 1:cv) {
train <- tt[[v]]$train
test <- tt[[v]]$test
ch.mean <- aggregate(train$childHeight, list(train$gender), mean)
fam.test <- test$family
pred <-c()
for(i in 1:length(fam.test)) {
gender.i <- test$gender[i]
if(gender.i == "male"){
pred[i] <- ch.mean$x[2]
}else{
pred[i] <- ch.mean$x[1]
}
}
err[v] <- sqrt(mean((pred-test$childHeight)^2, na.rm = T))
}
ave.err5[(s*cv-1):(s*cv)] <- err
}
g <- GaltonFamilies
cv = 2
ave.err6 <- rep(0, 100*cv)
for(s in 1:100){
tt <- train.test(g, seed=s, cv, rand = T)
err <- c()
for(v in 1:cv) {
train <- tt[[v]]$train
test <- tt[[v]]$test
train.fam <- unique(train$family)
ch.mgend <- data.frame(family=train.fam, male=rep(0, length(train.fam)),
female = rep(0,length(train.fam)))
for(j in 1:length(train.fam)) {
fam.num <- train.fam[j]
temp <- train[fam.num == train$family, ]
mal.ind <- which(temp$gender=="male")
fem.ind <- which(temp$gender=="female")
mal.mean <- mean(temp$childHeight[mal.ind])
fem.mean <- mean(temp$childHeight[fem.ind])
if(length(mal.ind) == 0) {
mal.mean <- mean(train$childHeight[which(train$gender=="male")])
}
if(length(fem.ind) == 0) {
fem.mean <- mean(train$childHeight[which(train$gender=="female")])
}
ch.mgend$male[j] <- mal.mean
ch.mgend$female[j] <- fem.mean
}
fam.test <- test$family
pred <-c()
for(i in 1:length(fam.test)) {
fam.tn <- fam.test[i]
fam.tg <- test[i, 7]
index <- which(ch.mgend[, 1] == fam.tn)
if(length(index) == 0) {
if(fam.tg == "male"){
pred[i] <- mean(train$childHeight[train$gender=="male"])
}else{
pred[i] <- mean(train$childHeight[train$gender=="female"])
}
}else{
if(fam.tg == "male"){
pred[i] <- ch.mgend$male[index]
}else{
pred[i] <- ch.mgend$female[index]
}
}
}
err[v] <- sqrt(mean((pred-test$childHeight)^2))
}
ave.err6[(s*cv-1):(s*cv)] <- err
}
ave.err <- cbind(ave.err1, ave.err2, ave.err3, ave.err4, ave.err5, ave.err6)
save(ave.err, file="/Users/zhouchao/Documents/R/Galton/ave.err.rda")
load("/Users/zhouchao/Documents/R/Galton/ave.err.rda")
boxplot(ave.err, col = rainbow(3))
abline(h=seq(2.2, 3.2, 0.2), col="lightgrey")
colMeans(ave.err)
## ave.err1 ave.err2 ave.err3 ave.err4 ave.err5 ave.err6
## 2.601692 2.663082 2.888484 2.414207 2.489960 2.517108
g <- GaltonFamilies
fit1 <- lm(childHeight~father+mother+gender, data = g)
summary(fit1)$coef
cv = 2
lm.err5 <- rep(0, 100*cv)
for(s in 1:100){
tt <- train.test(g, seed=s, cv, rand = T)
err <- c()
for(v in 1:cv) {
train <- tt[[v]]$train
test <- tt[[v]]$test
temp.fit <- lm(childHeight~father+mother+gender, data = train)
temp.pred<- predict(temp.fit, test)
err[v] <- sqrt(mean((temp.pred-test$childHeight)^2))
}
lm.err5[(s*cv-1):(s*cv)] <- err
}
g <- GaltonFamilies
g$childHeight <- with(g, ifelse(g$gender=="female", 1.08, 1)*childHeight)
cv = 2
lm.err6 <- rep(0, 100*cv)
for(s in 1:100){
tt <- train.test(g, seed=s, cv, rand = T)
err <- c()
for(v in 1:cv) {
train <- tt[[v]]$train
test <- tt[[v]]$test
ch.mean<- aggregate(train$childHeight, list(train$family), mean)[, 2]
ph.fat <- aggregate(train$father, list(train$family), mean)[, 2]
ph.mot <- aggregate(train$mother, list(train$family), mean)[, 2]
newdata <- data.frame(y=ch.mean, x1=ph.fat, x2=ph.mot)
temp.fit<- lm(y~x1+x2, data=newdata)
temp.pred <- c()
for(i in 1:nrow(test)) {
newtest <- data.frame(x1=test$father[i], x2=test$mother[i])
temp.pred[i] <- predict(temp.fit, newtest)
}
err[v] <- sqrt(mean((temp.pred-test$childHeight)^2))
}
lm.err6[(s*cv-1):(s*cv)] <- err
}
g <- GaltonFamilies
cv = 2
lm.err7 <- rep(0, 100*cv)
for(s in 1:100){
tt <- train.test(g, seed=s, cv, rand = T)
err <- c()
for(v in 1:cv) {
train <- tt[[v]]$train
test <- tt[[v]]$test
ch.one <- train$childHeight[train$childNum==1]
ch.gen <- train$gender[train$childNum==1]
ph.fat <- train$father[train$childNum==1]
ph.mot <- train$mother[train$childNum==1]
newdata <- data.frame(y=ch.one, x1=ph.fat, x2=ph.mot,x3=ch.gen)
temp.fit<- lm(y~x1+x2+x3, data=newdata)
temp.pred <- c()
for(i in 1:nrow(test)) {
newtest <- data.frame(x1=test$father[i], x2=test$mother[i], x3=test$gender[i])
temp.pred[i] <- predict(temp.fit, newtest)
}
err[v] <- sqrt(mean((temp.pred-test$childHeight)^2))
}
lm.err7[(s*cv-1):(s*cv)] <- err
}
g <- GaltonFamilies
cv = 2
lm.err8 <- rep(0, 100*cv)
for(s in 1:100){
tt <- train.test(g, seed=s, cv, rand = T)
err <- c()
for(v in 1:cv) {
train <- tt[[v]]$train
test <- tt[[v]]$test
newtrain <- ch.rand(train$family, train)
temp.fit<- lm(childHeight~father+mother+gender, data = newtrain)
temp.pred <- c()
for(i in 1:nrow(test)) {
newtest <- test[i, ]
temp.pred[i] <- predict(temp.fit, newtest)
}
err[v] <- sqrt(mean((temp.pred-test$childHeight)^2))
}
lm.err8[(s*cv-1):(s*cv)] <- err
}
lm.err <- cbind(lm.err1, lm.err2, lm.err3, lm.err4, lm.err5, lm.err6, lm.err7, lm.err8)
save(lm.err, file="/Users/zhouchao/Documents/R/Galton/lm.err.rda")
load("/Users/zhouchao/Documents/R/Galton/lm.err.rda")
boxplot(lm.err, col = rainbow(4))
abline(h=seq(2,2.8,0.2), col="lightgrey")
colMeans(lm.err)
## lm.err1 lm.err2 lm.err3 lm.err4 lm.err5 lm.err6 lm.err7 lm.err8
## 2.311023 2.365793 2.399640 2.340505 2.160662 2.233701 2.469846 2.172661
This methods is conditioned on the observed data.
## y family.index, K are matching
cond.dist <- function(fam.mis, mu=NULL, gender=1,
sigma_g, simga_a, y, family.index, K=K, KI) {
# one miss data
n <- length(y)
f <- length(unique(family.index))
if(is.null(mu)) {
temp <- 0
mu <- c()
for(i in 1:f){
fam.n <- unique(family.index)[i]
fn <- sum(fam.n==family.index)
mu[temp + (1:fn)] <- rep(mean(y[which(fam.n == family.index)]), fn)
temp <- temp + fn
}
mis.ind <- which(fam.mis == family.index)
if(length(mis.ind) == 0){
mu.mis <- mean(y) + 5.2*gender
}else{
mu.mis <- mean(y[mis.ind]) + 5.2*gender
}
}
else{
mu.mis <- mean(y) + 5.2*gender
}
if(is.null(K)) K <- diag(1, n)
B <- rep(0, n)
fam.index <- which(fam.mis == family.index)
B[fam.index] <- 1/2
if(is.null(KI)) KI <- solve(K*sigma_g + diag(n)*sigma_e)
mu.obs <- mu
# mu.mis <- mu.obs[fam.index] + 5.2*gender # each family's mean
BSigma <- sigma_g * colSums(B * KI)
mu.star<- mu.mis + sum(BSigma *(y-mu.obs))
sigma.star <- sigma_g + sigma_e - sum(BSigma*B)*sigma_g
res <- c(mu.star, sigma.star)
return(res)
}
data.trans <- function(train) {
fam.num <- unique(train$family)
N <- nrow(train) + 2*length(fam.num)
yheight <- gender <- family.index <- c()
Kgrm <- matrix(0, N, N)
temp <- 0
for(i in 1:length(fam.num)) {
id <- fam.num[i]
index.i <- which(train$family == id)
Ni <- length(index.i)
ge <- c(1,0, gend[index.i])
gender <- append(gender, ge)
fam <- rep(id, 2+length(index.i))
family.index[temp + (1:(Ni+2))] <- fam
yi<- c(train$father[index.i][1], train$mother[index.i][1], train$childHeight[index.i])
yheight <- append(yheight, yi)
Ki <- diag(1, length(yi))
Ki[Ki==0] <- 0.5; Ki[1,2] <- 0; Ki[2,1] <- 0
ly <- length(yheight) - length(yi) + 1
ki.row <- ki.col <- ly:length(yheight)
Kgrm[ki.row, ki.col] <- Ki
temp <- temp+Ni+2
}
results <- list(K=Kgrm, y=yheight, gender=gender, family.index=family.index)
}
All familiy with the same interspects with gender adjusted.
g <- GaltonFamilies
g$family <- as.numeric(g$family)
sigma_g <- 4.4988
sigma_e <- 2.0272
cv = 2
cdm.err3 <- rep(0, 100*cv)
cdm.err4 <- rep(0, 100*cv)
cdm.std2 <- c()
for(s in 1:100){
tt <- train.test(g, seed=s, cv, rand = T)
err1 <- err2 <- mstd <- c()
for(v in 1:cv) {
train <- tt[[v]]$train
test <- tt[[v]]$test
newtrain <- data.trans(train)
KI <- solve(newtrain$K *sigma_g + diag(length(newtrain$y))*sigma_e)
pred1 <-pred2 <- c()
std <- c()
for(i in 1:nrow(test)) {
fam.mis <- test$family[i]
fam.gen <- ifelse(test$gender[i]=="male",1,0)
mu1 <- mean(newtrain$y) + 5.2*newtrain$gender # all family with the same mean
temp.pred1 <- cond.dist(fam.mis, gender=fam.gen, mu=mu1, sigma_g, simga_a,
y=newtrain$y, family.index=newtrain$family.index, K=newtrain$K, KI)
pred1[i] <- temp.pred1[1]
temp.pred2 <- cond.dist(fam.mis, gender=fam.gen, mu=NULL, sigma_g, simga_a,
y=newtrain$y, family.index=newtrain$family.index, K=newtrain$K, KI)
pred2[i] <- temp.pred2[1]
std[i] <- temp.pred1[2]
}
err1[v] <- sqrt(mean((pred1-test$childHeight)^2))
err2[v] <- sqrt(mean((pred2-test$childHeight)^2))
mstd <- c(mstd, std)
}
cdm.err3[(s*cv-1):(s*cv)] <- err1
cdm.err4[(s*cv-1):(s*cv)] <- err2
cdm.std2 <- c(cdm.std2, mstd)
}
cdm.err <- cbind(cdm.err1, cdm.err2, cdm.err3, cdm.err4)
save(cdm.err, file="/Users/zhouchao/Documents/R/Galton/cdm.err.rda")
1.One use the overall mean level as the interspect. 2. Each family’s mean level as his own interspect.
load("/Users/zhouchao/Documents/R/Galton/cdm.err.rda")
boxplot(cdm.err, col =rainbow(2))
abline(h=seq(2.3, 3.6, 0.2), col="grey")
colMeans(cdm.err)
## cdm.err1 cdm.err2 cdm.err3 cdm.err4
## 2.455075 2.468594 2.391902 3.423050
# boxplot(sqrt(cdm.std), ylim=c(2.1,2.3), col=2)
This method take each chid’s height will by effected by the fix effect such as gender, father or mother and the random effect come from family level and children level.
\[y_{ij} = \beta_0 + \beta_1 M_{i}+ \beta_2 g_{ij} + \alpha_1 f_i + \alpha_2 g_j + e_{ij}\] which assume family’s effect and children effect follow normal distribution \(f_i \sim N(0,\sigma_f^2)\) and \(g_j \sim N(0, \sigma_g^2)\).
library(lme4)
g <- GaltonFamilies
R <- 1
cv <- 2
lmm.err6 <- rep(0, R*cv)
for(s in 1:R){
tt <- train.test(g, seed=s, cv, rand = T)
err <- c()
for(v in 1:cv) {
train <- tt[[v]]$train
test <- tt[[v]]$test
temp.fit <- lmer(childHeight~father+mother+gender + (1|family) + (1|childNum),
data = train)
temp.pred = predict(temp.fit, newdata = test, allow.new.levels = TRUE)
err[v] <- sqrt(mean((temp.pred-test$childHeight)^2))
}
lmm.err6[(s*cv-1):(s*cv)] <- err
}
plot(temp.fit)
plot(temp.pred, test$childHeight, pch=20, ylab = "actual value")
abline(a=0,b=1,col=2)
lmm.err <- cbind(lmm.err5, lmm.err6)
save(lmm.err, file="/Users/zhouchao/Documents/R/Galton/lmm.err.rda")
load("/Users/zhouchao/Documents/R/Galton/lmm.err.rda")
boxplot(lmm.err, col=rainbow(2))
colMeans(lmm.err)
## lmm.err5 lmm.err6
## 1.986466 1.866852
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
g <- GaltonFamilies
g$family <- as.numeric(g$family)
cv = 2
R=1
rf.err9 <- rep(0, R*cv)
for(s in 1:R){
tt <- train.test(g, seed=s, cv, rand = F)
err <- c()
for(v in 1:cv) {
train <- tt[[v]]$train
test <- tt[[v]]$test
temp.fit <- randomForest(childHeight~father+mother+gender, data = train)
temp.pred = predict(temp.fit, newdata = test)
err[v] <- sqrt(mean((temp.pred-test$childHeight)^2))
}
rf.err9[(s*cv-1):(s*cv)] <- err
}
par(mfrow=c(1,2))
plot(temp.fit)
plot(temp.pred, test$childHeight, pch=20, ylab = "actual value")
abline(a=0,b=1,col=2)
rf.err <- cbind(rf.err1, rf.err3, rf.err5, rf.err7, rf.err9,
rf.err2, rf.err4, rf.err6, rf.err8, rf.err10)
save(rf.err, file="/Users/zhouchao/Documents/R/Galton/rf.err.rda")
Random Forest.
1 childHeight~father+mother+gender+childNum 2. childHeight~father+mother+gender+childNum+children 3. childHeight~father+mother+gender+childNum+children+family 4. childHeight~.(all variable) 5. childHeight~father+mother+gender
load("/Users/zhouchao/Documents/R/Galton/rf.err.rda")
boxplot(rf.err, col=rainbow(5))
abline(h=seq(1.6, 2.8, 0.2), col="grey")
colMeans(rf.err)
## rf.err1 rf.err3 rf.err5 rf.err7 rf.err9 rf.err2 rf.err4 rf.err6
## 2.172269 2.159621 2.056058 2.064234 2.404077 2.036974 1.991147 1.869770
## rf.err8 rf.err10
## 1.874016 2.178973
Neural Network
library(nnet)
g <- GaltonFamilies
g$family <- as.numeric(g$family)
R<-1
cv = 2
nnt.err10 <- rep(0, R*cv)
for(s in 1:R){
tt <- train.test(g, seed=s, cv, rand = T)
err <- c()
for(v in 1:cv) {
train <- tt[[v]]$train
test <- tt[[v]]$test
temp.fit <- nnet(childHeight~father+mother+gender,
data = train, size =10, linout=TRUE,
rang = 0.1, decay = 5e-4, maxit = 200)
temp.pred = predict(temp.fit, newdata = test) #, type = "raw")
err[v] <- sqrt(mean((temp.pred-test$childHeight)^2))
}
nnt.err10[(s*cv-1):(s*cv)] <- err
}
## # weights: 51
## initial value 2983509.158760
## final value 8353.782886
## converged
## # weights: 51
## initial value 3022453.990113
## iter 10 value 9198.941696
## iter 20 value 9135.011180
## iter 30 value 9099.172748
## iter 40 value 4125.037095
## iter 50 value 3585.301056
## iter 60 value 3126.582494
## iter 70 value 3095.751293
## iter 80 value 3092.311968
## iter 90 value 3088.776384
## iter 100 value 3088.539483
## iter 110 value 3088.409944
## iter 120 value 3088.303619
## iter 130 value 3087.457557
## iter 140 value 3080.730838
## iter 150 value 3071.067831
## iter 160 value 3054.190359
## iter 170 value 3033.119049
## iter 180 value 3027.390180
## iter 190 value 3024.949447
## iter 200 value 3024.890180
## final value 3024.890180
## stopped after 200 iterations
nnt.err <- cbind(nnt.err1, nnt.err3, nnt.err5, nnt.err7, nnt.err9,
nnt.err2, nnt.err4, nnt.err6, nnt.err8, nnt.err10)
save(nnt.err, file="/Users/zhouchao/Documents/R/Galton/nnt.err.rda")
NNT.
1 childHeight~father+mother+gender+childNum 2. childHeight~father+mother+gender+childNum+children 3. childHeight~father+mother+gender+childNum+children+family 4. childHeight~.(all variable) 5. childHeight~father+mother+gender
load("/Users/zhouchao/Documents/R/Galton/nnt.err.rda")
boxplot(nnt.err, col=rainbow(5))
#abline(h=seq(1.6, 2.8, 0.2), col="grey")
colMeans(nnt.err)
## nnt.err1 nnt.err3 nnt.err5 nnt.err7 nnt.err9 nnt.err2 nnt.err4
## 2.804182 2.836102 2.830070 2.802699 3.005308 2.599410 2.571843
## nnt.err6 nnt.err8 nnt.err10
## 2.694780 2.571749 2.738264