sA = c(48, 36, 20, 29, 42, 42, 20, 42, 22, 41, 45, 14, 6, 0,
33, 28, 34, 4, 32, 24, 47, 41, 24, 26, 30, 41)
sB = c(42, 33, 16, 39, 38, 36, 15, 33, 20, 43, 34, 22, 7, 15,
34, 29, 41, 13, 38, 25, 27, 41, 28, 14, 28, 40)
neurological_data <- data.frame(sA,sB)
names(neurological_data) <- c("A","B")
sample_A <- neurological_data[,1]
sample_B <- neurological_data[,2]
We are interested in the variance \(\theta = var(A)\) of the first sample A
Assuming that our sample \(x_i = ( A_i)\) was drawn from a parametric distribution \(F_{\mathbf z}(x,\theta )\), the plug-in estimator of a parameter \(\theta\) is defined to be as \(\hat{\theta }=\theta(\hat{F}(x))\) and is obtained by replacing the distribution function with the parametric distribution function.
Our plug-in estimator of variance can be expressed by the following:
\(\hat{\theta}=\hat{\sigma }^2=\frac{1}{N} \sum_{i=1}^ N (x_ i-\hat{\mu })^2\)
Where
\(\hat{\mu }=\int x d \hat{F}(x)=\frac{1}{N} \sum _{i=1}^ N x_i\)
We account for the bias of the variance estimator and and our \(\hat{\theta}\) becomes a sample variance
\(\hat{\theta}=\hat{\sigma }^2=\frac{1}{N-1} \sum_{i=1}^ N (x_ i-\hat{\mu })^2\)
theta.hat <- var(sample_A)
print(theta.hat)
## [1] 178.3954
Let’s use 999 nonparametric bootstrap samples and store the replicates \(\hat{\theta^*}\)
B<-999
brepl_NP <-rep(1:B)
for(j in 1:B){
bsample <- sample(sA,replace=T)
brepl_NP[j]<- var(bsample)
}
95% basic bootstrap confidence interval:
The lower confidence bound: \(L^* = \hat{\theta}_{\lbrack (B+1)\frac{\alpha}{2}\rbrack}^* - \hat{\theta}\) and the upper bound \(U^* = \hat{\theta}_{\lbrack (B+1)(1-\frac{\alpha}{2})\rbrack}^* - \hat{\theta}\)
bCi <- sort(brepl_NP)
hiInd = (B+1) - 0.025 * (B+1)
loInd = 0.025 * (B+1)
ci_low_np<-2*theta.hat-bCi[hiInd]
ci_hi_np<-2*theta.hat-bCi[loInd]
hist(brepl_NP, main = "Nonparametric bootstrap distribution")
segments(ci_low_np,1,ci_hi_np,1, col=3,lwd=3)
text(ci_low_np,1,"[")
text(ci_hi_np,1,"]")
points(theta.hat,1,lwd=3)
Confidence interval (Basic bootstrap confidence interval, nonparametric bootstrap):
ci_low_np
## [1] 101.6892
ci_hi_np
## [1] 268.4646
We assume that the original population data comes from the normal distribution with the sample mean \(\hat{\mu}\) and variance of \(\hat{\theta}\)
mju <- mean(sA)
brepl_PAR <-rep(1:B)
for(j in 1:B){
bsample <- rnorm(length(sA),mju,sqrt(theta.hat))
brepl_PAR[j]<- var(bsample)
}
hist(brepl_PAR,main = "Parametric bootstrap distribution")
bCiPAR <- sort(brepl_PAR)
ci_low_par<-2*theta.hat-bCiPAR[hiInd]
ci_hi_par<-2*theta.hat-bCiPAR[loInd]
segments(ci_low_par,1,ci_hi_par,1, col=3,lwd=3)
text(ci_low_par,1,"[")
text(ci_hi_par,1,"]")
points(theta.hat,1,lwd=3)
Confidence interval (Basic bootstrap confidence interval, parametric bootstrap):
ci_low_par
## [1] 57.99125
ci_hi_par
## [1] 265.0636
We draw B nonparametric bootstrap samples and ‘Studentize’ each one of them to calculate our 95% confidence interval
bootstud<-function(B, data){
#theta.hat
th<-var(data)
#bootstrap samples
TB<-rep(NA,B)
breplST <- rep(NA,B)
for (i in 1:B){
bsample <-sample(data,replace=T)
breplST[i] <- var(bsample)
TB[i]<-studentize(bsample,th)
}
newList <- list("npb" = breplST, "tsb" = TB)
return(newList)
}
In order to calculate the confidence interval, we need the statistic
\(Z=\frac{\hat{\theta}-\theta}{\sqrt{\hat{V}}}\)
The variance of our estimator can be expressed as
\(Var(\hat{\theta})=\frac{n-1}{n^3}{\sigma}^4((n-1)K-(n-3))\)
where K is a kurtosis
stderr<-function(np_sample){
tb<-var(np_sample)
mb<-mean(np_sample)
n<-length(np_sample)
kurt<-mean((np_sample-mb)^4)/tb^2
return(sqrt(((n-1)/n^3)*tb^2*((n-1)*kurt-n+3)))
}
studentize<-function(np_sample,th){
#theta.hat.star
tb<-var(np_sample)
studb<-((tb-th)/stderr(np_sample))
return(studb)
}
We generate 999 t-studentized bootstrap samples
se <-stderr(sA)
bst <- bootstud(B,sA)
TSB<-sort(bst$tsb)
U<-TSB[loInd]
L<-TSB[hiInd]
ci_low_ts<-theta.hat-se*L
ci_hi_ts<-theta.hat-se*U
hist(bst$npb,main = "T-student nonparametric bootstrap distribution")
segments(ci_low_ts,1,ci_hi_ts,1, col=3,lwd=3)
text(ci_low_ts,1,"[")
text(ci_hi_ts,1,"]")
points(theta.hat,1,lwd=3)
Confidence interval (Studentized confidence interval, nonparametric bootstrap):
ci_low_ts
## [1] 104.6557
ci_hi_ts
## [1] 387.4497
Basic bootstrap confidence seem to be symmetrical around \(\hat{\theta}\) and highly similar.
This suggests that our assumption of normality was walid and both methods return similar results. Studentized confidence interval is assymetric with respect to the original sample variance. This suggests that the original distribution was skewed towards the right as the studentized interval does not depend on nuissance parameters.
plot(sB,sA)
#fit the linear model
lm_naive <- lm(sA~sB)
abline(lm_naive,col=2,lwd=3)
coef(lm_naive)[2]
## sB
## 1.02788
Standard SIMEX procedure requires us to know the real error variance. Unfortunately in this case it is unknown. We can either perform symmetrical SIMEX or assume error variance of 1.
#add error
perturb<-function(lamda,dataX,dataY,count=10){
avgCoef <- 0
for (i in 1:count){
pert<-dataX+sqrt(lamda)*rnorm(length(dataX),0,1)
lmod<-lm(dataY~pert)
avgCoef <- avgCoef + coef(lmod)[2]
}
return(avgCoef/count)
}
Simulate several lambas
lmb<-seq(0.1,10,0.5)
k<-length(lmb)
betas<-rep(NA,k)
for(i in 1:k){
betas[i]<-perturb(lmb[i],sB,sA)
}
Extrapolate
plot(lmb,betas,lwd=3,xlim=c(-2,10), ylim=c(0.9,1.1),xlab="lamda",ylab="beta", main= "SIMEX")
ex_ml<-lm(betas~lmb);
abline(ex_ml,lwd=2)
#assuming error variance = 1, the estimate is at lambda = -1
extrapolatedB <- coef(ex_ml)[1]-coef(ex_ml)[2]
extrapolatedB
## (Intercept)
## 1.038082
Now let’s try SYMEX
#reuse betas from A~B regression
betasAB<-betas
betasBA<-rep(NA,k)
#perturb A~B regression
for(i in 1:k){
betasBA[i]<-1/perturb(lmb[i],sA,sB)
}
plot(lmb,betasAB,col=4, lwd=3,xlim=c(-40,10), ylim=c(0.7,1.7),xlab="lamda",ylab="beta", main= "SYMEX")
points(lmb,betasBA,col=6, lwd=3)
ex_ml<-lm(betasAB~lmb);
ex_ml_rev<-lm(betasBA~lmb);
abline(ex_ml,lwd=2, col=4)
abline(ex_ml_rev,lwd=2, col=6)
Our slope is the intersect between two SYMEX linear models
cm <- rbind(coef(ex_ml),coef(ex_ml_rev)) # Coefficient matrix
res_sym <-c(-solve(cbind(cm[,2],-1)) %*% cm[,1])
res_sym[2]
## [1] 1.248342
Final plot - naive vs improved SYMEX
plot(sB,sA)
#fit the linear model
lm_naive <- lm(sA~sB)
abline(lm_naive,col=2,lwd=3)
abline(coef(lm_naive)[1],res_sym[2],col=3,lwd=3)
It seems that error variance in the model is quite high. Assuming error variance of 1 does not differ much from the original naive estimate. The results from SYMEX suggest that the variance is quite high and that result should be the estimate of our linear model.
The Hardy Weinberg Estimation Maximization code works by assuming that if the sum of the genotypes \(SS+II+FF\) is fixed, then \(E(SS\vert SS + II + FF, \theta_m) \sim \frac{\theta^2_{SS}}{\theta^2_{SS} + \theta^2_{II} + \theta^2_{FF}}Y_1\), where \(Y_1\) is the measured count of the sum of the genotypes \(SS+II+FF\).
Our observed data \(Y\) consists of the the homozygotes \(H\) where \(H=\{SS,II,FF\}\) and the rest of heterozygotes
\(Y=\{H,SI,SF,IF\}=\{y_1,y_2,y_3,y_4\}\)
Complete model data \(X\) contains all the genotypes
\(X=\{SS,II,FF,SI,SF,IF\}=\{x_1,x_2,x_3,x_4,x_5,x_6\}\)
The likelihood function :
\(f(\theta_1,\theta_2,\theta_3)=\theta_1^{2|x_1|}\theta_2^{2|x_2|}\theta_3^{2|x_3|}2(\theta_1\theta_2)^{|x_4|}2(\theta_1\theta_3)^{|x_5|}2(\theta_2\theta_3)^{|x_6|}\)
Solving \(f(\theta)=0\) we get :
\(\theta_1=\frac{2|x_1|+|x_4|+|x_5|}{2(|x_1|+|x_2|+|x_3|+|x_4|+|x_5|+|x_6|)}\) \(\theta_2=\frac{2|x_2|+|x_4|+|x_6|}{2(|x_1|+|x_2|+|x_3|+|x_4|+|x_5|+|x_6|)}\) \(\theta_3=\frac{2|x_3|+|x_5|+|x_6|}{2(|x_1|+|x_2|+|x_3|+|x_4|+|x_5|+|x_6|)}\)
emstep <-function(Y,theta){
#Expecation step
#SS expectation
x1 <- Y[1] * (theta[1]^2)/((theta[1]^2 + theta[2]^2 + theta[3]^2))
#II expectation
x2 <- Y[1] * (theta[1]^2)/(theta[1]^2 + theta[2]^2 + theta[3]^2)
#FF expectation
x3 <- Y[1] * (theta[1]^2)/(theta[1]^2 + theta[2]^2 + theta[3]^2)
#Maximization step
#mle estimates
theta[1] = (2*x1 + Y[2] + Y[3])/(2*sum(Y))
theta[2] = (2*x2 + Y[2] + Y[4])/(2*sum(Y))
theta[3] = (2*x3 + Y[3] + Y[4])/(2*sum(Y))
theta
}
First we need to define the \(\epsilon\) parameter responsible for the accuracy of our algorithm
epsilon = 1e-6
Y <- c(164,41,35,160)
#start with equal probs
theta.new <- c(0,0,0)
theta.old <- c(1/3,1/3,1/3)
dif<-1
while(max(abs(dif)) > epsilon)
{
theta.new<-emstep(Y,theta.old)
dif<-theta.old-theta.new
theta.old<-theta.new
print(theta.new)
}
## [1] 0.2316667 0.3879167 0.3804167
## [1] 0.1580743 0.3143243 0.3068243
## [1] 0.1420103 0.2982603 0.2907603
## [1] 0.1376939 0.2939439 0.2864439
## [1] 0.1364777 0.2927277 0.2852277
## [1] 0.1361305 0.2923805 0.2848805
## [1] 0.1360311 0.2922811 0.2847811
## [1] 0.1360026 0.2922526 0.2847526
## [1] 0.1359944 0.2922444 0.2847444
## [1] 0.135992 0.292242 0.284742
## [1] 0.1359914 0.2922414 0.2847414