On considère une série d’observations \(y_i,\,i=1,\ldots,n\) sur un échantillon composé de \(K\) groupes. Notre objectif est de montrer s’il existe des différences entre les groupes.
Le modèle suivant est appelé le modèle ANOVA à effets fixes :
\[\left\{ \begin{array}{l} y_i=a_{j(i)}+\epsilon_i \\ \epsilon_i\sim\mathcal{N}(0,\sigma^2)\end{array}\right.\] où \(a_{j(i)}\) est la moyenne de la \(i-\)ième observation du \(j-\)ième groupe.
Voici un code R qui permet de simuler des données selon le modèle précedent.
> set.seed(1235)
> K <- 5
> nsample <- 10
> a <- c(50, 40, 45, 55, 60)
> sig <- 3
> n <- nsample*K
> epsilon <- rnorm(n,0,sig)
x qui donne les indicateurs des groupes> x <- rep(1:K, rep(nsample, K))
> x
[1] 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 4 4 4 4 4
[36] 4 4 4 4 4 5 5 5 5 5 5 5 5 5 5
> X <- model.matrix(~-1+as.factor(x))
> X<-as.matrix(X)
> head(X)
as.factor(x)1 as.factor(x)2 as.factor(x)3 as.factor(x)4 as.factor(x)5
1 1 0 0 0 0
2 1 0 0 0 0
3 1 0 0 0 0
4 1 0 0 0 0
5 1 0 0 0 0
6 1 0 0 0 0
y> y <- as.numeric(X%*%as.matrix(a)+epsilon)
> summary(y)
Min. 1st Qu. Median Mean 3rd Qu. Max.
37.18 43.45 50.34 50.20 56.86 70.04
violin plot de y> library(ggplot2)
> dt=data.frame(x,y)
> gr<-ggplot(dt,aes(x=as.factor(x),y=y,fill=as.factor(x)))+geom_violin(alpha=.7)
> gr<-gr+theme_classic() +xlab("")+ylab("")+
+ theme(legend.title = element_blank(),legend.position="non",text = element_text(size=14))
> gr
> dt_fixe=data.frame(x,y)
> save(dt_fixe,file = "anova_data_fixe.RData")
> m_fixe=lm(y~as.factor(x),data=dt_fixe)
> summary(m_fixe)
Call:
lm(formula = y ~ as.factor(x), data = dt_fixe)
Residuals:
Min 1Q Median 3Q Max
-8.0342 -1.7873 -0.2587 1.6867 9.3338
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 51.021 1.059 48.171 < 2e-16 ***
as.factor(x)2 -11.509 1.498 -7.684 9.99e-10 ***
as.factor(x)3 -6.791 1.498 -4.534 4.26e-05 ***
as.factor(x)4 4.528 1.498 3.023 0.00412 **
as.factor(x)5 9.684 1.498 6.465 6.35e-08 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 3.349 on 45 degrees of freedom
Multiple R-squared: 0.8515, Adjusted R-squared: 0.8383
F-statistic: 64.52 on 4 and 45 DF, p-value: < 2.2e-16
> summary(m_fixe)$sigma
[1] 3.349313
\[\left\{ \begin{array}{l} y_i=a_{j(i)}+\epsilon_i \\ \epsilon_i\sim\mathcal{N}(0,\sigma^2)\end{array}\right.\]
OpenBUGS du modèlemodel{
# Priors
for(j in 1:K){
a[j] ~ dnorm(0,.001)
}
sigma~ dunif(0,100)
# Likelihood
for(i in 1:n){
y[i] ~ dnorm(mu[i],tau)
mu[i] <-a[x[i]]
}
# Parameters
tau <- 1/(sigma*sigma)
}
R.Rsink('exempleANOVA_fixe.txt')
cat("
model{
# Priors
for(j in 1:K){
a[j] ~ dnorm(0,.001)
}
sigma~ dunif(0,100)
# Likelihood
for(i in 1:n){
y[i] ~ dnorm(mu[i],tau)
mu[i] <-a[x[i]]
}
# Parameters
tau <- 1/(sigma*sigma)
}
",fill=T)
sink()
##
filename<-"exempleANOVA_fixe.txt"
# data
donnees=list(y=dt_fixe$y,x=dt_fixe$x,K=length(unique(dt_fixe$x)),n=nrow(dt_fixe))
# Inits
inits<-function(){
inits=list(a=rnorm(5,0,10),sigma=runif(1,0,100))
}
# parametres
params <- c('a','sigma')
# BUGS
library(R2OpenBUGS)
outBinom_fixe <-bugs(donnees,inits,params,filename,codaPkg=F,n.thin =1,
n.iter=10000,debug=F,n.chains = 3,
working.directory=getwd())
Le résumé des MCMC générant les lois a posteriori
> library(R2OpenBUGS)
> outBinom_fixe
Inference for Bugs model at "exempleANOVA_fixe.txt",
Current: 3 chains, each with 10000 iterations (first 5000 discarded)
Cumulative: n.sims = 15000 iterations saved
mean sd 2.5% 25% 50% 75% 97.5% Rhat n.eff
a[1] 51.0 1.1 48.8 50.2 51.0 51.7 53.1 1 15000
a[2] 39.5 1.1 37.3 38.8 39.5 40.2 41.6 1 15000
a[3] 44.2 1.1 42.0 43.5 44.2 44.9 46.3 1 15000
a[4] 55.5 1.1 53.3 54.8 55.5 56.2 57.6 1 3600
a[5] 60.6 1.1 58.5 59.9 60.6 61.4 62.8 1 15000
sigma 3.4 0.4 2.8 3.2 3.4 3.7 4.3 1 3500
deviance 264.1 3.8 258.9 261.3 263.3 266.0 273.3 1 3400
For each parameter, n.eff is a crude measure of effective sample size,
and Rhat is the potential scale reduction factor (at convergence, Rhat=1).
DIC info (using the rule, pD = Dbar-Dhat)
pD = 5.9 and DIC = 270.0
DIC is an estimate of expected predictive error (lower deviance is better).
traceplot> dim(outBinom_fixe$sims.array)
[1] 5000 3 7
> library(coda)
> a.mcmc=vector('list',K)
> for(k in 1:K) a.mcmc[[k]]=mcmc(outBinom_fixe$sims.array[,,k])
> for(k in 1:K)
+ matplot(a.mcmc[[k]],col=c("red","blue","green"),type="l",ylab=expression(alpha),xlab="Iterations",main=paste("a[",k,"]",sep=""))
> sigma.mcmc=mcmc(outBinom_fixe$sims.array[,,6])
> matplot(sigma.mcmc,col=c("red","blue","green"),type="l",ylab=expression(alpha),xlab="Iterations",main=expression(sigma))
> for(k in 1:K) print(summary(a.mcmc[[k]]))
Iterations = 1:5000
Thinning interval = 1
Number of chains = 1
Sample size per chain = 5000
1. Empirical mean and standard deviation for each variable,
plus standard error of the mean:
Mean SD Naive SE Time-series SE
[1,] 50.96 1.099 0.01554 0.01554
[2,] 50.96 1.109 0.01568 0.01568
[3,] 50.97 1.078 0.01524 0.01524
2. Quantiles for each variable:
2.5% 25% 50% 75% 97.5%
var1 48.74 50.23 50.95 51.68 53.14
var2 48.75 50.24 50.95 51.68 53.16
var3 48.84 50.25 51.00 51.70 53.05
Iterations = 1:5000
Thinning interval = 1
Number of chains = 1
Sample size per chain = 5000
1. Empirical mean and standard deviation for each variable,
plus standard error of the mean:
Mean SD Naive SE Time-series SE
[1,] 39.46 1.087 0.01538 0.01538
[2,] 39.46 1.098 0.01553 0.01553
[3,] 39.48 1.088 0.01539 0.01539
2. Quantiles for each variable:
2.5% 25% 50% 75% 97.5%
var1 37.26 38.76 39.47 40.19 41.52
var2 37.28 38.74 39.45 40.18 41.65
var3 37.33 38.75 39.48 40.18 41.64
Iterations = 1:5000
Thinning interval = 1
Number of chains = 1
Sample size per chain = 5000
1. Empirical mean and standard deviation for each variable,
plus standard error of the mean:
Mean SD Naive SE Time-series SE
[1,] 44.19 1.097 0.01552 0.01552
[2,] 44.19 1.100 0.01555 0.01555
[3,] 44.17 1.097 0.01551 0.01551
2. Quantiles for each variable:
2.5% 25% 50% 75% 97.5%
var1 42.02 43.44 44.19 44.92 46.30
var2 42.04 43.48 44.18 44.92 46.38
var3 41.98 43.46 44.19 44.89 46.31
Iterations = 1:5000
Thinning interval = 1
Number of chains = 1
Sample size per chain = 5000
1. Empirical mean and standard deviation for each variable,
plus standard error of the mean:
Mean SD Naive SE Time-series SE
[1,] 55.45 1.084 0.01533 0.01533
[2,] 55.51 1.110 0.01569 0.01569
[3,] 55.48 1.086 0.01536 0.01536
2. Quantiles for each variable:
2.5% 25% 50% 75% 97.5%
var1 53.27 54.73 55.45 56.18 57.54
var2 53.34 54.77 55.51 56.24 57.71
var3 53.29 54.76 55.47 56.20 57.62
Iterations = 1:5000
Thinning interval = 1
Number of chains = 1
Sample size per chain = 5000
1. Empirical mean and standard deviation for each variable,
plus standard error of the mean:
Mean SD Naive SE Time-series SE
[1,] 60.63 1.121 0.01586 0.01655
[2,] 60.60 1.112 0.01572 0.01605
[3,] 60.62 1.088 0.01539 0.01582
2. Quantiles for each variable:
2.5% 25% 50% 75% 97.5%
var1 58.44 59.89 60.62 61.38 62.82
var2 58.43 59.86 60.59 61.35 62.79
var3 58.50 59.89 60.62 61.35 62.75
> summary(sigma.mcmc)
Iterations = 1:5000
Thinning interval = 1
Number of chains = 1
Sample size per chain = 5000
1. Empirical mean and standard deviation for each variable,
plus standard error of the mean:
Mean SD Naive SE Time-series SE
[1,] 3.437 0.3700 0.005233 0.005363
[2,] 3.459 0.3873 0.005477 0.006819
[3,] 3.444 0.3752 0.005306 0.006273
2. Quantiles for each variable:
2.5% 25% 50% 75% 97.5%
var1 2.785 3.172 3.409 3.666 4.228
var2 2.806 3.183 3.424 3.695 4.318
var3 2.810 3.180 3.417 3.662 4.306
> for(k in 1:K) print(autocorr.diag(a.mcmc[[k]]))
[,1] [,2] [,3]
Lag 0 1.000000000 1.0000000000 1.000000000
Lag 1 0.007994218 0.0140252890 0.002546215
Lag 5 0.005930643 0.0006537639 -0.007223642
Lag 10 -0.006595497 0.0127327311 -0.005693247
Lag 50 0.002653034 -0.0073911415 -0.005464868
[,1] [,2] [,3]
Lag 0 1.000000000 1.000000000 1.000000000
Lag 1 -0.003162746 -0.007613435 0.008230883
Lag 5 0.003612877 0.002206722 0.001141674
Lag 10 -0.012076041 0.018299429 0.008614816
Lag 50 -0.001874770 -0.021580145 0.006978214
[,1] [,2] [,3]
Lag 0 1.0000000000 1.0000000000 1.000000000
Lag 1 0.0007250380 -0.0004366058 -0.007325057
Lag 5 0.0079324981 -0.0179475162 -0.014099298
Lag 10 -0.0163628404 0.0044267491 0.001527793
Lag 50 0.0003124323 0.0189581017 0.007985882
[,1] [,2] [,3]
Lag 0 1.000000000 1.000000000 1.000000000
Lag 1 -0.017706700 0.012604158 -0.018250997
Lag 5 -0.001541642 -0.010595838 -0.005266936
Lag 10 -0.010371840 0.008593231 -0.004815082
Lag 50 0.012598203 -0.008446777 -0.015796745
[,1] [,2] [,3]
Lag 0 1.000000000 1.000000000 1.000000000
Lag 1 -0.009695259 0.020415918 0.027573221
Lag 5 0.019074995 0.006361828 -0.001328159
Lag 10 -0.008222234 -0.004330259 0.013428285
Lag 50 0.008635679 0.015717587 -0.022973231
> for(k in 1:K) autocorr.plot(a.mcmc[[k]],main=paste("a[",k,"]",sep=""))
> autocorr.diag(sigma.mcmc)
[,1] [,2] [,3]
Lag 0 1.000000000 1.000000000 1.00000000
Lag 1 0.150321378 0.184077015 0.16583785
Lag 5 -0.022279834 0.003671227 -0.00226964
Lag 10 -0.014280231 0.017361671 0.01228902
Lag 50 0.004982969 0.017233716 -0.01426869
> autocorr.plot(sigma.mcmc,main=expression(sigma))
Rhat des \(a_{j(i)}\)> for(k in 1:K) print(gelman.diag(list(a.mcmc[[k]][,1],a.mcmc[[k]][,2],a.mcmc[[k]][,3])))
Potential scale reduction factors:
Point est. Upper C.I.
[1,] 1 1
Potential scale reduction factors:
Point est. Upper C.I.
[1,] 1 1
Potential scale reduction factors:
Point est. Upper C.I.
[1,] 1 1
Potential scale reduction factors:
Point est. Upper C.I.
[1,] 1 1
Potential scale reduction factors:
Point est. Upper C.I.
[1,] 1 1
> for(k in 1:K) gelman.plot(list(a.mcmc[[k]][,1],a.mcmc[[k]][,2],a.mcmc[[k]][,3]),main=paste("a[",k,"]",sep=""))
Rhat des \(\sigma\)> gelman.diag(list(sigma.mcmc[,1],sigma.mcmc[,2],sigma.mcmc[,3]))
Potential scale reduction factors:
Point est. Upper C.I.
[1,] 1 1.01
> gelman.plot(list(sigma.mcmc[,1],sigma.mcmc[,2],sigma.mcmc[,3]),main=expression(sigma))
> for(k in 1:K) print(geweke.diag(a.mcmc[[k]]))
Fraction in 1st window = 0.1
Fraction in 2nd window = 0.5
var1 var2 var3
0.4802 0.1900 -0.2434
Fraction in 1st window = 0.1
Fraction in 2nd window = 0.5
var1 var2 var3
0.7506 -0.1380 0.9204
Fraction in 1st window = 0.1
Fraction in 2nd window = 0.5
var1 var2 var3
-0.2404 -0.5395 -0.7554
Fraction in 1st window = 0.1
Fraction in 2nd window = 0.5
var1 var2 var3
-0.7680 1.5046 0.8474
Fraction in 1st window = 0.1
Fraction in 2nd window = 0.5
var1 var2 var3
-1.8398 -1.0663 -0.7389
> for(k in 1:K) geweke.plot(a.mcmc[[k]])
> geweke.diag(sigma.mcmc)
Fraction in 1st window = 0.1
Fraction in 2nd window = 0.5
var1 var2 var3
0.7092 -0.8460 -0.5300
> geweke.plot(sigma.mcmc,main=expression(sigma))
> difference=c()
> nx=c()
> for(j in 1:(K-1)){
+ for(i in (j+1):K){
+ x=as.vector(a.mcmc[[i]]-a.mcmc[[j]])
+ difference=cbind(difference,x)
+ nx=c(nx,paste(i,"-",j))
+ }
+ }
> colnames(difference)=nx
> library(reshape2)
> dt=melt(difference)
> library(doBy)
Loading required package: survival
> tdc=summaryBy(formula = value~Var2,FUN = function(x)c(mean(x),quantile(x,probs = c(.025,.975))),data = dt)
> colnames(tdc)=c("Var2","Mean","perc2.5","perc97.5")
> tdc
Var2 Mean perc2.5 perc97.5
1 2 - 1 -11.496777 -14.56000 -8.46000
2 3 - 1 -6.780200 -9.80000 -3.75975
3 4 - 1 4.514166 1.45000 7.54000
4 5 - 1 9.652424 6.58000 12.70000
5 3 - 2 4.716577 1.71000 7.78000
6 4 - 2 16.010943 13.00000 19.05000
7 5 - 2 21.149201 18.10000 24.26025
8 4 - 3 11.294366 8.31000 14.35000
9 5 - 3 16.432624 13.39000 19.50025
10 5 - 4 5.138258 2.07975 8.23000
> gr<-ggplot(tdc, aes(x=Var2, y=Mean,group=Var2)) + geom_hline(yintercept=0)+
+ geom_errorbar(aes(ymin=perc2.5, ymax=perc97.5), width=.2,size=2) +
+ geom_line(siz=5) +
+ geom_point(size=5)+coord_flip() +theme_classic() +xlab("")+ylab("")+
+ theme(legend.title = element_blank(),legend.position="none",text = element_text(size=17))
> gr
geom_path: Each group consist of only one observation. Do you need to adjust the group aesthetic?