Korelacija je odvisna od ostankov

alpha1 <- 0
beta1 <- 1
alpha2 <- 3
beta2 <- beta1

Model1 ima koeficiente \[y_1=0+1 * x\]

Model 2 pa \[y_2=3+1 * x\] Empirični koeficienti so rahlo drugačni

n <- 10
x <- 1:n
eps <- scale(rnorm(n))
y1 <- alpha1 + beta1*x+eps
(fit1 <- lm(y1~x))
## 
## Call:
## lm(formula = y1 ~ x)
## 
## Coefficients:
## (Intercept)            x  
##      0.4197       0.9237
y2 <- alpha2 + beta2*x+eps
(fit2 <- lm(y2~x))
## 
## Call:
## lm(formula = y2 ~ x)
## 
## Coefficients:
## (Intercept)            x  
##      3.4197       0.9237

Slike

Kljub različnim koeficientom je korelacija enaka.

par(mfrow=c(1,2))
plot(x,y1,ylim=c(0,15))
abline(fit1)
segments(x,predict(fit1),x,y1,col="blue")
title(round(cor(x,y1),4))
plot(x,y2,ylim=c(0,15))
abline(fit2)
segments(x,predict(fit2),x,y2,col="blue")
title(round(cor(x,y2),4))

Podobno je, če so napake pripisane drugim enotam:

eps3 <- sample(eps,replace = FALSE)
y3 <- alpha1 + beta1*x+eps3
(fit3 <- lm(y3~x))
## 
## Call:
## lm(formula = y3 ~ x)
## 
## Coefficients:
## (Intercept)            x  
##      0.9821       0.8214
par(mfrow=c(1,2))
plot(x,y1,ylim=c(0,15))
abline(fit1)
segments(x,predict(fit1),x,y1,col="blue")
title(round(cor(x,y1),4))
plot(x,y3,ylim=c(0,15))
abline(fit3)
segments(x,predict(fit3),x,y3,col="blue")
title(round(cor(x,y3),4))

par(mfrow=c(1,3))
# priprava odatkov
x <- rep.int(1:5,times=3:7)
y <- rep.int(5:1,times=3:7)
n <- rep.int(3:7,times=3:7)
n
##  [1] 3 3 3 4 4 4 4 5 5 5 5 5 6 6 6 6 6 6 7 7 7 7 7 7 7
tbl <- table(x,y)
tbl <- as.data.frame(tbl)
tbl <- tbl[tbl[,3]>0,]
tbl$x <- as.numeric(tbl$x)
tbl$y <- as.numeric(tbl$y)
tbl
##    x y Freq
## 5  5 1    7
## 9  4 2    6
## 13 3 3    5
## 17 2 4    4
## 21 1 5    3
# risanje
plot(x,y,col="red",cex=2,pch=16) 
# jitter, da se vidijo posamezne pike
plot(jitter(x),jitter(y),pch=21,bg="red",cex=2)
# sqrt, da so ploščine pik sorazmerne frekvenci
# faktor (2*), da pike niso premajhne/prevelike
plot(tbl$x,tbl$y, pch=21, bg="red", cex= 2* sqrt(tbl$Freq)) 

Simpsonov paradox

n <- 20
delta <-10
x1 <- 5+rnorm(n)
y1 <- 10*x1+rnorm(n,0,5)
x2 <- x1+delta
y2 <- 100-5*x1+delta+rnorm(n,0,5)
x <- c(x1,x2)
y <- c(y1,y2)
plot(x,y,pch=16,col=rep(c(2,4),each=n))
abline(lm(y~x))
text(mean(x),mean(y)+5,paste("r=",round(cor(y,x),4)))
# posamezne korelacije
abline(lm(y1~x1),col=2)
text(mean(x1),mean(y1)+30,paste("r=",round(cor(y1,x1),4)),col=2)
abline(lm(y2~x2),col=4)
text(mean(x2),mean(y2)-20,paste("r=",round(cor(y2,x2),4)),col=4)