Source: Jeremy Miles: Applying Regression and Correlation: A Guide for Students and Researchers
DATA = MODEL + ERROR
data1_1 <- read.delim("data1_1.dat")
tab1_2 <- data1_1 %>%
select(BOOKS) %>%
mutate(
Mean = mean(BOOKS),
Error = BOOKS - Mean
)
head(tab1_2)
## BOOKS Mean Error
## 1 0 2 -2
## 2 1 2 -1
## 3 0 2 -2
## 4 2 2 0
## 5 4 2 2
## 6 4 2 2
sum(tab1_2$Error)
## [1] 0
error_sq = sum(tab1_2$Error^2)/length(tab1_2$Error)
error_sq
## [1] 2
error <- sqrt(error_sq)
error
## [1] 1.414214
means <- seq(0, 5, .1)
s <- vector()
for(m in means){
e <- sapply(tab1_2$BOOKS, function(x) x - m)
s[m * 10] <- sqrt(sum(e^2))/length(e)
}
plot(means[2:51], s, pch = 21, col = "red", bg = "orange", ylim = c(0, .6),
xlab = "Simulated Means .01:5",
ylab = "Error (sd)",
main = "The mean is a least squares estimator")
abline(h = min(s), col = "blue")
abline(v = mean(tab1_2$BOOKS), col = "green")
# simulated data
rdata <- rnorm(10000)
par(mfrow = c(2, 3))
for (i in 1:6){
samp <- sample(rdata, 100, replace = FALSE)
hist(samp, col = "skyblue", main = paste("Mean = ", round(mean(samp), 4)))
abline(v = mean(samp), col = "red")
}
par(mfrow = c(1, 1))
# for BOOK-Data
n <- length(data1_1$BOOKS)
se <- sqrt(var(data1_1$BOOKS)/n)
se
## [1] 0.2264554
ci <- mean(data1_1$BOOKS) + c(-1.96, 1.96) * se
ci
## [1] 1.556147 2.443853
If we say that the mean number of books read by students in the next sample taken from that population from which we took our sample is between 1.56 and 2.45, we will only have a 5% chance to be wrong.
plot(data1_1$BOOKS, data1_1$GRADE,
pch = 3, col = "blue", main = "Fig 1.3")
m <- lm(GRADE ~ BOOKS, data = data1_1)
abline(m, col = "purple")
x <- seq(1, 6, 1)
b0 <- 2
b1 <- 1
y <- b0 + b1*x
plot(x, y, col = "black", main = "Fig. 1.5: Graph of y = (1 * x) + 2")
abline(a = 2, b = 1, col = "purple")
Number <- seq(1, 5, 1)
Books <- c(3, 1, 0, 2, 4)
Marks <- c(56, 57, 45, 51, 65)
dataset <- data.frame(Number = Number, Books = Books, Marks = Marks)
dataset <- dataset %>%
mutate(
PredScore = mean(Marks),
Residual = Marks - PredScore,
ResidualSquared = Residual^2
)
dataset
## Number Books Marks PredScore Residual ResidualSquared
## 1 1 3 56 54.8 1.2 1.44
## 2 2 1 57 54.8 2.2 4.84
## 3 3 0 45 54.8 -9.8 96.04
## 4 4 2 51 54.8 -3.8 14.44
## 5 5 4 65 54.8 10.2 104.04
error <- sqrt(sum(dataset$ResidualSquared)/5)
error
## [1] 6.645299
plot(dataset$Books, dataset$Marks, main = "Fig. 1.6")
abline(h = mean(dataset$Marks), col = "red")
segments(Books,mean(Marks), Books, Marks, col = "blue")
m <- lm(Marks ~ Books, data = dataset)
summary(m)$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 47.0 3.706751 12.679569 0.001058076
## Books 3.9 1.513275 2.577193 0.081979302
abline(m, col = "green")
dataset_lm <- dataset
dataset_lm$PredScore <- predict(m)
dataset_lm$Residual <- dataset_lm$Marks - dataset_lm$PredScore
dataset_lm$ResidualSquared <- dataset_lm$Residual^2
dataset_lm
## Number Books Marks PredScore Residual ResidualSquared
## 1 1 3 56 58.7 -2.7 7.29
## 2 2 1 57 50.9 6.1 37.21
## 3 3 0 45 47.0 -2.0 4.00
## 4 4 2 51 54.8 -3.8 14.44
## 5 5 4 65 62.6 2.4 5.76
error_lm <- sqrt(sum(dataset_lm$ResidualSquared)/5)
error_lm
## [1] 3.706751
error_lm - error # error new model is much smaller than error mean
## [1] -2.938548
plot(dataset_lm$Books, dataset_lm$Marks, main = "Fig. 1.7 (New model)")
abline(m, col = "green", main = "Fig. 1.7")
segments(dataset_lm$Books, dataset_lm$PredScore, dataset_lm$Books, dataset_lm$Marks, col = "blue")
new_m <- lm(GRADE ~ BOOKS, data = data1_1)
summary(new_m)$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 52.0750 4.035302 12.90486 1.830901e-15
## BOOKS 5.7375 1.647405 3.48275 1.265259e-03
stargazer(new_m, type = "text")
##
## ===============================================
## Dependent variable:
## ---------------------------
## GRADE
## -----------------------------------------------
## BOOKS 5.738***
## (1.647)
##
## Constant 52.075***
## (4.035)
##
## -----------------------------------------------
## Observations 40
## R2 0.242
## Adjusted R2 0.222
## Residual Std. Error 14.735 (df = 38)
## F Statistic 12.130*** (df = 1; 38)
## ===============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
The average student who reads no books will achieve a mark of 52.08 in the exam, and each additional book they read will increas their grade by 5.74 percentage points.
b0 <- summary(new_m)$coef[1]
b1 <- summary(new_m)$coef[2]
se_b0 <- summary(new_m)$coef[3]
se_b1 <- summary(new_m)$coef[4]
ci_b0 <- b0 + c(-1.96, 1.96) * se_b0
ci_b1 <- b1 + c(-1.96, 1.96) * se_b1
ci_b0
## [1] 44.16581 59.98419
ci_b1
## [1] 2.508586 8.966414
The standard error of the slope can also be used for something more useful, that is for determining whether the figure give by the slope is statistically significant. … “Does reading more books really have any effects on grades?”
zValue <- (b1 - 0)/se_b1
pnormValue <- 2 * pnorm(zValue, lower.tail = FALSE)
zValue
## [1] 3.48275
pnormValue
## [1] 0.0004962914
# use t-distribution
df_b1 <- summary(new_m)$df[2]
ptValue <- 2 * pt(zValue, df = df_b1, lower.tail = FALSE)
ptValue
## [1] 0.001265259
sc_data <- data1_1 %>%
select(BOOKS, GRADE) %>%
mutate(
scBooks = scale(BOOKS),
scMarks = scale(GRADE)
)
sc_m <- lm(scMarks ~ scBooks, data = sc_data)
plot(sc_data$scBooks, sc_data$scMarks, pch = 20, col = "darkblue",
main = "Fig. 1.9: Scattergraph of z-scores of books and marks")
abline(h = 0, col = "orange")
abline(v = 0, col = "orange")
abline(sc_m, col = "red", lwd = 2)
summary(sc_m)$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.410022e-16 0.1394618 1.011045e-15 1.000000000
## scBooks 4.918984e-01 0.1412385 3.482750e+00 0.001265259
Where there is a single independent variable, the standardised slope b1-coefficient is equal to the Pearson correlation coefficient.
Formula for the standardized coefficient
\[SD_y = SD_x = 1\]
\[B = r\frac{SD_y}{SD_x}\]
\[\beta = r\]
This holds only true for simple regression!
cor(sc_data$scBooks, sc_data$scMarks)
## [,1]
## [1,] 0.4918984
x <- seq(0, 10, 1)
y1 <- 1 * x
y2 <- -1 * x
par(mfrow = c(1, 2))
plot(x, y1, pch = 21, col = "red", bg = "orange", main = "Fig. 1.10: r = +1")
plot(x, y2, pch = 21, col = "red", bg = "orange", main = "Fig. 1.11: r = -1")
par(mfrow = c(1, 1))
c(cor(x, y1), cor(x, y2))
## [1] 1 -1
set.seed(333)
x3 <- rnorm(100)
y3 <- rnorm(100)
plot(x3, y3, pch = 21, col = "red", bg = "orange", main = "Fig 1.13d: r ~ 0.02")
cor(x3, y3)
## [1] 0.02012909
plot(data1_1$BOOKS, data1_1$GRADE, pch = 3, main = "Fig. 1.14: line = mean of GRADE")
abline(h = mean(data1_1$GRADE), col = "red")
varGrade <- sum((data1_1$GRADE - mean(data1_1$GRADE))^2)/length(data1_1$GRADE)
varGrade
## [1] 272.0975
The regression equation (find the OLS-line)
\[GRADE = B_0+B_1(BOOKS)+e\]
Formula for the slope:
\[B_1=r\frac{sd(GRADE)}{sd(BOOKS)}\]
Formula for the intercept:
\[B_0=mean(GRADE)−B_1mean(BOOKS)\]
model <- lm(GRADE ~ BOOKS, data = data1_1)
predValues <- predict(model)
predGrade <- summary(model)$coef[1] + summary(model)$coef[2] * data1_1$BOOKS
datasetM <- data1_1[, -2]
datasetM$predGrade <- predGrade
datasetM$residual <- datasetM$GRADE - datasetM$predGrade
datasetM$residualSq <- datasetM$residual^2
head(datasetM, 8)
## BOOKS GRADE predGrade residual residualSq
## 1 0 45 52.0750 -7.0750 50.0556250
## 2 1 57 57.8125 -0.8125 0.6601562
## 3 0 45 52.0750 -7.0750 50.0556250
## 4 2 51 63.5500 -12.5500 157.5025000
## 5 4 65 75.0250 -10.0250 100.5006250
## 6 4 88 75.0250 12.9750 168.3506250
## 7 1 44 57.8125 -13.8125 190.7851562
## 8 4 87 75.0250 11.9750 143.4006250
PredScores <- round(c(mean(predValues), sd(predValues), var(predValues)), 2)
PredScores1 <- round(c(mean(datasetM$predGrade), sd(datasetM$predGrade), var(datasetM$predGrade)), 2)
ResidScores <- round(c(mean(datasetM$residual), sd(datasetM$residual), var(datasetM$residual)), 2)
rbind(PredScores, PredScores1, ResidScores)
## [,1] [,2] [,3]
## PredScores 63.55 8.22 67.53
## PredScores1 63.55 8.22 67.53
## ResidScores 0.00 14.54 211.55
Variance of grades is equal to the variance of the predicted grades plus the variance of the residuals.
var(datasetM$GRADE)
## [1] 279.0744
var(datasetM$predGrade) + var(datasetM$residual)
## [1] 279.0744
Proportion of the variance in GRADE that has been accounted for by BOOKS:
prop <- var(datasetM$predGrade)/var(datasetM$GRADE)
sqrtProp <- sqrt(prop)
r <- cor(datasetM$BOOKS, datasetM$GRADE)
cbind(prop, sqrtProp, r)
## prop sqrtProp r
## [1,] 0.241964 0.4918984 0.4918984
If you square a correlation, the resulting figure is equal to the proportion of variance that the two variables share.
cor.test(datasetM$BOOKS, datasetM$GRADE)
##
## Pearson's product-moment correlation
##
## data: datasetM$BOOKS and datasetM$GRADE
## t = 3.4828, df = 38, p-value = 0.001265
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2130322 0.6966582
## sample estimates:
## cor
## 0.4918984
Guidance provided by Cohen (1988):
Multiple regression
Multiple regression equation
\[\hat Y = B_0 + B_1X_1 + B_2X_2 + B_3X_3 + … + B_kX_k\]
\[\hat Y = B_0 + \sum(B_kX_k)\]
\(\hat Y\) = predicted value on the outcome variable Y
\(B_0\) = predicted value on Y when all X = 0
\(X_k\) = predictor variables
\(B_k\) = unstandardized regression coefficients
\(Y - \hat Y\) = residual (prediction error)
\(k\) = the number of predictor variables
model2 <- lm(GRADE ~ ATTEND, data = data1_1)
summary(model2)
##
## Call:
## lm(formula = GRADE ~ ATTEND, data = data1_1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.777 -10.904 2.021 12.430 31.755
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.998 8.169 4.529 5.71e-05 ***
## ATTEND 1.883 0.555 3.393 0.00163 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.83 on 38 degrees of freedom
## Multiple R-squared: 0.2325, Adjusted R-squared: 0.2123
## F-statistic: 11.51 on 1 and 38 DF, p-value: 0.001629
cor(data1_1$ATTEND, data1_1$GRADE)
## [1] 0.4821864
Each additional lecture attended increases a student’s grade by 1.9%.
cor(data1_1)
## BOOKS ATTEND GRADE
## BOOKS 1.0000000 0.4436428 0.4918984
## ATTEND 0.4436428 1.0000000 0.4821864
## GRADE 0.4918984 0.4821864 1.0000000
What we are interested in is the correlation between BOOKS and GRADE, with the influence of ATTEND removed or controlled for. Similarly, we want the correlation between ATTEND and GRADE with the influence of BOOKS removed or controlled for. … So if we enter both BOOKS and ATTEND into the regression equation, we get estimates of the slope coefficients for each variable, controlling for the other variables.
model3 <- lm(GRADE ~ BOOKS + ATTEND, data = data1_1)
R.Sq <- summary(model3)$r.squared
adj.R.Sq <- summary(model3)$adj.r.squared
R <- sqrt(R.Sq)
cbind(R, R.Sq, adj.R.Sq)
## R R.Sq adj.R.Sq
## [1,] 0.5733343 0.3287122 0.2924263
\[ Adj.R^2 = 1 - (1 - R^2)\frac{N - 1}{N - k - 1}\]
As N (sample size) increases, the amount by which \(R^2\) is adjusted downward decreases, and as k (number of independend variables) increases, the amount by which \(R^2\) is reduced increases.
anova(model3)
## Analysis of Variance Table
##
## Response: GRADE
## Df Sum Sq Mean Sq F value Pr(>F)
## BOOKS 1 2633.5 2633.51 13.3366 0.0008004 ***
## ATTEND 1 944.2 944.16 4.7814 0.0351684 *
## Residuals 37 7306.2 197.47
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The significance of the value F is the probability associated with \(R^2\), that is the probability of getting a value of \(R^2\) as high as it is if the actual value in the population is zero.
(in simple regression p-value for R.sq in summary output is identical with p-value in anova)
summary(model)
##
## Call:
## lm(formula = GRADE ~ BOOKS, data = data1_1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28.025 -12.616 -1.181 10.425 33.450
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 52.075 4.035 12.905 1.83e-15 ***
## BOOKS 5.738 1.647 3.483 0.00127 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.73 on 38 degrees of freedom
## Multiple R-squared: 0.242, Adjusted R-squared: 0.222
## F-statistic: 12.13 on 1 and 38 DF, p-value: 0.001265
anova(model)
## Analysis of Variance Table
##
## Response: GRADE
## Df Sum Sq Mean Sq F value Pr(>F)
## BOOKS 1 2633.5 2633.51 12.13 0.001265 **
## Residuals 38 8250.4 217.12
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(model3)
##
## Call:
## lm(formula = GRADE ~ BOOKS + ATTEND, data = data1_1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.802 -13.374 0.060 9.173 32.295
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.379 7.745 4.827 2.41e-05 ***
## BOOKS 4.037 1.753 2.303 0.0270 *
## ATTEND 1.284 0.587 2.187 0.0352 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.05 on 37 degrees of freedom
## Multiple R-squared: 0.3287, Adjusted R-squared: 0.2924
## F-statistic: 9.059 on 2 and 37 DF, p-value: 0.0006278
Standardized coefficients
data1_1sc <- data.frame(scale(data1_1))
model3sc <- lm(GRADE ~ BOOKS + ATTEND, data = data1_1sc)
summary(model3sc)
##
## Call:
## lm(formula = GRADE ~ BOOKS + ATTEND, data = data1_1sc)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.24523 -0.80055 0.00359 0.54907 1.93319
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.397e-16 1.330e-01 0.000 1.0000
## BOOKS 3.461e-01 1.503e-01 2.303 0.0270 *
## ATTEND 3.286e-01 1.503e-01 2.187 0.0352 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8412 on 37 degrees of freedom
## Multiple R-squared: 0.3287, Adjusted R-squared: 0.2924
## F-statistic: 9.059 on 2 and 37 DF, p-value: 0.0006278
# alternative
library(QuantPsyc)
## Warning: package 'QuantPsyc' was built under R version 3.2.5
## Loading required package: boot
##
## Attaching package: 'boot'
## The following object is masked from 'package:psych':
##
## logit
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
##
## Attaching package: 'QuantPsyc'
## The following object is masked from 'package:base':
##
## norm
lm.beta(model3)
## BOOKS ATTEND
## 0.3460987 0.3286422
The standardized coefficients tell us, what the correlation for one dependent variable would be if the other independent variables were held constant.
data2_1 <- read.delim("data2_1.dat")
Codebook:
cor(data2_1$EXTRO, data2_1$CAR) # R
## [1] 0.6711337
cor(data2_1$EXTRO, data2_1$CAR)^2 # R squared
## [1] 0.4504204
plot(data2_1$EXTRO, data2_1$CAR)
What we want is to estimate the effect of EXTRO on CAR while controlling for the effects of SEX and AGE.
model4 <- lm(CAR ~ EXTRO + SEX + AGE, data = data2_1)
summary(model4)
##
## Call:
## lm(formula = CAR ~ EXTRO + SEX + AGE, data = data2_1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.002 -9.758 -1.206 8.940 24.712
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.4260 7.3084 1.563 0.126704
## EXTRO 0.4633 0.1296 3.574 0.001023 **
## SEX 20.0377 4.6502 4.309 0.000121 ***
## AGE 0.1543 0.2062 0.749 0.458949
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.01 on 36 degrees of freedom
## Multiple R-squared: 0.6383, Adjusted R-squared: 0.6081
## F-statistic: 21.18 on 3 and 36 DF, p-value: 4.451e-08
lm.beta(model4)
## EXTRO SEX AGE
## 0.44056432 0.48813876 0.08455706
What we are really interested in is the effect that EXTRO has above and beyond the effects of AGE and SEX - that is to say, we want to assess the effects of EXTRO after the effects of AGE and SEX have been removed.
model4b <- lm(CAR ~ SEX + AGE, data = data2_1)
# alternative: eliminate EXTRO from model4
# model4b2 <- update(model4, .~.-EXTRO)
summary(model4b)
##
## Call:
## lm(formula = CAR ~ SEX + AGE, data = data2_1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.606 -10.126 -4.173 10.079 42.425
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.5627 8.3830 1.499 0.1425
## SEX 27.7406 4.7309 5.864 9.61e-07 ***
## AGE 0.4921 0.2104 2.339 0.0248 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.94 on 37 degrees of freedom
## Multiple R-squared: 0.5099, Adjusted R-squared: 0.4835
## F-statistic: 19.25 on 2 and 37 DF, p-value: 1.86e-06
Rsq.SEX.AGE <- summary(model4b)$r.squared
Rsq.EXTRO.SEX.AGE <- summary(model4)$r.squared
rbind(Rsq.SEX.AGE, Rsq.EXTRO.SEX.AGE)
## [,1]
## Rsq.SEX.AGE 0.5099419
## Rsq.EXTRO.SEX.AGE 0.6382828
increaseInRsq <- Rsq.EXTRO.SEX.AGE - Rsq.SEX.AGE
increaseInRsq
## [1] 0.1283409
# compare models with anova
anova(model4b, model4)
## Analysis of Variance Table
##
## Model 1: CAR ~ SEX + AGE
## Model 2: CAR ~ EXTRO + SEX + AGE
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 37 8257.7
## 2 36 6095.1 1 2162.6 12.773 0.001023 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The increase in \(R^2\) for the larger model is significant and EXTRO does predict CAR above SEX and AGE. However, if we look at the actual increase in \(R^2\), we find that it increases by 0.128 (13%) and therefore EXTRO is accounting for less of the variance than we had at first thought.
see book pp.38f
R-packages:
data3_1 <- read.delim("data3_1.dat")
data3_1$GROUP <- factor(data3_1$GROUP, levels = c(0, 1), labels = c("con", "exp"))
data3_1 %>%
group_by(GROUP) %>%
summarise(
N = n(),
Mean = mean(SCORE),
SD = sd(SCORE),
SE = SD/sqrt(n())
)
## Source: local data frame [2 x 5]
##
## GROUP N Mean SD SE
## (fctr) (int) (dbl) (dbl) (dbl)
## 1 con 10 10.1 1.791957 0.5666667
## 2 exp 10 12.6 2.065591 0.6531973
t.test(data3_1$SCORE ~ data3_1$GROUP, paired = FALSE)
##
## Welch Two Sample t-test
##
## data: data3_1$SCORE by data3_1$GROUP
## t = -2.891, df = 17.648, p-value = 0.009874
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -4.3193527 -0.6806473
## sample estimates:
## mean in group con mean in group exp
## 10.1 12.6
summary(lm(SCORE ~ GROUP, data = data3_1))
##
## Call:
## lm(formula = SCORE ~ GROUP, data = data3_1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.600 -1.600 -0.350 1.275 3.400
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.1000 0.6115 16.518 2.54e-12 ***
## GROUPexp 2.5000 0.8647 2.891 0.00973 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.934 on 18 degrees of freedom
## Multiple R-squared: 0.3171, Adjusted R-squared: 0.2792
## F-statistic: 8.358 on 1 and 18 DF, p-value: 0.009732
Compare output of t.test and lm:
data3_2a <- read.delim("data3_2a.dat")
data3_2a$GROUP <- factor(data3_2a$GROUP, levels = c(0, 1, 2), labels = c("con", "mnem", "aroma"))
data3_2a %>%
group_by(GROUP) %>%
summarise(
N = n(),
Mean = mean(SCORE),
SD = sd(SCORE),
SE = sd(SCORE)/sqrt(n()),
CI.lower = Mean -1.96 * SE,
CI.upper = Mean +1.96 * SE
)
## Source: local data frame [3 x 7]
##
## GROUP N Mean SD SE CI.lower CI.upper
## (fctr) (int) (dbl) (dbl) (dbl) (dbl) (dbl)
## 1 con 10 10.1 1.791957 0.5666667 8.989333 11.21067
## 2 mnem 10 12.6 2.065591 0.6531973 11.319733 13.88027
## 3 aroma 10 8.9 2.024846 0.6403124 7.644988 10.15501
a <- aov(SCORE ~ GROUP, data = data3_2a)
summary(a)
## Df Sum Sq Mean Sq F value Pr(>F)
## GROUP 2 71.27 35.63 9.233 0.00088 ***
## Residuals 27 104.20 3.86
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
pairwise.t.test(data3_2a$SCORE, data3_2a$GROUP, paired = FALSE, p.adjust.method = "none")
##
## Pairwise comparisons using t tests with pooled SD
##
## data: data3_2a$SCORE and data3_2a$GROUP
##
## con mnem
## mnem 0.00836 -
## aroma 0.18325 0.00025
##
## P value adjustment method: none
posthoc <- TukeyHSD(a, "GROUP")
posthoc
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = SCORE ~ GROUP, data = data3_2a)
##
## $GROUP
## diff lwr upr p adj
## mnem-con 2.5 0.3217051 4.6782949 0.0220484
## aroma-con -1.2 -3.3782949 0.9782949 0.3724648
## aroma-mnem -3.7 -5.8782949 -1.5217051 0.0007162
plot(posthoc, col = "blue")
Interpretation
It compares each level of the categorical variable to a fixed reference level.
require(psych)
dummies <- dummy.code(data3_2a$GROUP)
data3_2a <- cbind(data3_2a, dummies)
head(data3_2a)
## GROUP SCORE con mnem aroma
## 1 con 10 1 0 0
## 2 con 8 1 0 0
## 3 con 13 1 0 0
## 4 con 9 1 0 0
## 5 con 10 1 0 0
## 6 con 13 1 0 0
# omit reference group dummy-variable as predictor
m3 <- lm(SCORE ~ mnem + aroma, data = data3_2a)
summary(m3)
##
## Call:
## lm(formula = SCORE ~ mnem + aroma, data = data3_2a)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.900 -1.475 -0.100 1.050 4.100
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.1000 0.6212 16.258 1.8e-15 ***
## mnem 2.5000 0.8786 2.846 0.00836 **
## aroma -1.2000 0.8786 -1.366 0.18325
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.964 on 27 degrees of freedom
## Multiple R-squared: 0.4062, Adjusted R-squared: 0.3622
## F-statistic: 9.233 on 2 and 27 DF, p-value: 0.0008802
# standardized slopes
m3sc <- lm(scale(SCORE) ~ scale(mnem) + scale(aroma), data = data3_2a)
summary(m3sc)
##
## Call:
## lm(formula = scale(SCORE) ~ scale(mnem) + scale(aroma), data = data3_2a)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.17896 -0.59964 -0.04065 0.42687 1.66681
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.261e-16 1.458e-01 0.000 1.00000
## scale(mnem) 4.873e-01 1.712e-01 2.846 0.00836 **
## scale(aroma) -2.339e-01 1.712e-01 -1.366 0.18325
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7986 on 27 degrees of freedom
## Multiple R-squared: 0.4062, Adjusted R-squared: 0.3622
## F-statistic: 9.233 on 2 and 27 DF, p-value: 0.0008802
anova(m3)
## Analysis of Variance Table
##
## Response: SCORE
## Df Sum Sq Mean Sq F value Pr(>F)
## mnem 1 64.067 64.067 16.6008 0.0003634 ***
## aroma 1 7.200 7.200 1.8656 0.1832456
## Residuals 27 104.200 3.859
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Interpretation
Bonferroni correction for multiple testing
alpha <- .05
n.comparisons <- 2
# Probability of a type 1 error in multiple testing
type1error.inflation <- 1 - (1-alpha)^n.comparisons
type1error.inflation
## [1] 0.0975
# Bonferroni correction
alpha.bonferroni <- alpha/n.comparisons
alpha.bonferroni
## [1] 0.025
see also deviation coding IDRE, UCLA
This coding system compares the mean of the dependent variable for a given level to the overall mean of the dependent variable.
data3_3 <- read.delim("data3_3a.dat")
data3_3$JOB <- factor(data3_3$JOB, levels = c(1:5))
head(data3_3); tail(data3_3)
## JOB STRESS
## 1 1 71
## 2 1 67
## 3 1 67
## 4 1 67
## 5 1 79
## 6 1 46
## JOB STRESS
## 45 5 71
## 46 5 34
## 47 5 70
## 48 5 59
## 49 5 46
## 50 5 68
# levels of categorical variable
levels(data3_3$JOB)
## [1] "1" "2" "3" "4" "5"
# the contrast matrix for categorical variable with five levels
contr.sum(5)
## [,1] [,2] [,3] [,4]
## 1 1 0 0 0
## 2 0 1 0 0
## 3 0 0 1 0
## 4 0 0 0 1
## 5 -1 -1 -1 -1
# assigning the deviation contrasts to JOB
contrasts(data3_3$JOB) <- contr.sum(5)
#the regression
m33 <- lm(STRESS ~ JOB, data = data3_3)
summary(m33)
##
## Call:
## lm(formula = STRESS ~ JOB, data = data3_3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31.60 -11.53 -0.40 10.90 34.40
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 57.320 2.289 25.043 <2e-16 ***
## JOB1 10.080 4.578 2.202 0.0328 *
## JOB2 0.280 4.578 0.061 0.9515
## JOB3 7.280 4.578 1.590 0.1188
## JOB4 -12.620 4.578 -2.757 0.0084 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.18 on 45 degrees of freedom
## Multiple R-squared: 0.2234, Adjusted R-squared: 0.1544
## F-statistic: 3.237 on 4 and 45 DF, p-value: 0.02038
anova(m33)
## Analysis of Variance Table
##
## Response: STRESS
## Df Sum Sq Mean Sq F value Pr(>F)
## JOB 4 3391.5 847.87 3.2369 0.02038 *
## Residuals 45 11787.4 261.94
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Interpretation