Source: Jeremy Miles: Applying Regression and Correlation: A Guide for Students and Researchers

Building models with regression and correlation

1.2 Least Squares Models

DATA = MODEL + ERROR

data1_1 <- read.delim("data1_1.dat")

tab1_2 <- data1_1 %>% 
  select(BOOKS) %>%
  mutate(
    Mean = mean(BOOKS),
    Error = BOOKS - Mean
  )
head(tab1_2)
##   BOOKS Mean Error
## 1     0    2    -2
## 2     1    2    -1
## 3     0    2    -2
## 4     2    2     0
## 5     4    2     2
## 6     4    2     2
sum(tab1_2$Error)
## [1] 0
error_sq = sum(tab1_2$Error^2)/length(tab1_2$Error)
error_sq
## [1] 2
error <- sqrt(error_sq)
error
## [1] 1.414214
means <- seq(0, 5, .1)

s <- vector()
for(m in means){
  e <- sapply(tab1_2$BOOKS, function(x) x - m)
  s[m * 10] <- sqrt(sum(e^2))/length(e)
}

plot(means[2:51], s, pch = 21, col = "red", bg = "orange", ylim = c(0, .6),
     xlab = "Simulated Means .01:5",
     ylab = "Error (sd)",
     main = "The mean is a least squares estimator")
abline(h = min(s), col = "blue")
abline(v = mean(tab1_2$BOOKS), col = "green")

The standard error of the mean

# simulated data
rdata <- rnorm(10000)

par(mfrow = c(2, 3))
for (i in 1:6){
  samp <- sample(rdata, 100, replace = FALSE)
  hist(samp, col = "skyblue", main = paste("Mean = ", round(mean(samp), 4)))
  abline(v = mean(samp),  col = "red")
}

par(mfrow = c(1, 1))

# for BOOK-Data
n <- length(data1_1$BOOKS)
se <- sqrt(var(data1_1$BOOKS)/n)
se
## [1] 0.2264554
ci <- mean(data1_1$BOOKS) + c(-1.96, 1.96) * se
ci
## [1] 1.556147 2.443853

If we say that the mean number of books read by students in the next sample taken from that population from which we took our sample is between 1.56 and 2.45, we will only have a 5% chance to be wrong.

1.3 Modelling relationships

plot(data1_1$BOOKS, data1_1$GRADE,
     pch = 3, col = "blue", main = "Fig 1.3")
m <- lm(GRADE ~ BOOKS, data = data1_1)
abline(m, col = "purple")

x <- seq(1, 6, 1)
b0 <- 2
b1 <- 1
y <- b0 + b1*x
plot(x, y, col = "black", main = "Fig. 1.5: Graph of y = (1 * x) + 2")
abline(a = 2, b = 1, col = "purple")

Number <- seq(1, 5, 1)
Books <- c(3, 1, 0, 2, 4)
Marks <- c(56, 57, 45, 51, 65)
dataset <- data.frame(Number = Number, Books = Books, Marks = Marks)

dataset <- dataset %>%
  mutate(
    PredScore = mean(Marks),
    Residual = Marks - PredScore,
    ResidualSquared = Residual^2
  )
dataset
##   Number Books Marks PredScore Residual ResidualSquared
## 1      1     3    56      54.8      1.2            1.44
## 2      2     1    57      54.8      2.2            4.84
## 3      3     0    45      54.8     -9.8           96.04
## 4      4     2    51      54.8     -3.8           14.44
## 5      5     4    65      54.8     10.2          104.04
error <- sqrt(sum(dataset$ResidualSquared)/5)
error
## [1] 6.645299
plot(dataset$Books, dataset$Marks, main = "Fig. 1.6")
abline(h = mean(dataset$Marks), col = "red")
segments(Books,mean(Marks), Books, Marks, col = "blue")

m <- lm(Marks ~ Books, data = dataset)
summary(m)$coef
##             Estimate Std. Error   t value    Pr(>|t|)
## (Intercept)     47.0   3.706751 12.679569 0.001058076
## Books            3.9   1.513275  2.577193 0.081979302
abline(m, col = "green")

dataset_lm <- dataset
dataset_lm$PredScore <- predict(m)
dataset_lm$Residual <- dataset_lm$Marks - dataset_lm$PredScore
dataset_lm$ResidualSquared <- dataset_lm$Residual^2
dataset_lm
##   Number Books Marks PredScore Residual ResidualSquared
## 1      1     3    56      58.7     -2.7            7.29
## 2      2     1    57      50.9      6.1           37.21
## 3      3     0    45      47.0     -2.0            4.00
## 4      4     2    51      54.8     -3.8           14.44
## 5      5     4    65      62.6      2.4            5.76
error_lm <- sqrt(sum(dataset_lm$ResidualSquared)/5)
error_lm
## [1] 3.706751
error_lm - error # error new model is much smaller than error mean
## [1] -2.938548
plot(dataset_lm$Books, dataset_lm$Marks, main = "Fig. 1.7 (New model)")
abline(m, col = "green", main = "Fig. 1.7")
segments(dataset_lm$Books, dataset_lm$PredScore, dataset_lm$Books, dataset_lm$Marks, col = "blue")

new_m <- lm(GRADE ~ BOOKS, data = data1_1)
summary(new_m)$coef
##             Estimate Std. Error  t value     Pr(>|t|)
## (Intercept)  52.0750   4.035302 12.90486 1.830901e-15
## BOOKS         5.7375   1.647405  3.48275 1.265259e-03
stargazer(new_m, type = "text")
## 
## ===============================================
##                         Dependent variable:    
##                     ---------------------------
##                                GRADE           
## -----------------------------------------------
## BOOKS                        5.738***          
##                               (1.647)          
##                                                
## Constant                     52.075***         
##                               (4.035)          
##                                                
## -----------------------------------------------
## Observations                    40             
## R2                             0.242           
## Adjusted R2                    0.222           
## Residual Std. Error      14.735 (df = 38)      
## F Statistic           12.130*** (df = 1; 38)   
## ===============================================
## Note:               *p<0.1; **p<0.05; ***p<0.01

The average student who reads no books will achieve a mark of 52.08 in the exam, and each additional book they read will increas their grade by 5.74 percentage points.

1.3.1. The standard error and significance of parameter estimates

b0 <- summary(new_m)$coef[1]
b1 <- summary(new_m)$coef[2]
se_b0 <- summary(new_m)$coef[3]
se_b1 <- summary(new_m)$coef[4]

ci_b0 <- b0 + c(-1.96, 1.96) * se_b0
ci_b1 <- b1 + c(-1.96, 1.96) * se_b1
ci_b0
## [1] 44.16581 59.98419
ci_b1
## [1] 2.508586 8.966414

The standard error of the slope can also be used for something more useful, that is for determining whether the figure give by the slope is statistically significant. … “Does reading more books really have any effects on grades?”

zValue <- (b1 - 0)/se_b1
pnormValue <- 2 * pnorm(zValue, lower.tail = FALSE)
zValue
## [1] 3.48275
pnormValue
## [1] 0.0004962914
# use t-distribution
df_b1 <- summary(new_m)$df[2]
ptValue <- 2 * pt(zValue, df = df_b1, lower.tail = FALSE)
ptValue
## [1] 0.001265259

Standardized estimates

sc_data <- data1_1 %>%
  select(BOOKS, GRADE) %>%
  mutate(
    scBooks = scale(BOOKS),
    scMarks = scale(GRADE)
  )

sc_m <- lm(scMarks ~ scBooks, data = sc_data)
plot(sc_data$scBooks, sc_data$scMarks, pch = 20, col = "darkblue",
     main = "Fig. 1.9: Scattergraph of z-scores of books and marks")
abline(h = 0, col = "orange")
abline(v = 0, col = "orange")
abline(sc_m, col = "red", lwd = 2)

summary(sc_m)$coef
##                 Estimate Std. Error      t value    Pr(>|t|)
## (Intercept) 1.410022e-16  0.1394618 1.011045e-15 1.000000000
## scBooks     4.918984e-01  0.1412385 3.482750e+00 0.001265259

Where there is a single independent variable, the standardised slope b1-coefficient is equal to the Pearson correlation coefficient.

Formula for the standardized coefficient

  • If X and Y are standardized then

\[SD_y = SD_x = 1\]

\[B = r\frac{SD_y}{SD_x}\]

\[\beta = r\]

This holds only true for simple regression!

cor(sc_data$scBooks, sc_data$scMarks)
##           [,1]
## [1,] 0.4918984

1.4 Looking more at correlations

  • A correlation
    • … is a measure of the extent to which two variables are linearly related.
    • … is just a number that represents a special case of a regression line in which the intercept is zero and the variables have a mean of 0 and a standard deviation of 1.
    • … can be thought of as the extent to which the scattergraph of the relationship between two variables fits a straight line.
x <- seq(0, 10, 1)
y1 <- 1 * x
y2 <- -1 * x

par(mfrow = c(1, 2))
plot(x, y1, pch = 21, col = "red", bg = "orange", main = "Fig. 1.10: r = +1")
plot(x, y2, pch = 21, col = "red", bg = "orange", main = "Fig. 1.11: r = -1")

par(mfrow = c(1, 1))

c(cor(x, y1), cor(x, y2))
## [1]  1 -1
set.seed(333)
x3 <- rnorm(100)
y3 <- rnorm(100)
plot(x3, y3, pch = 21, col = "red", bg = "orange", main = "Fig 1.13d: r ~ 0.02")

cor(x3, y3)
## [1] 0.02012909

Correlations and variance

plot(data1_1$BOOKS, data1_1$GRADE, pch = 3, main = "Fig. 1.14: line = mean of GRADE")
abline(h = mean(data1_1$GRADE), col = "red")

varGrade <- sum((data1_1$GRADE - mean(data1_1$GRADE))^2)/length(data1_1$GRADE)
varGrade
## [1] 272.0975

The regression equation (find the OLS-line)

\[GRADE = B_0+B_1(BOOKS)+e\]

Formula for the slope:

\[B_1=r\frac{sd(GRADE)}{sd(BOOKS)}\]

Formula for the intercept:

\[B_0=mean(GRADE)−B_1mean(BOOKS)\]

model <- lm(GRADE ~ BOOKS, data = data1_1)
predValues <- predict(model)

predGrade <- summary(model)$coef[1] + summary(model)$coef[2] * data1_1$BOOKS

datasetM <- data1_1[, -2]
datasetM$predGrade <- predGrade
datasetM$residual <- datasetM$GRADE - datasetM$predGrade
datasetM$residualSq <- datasetM$residual^2
head(datasetM, 8)
##   BOOKS GRADE predGrade residual  residualSq
## 1     0    45   52.0750  -7.0750  50.0556250
## 2     1    57   57.8125  -0.8125   0.6601562
## 3     0    45   52.0750  -7.0750  50.0556250
## 4     2    51   63.5500 -12.5500 157.5025000
## 5     4    65   75.0250 -10.0250 100.5006250
## 6     4    88   75.0250  12.9750 168.3506250
## 7     1    44   57.8125 -13.8125 190.7851562
## 8     4    87   75.0250  11.9750 143.4006250
PredScores <- round(c(mean(predValues), sd(predValues), var(predValues)), 2)
PredScores1 <- round(c(mean(datasetM$predGrade), sd(datasetM$predGrade), var(datasetM$predGrade)), 2)
ResidScores <- round(c(mean(datasetM$residual), sd(datasetM$residual), var(datasetM$residual)), 2)
rbind(PredScores, PredScores1, ResidScores)
##              [,1]  [,2]   [,3]
## PredScores  63.55  8.22  67.53
## PredScores1 63.55  8.22  67.53
## ResidScores  0.00 14.54 211.55

Variance of grades is equal to the variance of the predicted grades plus the variance of the residuals.

  • The variance in residuals represents variance in GRADE unaccounted for by the model.
  • The variance in predicted grades represents variance in GRADE accounted for by the model.
var(datasetM$GRADE)
## [1] 279.0744
var(datasetM$predGrade) + var(datasetM$residual)
## [1] 279.0744

Proportion of the variance in GRADE that has been accounted for by BOOKS:

prop <- var(datasetM$predGrade)/var(datasetM$GRADE)
sqrtProp <- sqrt(prop)
r <- cor(datasetM$BOOKS, datasetM$GRADE)
cbind(prop, sqrtProp, r)
##          prop  sqrtProp         r
## [1,] 0.241964 0.4918984 0.4918984

If you square a correlation, the resulting figure is equal to the proportion of variance that the two variables share.

Correlation and size

cor.test(datasetM$BOOKS, datasetM$GRADE)
## 
##  Pearson's product-moment correlation
## 
## data:  datasetM$BOOKS and datasetM$GRADE
## t = 3.4828, df = 38, p-value = 0.001265
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2130322 0.6966582
## sample estimates:
##       cor 
## 0.4918984

Guidance provided by Cohen (1988):

  • r >= 0.1: small correlation
  • r >= 0.3: medium correlation
  • r >= 0.5: large correlation

More than one independet variable - multiple regression

Multiple regression

Multiple regression equation

\[\hat Y = B_0 + B_1X_1 + B_2X_2 + B_3X_3 + … + B_kX_k\]

\[\hat Y = B_0 + \sum(B_kX_k)\]

\(\hat Y\) = predicted value on the outcome variable Y
\(B_0\) = predicted value on Y when all X = 0
\(X_k\) = predictor variables
\(B_k\) = unstandardized regression coefficients
\(Y - \hat Y\) = residual (prediction error)
\(k\) = the number of predictor variables

model2 <- lm(GRADE ~ ATTEND, data = data1_1)
summary(model2)
## 
## Call:
## lm(formula = GRADE ~ ATTEND, data = data1_1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -27.777 -10.904   2.021  12.430  31.755 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   36.998      8.169   4.529 5.71e-05 ***
## ATTEND         1.883      0.555   3.393  0.00163 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.83 on 38 degrees of freedom
## Multiple R-squared:  0.2325, Adjusted R-squared:  0.2123 
## F-statistic: 11.51 on 1 and 38 DF,  p-value: 0.001629
cor(data1_1$ATTEND, data1_1$GRADE) 
## [1] 0.4821864

Each additional lecture attended increases a student’s grade by 1.9%.

cor(data1_1)
##            BOOKS    ATTEND     GRADE
## BOOKS  1.0000000 0.4436428 0.4918984
## ATTEND 0.4436428 1.0000000 0.4821864
## GRADE  0.4918984 0.4821864 1.0000000

What we are interested in is the correlation between BOOKS and GRADE, with the influence of ATTEND removed or controlled for. Similarly, we want the correlation between ATTEND and GRADE with the influence of BOOKS removed or controlled for. … So if we enter both BOOKS and ATTEND into the regression equation, we get estimates of the slope coefficients for each variable, controlling for the other variables.

model3 <- lm(GRADE ~ BOOKS + ATTEND, data = data1_1)
R.Sq <- summary(model3)$r.squared
adj.R.Sq <- summary(model3)$adj.r.squared
R <- sqrt(R.Sq)
cbind(R, R.Sq, adj.R.Sq)
##              R      R.Sq  adj.R.Sq
## [1,] 0.5733343 0.3287122 0.2924263

2.5 Adjusted R square

\[ Adj.R^2 = 1 - (1 - R^2)\frac{N - 1}{N - k - 1}\]

As N (sample size) increases, the amount by which \(R^2\) is adjusted downward decreases, and as k (number of independend variables) increases, the amount by which \(R^2\) is reduced increases.

2.6 ANOVA table

anova(model3)
## Analysis of Variance Table
## 
## Response: GRADE
##           Df Sum Sq Mean Sq F value    Pr(>F)    
## BOOKS      1 2633.5 2633.51 13.3366 0.0008004 ***
## ATTEND     1  944.2  944.16  4.7814 0.0351684 *  
## Residuals 37 7306.2  197.47                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The significance of the value F is the probability associated with \(R^2\), that is the probability of getting a value of \(R^2\) as high as it is if the actual value in the population is zero.

(in simple regression p-value for R.sq in summary output is identical with p-value in anova)

summary(model)
## 
## Call:
## lm(formula = GRADE ~ BOOKS, data = data1_1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -28.025 -12.616  -1.181  10.425  33.450 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   52.075      4.035  12.905 1.83e-15 ***
## BOOKS          5.738      1.647   3.483  0.00127 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.73 on 38 degrees of freedom
## Multiple R-squared:  0.242,  Adjusted R-squared:  0.222 
## F-statistic: 12.13 on 1 and 38 DF,  p-value: 0.001265
anova(model)
## Analysis of Variance Table
## 
## Response: GRADE
##           Df Sum Sq Mean Sq F value   Pr(>F)   
## BOOKS      1 2633.5 2633.51   12.13 0.001265 **
## Residuals 38 8250.4  217.12                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

2.7 Coefficients

summary(model3)
## 
## Call:
## lm(formula = GRADE ~ BOOKS + ATTEND, data = data1_1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -20.802 -13.374   0.060   9.173  32.295 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   37.379      7.745   4.827 2.41e-05 ***
## BOOKS          4.037      1.753   2.303   0.0270 *  
## ATTEND         1.284      0.587   2.187   0.0352 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.05 on 37 degrees of freedom
## Multiple R-squared:  0.3287, Adjusted R-squared:  0.2924 
## F-statistic: 9.059 on 2 and 37 DF,  p-value: 0.0006278
  • Intercept (Constant): value of the dependent variable, when all independent variables are zero.
  • Coefficient BOOKS: How much reading each extra book increases GRADE if everyone attended the same number of lectures.
  • Coefficient ATTEND: How much attending a extra lecture increases GRADE if everyone read the same number of books.

Standardized coefficients

data1_1sc <- data.frame(scale(data1_1))
model3sc <- lm(GRADE ~ BOOKS + ATTEND, data = data1_1sc)
summary(model3sc)
## 
## Call:
## lm(formula = GRADE ~ BOOKS + ATTEND, data = data1_1sc)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.24523 -0.80055  0.00359  0.54907  1.93319 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 1.397e-16  1.330e-01   0.000   1.0000  
## BOOKS       3.461e-01  1.503e-01   2.303   0.0270 *
## ATTEND      3.286e-01  1.503e-01   2.187   0.0352 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8412 on 37 degrees of freedom
## Multiple R-squared:  0.3287, Adjusted R-squared:  0.2924 
## F-statistic: 9.059 on 2 and 37 DF,  p-value: 0.0006278
# alternative
library(QuantPsyc)
## Warning: package 'QuantPsyc' was built under R version 3.2.5
## Loading required package: boot
## 
## Attaching package: 'boot'
## The following object is masked from 'package:psych':
## 
##     logit
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## Attaching package: 'QuantPsyc'
## The following object is masked from 'package:base':
## 
##     norm
lm.beta(model3)
##     BOOKS    ATTEND 
## 0.3460987 0.3286422

The standardized coefficients tell us, what the correlation for one dependent variable would be if the other independent variables were held constant.

2.8 Variable entry

Hierarchical variable entry

data2_1 <- read.delim("data2_1.dat")

Codebook:

  • CAR: number of minutes per week a person spends looking after their car
  • EXTRO: measure of extroversion score
cor(data2_1$EXTRO, data2_1$CAR)    # R
## [1] 0.6711337
cor(data2_1$EXTRO, data2_1$CAR)^2  # R squared
## [1] 0.4504204
plot(data2_1$EXTRO, data2_1$CAR)

What we want is to estimate the effect of EXTRO on CAR while controlling for the effects of SEX and AGE.

model4 <- lm(CAR ~ EXTRO + SEX + AGE, data = data2_1)
summary(model4)
## 
## Call:
## lm(formula = CAR ~ EXTRO + SEX + AGE, data = data2_1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -25.002  -9.758  -1.206   8.940  24.712 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  11.4260     7.3084   1.563 0.126704    
## EXTRO         0.4633     0.1296   3.574 0.001023 ** 
## SEX          20.0377     4.6502   4.309 0.000121 ***
## AGE           0.1543     0.2062   0.749 0.458949    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.01 on 36 degrees of freedom
## Multiple R-squared:  0.6383, Adjusted R-squared:  0.6081 
## F-statistic: 21.18 on 3 and 36 DF,  p-value: 4.451e-08
lm.beta(model4)
##      EXTRO        SEX        AGE 
## 0.44056432 0.48813876 0.08455706

What we are really interested in is the effect that EXTRO has above and beyond the effects of AGE and SEX - that is to say, we want to assess the effects of EXTRO after the effects of AGE and SEX have been removed.

model4b <- lm(CAR ~ SEX + AGE, data = data2_1)
# alternative: eliminate EXTRO from model4
# model4b2 <- update(model4, .~.-EXTRO)
summary(model4b)
## 
## Call:
## lm(formula = CAR ~ SEX + AGE, data = data2_1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -25.606 -10.126  -4.173  10.079  42.425 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  12.5627     8.3830   1.499   0.1425    
## SEX          27.7406     4.7309   5.864 9.61e-07 ***
## AGE           0.4921     0.2104   2.339   0.0248 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.94 on 37 degrees of freedom
## Multiple R-squared:  0.5099, Adjusted R-squared:  0.4835 
## F-statistic: 19.25 on 2 and 37 DF,  p-value: 1.86e-06
Rsq.SEX.AGE <- summary(model4b)$r.squared
Rsq.EXTRO.SEX.AGE <- summary(model4)$r.squared
rbind(Rsq.SEX.AGE, Rsq.EXTRO.SEX.AGE)
##                        [,1]
## Rsq.SEX.AGE       0.5099419
## Rsq.EXTRO.SEX.AGE 0.6382828
increaseInRsq <- Rsq.EXTRO.SEX.AGE - Rsq.SEX.AGE
increaseInRsq
## [1] 0.1283409
# compare models with anova
anova(model4b, model4)
## Analysis of Variance Table
## 
## Model 1: CAR ~ SEX + AGE
## Model 2: CAR ~ EXTRO + SEX + AGE
##   Res.Df    RSS Df Sum of Sq      F   Pr(>F)   
## 1     37 8257.7                                
## 2     36 6095.1  1    2162.6 12.773 0.001023 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The increase in \(R^2\) for the larger model is significant and EXTRO does predict CAR above SEX and AGE. However, if we look at the actual increase in \(R^2\), we find that it increases by 0.128 (13%) and therefore EXTRO is accounting for less of the variance than we had at first thought.

2.9 Methods of variable entry

see book pp.38f

R-packages:

  • mass::stepAIC()
  • leaps::regsubsets()

3. Categorical independent variables

3.1 The t-test as regression

data3_1 <- read.delim("data3_1.dat")
data3_1$GROUP <- factor(data3_1$GROUP, levels = c(0, 1), labels = c("con", "exp"))

data3_1 %>%
  group_by(GROUP) %>%
  summarise(
    N = n(),
    Mean = mean(SCORE),
    SD = sd(SCORE),
    SE = SD/sqrt(n())
  )
## Source: local data frame [2 x 5]
## 
##    GROUP     N  Mean       SD        SE
##   (fctr) (int) (dbl)    (dbl)     (dbl)
## 1    con    10  10.1 1.791957 0.5666667
## 2    exp    10  12.6 2.065591 0.6531973
t.test(data3_1$SCORE ~ data3_1$GROUP, paired = FALSE)
## 
##  Welch Two Sample t-test
## 
## data:  data3_1$SCORE by data3_1$GROUP
## t = -2.891, df = 17.648, p-value = 0.009874
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -4.3193527 -0.6806473
## sample estimates:
## mean in group con mean in group exp 
##              10.1              12.6
summary(lm(SCORE ~ GROUP, data = data3_1))
## 
## Call:
## lm(formula = SCORE ~ GROUP, data = data3_1)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -2.600 -1.600 -0.350  1.275  3.400 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  10.1000     0.6115  16.518 2.54e-12 ***
## GROUPexp      2.5000     0.8647   2.891  0.00973 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.934 on 18 degrees of freedom
## Multiple R-squared:  0.3171, Adjusted R-squared:  0.2792 
## F-statistic: 8.358 on 1 and 18 DF,  p-value: 0.009732

Compare output of t.test and lm:

  • The intercept is the same as the mean for GROUPcon.
  • The slope is the same as the difference between the means.
  • The t-value and p-value are equal.

3.3 ANOVA as regression

Coding schemes for categorical data

data3_2a <- read.delim("data3_2a.dat")
data3_2a$GROUP <- factor(data3_2a$GROUP, levels = c(0, 1, 2), labels = c("con", "mnem", "aroma"))

data3_2a %>%
  group_by(GROUP) %>%
  summarise(
    N = n(),
    Mean = mean(SCORE),
    SD = sd(SCORE),
    SE = sd(SCORE)/sqrt(n()),
    CI.lower = Mean -1.96 * SE,
    CI.upper = Mean +1.96 * SE 
  )
## Source: local data frame [3 x 7]
## 
##    GROUP     N  Mean       SD        SE  CI.lower CI.upper
##   (fctr) (int) (dbl)    (dbl)     (dbl)     (dbl)    (dbl)
## 1    con    10  10.1 1.791957 0.5666667  8.989333 11.21067
## 2   mnem    10  12.6 2.065591 0.6531973 11.319733 13.88027
## 3  aroma    10   8.9 2.024846 0.6403124  7.644988 10.15501
a <- aov(SCORE ~ GROUP, data = data3_2a)
summary(a)
##             Df Sum Sq Mean Sq F value  Pr(>F)    
## GROUP        2  71.27   35.63   9.233 0.00088 ***
## Residuals   27 104.20    3.86                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
pairwise.t.test(data3_2a$SCORE, data3_2a$GROUP, paired = FALSE, p.adjust.method = "none")
## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  data3_2a$SCORE and data3_2a$GROUP 
## 
##       con     mnem   
## mnem  0.00836 -      
## aroma 0.18325 0.00025
## 
## P value adjustment method: none
posthoc <- TukeyHSD(a, "GROUP")
posthoc
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = SCORE ~ GROUP, data = data3_2a)
## 
## $GROUP
##            diff        lwr        upr     p adj
## mnem-con    2.5  0.3217051  4.6782949 0.0220484
## aroma-con  -1.2 -3.3782949  0.9782949 0.3724648
## aroma-mnem -3.7 -5.8782949 -1.5217051 0.0007162
plot(posthoc, col = "blue")

Interpretation

  • The summary statistics indicate that the mnemonics condition (1) has the highest mean recall of the three groups and that the control (0) and aromatherapy (2) are relatively similar.
  • The results of the ANOVA confirm that the means are not equal.
  • The post hoc tests show that there are significant differences between the mnemonics and control conditions and between the mnemonics and aromatherapy conditions. The mnemonics condition has the highest mean and the difference for contorl and aromatherapy conditions is not significant.

Dummy coding

It compares each level of the categorical variable to a fixed reference level.

require(psych)
dummies <- dummy.code(data3_2a$GROUP)
data3_2a <- cbind(data3_2a, dummies)
head(data3_2a)
##   GROUP SCORE con mnem aroma
## 1   con    10   1    0     0
## 2   con     8   1    0     0
## 3   con    13   1    0     0
## 4   con     9   1    0     0
## 5   con    10   1    0     0
## 6   con    13   1    0     0
# omit reference group dummy-variable as predictor
m3 <- lm(SCORE ~ mnem + aroma, data = data3_2a)
summary(m3)
## 
## Call:
## lm(formula = SCORE ~ mnem + aroma, data = data3_2a)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -2.900 -1.475 -0.100  1.050  4.100 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  10.1000     0.6212  16.258  1.8e-15 ***
## mnem          2.5000     0.8786   2.846  0.00836 ** 
## aroma        -1.2000     0.8786  -1.366  0.18325    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.964 on 27 degrees of freedom
## Multiple R-squared:  0.4062, Adjusted R-squared:  0.3622 
## F-statistic: 9.233 on 2 and 27 DF,  p-value: 0.0008802
# standardized slopes
m3sc <- lm(scale(SCORE) ~ scale(mnem) + scale(aroma), data = data3_2a)
summary(m3sc)
## 
## Call:
## lm(formula = scale(SCORE) ~ scale(mnem) + scale(aroma), data = data3_2a)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.17896 -0.59964 -0.04065  0.42687  1.66681 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)   
## (Intercept)   1.261e-16  1.458e-01   0.000  1.00000   
## scale(mnem)   4.873e-01  1.712e-01   2.846  0.00836 **
## scale(aroma) -2.339e-01  1.712e-01  -1.366  0.18325   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7986 on 27 degrees of freedom
## Multiple R-squared:  0.4062, Adjusted R-squared:  0.3622 
## F-statistic: 9.233 on 2 and 27 DF,  p-value: 0.0008802
anova(m3)
## Analysis of Variance Table
## 
## Response: SCORE
##           Df  Sum Sq Mean Sq F value    Pr(>F)    
## mnem       1  64.067  64.067 16.6008 0.0003634 ***
## aroma      1   7.200   7.200  1.8656 0.1832456    
## Residuals 27 104.200   3.859                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Interpretation

  • The ANOVA table shows a significant result for mnem but not for aroma, meaning, that the null hypothesis of no mean differences can be rejected.
  • The regression coefficients (slopes) indicate the difference between the mean value for each group and the reference group. The coefficient for mnem is statistically significant but the coefficient for aroma is not. Therefore the means of the con und mnem condition are significantly different, but the means of the con and aromatherapy condition are not significantly different.
  • The sign of the slope gives us information about the direction of the mean difference.

Bonferroni correction for multiple testing

  • Number of comparisons: con vs. mnem, con vs. aroma -> 2 comparisons
alpha <- .05
n.comparisons <- 2

# Probability of a type 1 error in multiple testing
type1error.inflation <- 1 - (1-alpha)^n.comparisons
type1error.inflation
## [1] 0.0975
# Bonferroni correction
alpha.bonferroni <- alpha/n.comparisons
alpha.bonferroni
## [1] 0.025

Effect coding

see also deviation coding IDRE, UCLA

This coding system compares the mean of the dependent variable for a given level to the overall mean of the dependent variable.

data3_3 <- read.delim("data3_3a.dat")
data3_3$JOB <- factor(data3_3$JOB, levels = c(1:5))
head(data3_3); tail(data3_3)
##   JOB STRESS
## 1   1     71
## 2   1     67
## 3   1     67
## 4   1     67
## 5   1     79
## 6   1     46
##    JOB STRESS
## 45   5     71
## 46   5     34
## 47   5     70
## 48   5     59
## 49   5     46
## 50   5     68
# levels of categorical variable
levels(data3_3$JOB)
## [1] "1" "2" "3" "4" "5"
# the contrast matrix for categorical variable with five levels
contr.sum(5)
##   [,1] [,2] [,3] [,4]
## 1    1    0    0    0
## 2    0    1    0    0
## 3    0    0    1    0
## 4    0    0    0    1
## 5   -1   -1   -1   -1
# assigning the deviation contrasts to JOB
contrasts(data3_3$JOB) <- contr.sum(5)

#the regression
m33 <- lm(STRESS ~ JOB, data = data3_3)
summary(m33)
## 
## Call:
## lm(formula = STRESS ~ JOB, data = data3_3)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -31.60 -11.53  -0.40  10.90  34.40 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   57.320      2.289  25.043   <2e-16 ***
## JOB1          10.080      4.578   2.202   0.0328 *  
## JOB2           0.280      4.578   0.061   0.9515    
## JOB3           7.280      4.578   1.590   0.1188    
## JOB4         -12.620      4.578  -2.757   0.0084 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.18 on 45 degrees of freedom
## Multiple R-squared:  0.2234, Adjusted R-squared:  0.1544 
## F-statistic: 3.237 on 4 and 45 DF,  p-value: 0.02038
anova(m33)
## Analysis of Variance Table
## 
## Response: STRESS
##           Df  Sum Sq Mean Sq F value  Pr(>F)  
## JOB        4  3391.5  847.87  3.2369 0.02038 *
## Residuals 45 11787.4  261.94                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Interpretation

  • The ANOVA table shows that the overall result is significant and that the null hypothesis of equivalent means can be rejected.
  • The regression coefficients show where these differences arise. The intercept (constant) shows the overall mean (grand mean) for all groups and the slope coefficients show the amount that each group differs from the overall mean.
  • JOB1 and JOB4 have scores that are significantly different from the overall mean.
  • This analysis excluded JOB5 because they were used as the reference group. To compare their stress levels to the mean we would recode the data to use a different reference group.