5 January 2016

The data file to be used

Load data into your workspace

Use syntax

load('/User/stephanievandenberg/Dropbox/BMS R workshop/Workshop package/Data/Linear models/data_Browsing_AB')

Slides with syntax

The experiment

## Warning: package 'ape' was built under R version 3.2.3
## Warning: package 'ggplot2' was built under R version 3.2.3
## Warning: package 'GGally' was built under R version 3.2.3
Subj education gender age Design ToT predict_M3 resid_M3
23 degree m 28.41143 A 62.30250 71.05689 -8.7543906
24 degree m 28.41143 B 71.14089 71.05689 0.0839968
45 no degree f 38.93923 A 165.00313 127.98771 37.0154158
46 no degree f 38.93923 B 176.26674 127.98771 48.2790327
67 degree f 76.07241 A 105.24776 110.07658 -4.8288145
68 degree f 76.07241 B 112.53900 110.07658 2.4624247
89 no degree f 39.65661 A 150.06767 128.57150 21.4961717
90 no degree f 39.65661 B 113.33963 128.57150 -15.2318657
111 degree m 22.77343 A 159.78351 66.46883 93.3146799
112 degree m 22.77343 B 71.04381 66.46883 4.5749828

One-way ANOVA

ANOVA_Design <-
  data_Browsing_AB %>% 
  aov(ToT ~ Design, data = .)

summary(ANOVA_Design) 
##              Df Sum Sq Mean Sq F value Pr(>F)  
## Design        1   4558    4558    2.83 0.0941 .
## Residuals   198 318858    1610                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Two-factorial ANOVA (1)

data_Browsing_AB %>%
  ggplot(aes(x = education, y = ToT, fill = Design)) +
  geom_boxplot()

data_Browsing_AB %>% aggregate( ToT~education+Design, . ,  mean )
##   education Design       ToT
## 1    degree      A  91.80865
## 2 no degree      A 141.78354
## 3    degree      B  83.54079
## 4 no degree      B 130.95649

Two-factorial ANOVA (2)

ANOVA_education_design <- 
  data_Browsing_AB %>% 
  aov(ToT ~ education + Design,  data = .)
summary(ANOVA_education_design) 
##              Df Sum Sq Mean Sq F value Pr(>F)    
## education     1 118562  118562 116.610 <2e-16 ***
## Design        1   4558    4558   4.483 0.0355 *  
## Residuals   197 200297    1017                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

ANOVA model coefficients

coef(ANOVA_education_design) 
##        (Intercept) educationno degree            DesignB 
##          92.448450          48.695291          -9.547452
data_Browsing_AB %>% aggregate( ToT~education+Design, . ,  mean )
##   education Design       ToT
## 1    degree      A  91.80865
## 2 no degree      A 141.78354
## 3    degree      B  83.54079
## 4 no degree      B 130.95649

Full factorial ANOVA (1)

ANOVA_fullfact <- 
  data_Browsing_AB %>% 
  aov(ToT ~  education + Design + education:Design,  data = .)

Full factorial ANOVA (2)

summary(ANOVA_fullfact) ; coef(ANOVA_fullfact)
##                   Df Sum Sq Mean Sq F value Pr(>F)    
## education          1 118562  118562 116.066 <2e-16 ***
## Design             1   4558    4558   4.462 0.0359 *  
## education:Design   1     82      82   0.080 0.7774    
## Residuals        196 200215    1022                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##                (Intercept)         educationno degree 
##                  91.808655                  49.974882 
##                    DesignB educationno degree:DesignB 
##                  -8.267861                  -2.559182

Linear regression (1)

data_Browsing_AB %>%
  ggplot(aes(x = age, y = ToT)) +  geom_point() + geom_smooth()

Linear regression (2)

LM_age <-
  data_Browsing_AB %>% 
  lm(ToT ~ 1 + age, data = .) 
summary(LM_age)
## 
## Call:
## lm(formula = ToT ~ 1 + age, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -88.737 -28.738  -2.309  27.947  89.035 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  70.8466     7.8210   9.058  < 2e-16 ***
## age           0.8397     0.1500   5.597 7.19e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 37.55 on 198 degrees of freedom
## Multiple R-squared:  0.1366, Adjusted R-squared:  0.1323 
## F-statistic: 31.33 on 1 and 198 DF,  p-value: 7.193e-08

Full factorial regression analysis (1)

LM_fullfact <- 
 data_Browsing_AB %>% lm(ToT ~ 1 + education + Design + education:Design
                                , data=.)

Full factorial regression analysis (2)

summary(LM_fullfact)
## 
## Call:
## lm(formula = ToT ~ 1 + education + Design + education:Design, 
##     data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -80.389 -22.628  -0.038  23.251  80.299 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  91.809      4.520  20.312  < 2e-16 ***
## educationno degree           49.975      6.392   7.818 3.22e-13 ***
## DesignB                      -8.268      6.392  -1.293    0.197    
## educationno degree:DesignB   -2.559      9.040  -0.283    0.777    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 31.96 on 196 degrees of freedom
## Multiple R-squared:  0.3809, Adjusted R-squared:  0.3715 
## F-statistic:  40.2 on 3 and 196 DF,  p-value: < 2.2e-16

Same results from aov and lm

coef(LM_fullfact)
##                (Intercept)         educationno degree 
##                  91.808655                  49.974882 
##                    DesignB educationno degree:DesignB 
##                  -8.267861                  -2.559182
coef(ANOVA_fullfact)
##                (Intercept)         educationno degree 
##                  91.808655                  49.974882 
##                    DesignB educationno degree:DesignB 
##                  -8.267861                  -2.559182

Mixing quant. and qual. predictors

M3 <- data_Browsing_AB %>% lm( ToT ~ 1 + age + education + gender,  data = .)
summary(M3)
## 
## Call:
## lm(formula = ToT ~ 1 + age + education + gender, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -70.207 -20.091   0.067  17.468  93.315 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         48.1707     6.8017   7.082 2.48e-11 ***
## age                  0.8138     0.1154   7.050 2.99e-11 ***
## educationno degree  48.1292     4.1036  11.729  < 2e-16 ***
## genderm             -0.2343     4.1072  -0.057    0.955    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 28.87 on 196 degrees of freedom
## Multiple R-squared:  0.495,  Adjusted R-squared:  0.4872 
## F-statistic: 64.03 on 3 and 196 DF,  p-value: < 2.2e-16

Ridiculous model (1)

M_ridiculous <- 
data_Browsing_AB %>%
lm( ToT ~ 1 + ( age  + education + gender + Design)^4,         
    data = .)

Ridiculous model (2)

summary(M_ridiculous)$coefficients [,c(1,4)]
##                                           Estimate     Pr(>|t|)
## (Intercept)                             93.9001070 1.870167e-07
## age                                      0.0525118 8.785377e-01
## educationno degree                     -15.0197860 5.224265e-01
## genderm                                 -2.5760946 9.081278e-01
## DesignB                                -68.5707862 5.694932e-03
## age:educationno degree                   1.1445826 1.156574e-02
## age:genderm                             -0.1155961 7.896880e-01
## age:DesignB                              1.1619846 1.764558e-02
## educationno degree:genderm              46.2295763 1.565939e-01
## educationno degree:DesignB              39.4811746 2.351377e-01
## genderm:DesignB                         21.3415639 4.992897e-01
## age:educationno degree:genderm          -0.6400597 3.116495e-01
## age:educationno degree:DesignB          -0.7782987 2.216082e-01
## age:genderm:DesignB                     -0.2971787 6.278587e-01
## educationno degree:genderm:DesignB     -58.9175425 2.015040e-01
## age:educationno degree:genderm:DesignB   1.0678506 2.328924e-01

Less ridiculous model (1)

M_lessridiculous <- 
data_Browsing_AB %>%
lm( ToT ~ 1 + (age + education + gender + Design)^3,         
    data = .)

Less ridiculous model (2)

summary(M_lessridiculous)$coefficients[,c(1,4)]
##                                       Estimate     Pr(>|t|)
## (Intercept)                         86.3734250 2.679385e-07
## age                                  0.2104618 5.077195e-01
## educationno degree                  -1.7121244 9.340330e-01
## genderm                              9.5722892 6.305695e-01
## DesignB                            -53.5174222 1.187420e-02
## age:educationno degree               0.8744501 2.549851e-02
## age:genderm                         -0.3668520 3.341991e-01
## age:DesignB                          0.8460845 3.933877e-02
## educationno degree:genderm          20.3151378 4.035751e-01
## educationno degree:DesignB          12.8658513 6.017243e-01
## genderm:DesignB                     -2.9552037 9.027354e-01
## age:educationno degree:genderm      -0.1061344 8.124233e-01
## age:educationno degree:DesignB      -0.2380336 5.946687e-01
## age:genderm:DesignB                  0.2053332 6.456632e-01
## educationno degree:genderm:DesignB  -7.0886655 6.463750e-01

Reasonable model (1)

M_reasonable <- 
data_Browsing_AB %>%
lm( ToT ~ 1 + (age + education + gender + Design)^2,         
    data = .)

Reasonable model (2)

summary(M_reasonable)$coefficients[,c(1,4)] 
##                               Estimate     Pr(>|t|)
## (Intercept)                 84.0830023 7.938233e-11
## age                          0.2393769 2.966175e-01
## educationno degree           8.7321025 4.953211e-01
## genderm                      8.5360261 4.977448e-01
## DesignB                    -51.9289037 5.691257e-05
## age:educationno degree       0.7017360 1.784847e-03
## age:genderm                 -0.3141304 1.569879e-01
## age:DesignB                  0.8510494 9.911922e-05
## educationno degree:genderm  11.6194996 1.303931e-01
## educationno degree:DesignB  -2.7319742 7.198764e-01
## genderm:DesignB              3.9470652 6.047568e-01

Observed ToTs (1)

data_Browsing_AB %>%
        ggplot(aes(x = age,  y = ToT, 
        shape = education, col= Design, size=10)) +
        geom_point() + geom_smooth( method='lm', se=F)

Observed ToTs (2)

Predicted ToTs

data_Browsing_AB %>%
        mutate(   pred_Mfull2 = predict(M_reasonable )) %>%
        ggplot(aes(x = age, y = pred_Mfull2, 
    shape = education, col= Design, size=10)) + geom_point() 

Concluding remarks

  • y ~ a:b:c =
    y ~ 1 + a + b + c + a:b + a:c + a:c + a:b:c

  • Variables specified as a factor are analysed as categorical predictors, even when labelled with numbers. So be careful when you have more than two levels in a variable. Changing a numeric variable into a factor can be done by as.factor(variable_name). Always check the degrees of freedom in your output.