dplyr and Experimental Designs

Published

May 7, 2024

df0<-read.csv("https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv")
df0[df0 == ""] <- NA
df1<-na.omit(df0)
library(dplyr)
library(ggplot2)
library(ggthemes)
library(tidyverse)
library(leaps)
library(olsrr)
library(lmtest)

Basic Data Structure

#str(df1)
for(i in c(1,2,7)){
  df1[,i]<-factor(df1[,i])
}
str(df1)
'data.frame':   333 obs. of  7 variables:
 $ species          : Factor w/ 3 levels "Adelie","Chinstrap",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ island           : Factor w/ 3 levels "Biscoe","Dream",..: 3 3 3 3 3 3 3 3 3 3 ...
 $ bill_length_mm   : num  39.1 39.5 40.3 36.7 39.3 38.9 39.2 41.1 38.6 34.6 ...
 $ bill_depth_mm    : num  18.7 17.4 18 19.3 20.6 17.8 19.6 17.6 21.2 21.1 ...
 $ flipper_length_mm: int  181 186 195 193 190 181 195 182 191 198 ...
 $ body_mass_g      : int  3750 3800 3250 3450 3650 3625 4675 3200 3800 4400 ...
 $ sex              : Factor w/ 2 levels "FEMALE","MALE": 2 1 1 1 2 1 2 1 2 2 ...
 - attr(*, "na.action")= 'omit' Named int [1:11] 4 9 10 11 12 48 247 287 325 337 ...
  ..- attr(*, "names")= chr [1:11] "4" "9" "10" "11" ...

Compute some summary statistics

df1%>%
  group_by(species)%>%
  summarise(`Mean Body Mass`=mean(body_mass_g),std=sd(body_mass_g),n=n())
# A tibble: 3 × 4
  species   `Mean Body Mass`   std     n
  <fct>                <dbl> <dbl> <int>
1 Adelie               3706.  459.   146
2 Chinstrap            3733.  384.    68
3 Gentoo               5092.  501.   119

Using Tukey’s Test to see difference

mod_anova<-aov(body_mass_g~species,df1)
summary(mod_anova)
             Df    Sum Sq  Mean Sq F value Pr(>F)    
species       2 145190219 72595110   341.9 <2e-16 ***
Residuals   330  70069447   212332                   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(mod_anova)
  Tukey multiple comparisons of means
    95% family-wise confidence level

Fit: aov(formula = body_mass_g ~ species, data = df1)

$species
                       diff       lwr       upr    p adj
Chinstrap-Adelie   26.92385 -132.3528  186.2005 0.916431
Gentoo-Adelie    1386.27259 1252.2897 1520.2554 0.000000
Gentoo-Chinstrap 1359.34874 1194.4304 1524.2671 0.000000
plot(TukeyHSD(mod_anova))

In terms of species, the differences in mean body mass were significant among Gentoo vs Adelie and Gentoo vs Chinstrap, but the difference between Chinstrap vs Adelie was not significant.

Let’s say you would like to re-group them into 2 groups, Gentoo and others

df1%>%
  mutate(alt_species=
           case_when(species %in% c("Adelie","Chinstrap") ~"Others",
                     TRUE ~ "Gentoo"))%>%
  group_by(alt_species)%>%
  summarise(`Mean Body Mass`=mean(body_mass_g),std=sd(body_mass_g),n=n())
# A tibble: 2 × 4
  alt_species `Mean Body Mass`   std     n
  <chr>                  <dbl> <dbl> <int>
1 Gentoo                 5092.  501.   119
2 Others                 3715.  436.   214

Using t.test

df2<-df1%>%
  mutate(alt_species=
           case_when(species %in% c("Adelie","Chinstrap") ~"Others",
                     TRUE ~ "Gentoo"))%>%
  select(-species)
t.test(body_mass_g~alt_species,df2,var.equal=FALSE)

    Welch Two Sample t-test

data:  body_mass_g by alt_species
t = 25.153, df = 216.69, p-value < 2.2e-16
alternative hypothesis: true difference in means between group Gentoo and group Others is not equal to 0
95 percent confidence interval:
 1269.759 1485.676
sample estimates:
mean in group Gentoo mean in group Others 
            5092.437             3714.720 
df1%>%
  ggplot(aes(x=species,y=body_mass_g,group=sex,col=sex))+
  stat_summary(fun=mean,geom="point")+
  stat_summary(fun=mean,geom="line")+theme_stata(scheme = "s1color")+
  scale_color_stata()+labs(y="Body Mass in Grams")

Regression Model

mod0<-lm(body_mass_g~.,df1)
summary(mod0)

Call:
lm(formula = body_mass_g ~ ., data = df1)

Residuals:
    Min      1Q  Median      3Q     Max 
-779.20 -167.35   -3.16  179.37  914.27 

Coefficients:
                   Estimate Std. Error t value Pr(>|t|)    
(Intercept)       -1500.029    575.822  -2.605 0.009610 ** 
speciesChinstrap   -260.306     88.551  -2.940 0.003522 ** 
speciesGentoo       987.761    137.238   7.197 4.30e-12 ***
islandDream         -13.103     58.541  -0.224 0.823032    
islandTorgersen     -48.064     60.922  -0.789 0.430722    
bill_length_mm       18.189      7.136   2.549 0.011270 *  
bill_depth_mm        67.575     19.821   3.409 0.000734 ***
flipper_length_mm    16.239      2.939   5.524 6.80e-08 ***
sexMALE             387.224     48.138   8.044 1.66e-14 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 287.9 on 324 degrees of freedom
Multiple R-squared:  0.8752,    Adjusted R-squared:  0.8721 
F-statistic: 284.1 on 8 and 324 DF,  p-value: < 2.2e-16
AIC(mod0)
[1] 4727.242
plot(mod0)

resettest(mod0,power=2:3,type=("fitted"))

    RESET test

data:  mod0
RESET = 3.3115, df1 = 2, df2 = 322, p-value = 0.03771
ols_test_breusch_pagan(mod0)

 Breusch Pagan Test for Heteroskedasticity
 -----------------------------------------
 Ho: the variance is constant            
 Ha: the variance is not constant        

                 Data                   
 ---------------------------------------
 Response : body_mass_g 
 Variables: fitted values of body_mass_g 

         Test Summary          
 ------------------------------
 DF            =    1 
 Chi2          =    0.008620809 
 Prob > Chi2   =    0.9260241 

There might be an issue of miss-specification

mod1<-lm(body_mass_g~.+(.)^2,df1)
summary(mod1)

Call:
lm(formula = body_mass_g ~ . + (.)^2, data = df1)

Residuals:
    Min      1Q  Median      3Q     Max 
-720.44 -172.21   -9.57  172.43  843.21 

Coefficients: (4 not defined because of singularities)
                                     Estimate Std. Error t value Pr(>|t|)  
(Intercept)                        -9474.7116 14278.3880  -0.664   0.5075  
speciesChinstrap                   -5039.2580  3452.6609  -1.460   0.1455  
speciesGentoo                        766.5496  4366.4877   0.176   0.8608  
islandDream                         3819.3950  2035.8064   1.876   0.0616 .
islandTorgersen                     5185.2661  2264.3522   2.290   0.0227 *
bill_length_mm                       232.2690   254.1576   0.914   0.3615  
bill_depth_mm                         55.5615   743.6789   0.075   0.9405  
flipper_length_mm                     45.9044    81.7854   0.561   0.5750  
sexMALE                             -348.4382  2199.7259  -0.158   0.8742  
speciesChinstrap:islandDream               NA         NA      NA       NA  
speciesGentoo:islandDream                  NA         NA      NA       NA  
speciesChinstrap:islandTorgersen           NA         NA      NA       NA  
speciesGentoo:islandTorgersen              NA         NA      NA       NA  
speciesChinstrap:bill_length_mm       11.7772    26.6817   0.441   0.6592  
speciesGentoo:bill_length_mm          12.7100    59.6581   0.213   0.8314  
speciesChinstrap:bill_depth_mm        27.6850   109.6793   0.252   0.8009  
speciesGentoo:bill_depth_mm          -39.2712   132.3282  -0.297   0.7668  
speciesChinstrap:flipper_length_mm    21.6943    16.1507   1.343   0.1802  
speciesGentoo:flipper_length_mm        2.2869    20.0925   0.114   0.9095  
speciesChinstrap:sexMALE            -752.8800   309.2195  -2.435   0.0155 *
speciesGentoo:sexMALE               -498.9192   397.9249  -1.254   0.2109  
islandDream:bill_length_mm           -23.7795    31.9870  -0.743   0.4578  
islandTorgersen:bill_length_mm       -19.8607    29.1421  -0.682   0.4961  
islandDream:bill_depth_mm            -41.0968    62.6284  -0.656   0.5122  
islandTorgersen:bill_depth_mm        -52.9501    64.3628  -0.823   0.4113  
islandDream:flipper_length_mm        -11.8875     9.4066  -1.264   0.2073  
islandTorgersen:flipper_length_mm    -18.5435    10.7728  -1.721   0.0862 .
islandDream:sexMALE                  176.7459   163.5401   1.081   0.2807  
islandTorgersen:sexMALE              106.4357   187.1268   0.569   0.5699  
bill_length_mm:bill_depth_mm          -1.0975     9.0604  -0.121   0.9037  
bill_length_mm:flipper_length_mm      -0.9886     1.2604  -0.784   0.4334  
bill_length_mm:sexMALE                17.9304    25.0540   0.716   0.4747  
bill_depth_mm:flipper_length_mm        0.6729     4.0201   0.167   0.8672  
bill_depth_mm:sexMALE                -75.4073    57.1548  -1.319   0.1881  
flipper_length_mm:sexMALE              7.3974     8.9240   0.829   0.4078  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 280.3 on 302 degrees of freedom
Multiple R-squared:  0.8898,    Adjusted R-squared:  0.8788 
F-statistic: 81.25 on 30 and 302 DF,  p-value: < 2.2e-16
plot(mod1)

resettest(mod1,power=2:3,type=("fitted"))

    RESET test

data:  mod1
RESET = 1.058, df1 = 2, df2 = 296, p-value = 0.3485
ols_test_breusch_pagan(mod1)

 Breusch Pagan Test for Heteroskedasticity
 -----------------------------------------
 Ho: the variance is constant            
 Ha: the variance is not constant        

                 Data                   
 ---------------------------------------
 Response : body_mass_g 
 Variables: fitted values of body_mass_g 

        Test Summary         
 ----------------------------
 DF            =    1 
 Chi2          =    0.5942284 
 Prob > Chi2   =    0.4407887 

Let’s do Model Selection Based on AIC

Without interaction

mod0_full<-lm(body_mass_g~.,df1)
mod0_null<-lm(body_mass_g~1,df1)
step(mod0_null,direction="forward",scope=formula(mod0_full),trace=1)
Start:  AIC=4457.28
body_mass_g ~ 1

                    Df Sum of Sq       RSS    AIC
+ flipper_length_mm  1 164047703  51211963 3981.1
+ species            2 145190219  70069447 4087.5
+ island             2  83740680 131518986 4297.2
+ bill_length_mm     1  74792533 140467133 4317.1
+ bill_depth_mm      1  47959592 167300074 4375.3
+ sex                1  38878897 176380769 4392.9
<none>                           215259666 4457.3

Step:  AIC=3981.13
body_mass_g ~ flipper_length_mm

                 Df Sum of Sq      RSS    AIC
+ sex             1   9416589 41795374 3915.5
+ species         2   5368818 45843144 3948.3
+ island          2   3437527 47774435 3962.0
+ bill_depth_mm   1    338887 50873075 3980.9
<none>                        51211963 3981.1
+ bill_length_mm  1    140000 51071963 3982.2

Step:  AIC=3915.47
body_mass_g ~ flipper_length_mm + sex

                 Df Sum of Sq      RSS    AIC
+ species         2  13141806 28653568 3793.8
+ island          2   6037165 35758209 3867.5
+ bill_depth_mm   1   3667377 38127997 3886.9
<none>                        41795374 3915.5
+ bill_length_mm  1    144990 41650383 3916.3

Step:  AIC=3793.76
body_mass_g ~ flipper_length_mm + sex + species

                 Df Sum of Sq      RSS    AIC
+ bill_depth_mm   1   1196096 27457472 3781.6
+ bill_length_mm  1    780776 27872792 3786.6
<none>                        28653568 3793.8
+ island          2     61695 28591873 3797.0

Step:  AIC=3781.56
body_mass_g ~ flipper_length_mm + sex + species + bill_depth_mm

                 Df Sum of Sq      RSS    AIC
+ bill_length_mm  1    541825 26915647 3776.9
<none>                        27457472 3781.6
+ island          2     59488 27397984 3784.8

Step:  AIC=3776.93
body_mass_g ~ flipper_length_mm + sex + species + bill_depth_mm + 
    bill_length_mm

         Df Sum of Sq      RSS    AIC
<none>                26915647 3776.9
+ island  2     56214 26859432 3780.2

Call:
lm(formula = body_mass_g ~ flipper_length_mm + sex + species + 
    bill_depth_mm + bill_length_mm, data = df1)

Coefficients:
      (Intercept)  flipper_length_mm            sexMALE   speciesChinstrap  
         -1460.99              15.95             389.89            -251.48  
    speciesGentoo      bill_depth_mm     bill_length_mm  
          1014.63              67.22              18.20  
#Step:  AIC=3776.93
#body_mass_g ~ flipper_length_mm + sex + species + bill_depth_mm + 
#    bill_length_mm

With Interaction

mod1_full<-lm(body_mass_g~.+(.)^2,df1)
mod1_null<-lm(body_mass_g~1,df1)
step(mod1_null,direction="forward",scope=formula(mod1_full),trace=0)

Call:
lm(formula = body_mass_g ~ flipper_length_mm + sex + species + 
    bill_depth_mm + bill_length_mm + sex:species + sex:bill_depth_mm, 
    data = df1)

Coefficients:
             (Intercept)         flipper_length_mm                   sexMALE  
                -2357.33                     16.69                   1605.64  
        speciesChinstrap             speciesGentoo             bill_depth_mm  
                  -87.13                   1088.53                    105.46  
          bill_length_mm  sexMALE:speciesChinstrap     sexMALE:speciesGentoo  
                   19.67                   -360.63                   -173.40  
   sexMALE:bill_depth_mm  
                  -64.11  
#Step:  AIC=3754.04
#body_mass_g ~ flipper_length_mm + sex + species + bill_depth_mm + 
#    bill_length_mm + sex:species + sex:bill_depth_mm