df0<-read.csv("https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv")
df0[df0 == ""] <- NA
df1<-na.omit(df0)
library(dplyr)
library(ggplot2)
library(ggthemes)
library(tidyverse)
library(leaps)
library(olsrr)
library(lmtest)dplyr and Experimental Designs
Basic Data Structure
#str(df1)
for(i in c(1,2,7)){
df1[,i]<-factor(df1[,i])
}
str(df1)'data.frame': 333 obs. of 7 variables:
$ species : Factor w/ 3 levels "Adelie","Chinstrap",..: 1 1 1 1 1 1 1 1 1 1 ...
$ island : Factor w/ 3 levels "Biscoe","Dream",..: 3 3 3 3 3 3 3 3 3 3 ...
$ bill_length_mm : num 39.1 39.5 40.3 36.7 39.3 38.9 39.2 41.1 38.6 34.6 ...
$ bill_depth_mm : num 18.7 17.4 18 19.3 20.6 17.8 19.6 17.6 21.2 21.1 ...
$ flipper_length_mm: int 181 186 195 193 190 181 195 182 191 198 ...
$ body_mass_g : int 3750 3800 3250 3450 3650 3625 4675 3200 3800 4400 ...
$ sex : Factor w/ 2 levels "FEMALE","MALE": 2 1 1 1 2 1 2 1 2 2 ...
- attr(*, "na.action")= 'omit' Named int [1:11] 4 9 10 11 12 48 247 287 325 337 ...
..- attr(*, "names")= chr [1:11] "4" "9" "10" "11" ...
Compute some summary statistics
df1%>%
group_by(species)%>%
summarise(`Mean Body Mass`=mean(body_mass_g),std=sd(body_mass_g),n=n())# A tibble: 3 × 4
species `Mean Body Mass` std n
<fct> <dbl> <dbl> <int>
1 Adelie 3706. 459. 146
2 Chinstrap 3733. 384. 68
3 Gentoo 5092. 501. 119
Using Tukey’s Test to see difference
mod_anova<-aov(body_mass_g~species,df1)
summary(mod_anova) Df Sum Sq Mean Sq F value Pr(>F)
species 2 145190219 72595110 341.9 <2e-16 ***
Residuals 330 70069447 212332
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(mod_anova) Tukey multiple comparisons of means
95% family-wise confidence level
Fit: aov(formula = body_mass_g ~ species, data = df1)
$species
diff lwr upr p adj
Chinstrap-Adelie 26.92385 -132.3528 186.2005 0.916431
Gentoo-Adelie 1386.27259 1252.2897 1520.2554 0.000000
Gentoo-Chinstrap 1359.34874 1194.4304 1524.2671 0.000000
plot(TukeyHSD(mod_anova))In terms of species, the differences in mean body mass were significant among Gentoo vs Adelie and Gentoo vs Chinstrap, but the difference between Chinstrap vs Adelie was not significant.
Let’s say you would like to re-group them into 2 groups, Gentoo and others
df1%>%
mutate(alt_species=
case_when(species %in% c("Adelie","Chinstrap") ~"Others",
TRUE ~ "Gentoo"))%>%
group_by(alt_species)%>%
summarise(`Mean Body Mass`=mean(body_mass_g),std=sd(body_mass_g),n=n())# A tibble: 2 × 4
alt_species `Mean Body Mass` std n
<chr> <dbl> <dbl> <int>
1 Gentoo 5092. 501. 119
2 Others 3715. 436. 214
Using t.test
df2<-df1%>%
mutate(alt_species=
case_when(species %in% c("Adelie","Chinstrap") ~"Others",
TRUE ~ "Gentoo"))%>%
select(-species)
t.test(body_mass_g~alt_species,df2,var.equal=FALSE)
Welch Two Sample t-test
data: body_mass_g by alt_species
t = 25.153, df = 216.69, p-value < 2.2e-16
alternative hypothesis: true difference in means between group Gentoo and group Others is not equal to 0
95 percent confidence interval:
1269.759 1485.676
sample estimates:
mean in group Gentoo mean in group Others
5092.437 3714.720
df1%>%
ggplot(aes(x=species,y=body_mass_g,group=sex,col=sex))+
stat_summary(fun=mean,geom="point")+
stat_summary(fun=mean,geom="line")+theme_stata(scheme = "s1color")+
scale_color_stata()+labs(y="Body Mass in Grams")Regression Model
mod0<-lm(body_mass_g~.,df1)
summary(mod0)
Call:
lm(formula = body_mass_g ~ ., data = df1)
Residuals:
Min 1Q Median 3Q Max
-779.20 -167.35 -3.16 179.37 914.27
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1500.029 575.822 -2.605 0.009610 **
speciesChinstrap -260.306 88.551 -2.940 0.003522 **
speciesGentoo 987.761 137.238 7.197 4.30e-12 ***
islandDream -13.103 58.541 -0.224 0.823032
islandTorgersen -48.064 60.922 -0.789 0.430722
bill_length_mm 18.189 7.136 2.549 0.011270 *
bill_depth_mm 67.575 19.821 3.409 0.000734 ***
flipper_length_mm 16.239 2.939 5.524 6.80e-08 ***
sexMALE 387.224 48.138 8.044 1.66e-14 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 287.9 on 324 degrees of freedom
Multiple R-squared: 0.8752, Adjusted R-squared: 0.8721
F-statistic: 284.1 on 8 and 324 DF, p-value: < 2.2e-16
AIC(mod0)[1] 4727.242
plot(mod0)resettest(mod0,power=2:3,type=("fitted"))
RESET test
data: mod0
RESET = 3.3115, df1 = 2, df2 = 322, p-value = 0.03771
ols_test_breusch_pagan(mod0)
Breusch Pagan Test for Heteroskedasticity
-----------------------------------------
Ho: the variance is constant
Ha: the variance is not constant
Data
---------------------------------------
Response : body_mass_g
Variables: fitted values of body_mass_g
Test Summary
------------------------------
DF = 1
Chi2 = 0.008620809
Prob > Chi2 = 0.9260241
There might be an issue of miss-specification
mod1<-lm(body_mass_g~.+(.)^2,df1)
summary(mod1)
Call:
lm(formula = body_mass_g ~ . + (.)^2, data = df1)
Residuals:
Min 1Q Median 3Q Max
-720.44 -172.21 -9.57 172.43 843.21
Coefficients: (4 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) -9474.7116 14278.3880 -0.664 0.5075
speciesChinstrap -5039.2580 3452.6609 -1.460 0.1455
speciesGentoo 766.5496 4366.4877 0.176 0.8608
islandDream 3819.3950 2035.8064 1.876 0.0616 .
islandTorgersen 5185.2661 2264.3522 2.290 0.0227 *
bill_length_mm 232.2690 254.1576 0.914 0.3615
bill_depth_mm 55.5615 743.6789 0.075 0.9405
flipper_length_mm 45.9044 81.7854 0.561 0.5750
sexMALE -348.4382 2199.7259 -0.158 0.8742
speciesChinstrap:islandDream NA NA NA NA
speciesGentoo:islandDream NA NA NA NA
speciesChinstrap:islandTorgersen NA NA NA NA
speciesGentoo:islandTorgersen NA NA NA NA
speciesChinstrap:bill_length_mm 11.7772 26.6817 0.441 0.6592
speciesGentoo:bill_length_mm 12.7100 59.6581 0.213 0.8314
speciesChinstrap:bill_depth_mm 27.6850 109.6793 0.252 0.8009
speciesGentoo:bill_depth_mm -39.2712 132.3282 -0.297 0.7668
speciesChinstrap:flipper_length_mm 21.6943 16.1507 1.343 0.1802
speciesGentoo:flipper_length_mm 2.2869 20.0925 0.114 0.9095
speciesChinstrap:sexMALE -752.8800 309.2195 -2.435 0.0155 *
speciesGentoo:sexMALE -498.9192 397.9249 -1.254 0.2109
islandDream:bill_length_mm -23.7795 31.9870 -0.743 0.4578
islandTorgersen:bill_length_mm -19.8607 29.1421 -0.682 0.4961
islandDream:bill_depth_mm -41.0968 62.6284 -0.656 0.5122
islandTorgersen:bill_depth_mm -52.9501 64.3628 -0.823 0.4113
islandDream:flipper_length_mm -11.8875 9.4066 -1.264 0.2073
islandTorgersen:flipper_length_mm -18.5435 10.7728 -1.721 0.0862 .
islandDream:sexMALE 176.7459 163.5401 1.081 0.2807
islandTorgersen:sexMALE 106.4357 187.1268 0.569 0.5699
bill_length_mm:bill_depth_mm -1.0975 9.0604 -0.121 0.9037
bill_length_mm:flipper_length_mm -0.9886 1.2604 -0.784 0.4334
bill_length_mm:sexMALE 17.9304 25.0540 0.716 0.4747
bill_depth_mm:flipper_length_mm 0.6729 4.0201 0.167 0.8672
bill_depth_mm:sexMALE -75.4073 57.1548 -1.319 0.1881
flipper_length_mm:sexMALE 7.3974 8.9240 0.829 0.4078
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 280.3 on 302 degrees of freedom
Multiple R-squared: 0.8898, Adjusted R-squared: 0.8788
F-statistic: 81.25 on 30 and 302 DF, p-value: < 2.2e-16
plot(mod1)resettest(mod1,power=2:3,type=("fitted"))
RESET test
data: mod1
RESET = 1.058, df1 = 2, df2 = 296, p-value = 0.3485
ols_test_breusch_pagan(mod1)
Breusch Pagan Test for Heteroskedasticity
-----------------------------------------
Ho: the variance is constant
Ha: the variance is not constant
Data
---------------------------------------
Response : body_mass_g
Variables: fitted values of body_mass_g
Test Summary
----------------------------
DF = 1
Chi2 = 0.5942284
Prob > Chi2 = 0.4407887
Let’s do Model Selection Based on AIC
Without interaction
mod0_full<-lm(body_mass_g~.,df1)
mod0_null<-lm(body_mass_g~1,df1)
step(mod0_null,direction="forward",scope=formula(mod0_full),trace=1)Start: AIC=4457.28
body_mass_g ~ 1
Df Sum of Sq RSS AIC
+ flipper_length_mm 1 164047703 51211963 3981.1
+ species 2 145190219 70069447 4087.5
+ island 2 83740680 131518986 4297.2
+ bill_length_mm 1 74792533 140467133 4317.1
+ bill_depth_mm 1 47959592 167300074 4375.3
+ sex 1 38878897 176380769 4392.9
<none> 215259666 4457.3
Step: AIC=3981.13
body_mass_g ~ flipper_length_mm
Df Sum of Sq RSS AIC
+ sex 1 9416589 41795374 3915.5
+ species 2 5368818 45843144 3948.3
+ island 2 3437527 47774435 3962.0
+ bill_depth_mm 1 338887 50873075 3980.9
<none> 51211963 3981.1
+ bill_length_mm 1 140000 51071963 3982.2
Step: AIC=3915.47
body_mass_g ~ flipper_length_mm + sex
Df Sum of Sq RSS AIC
+ species 2 13141806 28653568 3793.8
+ island 2 6037165 35758209 3867.5
+ bill_depth_mm 1 3667377 38127997 3886.9
<none> 41795374 3915.5
+ bill_length_mm 1 144990 41650383 3916.3
Step: AIC=3793.76
body_mass_g ~ flipper_length_mm + sex + species
Df Sum of Sq RSS AIC
+ bill_depth_mm 1 1196096 27457472 3781.6
+ bill_length_mm 1 780776 27872792 3786.6
<none> 28653568 3793.8
+ island 2 61695 28591873 3797.0
Step: AIC=3781.56
body_mass_g ~ flipper_length_mm + sex + species + bill_depth_mm
Df Sum of Sq RSS AIC
+ bill_length_mm 1 541825 26915647 3776.9
<none> 27457472 3781.6
+ island 2 59488 27397984 3784.8
Step: AIC=3776.93
body_mass_g ~ flipper_length_mm + sex + species + bill_depth_mm +
bill_length_mm
Df Sum of Sq RSS AIC
<none> 26915647 3776.9
+ island 2 56214 26859432 3780.2
Call:
lm(formula = body_mass_g ~ flipper_length_mm + sex + species +
bill_depth_mm + bill_length_mm, data = df1)
Coefficients:
(Intercept) flipper_length_mm sexMALE speciesChinstrap
-1460.99 15.95 389.89 -251.48
speciesGentoo bill_depth_mm bill_length_mm
1014.63 67.22 18.20
#Step: AIC=3776.93
#body_mass_g ~ flipper_length_mm + sex + species + bill_depth_mm +
# bill_length_mmWith Interaction
mod1_full<-lm(body_mass_g~.+(.)^2,df1)
mod1_null<-lm(body_mass_g~1,df1)
step(mod1_null,direction="forward",scope=formula(mod1_full),trace=0)
Call:
lm(formula = body_mass_g ~ flipper_length_mm + sex + species +
bill_depth_mm + bill_length_mm + sex:species + sex:bill_depth_mm,
data = df1)
Coefficients:
(Intercept) flipper_length_mm sexMALE
-2357.33 16.69 1605.64
speciesChinstrap speciesGentoo bill_depth_mm
-87.13 1088.53 105.46
bill_length_mm sexMALE:speciesChinstrap sexMALE:speciesGentoo
19.67 -360.63 -173.40
sexMALE:bill_depth_mm
-64.11
#Step: AIC=3754.04
#body_mass_g ~ flipper_length_mm + sex + species + bill_depth_mm +
# bill_length_mm + sex:species + sex:bill_depth_mm