Statistics using the tidyverse - Modeling with R series

The tidyverse

Hadley Wickham
Set of libraries for data munging
Tidy data

Libraries

library(plotly)
library(dplyr)
library(gmodels)
library(mvnormtest)
library(Hotelling)
library(ICSNP)
library(DescTools)
library(car)

Import data

df <- read.csv("Data.csv")

Dimensions

dim(df)

## [1] 504  10

Variables

names(df)

##  [1] "ID"                 "Group"              "Grade"             
##  [4] "Smoking"            "InitialCholesterol" "FinalCholesterol"  
##  [7] "dBP"                "Creatinine"         "Urea"              
## [10] "Weight"

Data structure

str(df)

## 'data.frame':    504 obs. of  10 variables:
##  $ ID                : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Group             : Factor w/ 2 levels "A","P": 1 2 2 2 1 2 1 2 2 1 ...
##  $ Grade             : Factor w/ 3 levels "I","II","III": 2 2 2 2 1 1 2 2 1 3 ...
##  $ Smoking           : int  0 1 0 0 0 1 0 1 1 0 ...
##  $ InitialCholesterol: num  7.2 6.4 7.6 6.9 5.9 8.8 7.7 6.4 8.9 6.9 ...
##  $ FinalCholesterol  : num  6.5 6.2 6.9 6.9 5.9 9.2 7.1 6.2 8.9 6.1 ...
##  $ dBP               : int  81 86 77 71 82 75 79 82 78 85 ...
##  $ Creatinine        : int  63 56 69 67 55 86 75 58 77 66 ...
##  $ Urea              : num  7.8 6.7 8.2 8.6 7.4 12.1 8.7 7.2 8.2 9.1 ...
##  $ Weight            : int  89 86 105 71 88 88 92 87 72 100 ...

Managing categorical variables

Changing the Group variable

df$Group <- factor(df$Group,
                   levels = c("P", "A"),
                   labels = c("Placebo", "Active"))

Changing Smoking to a categorical variable

df$Smoking <- factor(df$Smoking,
                     levels = c(0, 1),
                     labels = c("No", "Yes"))

str(df)

## 'data.frame':    504 obs. of  10 variables:
##  $ ID                : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Group             : Factor w/ 2 levels "Placebo","Active": 2 1 1 1 2 1 2 1 1 2 ...
##  $ Grade             : Factor w/ 3 levels "I","II","III": 2 2 2 2 1 1 2 2 1 3 ...
##  $ Smoking           : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 2 1 ...
##  $ InitialCholesterol: num  7.2 6.4 7.6 6.9 5.9 8.8 7.7 6.4 8.9 6.9 ...
##  $ FinalCholesterol  : num  6.5 6.2 6.9 6.9 5.9 9.2 7.1 6.2 8.9 6.1 ...
##  $ dBP               : int  81 86 77 71 82 75 79 82 78 85 ...
##  $ Creatinine        : int  63 56 69 67 55 86 75 58 77 66 ...
##  $ Urea              : num  7.8 6.7 8.2 8.6 7.4 12.1 8.7 7.2 8.2 9.1 ...
##  $ Weight            : int  89 86 105 71 88 88 92 87 72 100 ...

Descriptive statistics

Create two vectors

var_1 <- c(20,20,20,20,21,24,44,60,90,94,101)
var_2 <- c(1.73,1.65,2.02,1.89,2.61,1.36,2.37,2.08,2.69,2.32,3.67)

Structure and type

str(var_2)

##  num [1:11] 1.73 1.65 2.02 1.89 2.61 1.36 2.37 2.08 2.69 2.32 ...

typeof(var_1)

## [1] "double"

typeof(var_2)

## [1] "double"

Arithmetic mean

mean(var_1)

## [1] 46.72727

Geometric mean

exp(mean(log(var_1)))

## [1] 37.07464

Harmonic mean

1 / var_1  # Elementwise operation

##  [1] 0.05000000 0.05000000 0.05000000 0.05000000 0.04761905 0.04166667
##  [7] 0.02272727 0.01666667 0.01111111 0.01063830 0.00990099

1 / mean(1 / var_1)

## [1] 30.52757

Median

median(var_1)

## [1] 24

Extrema

min(var_1)

## [1] 20

max(var_1)

## [1] 101

The range function returns both extrema

range(var_1)

## [1]  20 101

Standard deviation

sd(var_1)

## [1] 33.57407

Variance

var(var_1)

## [1] 1127.218

Quantile

quantile(var_1, 0.25)

## 25% 
##  20

quantile(var_1, c(0.25, 0.75))  # c() creates a vector object

## 25% 75% 
##  20  75

Sneak peek of plotting

plot(var_2 ~ var_1, xlab = "Variable 1", ylab = "Variable 2")

Summary statistics using data munging

Summary statistics of InitialCholesterol by Group

df %>% group_by(Group) %>% summarise(AVERAGE = mean(InitialCholesterol, na.rm = T),
                                     STD = sd(InitialCholesterol),
                                     MIN = min(InitialCholesterol),
                                     MEDIAN = median(InitialCholesterol),
                                     MAX = max(InitialCholesterol),
                                     IQRange = IQR(InitialCholesterol))

## # A tibble: 2 x 7
##   Group   AVERAGE   STD   MIN MEDIAN   MAX IQRange
##   <fct>     <dbl> <dbl> <dbl>  <dbl> <dbl>   <dbl>
## 1 Placebo    6.93  2.00   2.6    7    11.8    2.95
## 2 Active     6.98  1.99   0.8    7.1  11.3    2.7

Add a variable

df <- df %>% mutate(DifferenceCholesterol = FinalCholesterol - InitialCholesterol)

head(df)

##   ID   Group Grade Smoking InitialCholesterol FinalCholesterol dBP Creatinine
## 1  1  Active    II      No                7.2              6.5  81         63
## 2  2 Placebo    II     Yes                6.4              6.2  86         56
## 3  3 Placebo    II      No                7.6              6.9  77         69
## 4  4 Placebo    II      No                6.9              6.9  71         67
## 5  5  Active     I      No                5.9              5.9  82         55
## 6  6 Placebo     I     Yes                8.8              9.2  75         86
##   Urea Weight DifferenceCholesterol
## 1  7.8     89                  -0.7
## 2  6.7     86                  -0.2
## 3  8.2    105                  -0.7
## 4  8.6     71                   0.0
## 5  7.4     88                   0.0
## 6 12.1     88                   0.4

Common inferential tests

Comparing two means

Compare means of initial cholesterol between groups
- We have alread seen the summary statistics for this question
- Visialuze the data

boxplot(InitialCholesterol ~ Group,
        data = df,
        main = "Difference in initial cholesterol levels",
        xlab = "Group",
        ylab = "Cholesterol",
        las = 1,
        col = c("deepskyblue", "orange"))

Check for normality

shapiro.test(df %>% filter(Group == "Active") %>% select(InitialCholesterol) %>% pull())

## 
##  Shapiro-Wilk normality test
## 
## data:  df %>% filter(Group == "Active") %>% select(InitialCholesterol) %>%     pull()
## W = 0.99235, p-value = 0.2255

shapiro.test(df %>% filter(Group == "Placebo") %>% select(InitialCholesterol) %>% pull())

## 
##  Shapiro-Wilk normality test
## 
## data:  df %>% filter(Group == "Placebo") %>% select(InitialCholesterol) %>%     pull()
## W = 0.99041, p-value = 0.09184

Visual check for normality

qqnorm(df$InitialCholesterol[df$Group == "Active"])  # Use qqplot for other distributions
qqline(df$InitialCholesterol[df$Group == "Active"],
       col = "steelblue",
       lwd = 2)

Test for equal variances

bartlett.test(InitialCholesterol ~ Group,
              data = df)

## 
##  Bartlett test of homogeneity of variances
## 
## data:  InitialCholesterol by Group
## Bartlett's K-squared = 0.0052034, df = 1, p-value = 0.9425

Student’s t test

t.test(InitialCholesterol ~ Group,
       data = df,
       var.equal = T)

## 
##  Two Sample t-test
## 
## data:  InitialCholesterol by Group
## t = -0.31235, df = 502, p-value = 0.7549
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.4055720  0.2943034
## sample estimates:
## mean in group Placebo  mean in group Active 
##              6.925490              6.981124

Nonparametric alternative

wilcox.test(InitialCholesterol ~ Group,
            data = df)

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  InitialCholesterol by Group
## W = 30910, p-value = 0.6084
## alternative hypothesis: true location shift is not equal to 0

Comparing three or more means

See ANOVA later
Is there a difference in urea values between the different grades

df %>% group_by(Grade) %>% summarise(AVERAGE = mean(Urea, na.rm = T),
                                     STD = sd(Urea),
                                     MIN = min(Urea),
                                     MEDIAN = median(Urea),
                                     MAX = max(Urea),
                                     IQRange = IQR(Urea))

## # A tibble: 3 x 7
##   Grade AVERAGE   STD   MIN MEDIAN   MAX IQRange
##   <fct>   <dbl> <dbl> <dbl>  <dbl> <dbl>   <dbl>
## 1 I        8.26  2.37   2.2    8.3  13.7    3.1 
## 2 II       8.32  2.34   3.2    8.2  17.7    2.92
## 3 III      8.33  2.57   2.1    8.3  15      3.05

boxplot(Urea ~ Grade,
        data = df,
        main = "Difference in urea levels",
        xlab = "Grade",
        ylab = "Urea",
        las = 1,
        col = c("deepskyblue", "orange", "green"))

stripchart(df$Urea ~ df$Grade,
           vertical = T,
           method = "jitter",
           jitter = 0.1,
           main = "Difference in urea levels",
           xlab = "Grade",
           ylab = "Urea",
           las = 1,
           col = c("deepskyblue", "orange", "green"))

Test assumptions

shapiro.test(df$Urea[df$Grade == "I"])

## 
##  Shapiro-Wilk normality test
## 
## data:  df$Urea[df$Grade == "I"]
## W = 0.99312, p-value = 0.6123

shapiro.test(df$Urea[df$Grade == "II"])

## 
##  Shapiro-Wilk normality test
## 
## data:  df$Urea[df$Grade == "II"]
## W = 0.9736, p-value = 0.000336

shapiro.test(df$Urea[df$Grade == "III"])

## 
##  Shapiro-Wilk normality test
## 
## data:  df$Urea[df$Grade == "III"]
## W = 0.97979, p-value = 0.08782

qqnorm(df$Urea[df$Grade == "I"])
qqline(df$Urea[df$Grade == "I"],
       col = "deepskyblue",
       lwd = 2)

qqnorm(df$Urea[df$Grade == "II"])
qqline(df$Urea[df$Grade == "II"],
       col = "orange",
       lwd = 2)

qqnorm(df$Urea[df$Grade == "III"])
qqline(df$Urea[df$Grade == "III"],
       col = "green",
       lwd = 2)

summary(aov(Urea ~ Grade,
            data = df))

##              Df Sum Sq Mean Sq F value Pr(>F)
## Grade         2    0.5   0.261   0.045  0.956
## Residuals   501 2889.4   5.767

Nonparametric alternative

kruskal.test(Urea ~ Grade,
             data = df)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  Urea by Grade
## Kruskal-Wallis chi-squared = 0.060917, df = 2, p-value = 0.97

Multivariate comparison of means

Hotelling’s \(T^2\) test
- Comparing means of two variables between two groups

dep_matrix <- data.matrix(df[, 7:8])
dep_matrix[1:5, ]

##      dBP Creatinine
## [1,]  81         63
## [2,]  86         56
## [3,]  77         69
## [4,]  71         67
## [5,]  82         55

mvnormtest::mshapiro.test(t(dep_matrix))

## 
##  Shapiro-Wilk normality test
## 
## data:  Z
## W = 0.99559, p-value = 0.1676

det(cov(dep_matrix))

## [1] 29272.43

hot_test <- Hotelling::hotelling.test(df %>% filter(Group == "Active") %>% select(c("dBP", "Creatinine")),
                                      df %>% filter(Group == "Placebo") %>% select(c("dBP", "Creatinine")))
hot_test

## Test stat:  0.19412 
## Numerator df:  2 
## Denominator df:  501 
## P-value:  0.8236

ICSNP::HotellingsT2(df %>% filter(Group == "Active") %>% select(c("dBP", "Creatinine")),
                    df %>% filter(Group == "Placebo") %>% select(c("dBP", "Creatinine")))

## 
##  Hotelling's two sample T2-test
## 
## data:  df %>% filter(Group == "Active") %>% select(c("dBP", "Creatinine")) and df %>% filter(Group == "Placebo") %>% select(c("dBP", "Creatinine"))
## T.2 = 0.19412, df1 = 2, df2 = 501, p-value = 0.8236
## alternative hypothesis: true location difference is not equal to c(0,0)

\(\chi^2\) goodness of fit test

Oberved proprtions against expected proprotions

table(df$Grade)

## 
##   I  II III 
## 168 224 112

chisq.test(table(df$Grade),
            p =rep(1/3, 3))

## 
##  Chi-squared test for given probabilities
## 
## data:  table(df$Grade)
## X-squared = 37.333, df = 2, p-value = 7.819e-09

Log-likelihood alternative (G test for goodness of fit)

DescTools::GTest(x = c(168, 224, 112),
                 p = rep(1/3, 3),
                 correct = "none")

## 
##  Log likelihood ratio (G-test) goodness of fit test
## 
## data:  c(168, 224, 112)
## G = 38.057, X-squared df = 2, p-value = 5.444e-09

Fisher exact test

vals <- matrix(c(3, 1, 3, 2),
               nrow = 2,
               dimnames = list(Group = c("Control", "Experimental"),
                               Outcome = c("Worsened", "Improved")))
vals

##               Outcome
## Group          Worsened Improved
##   Control             3        3
##   Experimental        1        2

fisher.test(vals)

## 
##  Fisher's Exact Test for Count Data
## 
## data:  vals
## p-value = 1
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##    0.06060903 156.52286969
## sample estimates:
## odds ratio 
##   1.852496

\(\chi^2\) test for independence

Is there dependence between Grade and Smoking
Contingency table

table(df$Grade,
      df$Smoking)

##      
##        No Yes
##   I   131  37
##   II  176  48
##   III 103   9

Using CrossTable from gmodels for fractions

gmodels::CrossTable(df$Grade,
                    df$Smoking)

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  504 
## 
##  
##              | df$Smoking 
##     df$Grade |        No |       Yes | Row Total | 
## -------------|-----------|-----------|-----------|
##            I |       131 |        37 |       168 | 
##              |     0.235 |     1.025 |           | 
##              |     0.780 |     0.220 |     0.333 | 
##              |     0.320 |     0.394 |           | 
##              |     0.260 |     0.073 |           | 
## -------------|-----------|-----------|-----------|
##           II |       176 |        48 |       224 | 
##              |     0.212 |     0.927 |           | 
##              |     0.786 |     0.214 |     0.444 | 
##              |     0.429 |     0.511 |           | 
##              |     0.349 |     0.095 |           | 
## -------------|-----------|-----------|-----------|
##          III |       103 |         9 |       112 | 
##              |     1.551 |     6.767 |           | 
##              |     0.920 |     0.080 |     0.222 | 
##              |     0.251 |     0.096 |           | 
##              |     0.204 |     0.018 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |       410 |        94 |       504 | 
##              |     0.813 |     0.187 |           | 
## -------------|-----------|-----------|-----------|
## 
##

chisq.test(table(df$Grade,
                 df$Smoking))

## 
##  Pearson's Chi-squared test
## 
## data:  table(df$Grade, df$Smoking)
## X-squared = 10.717, df = 2, p-value = 0.004708

Log likelihood ratio test (G test for independence)

\[G=2\sum_{i=1}^{N}{\text{observed}_{i}\cdot\ln{\frac{{\text{observed}}_i}{{\text{expected}}_i}}}\]

DescTools::GTest(table(df$Grade,
                       df$Smoking),
                 correct = "none")

## 
##  Log likelihood ratio (G-test) test of independence without correction
## 
## data:  table(df$Grade, df$Smoking)
## G = 12.415, X-squared df = 2, p-value = 0.002014

Linear regression introduction

names(df)

##  [1] "ID"                    "Group"                 "Grade"                
##  [4] "Smoking"               "InitialCholesterol"    "FinalCholesterol"     
##  [7] "dBP"                   "Creatinine"            "Urea"                 
## [10] "Weight"                "DifferenceCholesterol"

Simple linear regression

Urea and Creatinine

plot(x = df$Urea,
     y = df$Creatinine,
     main = "Urea as predictor of creatinine",
     xlab = "Urea",
     ylab = "Creatinine",
     las = 1,
     col = "orange")

Fit data to a linear model with Creatine as dependent variable and Urea as independent variable

urea_creat <- lm(Creatinine ~ Urea,
                 data = df,
                 na.action = na.omit)

Summary of the model

summary(urea_creat)

## 
## Call:
## lm(formula = Creatinine ~ Urea, data = df, na.action = na.omit)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -27.9441  -4.8015  -0.2036   4.8079  23.1357 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  15.1182     1.2622   11.98   <2e-16 ***
## Urea          6.0919     0.1461   41.71   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.852 on 502 degrees of freedom
## Multiple R-squared:  0.7761, Adjusted R-squared:  0.7756 
## F-statistic:  1740 on 1 and 502 DF,  p-value: < 2.2e-16

Regression line

plot(x = df$Urea,
     y = df$Creatinine,
     main = "Urea as predictor of creatinine",
     xlab = "Urea",
     ylab = "Creatinine",
     las = 1,
     col = "orange")
abline(urea_creat,
       col = "gray",
       lwd = 2)

confint(urea_creat, level = 0.95)  # level = 0.95 is default

##                 2.5 %    97.5 %
## (Intercept) 12.638402 17.598046
## Urea         5.804904  6.378807

attributes(urea_creat)

## $names
##  [1] "coefficients"  "residuals"     "effects"       "rank"         
##  [5] "fitted.values" "assign"        "qr"            "df.residual"  
##  [9] "xlevels"       "call"          "terms"         "model"        
## 
## $class
## [1] "lm"

First five \(y\) and \(\hat{y}\)

df$Creatinine[1:5]

## [1] 63 56 69 67 55

urea_creat$fitted.values[1:5]

##        1        2        3        4        5 
## 62.63470 55.93366 65.07144 67.50818 60.19795

df$Creatinine[1:5] - urea_creat$fitted.values[1:5]

##           1           2           3           4           5 
##  0.36530339  0.06634438  3.92856122 -0.50818096 -5.19795443

urea_creat$residuals[1:5]

##           1           2           3           4           5 
##  0.36530339  0.06634438  3.92856122 -0.50818096 -5.19795443

From the F ratio
- Sum of deviations from the mean (total variability or \(\text{SSY}\)) = Sum of deviations of predicted values from the mean (variability explained by the regression or \(\text{SSR\)) + Sum of deviations of data from the regression line (residuals or \(\text{SSE}\))
- \(R^2\) is then \(\frac{\text{SSY} - \text{SSE}}{\text{SSY}}\) (fraction of the variance explained by the model)

anova(urea_creat)

## Analysis of Variance Table
## 
## Response: Creatinine
##            Df Sum Sq Mean Sq F value    Pr(>F)    
## Urea        1 107247  107247  1739.7 < 2.2e-16 ***
## Residuals 502  30947      62                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Test assumptions

par(mfrow = c(2, 2))
plot(urea_creat)

plot(urea_creat, which = 1)  # Seems like a non linearity is present

plot(urea_creat, which = 2)

plot(urea_creat, which = 3)

plot(urea_creat, which = 4)

hist(urea_creat$residuals,
     main = "Checking assumptions of residual distribution",
     xlab = "Residuals",
     ylab = "Frequency",
     col = 8,
     las = 1)

shapiro.test(urea_creat$residuals)

## 
##  Shapiro-Wilk normality test
## 
## data:  urea_creat$residuals
## W = 0.99568, p-value = 0.1802

Multiple linear regression model

Multiple regression
Urea given Creatinine and Weight

urea_weight_creat <- lm(Creatinine ~ Urea + Weight,
                        data = df)
summary(urea_weight_creat)

## 
## Call:
## lm(formula = Creatinine ~ Urea + Weight, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -27.8288  -4.7944  -0.2361   4.8110  23.1360 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 15.547397   2.548152   6.101 2.11e-09 ***
## Urea         6.094957   0.147066  41.444  < 2e-16 ***
## Weight      -0.004757   0.024528  -0.194    0.846    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.859 on 501 degrees of freedom
## Multiple R-squared:  0.7761, Adjusted R-squared:  0.7752 
## F-statistic: 868.2 on 2 and 501 DF,  p-value: < 2.2e-16

cor(df$Urea, df$Weight)

## [1] 0.1087418

par(mfrow = c(2, 2))
plot(urea_weight_creat)

confint(urea_weight_creat)

##                   2.5 %      97.5 %
## (Intercept) 10.54101713 20.55377713
## Urea         5.80601501  6.38389900
## Weight      -0.05294636  0.04343253

anova(urea_weight_creat)

## Analysis of Variance Table
## 
## Response: Creatinine
##            Df Sum Sq Mean Sq   F value Pr(>F)    
## Urea        1 107247  107247 1736.3647 <2e-16 ***
## Weight      1      2       2    0.0376 0.8463    
## Residuals 501  30944      62                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

One-way ANOVA

As above comparing means of more than two groups
Example by binning a numerical variable

df %>% summarise(MIN = min(Urea),
                 MAX = max(Urea))

##   MIN  MAX
## 1 2.1 17.7

df$UreaCat <- cut(df$Urea,
                  breaks = c(0, 5, 10, 18),
                  labels = c("U1", "U2", "U3")) # Breaks are left open and right closed
# Use right = FALSE to change this
# Use breaks = integer will create an integer number of equal sized bins

head(df)

##   ID   Group Grade Smoking InitialCholesterol FinalCholesterol dBP Creatinine
## 1  1  Active    II      No                7.2              6.5  81         63
## 2  2 Placebo    II     Yes                6.4              6.2  86         56
## 3  3 Placebo    II      No                7.6              6.9  77         69
## 4  4 Placebo    II      No                6.9              6.9  71         67
## 5  5  Active     I      No                5.9              5.9  82         55
## 6  6 Placebo     I     Yes                8.8              9.2  75         86
##   Urea Weight DifferenceCholesterol UreaCat
## 1  7.8     89                  -0.7      U2
## 2  6.7     86                  -0.2      U2
## 3  8.2    105                  -0.7      U2
## 4  8.6     71                   0.0      U2
## 5  7.4     88                   0.0      U2
## 6 12.1     88                   0.4      U3

df %>% group_by(UreaCat) %>% summarise(AVEAGE = mean(Creatinine))

## # A tibble: 3 x 2
##   UreaCat AVEAGE
##   <fct>    <dbl>
## 1 U1        34.4
## 2 U2        63.6
## 3 U3        84.2

urea_cat_plot <- plot_ly(data = df,
                         y = ~Creatinine,
                         color = ~UreaCat,
                         type = "box",
                         boxmean = "sd",
                         marker = list(symbol = "diamond"),
                         line = list(width = 1),
                         boxpoints = "all") %>% 
  layout(title = " Data")
urea_cat_plot

When creating the model the mean values above will be from \(\hat{\mu}_{\text{Creatinine}|X} = \beta_0 + \beta_{U2}X_{U2} + \beta_{U3}X_{U3} + \beta_{U4}X_{U4}\), where \(\beta_i\) is the Estimate below and \(Ui\) is the dummy variable value

cat_urea_creat <- lm(Creatinine ~ UreaCat,
                     data = df)
summary(cat_urea_creat)

## 
## Call:
## lm(formula = Creatinine ~ UreaCat, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -31.2381  -8.3947   0.3934   7.7619  29.3934 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   34.395      1.823   18.86   <2e-16 ***
## UreaCatU2     29.212      1.917   15.24   <2e-16 ***
## UreaCatU3     49.843      2.128   23.43   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.24 on 501 degrees of freedom
## Multiple R-squared:  0.542,  Adjusted R-squared:  0.5402 
## F-statistic: 296.5 on 2 and 501 DF,  p-value: < 2.2e-16

anova(cat_urea_creat)

## Analysis of Variance Table
## 
## Response: Creatinine
##            Df Sum Sq Mean Sq F value    Pr(>F)    
## UreaCat     2  74907   37454   296.5 < 2.2e-16 ***
## Residuals 501  63286     126                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

par(mfrow = c(1, 2))
plot(cat_urea_creat, which = c(1, 2))

Look at the standardized residuals

hist(rstandard(cat_urea_creat),
     main = "Histogram of the standardized residuals",
     xlab = "Standadized residuals",
     ylab = "Frequency",
     las = 1,
     col = "gray")

TukeyHSD(aov(cat_urea_creat))

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = cat_urea_creat)
## 
## $UreaCat
##           diff      lwr      upr p adj
## U2-U1 29.21191 24.70609 33.71773     0
## U3-U1 49.84336 44.84170 54.84502     0
## U3-U2 20.63145 17.70206 23.56084     0

Two-way ANOVA

Two categorical predictors

smoking_grade_creat <- lm(Creatinine ~ Smoking + Grade,
                          data = df)
anova(smoking_grade_creat)

## Analysis of Variance Table
## 
## Response: Creatinine
##            Df Sum Sq Mean Sq F value Pr(>F)
## Smoking     1      3   3.338  0.0121 0.9125
## Grade       2     74  37.244  0.1348 0.8739
## Residuals 500 138116 276.231

ANCOVA

Model with Weight and Grade

weight_grade_model <- lm(Creatinine ~ Weight + Grade,
                         data = df)
weight_grade_model$coefficients

## (Intercept)      Weight     GradeII    GradeIII 
##  55.5825643   0.1055191   0.3675932  -0.6069257

weight_grade_creat_plot <- plot_ly(data = df,
                                   x = ~Weight,
                                   y = ~Creatinine,
                                   color = ~Grade,
                                   type = "scatter",
                                   mode = "markers",
                                   marker = list(size = 12)) %>% 
  layout(title = "Predicting creatinine by weight and grade",
         xaxis = list(title = "Weight",
                      zeroline = F),
         yaxis = list(title = "Creatinine",
                      zeroline = F))
weight_grade_creat_plot

Type I (sequential: every listed variable above is controlled for) ANOVA of the model

anova(weight_grade_model)

## Analysis of Variance Table
## 
## Response: Creatinine
##            Df Sum Sq Mean Sq F value Pr(>F)  
## Weight      1   1163 1162.55  4.2441 0.0399 *
## Grade       2     71   35.55  0.1298 0.8783  
## Residuals 500 136960  273.92                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Type III (adjusted sum of squares, i.e. all other variable are eliminated) analysis of variance

car::Anova(weight_grade_model,
           type = "III")

## Anova Table (Type III tests)
## 
## Response: Creatinine
##             Sum Sq  Df  F value  Pr(>F)    
## (Intercept)  32613   1 119.0589 < 2e-16 ***
## Weight        1156   1   4.2212 0.04044 *  
## Grade           71   2   0.1298 0.87830    
## Residuals   136960 500                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Interaction between numerical and categorical variable

weight_grade_interaction_model <- lm(Creatinine ~ Weight * Grade,
                                     data = df)
anova(weight_grade_interaction_model)

## Analysis of Variance Table
## 
## Response: Creatinine
##               Df Sum Sq Mean Sq F value  Pr(>F)  
## Weight         1   1163 1162.55  4.2757 0.03918 *
## Grade          2     71   35.55  0.1308 0.87746  
## Weight:Grade   2   1554  777.13  2.8582 0.05832 .
## Residuals    498 135405  271.90                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

weight_grade_interaction_model$coefficients

##     (Intercept)          Weight         GradeII        GradeIII  Weight:GradeII 
##      40.5737205       0.2618322      22.4615001      26.1720236      -0.2304398 
## Weight:GradeIII 
##      -0.2799880

Randomized block design

Categorical variables as predictors
Consider two categorical variables, one for which we would simply like to control for
Create randomized block design

variable <- c(0.958, 0.971, 0.927, 0.971, 0.986, 1.051, 0.891, 1.010, 0.925, 0.952, 0.829, 0.955)
experiment <- factor(rep(1:4, 3))
grade <- rep(c("Mild", "Moderate", "Severe"), each = 4)
dfr_block <- data.frame(Variable = variable,
                        Experiment = experiment,
                        Grade = grade)
head(dfr_block)

##   Variable Experiment    Grade
## 1    0.958          1     Mild
## 2    0.971          2     Mild
## 3    0.927          3     Mild
## 4    0.971          4     Mild
## 5    0.986          1 Moderate
## 6    1.051          2 Moderate

block_project_plot <- plot_ly(data = dfr_block,
                              x = ~Grade,
                              y = ~Variable,
                              type = "box",
                              boxmean = "sd",
                              marker = list(symbol = "diamond"),
                              line = list(width = 1),
                              boxpoints = "all") %>% 
  layout(title = " Data")
block_project_plot

grade_variable_model <- lm(Variable ~ Grade,
                           data = dfr_block)
anova(grade_variable_model)

## Analysis of Variance Table
## 
## Response: Variable
##           Df    Sum Sq   Mean Sq F value Pr(>F)
## Grade      2 0.0097172 0.0048586  1.7098 0.2348
## Residuals  9 0.0255745 0.0028416

grade_experiment_variable_model <- lm(Variable ~ Grade + Experiment,
                                      data = dfr_block)
anova(grade_experiment_variable_model)

## Analysis of Variance Table
## 
## Response: Variable
##            Df    Sum Sq   Mean Sq F value   Pr(>F)   
## Grade       2 0.0097172 0.0048586  6.9682 0.027259 * 
## Experiment  3 0.0213910 0.0071303 10.2264 0.008967 **
## Residuals   6 0.0041835 0.0006972                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Now Grade is significant (we can stick to Type I here since there is no colinearlity)

par(mfrow = c(1, 2))
plot(grade_experiment_variable_model, which = c(1, 2))

If there was a significant result we would do a honestly significant difference TukeyHSD(aov(model, "variable")) test
- We are interested in the grade

TukeyHSD(aov(grade_experiment_variable_model), "Grade")

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = grade_experiment_variable_model)
## 
## $Grade
##                     diff         lwr         upr     p adj
## Moderate-Mild    0.02775 -0.02953929  0.08503929 0.3611755
## Severe-Mild     -0.04150 -0.09878929  0.01578929 0.1454294
## Severe-Moderate -0.06925 -0.12653929 -0.01196071 0.0232792

Factorial ANOVA

Consider two doses of a drug and two stages of disease
Study design randomizes equal number of patients to each (hidden replication)
Factorial \(2 \times 2\)
In our design we have orthogonality (no association or colinearity), i.e. knowledge of one variable does not provide information of the other and there are an equal number of replicates in each of the replicates
Can now consider interactions

variable <- c(709, 679, 699, 657, 594, 677, 592, 538, 476, 508, 505, 539)
grade <- rep(c("I", "I", "I", "II", "II", "II"), 2)
dose <- rep(c("Low", "High"), each = 6)
dfr_fact <- data.frame(Variable = variable,
                       Grade = grade,
                       Dose = dose)
dfr_fact

##    Variable Grade Dose
## 1       709     I  Low
## 2       679     I  Low
## 3       699     I  Low
## 4       657    II  Low
## 5       594    II  Low
## 6       677    II  Low
## 7       592     I High
## 8       538     I High
## 9       476     I High
## 10      508    II High
## 11      505    II High
## 12      539    II High

Tasty = dose, Sex = grade

interaction.plot(dfr_fact$Grade,
                 dfr_fact$Dose,
                 dfr_fact$Variable,
                 xlab = "Grade",
                 main = "Interaction plot",
                 ylab = "Mean of variable",
                 las = 1)

full_factorial_model <- lm(Variable ~ Grade * Dose,
                     data = dfr_fact)
anova(full_factorial_model)

## Analysis of Variance Table
## 
## Response: Variable
##            Df Sum Sq Mean Sq F value    Pr(>F)    
## Grade       1   3781    3781  2.5925 0.1460358    
## Dose        1  61204   61204 41.9685 0.0001925 ***
## Grade:Dose  1    919     919  0.6300 0.4502546    
## Residuals   8  11667    1458                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

par(mfrow = c(1, 2))
plot(full_factorial_model, which = c(1, 2))

Technical issues
- If there was not orthogonality do Type III analysis
- Use car library and Anova(model, type = "III) function
- Use options(contrasts = c(unordered = "contr.sum", ordered = "contr.poly))

Polynomial regression

poly_model <- lm(Creatinine ~ poly(Urea, degree = 2, raw = T),
                 data = df)  # For orthogonal polynomial use raw = FALSE
summary(poly_model)

## 
## Call:
## lm(formula = Creatinine ~ poly(Urea, degree = 2, raw = T), data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -25.2487  -4.6587   0.0023   4.5604  21.7639 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      -12.6588     2.8769   -4.40 1.32e-05 ***
## poly(Urea, degree = 2, raw = T)1  12.9855     0.6684   19.43  < 2e-16 ***
## poly(Urea, degree = 2, raw = T)2  -0.3945     0.0375  -10.52  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.113 on 501 degrees of freedom
## Multiple R-squared:  0.8166, Adjusted R-squared:  0.8159 
## F-statistic:  1115 on 2 and 501 DF,  p-value: < 2.2e-16

poly_model_alt <- lm(Creatinine ~ Urea + I(Urea^2),
                     data = df)
summary(poly_model_alt)

## 
## Call:
## lm(formula = Creatinine ~ Urea + I(Urea^2), data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -25.2487  -4.6587   0.0023   4.5604  21.7639 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -12.6588     2.8769   -4.40 1.32e-05 ***
## Urea         12.9855     0.6684   19.43  < 2e-16 ***
## I(Urea^2)    -0.3945     0.0375  -10.52  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.113 on 501 degrees of freedom
## Multiple R-squared:  0.8166, Adjusted R-squared:  0.8159 
## F-statistic:  1115 on 2 and 501 DF,  p-value: < 2.2e-16

Comparing models

Check for significant difference in sum or squared errors (residual sum of squares)

\[F_{\text{stat}}=\frac{\frac{\text{SSE}_{\text{partial}} - \text{SSE}_{\text{full}}}{\Delta_{\text{params}}}}{\text{MSE}_{\text{full}}}\tag{1}\]

Using ANOVA
Full model

full_model <- lm(Creatinine ~ Urea + I(Urea^2),
                 data = df)
summary(full_model)

## 
## Call:
## lm(formula = Creatinine ~ Urea + I(Urea^2), data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -25.2487  -4.6587   0.0023   4.5604  21.7639 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -12.6588     2.8769   -4.40 1.32e-05 ***
## Urea         12.9855     0.6684   19.43  < 2e-16 ***
## I(Urea^2)    -0.3945     0.0375  -10.52  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.113 on 501 degrees of freedom
## Multiple R-squared:  0.8166, Adjusted R-squared:  0.8159 
## F-statistic:  1115 on 2 and 501 DF,  p-value: < 2.2e-16

Partial model

partial_model <- lm(Creatinine ~ Urea,
                    data = df)
summary(partial_model)

## 
## Call:
## lm(formula = Creatinine ~ Urea, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -27.9441  -4.8015  -0.2036   4.8079  23.1357 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  15.1182     1.2622   11.98   <2e-16 ***
## Urea          6.0919     0.1461   41.71   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.852 on 502 degrees of freedom
## Multiple R-squared:  0.7761, Adjusted R-squared:  0.7756 
## F-statistic:  1740 on 1 and 502 DF,  p-value: < 2.2e-16

Partial F test (alternative hypothesis is error of full model is lower)

anova(partial_model, full_model)

## Analysis of Variance Table
## 
## Model 1: Creatinine ~ Urea
## Model 2: Creatinine ~ Urea + I(Urea^2)
##   Res.Df   RSS Df Sum of Sq      F    Pr(>F)    
## 1    502 30947                                  
## 2    501 25346  1    5600.7 110.71 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1