The tidyverse

Libraries

library(plotly)
library(dplyr)
library(gmodels)
library(mvnormtest)
library(Hotelling)
library(ICSNP)
library(DescTools)
library(car)

Import data

df <- read.csv("Data.csv")
dim(df)
## [1] 504  10
names(df)
##  [1] "ID"                 "Group"              "Grade"             
##  [4] "Smoking"            "InitialCholesterol" "FinalCholesterol"  
##  [7] "dBP"                "Creatinine"         "Urea"              
## [10] "Weight"
str(df)
## 'data.frame':    504 obs. of  10 variables:
##  $ ID                : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Group             : Factor w/ 2 levels "A","P": 1 2 2 2 1 2 1 2 2 1 ...
##  $ Grade             : Factor w/ 3 levels "I","II","III": 2 2 2 2 1 1 2 2 1 3 ...
##  $ Smoking           : int  0 1 0 0 0 1 0 1 1 0 ...
##  $ InitialCholesterol: num  7.2 6.4 7.6 6.9 5.9 8.8 7.7 6.4 8.9 6.9 ...
##  $ FinalCholesterol  : num  6.5 6.2 6.9 6.9 5.9 9.2 7.1 6.2 8.9 6.1 ...
##  $ dBP               : int  81 86 77 71 82 75 79 82 78 85 ...
##  $ Creatinine        : int  63 56 69 67 55 86 75 58 77 66 ...
##  $ Urea              : num  7.8 6.7 8.2 8.6 7.4 12.1 8.7 7.2 8.2 9.1 ...
##  $ Weight            : int  89 86 105 71 88 88 92 87 72 100 ...

Managing categorical variables

  • Changing the Group variable
df$Group <- factor(df$Group,
                   levels = c("P", "A"),
                   labels = c("Placebo", "Active"))
  • Changing Smoking to a categorical variable
df$Smoking <- factor(df$Smoking,
                     levels = c(0, 1),
                     labels = c("No", "Yes"))
str(df)
## 'data.frame':    504 obs. of  10 variables:
##  $ ID                : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Group             : Factor w/ 2 levels "Placebo","Active": 2 1 1 1 2 1 2 1 1 2 ...
##  $ Grade             : Factor w/ 3 levels "I","II","III": 2 2 2 2 1 1 2 2 1 3 ...
##  $ Smoking           : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 2 1 ...
##  $ InitialCholesterol: num  7.2 6.4 7.6 6.9 5.9 8.8 7.7 6.4 8.9 6.9 ...
##  $ FinalCholesterol  : num  6.5 6.2 6.9 6.9 5.9 9.2 7.1 6.2 8.9 6.1 ...
##  $ dBP               : int  81 86 77 71 82 75 79 82 78 85 ...
##  $ Creatinine        : int  63 56 69 67 55 86 75 58 77 66 ...
##  $ Urea              : num  7.8 6.7 8.2 8.6 7.4 12.1 8.7 7.2 8.2 9.1 ...
##  $ Weight            : int  89 86 105 71 88 88 92 87 72 100 ...

Descriptive statistics

var_1 <- c(20,20,20,20,21,24,44,60,90,94,101)
var_2 <- c(1.73,1.65,2.02,1.89,2.61,1.36,2.37,2.08,2.69,2.32,3.67)
str(var_2)
##  num [1:11] 1.73 1.65 2.02 1.89 2.61 1.36 2.37 2.08 2.69 2.32 ...
typeof(var_1)
## [1] "double"
typeof(var_2)
## [1] "double"
mean(var_1)
## [1] 46.72727
exp(mean(log(var_1)))
## [1] 37.07464
1 / var_1  # Elementwise operation
##  [1] 0.05000000 0.05000000 0.05000000 0.05000000 0.04761905 0.04166667
##  [7] 0.02272727 0.01666667 0.01111111 0.01063830 0.00990099
1 / mean(1 / var_1)
## [1] 30.52757
median(var_1)
## [1] 24
min(var_1)
## [1] 20
max(var_1)
## [1] 101
range(var_1)
## [1]  20 101
sd(var_1)
## [1] 33.57407
var(var_1)
## [1] 1127.218
quantile(var_1, 0.25)
## 25% 
##  20
quantile(var_1, c(0.25, 0.75))  # c() creates a vector object
## 25% 75% 
##  20  75
plot(var_2 ~ var_1, xlab = "Variable 1", ylab = "Variable 2")

Summary statistics using data munging

df %>% group_by(Group) %>% summarise(AVERAGE = mean(InitialCholesterol, na.rm = T),
                                     STD = sd(InitialCholesterol),
                                     MIN = min(InitialCholesterol),
                                     MEDIAN = median(InitialCholesterol),
                                     MAX = max(InitialCholesterol),
                                     IQRange = IQR(InitialCholesterol))
## # A tibble: 2 x 7
##   Group   AVERAGE   STD   MIN MEDIAN   MAX IQRange
##   <fct>     <dbl> <dbl> <dbl>  <dbl> <dbl>   <dbl>
## 1 Placebo    6.93  2.00   2.6    7    11.8    2.95
## 2 Active     6.98  1.99   0.8    7.1  11.3    2.7
df <- df %>% mutate(DifferenceCholesterol = FinalCholesterol - InitialCholesterol)
head(df)
##   ID   Group Grade Smoking InitialCholesterol FinalCholesterol dBP Creatinine
## 1  1  Active    II      No                7.2              6.5  81         63
## 2  2 Placebo    II     Yes                6.4              6.2  86         56
## 3  3 Placebo    II      No                7.6              6.9  77         69
## 4  4 Placebo    II      No                6.9              6.9  71         67
## 5  5  Active     I      No                5.9              5.9  82         55
## 6  6 Placebo     I     Yes                8.8              9.2  75         86
##   Urea Weight DifferenceCholesterol
## 1  7.8     89                  -0.7
## 2  6.7     86                  -0.2
## 3  8.2    105                  -0.7
## 4  8.6     71                   0.0
## 5  7.4     88                   0.0
## 6 12.1     88                   0.4

Common inferential tests

Comparing two means

  • Compare means of initial cholesterol between groups
    • We have alread seen the summary statistics for this question
    • Visialuze the data
boxplot(InitialCholesterol ~ Group,
        data = df,
        main = "Difference in initial cholesterol levels",
        xlab = "Group",
        ylab = "Cholesterol",
        las = 1,
        col = c("deepskyblue", "orange"))

  • Check for normality
shapiro.test(df %>% filter(Group == "Active") %>% select(InitialCholesterol) %>% pull())
## 
##  Shapiro-Wilk normality test
## 
## data:  df %>% filter(Group == "Active") %>% select(InitialCholesterol) %>%     pull()
## W = 0.99235, p-value = 0.2255
shapiro.test(df %>% filter(Group == "Placebo") %>% select(InitialCholesterol) %>% pull())
## 
##  Shapiro-Wilk normality test
## 
## data:  df %>% filter(Group == "Placebo") %>% select(InitialCholesterol) %>%     pull()
## W = 0.99041, p-value = 0.09184
  • Visual check for normality
qqnorm(df$InitialCholesterol[df$Group == "Active"])  # Use qqplot for other distributions
qqline(df$InitialCholesterol[df$Group == "Active"],
       col = "steelblue",
       lwd = 2)

  • Test for equal variances
bartlett.test(InitialCholesterol ~ Group,
              data = df)
## 
##  Bartlett test of homogeneity of variances
## 
## data:  InitialCholesterol by Group
## Bartlett's K-squared = 0.0052034, df = 1, p-value = 0.9425
  • Student’s t test
t.test(InitialCholesterol ~ Group,
       data = df,
       var.equal = T)
## 
##  Two Sample t-test
## 
## data:  InitialCholesterol by Group
## t = -0.31235, df = 502, p-value = 0.7549
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.4055720  0.2943034
## sample estimates:
## mean in group Placebo  mean in group Active 
##              6.925490              6.981124
  • Nonparametric alternative
wilcox.test(InitialCholesterol ~ Group,
            data = df)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  InitialCholesterol by Group
## W = 30910, p-value = 0.6084
## alternative hypothesis: true location shift is not equal to 0

Comparing three or more means

  • See ANOVA later

  • Is there a difference in urea values between the different grades

df %>% group_by(Grade) %>% summarise(AVERAGE = mean(Urea, na.rm = T),
                                     STD = sd(Urea),
                                     MIN = min(Urea),
                                     MEDIAN = median(Urea),
                                     MAX = max(Urea),
                                     IQRange = IQR(Urea))
## # A tibble: 3 x 7
##   Grade AVERAGE   STD   MIN MEDIAN   MAX IQRange
##   <fct>   <dbl> <dbl> <dbl>  <dbl> <dbl>   <dbl>
## 1 I        8.26  2.37   2.2    8.3  13.7    3.1 
## 2 II       8.32  2.34   3.2    8.2  17.7    2.92
## 3 III      8.33  2.57   2.1    8.3  15      3.05
boxplot(Urea ~ Grade,
        data = df,
        main = "Difference in urea levels",
        xlab = "Grade",
        ylab = "Urea",
        las = 1,
        col = c("deepskyblue", "orange", "green"))

stripchart(df$Urea ~ df$Grade,
           vertical = T,
           method = "jitter",
           jitter = 0.1,
           main = "Difference in urea levels",
           xlab = "Grade",
           ylab = "Urea",
           las = 1,
           col = c("deepskyblue", "orange", "green"))

  • Test assumptions
shapiro.test(df$Urea[df$Grade == "I"])
## 
##  Shapiro-Wilk normality test
## 
## data:  df$Urea[df$Grade == "I"]
## W = 0.99312, p-value = 0.6123
shapiro.test(df$Urea[df$Grade == "II"])
## 
##  Shapiro-Wilk normality test
## 
## data:  df$Urea[df$Grade == "II"]
## W = 0.9736, p-value = 0.000336
shapiro.test(df$Urea[df$Grade == "III"])
## 
##  Shapiro-Wilk normality test
## 
## data:  df$Urea[df$Grade == "III"]
## W = 0.97979, p-value = 0.08782
qqnorm(df$Urea[df$Grade == "I"])
qqline(df$Urea[df$Grade == "I"],
       col = "deepskyblue",
       lwd = 2)

qqnorm(df$Urea[df$Grade == "II"])
qqline(df$Urea[df$Grade == "II"],
       col = "orange",
       lwd = 2)

qqnorm(df$Urea[df$Grade == "III"])
qqline(df$Urea[df$Grade == "III"],
       col = "green",
       lwd = 2)

summary(aov(Urea ~ Grade,
            data = df))
##              Df Sum Sq Mean Sq F value Pr(>F)
## Grade         2    0.5   0.261   0.045  0.956
## Residuals   501 2889.4   5.767
  • Nonparametric alternative
kruskal.test(Urea ~ Grade,
             data = df)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  Urea by Grade
## Kruskal-Wallis chi-squared = 0.060917, df = 2, p-value = 0.97

Multivariate comparison of means

  • Hotelling’s \(T^2\) test
    • Comparing means of two variables between two groups
dep_matrix <- data.matrix(df[, 7:8])
dep_matrix[1:5, ]
##      dBP Creatinine
## [1,]  81         63
## [2,]  86         56
## [3,]  77         69
## [4,]  71         67
## [5,]  82         55
mvnormtest::mshapiro.test(t(dep_matrix))
## 
##  Shapiro-Wilk normality test
## 
## data:  Z
## W = 0.99559, p-value = 0.1676
det(cov(dep_matrix))
## [1] 29272.43
hot_test <- Hotelling::hotelling.test(df %>% filter(Group == "Active") %>% select(c("dBP", "Creatinine")),
                                      df %>% filter(Group == "Placebo") %>% select(c("dBP", "Creatinine")))
hot_test
## Test stat:  0.19412 
## Numerator df:  2 
## Denominator df:  501 
## P-value:  0.8236
ICSNP::HotellingsT2(df %>% filter(Group == "Active") %>% select(c("dBP", "Creatinine")),
                    df %>% filter(Group == "Placebo") %>% select(c("dBP", "Creatinine")))
## 
##  Hotelling's two sample T2-test
## 
## data:  df %>% filter(Group == "Active") %>% select(c("dBP", "Creatinine")) and df %>% filter(Group == "Placebo") %>% select(c("dBP", "Creatinine"))
## T.2 = 0.19412, df1 = 2, df2 = 501, p-value = 0.8236
## alternative hypothesis: true location difference is not equal to c(0,0)

\(\chi^2\) goodness of fit test

  • Oberved proprtions against expected proprotions
table(df$Grade)
## 
##   I  II III 
## 168 224 112
chisq.test(table(df$Grade),
            p =rep(1/3, 3))
## 
##  Chi-squared test for given probabilities
## 
## data:  table(df$Grade)
## X-squared = 37.333, df = 2, p-value = 7.819e-09

Log-likelihood alternative (G test for goodness of fit)

DescTools::GTest(x = c(168, 224, 112),
                 p = rep(1/3, 3),
                 correct = "none")
## 
##  Log likelihood ratio (G-test) goodness of fit test
## 
## data:  c(168, 224, 112)
## G = 38.057, X-squared df = 2, p-value = 5.444e-09

Fisher exact test

vals <- matrix(c(3, 1, 3, 2),
               nrow = 2,
               dimnames = list(Group = c("Control", "Experimental"),
                               Outcome = c("Worsened", "Improved")))
vals
##               Outcome
## Group          Worsened Improved
##   Control             3        3
##   Experimental        1        2
fisher.test(vals)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  vals
## p-value = 1
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##    0.06060903 156.52286969
## sample estimates:
## odds ratio 
##   1.852496

\(\chi^2\) test for independence

  • Is there dependence between Grade and Smoking

  • Contingency table

table(df$Grade,
      df$Smoking)
##      
##        No Yes
##   I   131  37
##   II  176  48
##   III 103   9
  • Using CrossTable from gmodels for fractions
gmodels::CrossTable(df$Grade,
                    df$Smoking)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  504 
## 
##  
##              | df$Smoking 
##     df$Grade |        No |       Yes | Row Total | 
## -------------|-----------|-----------|-----------|
##            I |       131 |        37 |       168 | 
##              |     0.235 |     1.025 |           | 
##              |     0.780 |     0.220 |     0.333 | 
##              |     0.320 |     0.394 |           | 
##              |     0.260 |     0.073 |           | 
## -------------|-----------|-----------|-----------|
##           II |       176 |        48 |       224 | 
##              |     0.212 |     0.927 |           | 
##              |     0.786 |     0.214 |     0.444 | 
##              |     0.429 |     0.511 |           | 
##              |     0.349 |     0.095 |           | 
## -------------|-----------|-----------|-----------|
##          III |       103 |         9 |       112 | 
##              |     1.551 |     6.767 |           | 
##              |     0.920 |     0.080 |     0.222 | 
##              |     0.251 |     0.096 |           | 
##              |     0.204 |     0.018 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |       410 |        94 |       504 | 
##              |     0.813 |     0.187 |           | 
## -------------|-----------|-----------|-----------|
## 
## 
chisq.test(table(df$Grade,
                 df$Smoking))
## 
##  Pearson's Chi-squared test
## 
## data:  table(df$Grade, df$Smoking)
## X-squared = 10.717, df = 2, p-value = 0.004708

Log likelihood ratio test (G test for independence)

\[G=2\sum_{i=1}^{N}{\text{observed}_{i}\cdot\ln{\frac{{\text{observed}}_i}{{\text{expected}}_i}}}\]

DescTools::GTest(table(df$Grade,
                       df$Smoking),
                 correct = "none")
## 
##  Log likelihood ratio (G-test) test of independence without correction
## 
## data:  table(df$Grade, df$Smoking)
## G = 12.415, X-squared df = 2, p-value = 0.002014

Linear regression introduction

names(df)
##  [1] "ID"                    "Group"                 "Grade"                
##  [4] "Smoking"               "InitialCholesterol"    "FinalCholesterol"     
##  [7] "dBP"                   "Creatinine"            "Urea"                 
## [10] "Weight"                "DifferenceCholesterol"

Simple linear regression

  • Urea and Creatinine
plot(x = df$Urea,
     y = df$Creatinine,
     main = "Urea as predictor of creatinine",
     xlab = "Urea",
     ylab = "Creatinine",
     las = 1,
     col = "orange")

  • Fit data to a linear model with Creatine as dependent variable and Urea as independent variable
urea_creat <- lm(Creatinine ~ Urea,
                 data = df,
                 na.action = na.omit)
  • Summary of the model
summary(urea_creat)
## 
## Call:
## lm(formula = Creatinine ~ Urea, data = df, na.action = na.omit)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -27.9441  -4.8015  -0.2036   4.8079  23.1357 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  15.1182     1.2622   11.98   <2e-16 ***
## Urea          6.0919     0.1461   41.71   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.852 on 502 degrees of freedom
## Multiple R-squared:  0.7761, Adjusted R-squared:  0.7756 
## F-statistic:  1740 on 1 and 502 DF,  p-value: < 2.2e-16
  • Regression line
plot(x = df$Urea,
     y = df$Creatinine,
     main = "Urea as predictor of creatinine",
     xlab = "Urea",
     ylab = "Creatinine",
     las = 1,
     col = "orange")
abline(urea_creat,
       col = "gray",
       lwd = 2)

confint(urea_creat, level = 0.95)  # level = 0.95 is default
##                 2.5 %    97.5 %
## (Intercept) 12.638402 17.598046
## Urea         5.804904  6.378807
attributes(urea_creat)
## $names
##  [1] "coefficients"  "residuals"     "effects"       "rank"         
##  [5] "fitted.values" "assign"        "qr"            "df.residual"  
##  [9] "xlevels"       "call"          "terms"         "model"        
## 
## $class
## [1] "lm"
  • First five \(y\) and \(\hat{y}\)
df$Creatinine[1:5]
## [1] 63 56 69 67 55
urea_creat$fitted.values[1:5]
##        1        2        3        4        5 
## 62.63470 55.93366 65.07144 67.50818 60.19795
df$Creatinine[1:5] - urea_creat$fitted.values[1:5]
##           1           2           3           4           5 
##  0.36530339  0.06634438  3.92856122 -0.50818096 -5.19795443
urea_creat$residuals[1:5]
##           1           2           3           4           5 
##  0.36530339  0.06634438  3.92856122 -0.50818096 -5.19795443
  • From the F ratio
    • Sum of deviations from the mean (total variability or \(\text{SSY}\)) = Sum of deviations of predicted values from the mean (variability explained by the regression or \(\text{SSR\)) + Sum of deviations of data from the regression line (residuals or \(\text{SSE}\))
    • \(R^2\) is then \(\frac{\text{SSY} - \text{SSE}}{\text{SSY}}\) (fraction of the variance explained by the model)
anova(urea_creat)
## Analysis of Variance Table
## 
## Response: Creatinine
##            Df Sum Sq Mean Sq F value    Pr(>F)    
## Urea        1 107247  107247  1739.7 < 2.2e-16 ***
## Residuals 502  30947      62                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
  • Test assumptions
par(mfrow = c(2, 2))
plot(urea_creat)

plot(urea_creat, which = 1)  # Seems like a non linearity is present

plot(urea_creat, which = 2)

plot(urea_creat, which = 3)

plot(urea_creat, which = 4)

hist(urea_creat$residuals,
     main = "Checking assumptions of residual distribution",
     xlab = "Residuals",
     ylab = "Frequency",
     col = 8,
     las = 1)

shapiro.test(urea_creat$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  urea_creat$residuals
## W = 0.99568, p-value = 0.1802

Multiple linear regression model

  • Multiple regression
  • Urea given Creatinine and Weight
urea_weight_creat <- lm(Creatinine ~ Urea + Weight,
                        data = df)
summary(urea_weight_creat)
## 
## Call:
## lm(formula = Creatinine ~ Urea + Weight, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -27.8288  -4.7944  -0.2361   4.8110  23.1360 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 15.547397   2.548152   6.101 2.11e-09 ***
## Urea         6.094957   0.147066  41.444  < 2e-16 ***
## Weight      -0.004757   0.024528  -0.194    0.846    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.859 on 501 degrees of freedom
## Multiple R-squared:  0.7761, Adjusted R-squared:  0.7752 
## F-statistic: 868.2 on 2 and 501 DF,  p-value: < 2.2e-16
cor(df$Urea, df$Weight)
## [1] 0.1087418
par(mfrow = c(2, 2))
plot(urea_weight_creat)

confint(urea_weight_creat)
##                   2.5 %      97.5 %
## (Intercept) 10.54101713 20.55377713
## Urea         5.80601501  6.38389900
## Weight      -0.05294636  0.04343253
anova(urea_weight_creat)
## Analysis of Variance Table
## 
## Response: Creatinine
##            Df Sum Sq Mean Sq   F value Pr(>F)    
## Urea        1 107247  107247 1736.3647 <2e-16 ***
## Weight      1      2       2    0.0376 0.8463    
## Residuals 501  30944      62                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

One-way ANOVA

df %>% summarise(MIN = min(Urea),
                 MAX = max(Urea))
##   MIN  MAX
## 1 2.1 17.7
df$UreaCat <- cut(df$Urea,
                  breaks = c(0, 5, 10, 18),
                  labels = c("U1", "U2", "U3")) # Breaks are left open and right closed
# Use right = FALSE to change this
# Use breaks = integer will create an integer number of equal sized bins
head(df)
##   ID   Group Grade Smoking InitialCholesterol FinalCholesterol dBP Creatinine
## 1  1  Active    II      No                7.2              6.5  81         63
## 2  2 Placebo    II     Yes                6.4              6.2  86         56
## 3  3 Placebo    II      No                7.6              6.9  77         69
## 4  4 Placebo    II      No                6.9              6.9  71         67
## 5  5  Active     I      No                5.9              5.9  82         55
## 6  6 Placebo     I     Yes                8.8              9.2  75         86
##   Urea Weight DifferenceCholesterol UreaCat
## 1  7.8     89                  -0.7      U2
## 2  6.7     86                  -0.2      U2
## 3  8.2    105                  -0.7      U2
## 4  8.6     71                   0.0      U2
## 5  7.4     88                   0.0      U2
## 6 12.1     88                   0.4      U3
df %>% group_by(UreaCat) %>% summarise(AVEAGE = mean(Creatinine))
## # A tibble: 3 x 2
##   UreaCat AVEAGE
##   <fct>    <dbl>
## 1 U1        34.4
## 2 U2        63.6
## 3 U3        84.2
urea_cat_plot <- plot_ly(data = df,
                         y = ~Creatinine,
                         color = ~UreaCat,
                         type = "box",
                         boxmean = "sd",
                         marker = list(symbol = "diamond"),
                         line = list(width = 1),
                         boxpoints = "all") %>% 
  layout(title = " Data")
urea_cat_plot
cat_urea_creat <- lm(Creatinine ~ UreaCat,
                     data = df)
summary(cat_urea_creat)
## 
## Call:
## lm(formula = Creatinine ~ UreaCat, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -31.2381  -8.3947   0.3934   7.7619  29.3934 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   34.395      1.823   18.86   <2e-16 ***
## UreaCatU2     29.212      1.917   15.24   <2e-16 ***
## UreaCatU3     49.843      2.128   23.43   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.24 on 501 degrees of freedom
## Multiple R-squared:  0.542,  Adjusted R-squared:  0.5402 
## F-statistic: 296.5 on 2 and 501 DF,  p-value: < 2.2e-16
anova(cat_urea_creat)
## Analysis of Variance Table
## 
## Response: Creatinine
##            Df Sum Sq Mean Sq F value    Pr(>F)    
## UreaCat     2  74907   37454   296.5 < 2.2e-16 ***
## Residuals 501  63286     126                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow = c(1, 2))
plot(cat_urea_creat, which = c(1, 2))

hist(rstandard(cat_urea_creat),
     main = "Histogram of the standardized residuals",
     xlab = "Standadized residuals",
     ylab = "Frequency",
     las = 1,
     col = "gray")

TukeyHSD(aov(cat_urea_creat))
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = cat_urea_creat)
## 
## $UreaCat
##           diff      lwr      upr p adj
## U2-U1 29.21191 24.70609 33.71773     0
## U3-U1 49.84336 44.84170 54.84502     0
## U3-U2 20.63145 17.70206 23.56084     0

Two-way ANOVA

smoking_grade_creat <- lm(Creatinine ~ Smoking + Grade,
                          data = df)
anova(smoking_grade_creat)
## Analysis of Variance Table
## 
## Response: Creatinine
##            Df Sum Sq Mean Sq F value Pr(>F)
## Smoking     1      3   3.338  0.0121 0.9125
## Grade       2     74  37.244  0.1348 0.8739
## Residuals 500 138116 276.231

ANCOVA

weight_grade_model <- lm(Creatinine ~ Weight + Grade,
                         data = df)
weight_grade_model$coefficients
## (Intercept)      Weight     GradeII    GradeIII 
##  55.5825643   0.1055191   0.3675932  -0.6069257
weight_grade_creat_plot <- plot_ly(data = df,
                                   x = ~Weight,
                                   y = ~Creatinine,
                                   color = ~Grade,
                                   type = "scatter",
                                   mode = "markers",
                                   marker = list(size = 12)) %>% 
  layout(title = "Predicting creatinine by weight and grade",
         xaxis = list(title = "Weight",
                      zeroline = F),
         yaxis = list(title = "Creatinine",
                      zeroline = F))
weight_grade_creat_plot
anova(weight_grade_model)
## Analysis of Variance Table
## 
## Response: Creatinine
##            Df Sum Sq Mean Sq F value Pr(>F)  
## Weight      1   1163 1162.55  4.2441 0.0399 *
## Grade       2     71   35.55  0.1298 0.8783  
## Residuals 500 136960  273.92                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
car::Anova(weight_grade_model,
           type = "III")
## Anova Table (Type III tests)
## 
## Response: Creatinine
##             Sum Sq  Df  F value  Pr(>F)    
## (Intercept)  32613   1 119.0589 < 2e-16 ***
## Weight        1156   1   4.2212 0.04044 *  
## Grade           71   2   0.1298 0.87830    
## Residuals   136960 500                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
weight_grade_interaction_model <- lm(Creatinine ~ Weight * Grade,
                                     data = df)
anova(weight_grade_interaction_model)
## Analysis of Variance Table
## 
## Response: Creatinine
##               Df Sum Sq Mean Sq F value  Pr(>F)  
## Weight         1   1163 1162.55  4.2757 0.03918 *
## Grade          2     71   35.55  0.1308 0.87746  
## Weight:Grade   2   1554  777.13  2.8582 0.05832 .
## Residuals    498 135405  271.90                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
weight_grade_interaction_model$coefficients
##     (Intercept)          Weight         GradeII        GradeIII  Weight:GradeII 
##      40.5737205       0.2618322      22.4615001      26.1720236      -0.2304398 
## Weight:GradeIII 
##      -0.2799880

Randomized block design

variable <- c(0.958, 0.971, 0.927, 0.971, 0.986, 1.051, 0.891, 1.010, 0.925, 0.952, 0.829, 0.955)
experiment <- factor(rep(1:4, 3))
grade <- rep(c("Mild", "Moderate", "Severe"), each = 4)
dfr_block <- data.frame(Variable = variable,
                        Experiment = experiment,
                        Grade = grade)
head(dfr_block)
##   Variable Experiment    Grade
## 1    0.958          1     Mild
## 2    0.971          2     Mild
## 3    0.927          3     Mild
## 4    0.971          4     Mild
## 5    0.986          1 Moderate
## 6    1.051          2 Moderate
block_project_plot <- plot_ly(data = dfr_block,
                              x = ~Grade,
                              y = ~Variable,
                              type = "box",
                              boxmean = "sd",
                              marker = list(symbol = "diamond"),
                              line = list(width = 1),
                              boxpoints = "all") %>% 
  layout(title = " Data")
block_project_plot
grade_variable_model <- lm(Variable ~ Grade,
                           data = dfr_block)
anova(grade_variable_model)
## Analysis of Variance Table
## 
## Response: Variable
##           Df    Sum Sq   Mean Sq F value Pr(>F)
## Grade      2 0.0097172 0.0048586  1.7098 0.2348
## Residuals  9 0.0255745 0.0028416
grade_experiment_variable_model <- lm(Variable ~ Grade + Experiment,
                                      data = dfr_block)
anova(grade_experiment_variable_model)
## Analysis of Variance Table
## 
## Response: Variable
##            Df    Sum Sq   Mean Sq F value   Pr(>F)   
## Grade       2 0.0097172 0.0048586  6.9682 0.027259 * 
## Experiment  3 0.0213910 0.0071303 10.2264 0.008967 **
## Residuals   6 0.0041835 0.0006972                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow = c(1, 2))
plot(grade_experiment_variable_model, which = c(1, 2))

TukeyHSD(aov(grade_experiment_variable_model), "Grade")
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = grade_experiment_variable_model)
## 
## $Grade
##                     diff         lwr         upr     p adj
## Moderate-Mild    0.02775 -0.02953929  0.08503929 0.3611755
## Severe-Mild     -0.04150 -0.09878929  0.01578929 0.1454294
## Severe-Moderate -0.06925 -0.12653929 -0.01196071 0.0232792

Factorial ANOVA

variable <- c(709, 679, 699, 657, 594, 677, 592, 538, 476, 508, 505, 539)
grade <- rep(c("I", "I", "I", "II", "II", "II"), 2)
dose <- rep(c("Low", "High"), each = 6)
dfr_fact <- data.frame(Variable = variable,
                       Grade = grade,
                       Dose = dose)
dfr_fact
##    Variable Grade Dose
## 1       709     I  Low
## 2       679     I  Low
## 3       699     I  Low
## 4       657    II  Low
## 5       594    II  Low
## 6       677    II  Low
## 7       592     I High
## 8       538     I High
## 9       476     I High
## 10      508    II High
## 11      505    II High
## 12      539    II High
interaction.plot(dfr_fact$Grade,
                 dfr_fact$Dose,
                 dfr_fact$Variable,
                 xlab = "Grade",
                 main = "Interaction plot",
                 ylab = "Mean of variable",
                 las = 1)

full_factorial_model <- lm(Variable ~ Grade * Dose,
                     data = dfr_fact)
anova(full_factorial_model)
## Analysis of Variance Table
## 
## Response: Variable
##            Df Sum Sq Mean Sq F value    Pr(>F)    
## Grade       1   3781    3781  2.5925 0.1460358    
## Dose        1  61204   61204 41.9685 0.0001925 ***
## Grade:Dose  1    919     919  0.6300 0.4502546    
## Residuals   8  11667    1458                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow = c(1, 2))
plot(full_factorial_model, which = c(1, 2))

Polynomial regression

poly_model <- lm(Creatinine ~ poly(Urea, degree = 2, raw = T),
                 data = df)  # For orthogonal polynomial use raw = FALSE
summary(poly_model)
## 
## Call:
## lm(formula = Creatinine ~ poly(Urea, degree = 2, raw = T), data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -25.2487  -4.6587   0.0023   4.5604  21.7639 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      -12.6588     2.8769   -4.40 1.32e-05 ***
## poly(Urea, degree = 2, raw = T)1  12.9855     0.6684   19.43  < 2e-16 ***
## poly(Urea, degree = 2, raw = T)2  -0.3945     0.0375  -10.52  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.113 on 501 degrees of freedom
## Multiple R-squared:  0.8166, Adjusted R-squared:  0.8159 
## F-statistic:  1115 on 2 and 501 DF,  p-value: < 2.2e-16
poly_model_alt <- lm(Creatinine ~ Urea + I(Urea^2),
                     data = df)
summary(poly_model_alt)
## 
## Call:
## lm(formula = Creatinine ~ Urea + I(Urea^2), data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -25.2487  -4.6587   0.0023   4.5604  21.7639 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -12.6588     2.8769   -4.40 1.32e-05 ***
## Urea         12.9855     0.6684   19.43  < 2e-16 ***
## I(Urea^2)    -0.3945     0.0375  -10.52  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.113 on 501 degrees of freedom
## Multiple R-squared:  0.8166, Adjusted R-squared:  0.8159 
## F-statistic:  1115 on 2 and 501 DF,  p-value: < 2.2e-16

Comparing models

\[F_{\text{stat}}=\frac{\frac{\text{SSE}_{\text{partial}} - \text{SSE}_{\text{full}}}{\Delta_{\text{params}}}}{\text{MSE}_{\text{full}}}\tag{1}\]

full_model <- lm(Creatinine ~ Urea + I(Urea^2),
                 data = df)
summary(full_model)
## 
## Call:
## lm(formula = Creatinine ~ Urea + I(Urea^2), data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -25.2487  -4.6587   0.0023   4.5604  21.7639 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -12.6588     2.8769   -4.40 1.32e-05 ***
## Urea         12.9855     0.6684   19.43  < 2e-16 ***
## I(Urea^2)    -0.3945     0.0375  -10.52  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.113 on 501 degrees of freedom
## Multiple R-squared:  0.8166, Adjusted R-squared:  0.8159 
## F-statistic:  1115 on 2 and 501 DF,  p-value: < 2.2e-16
partial_model <- lm(Creatinine ~ Urea,
                    data = df)
summary(partial_model)
## 
## Call:
## lm(formula = Creatinine ~ Urea, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -27.9441  -4.8015  -0.2036   4.8079  23.1357 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  15.1182     1.2622   11.98   <2e-16 ***
## Urea          6.0919     0.1461   41.71   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.852 on 502 degrees of freedom
## Multiple R-squared:  0.7761, Adjusted R-squared:  0.7756 
## F-statistic:  1740 on 1 and 502 DF,  p-value: < 2.2e-16
anova(partial_model, full_model)
## Analysis of Variance Table
## 
## Model 1: Creatinine ~ Urea
## Model 2: Creatinine ~ Urea + I(Urea^2)
##   Res.Df   RSS Df Sum of Sq      F    Pr(>F)    
## 1    502 30947                                  
## 2    501 25346  1    5600.7 110.71 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1