library(plotly)
library(dplyr)
library(gmodels)
library(mvnormtest)
library(Hotelling)
library(ICSNP)
library(DescTools)
library(car)
df <- read.csv("Data.csv")
dim(df)
## [1] 504 10
names(df)
## [1] "ID" "Group" "Grade"
## [4] "Smoking" "InitialCholesterol" "FinalCholesterol"
## [7] "dBP" "Creatinine" "Urea"
## [10] "Weight"
str(df)
## 'data.frame': 504 obs. of 10 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Group : Factor w/ 2 levels "A","P": 1 2 2 2 1 2 1 2 2 1 ...
## $ Grade : Factor w/ 3 levels "I","II","III": 2 2 2 2 1 1 2 2 1 3 ...
## $ Smoking : int 0 1 0 0 0 1 0 1 1 0 ...
## $ InitialCholesterol: num 7.2 6.4 7.6 6.9 5.9 8.8 7.7 6.4 8.9 6.9 ...
## $ FinalCholesterol : num 6.5 6.2 6.9 6.9 5.9 9.2 7.1 6.2 8.9 6.1 ...
## $ dBP : int 81 86 77 71 82 75 79 82 78 85 ...
## $ Creatinine : int 63 56 69 67 55 86 75 58 77 66 ...
## $ Urea : num 7.8 6.7 8.2 8.6 7.4 12.1 8.7 7.2 8.2 9.1 ...
## $ Weight : int 89 86 105 71 88 88 92 87 72 100 ...
df$Group <- factor(df$Group,
levels = c("P", "A"),
labels = c("Placebo", "Active"))
df$Smoking <- factor(df$Smoking,
levels = c(0, 1),
labels = c("No", "Yes"))
str(df)
## 'data.frame': 504 obs. of 10 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Group : Factor w/ 2 levels "Placebo","Active": 2 1 1 1 2 1 2 1 1 2 ...
## $ Grade : Factor w/ 3 levels "I","II","III": 2 2 2 2 1 1 2 2 1 3 ...
## $ Smoking : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 2 1 ...
## $ InitialCholesterol: num 7.2 6.4 7.6 6.9 5.9 8.8 7.7 6.4 8.9 6.9 ...
## $ FinalCholesterol : num 6.5 6.2 6.9 6.9 5.9 9.2 7.1 6.2 8.9 6.1 ...
## $ dBP : int 81 86 77 71 82 75 79 82 78 85 ...
## $ Creatinine : int 63 56 69 67 55 86 75 58 77 66 ...
## $ Urea : num 7.8 6.7 8.2 8.6 7.4 12.1 8.7 7.2 8.2 9.1 ...
## $ Weight : int 89 86 105 71 88 88 92 87 72 100 ...
var_1 <- c(20,20,20,20,21,24,44,60,90,94,101)
var_2 <- c(1.73,1.65,2.02,1.89,2.61,1.36,2.37,2.08,2.69,2.32,3.67)
str(var_2)
## num [1:11] 1.73 1.65 2.02 1.89 2.61 1.36 2.37 2.08 2.69 2.32 ...
typeof(var_1)
## [1] "double"
typeof(var_2)
## [1] "double"
mean(var_1)
## [1] 46.72727
exp(mean(log(var_1)))
## [1] 37.07464
1 / var_1 # Elementwise operation
## [1] 0.05000000 0.05000000 0.05000000 0.05000000 0.04761905 0.04166667
## [7] 0.02272727 0.01666667 0.01111111 0.01063830 0.00990099
1 / mean(1 / var_1)
## [1] 30.52757
median(var_1)
## [1] 24
min(var_1)
## [1] 20
max(var_1)
## [1] 101
range function returns both extremarange(var_1)
## [1] 20 101
sd(var_1)
## [1] 33.57407
var(var_1)
## [1] 1127.218
quantile(var_1, 0.25)
## 25%
## 20
quantile(var_1, c(0.25, 0.75)) # c() creates a vector object
## 25% 75%
## 20 75
plot(var_2 ~ var_1, xlab = "Variable 1", ylab = "Variable 2")
df %>% group_by(Group) %>% summarise(AVERAGE = mean(InitialCholesterol, na.rm = T),
STD = sd(InitialCholesterol),
MIN = min(InitialCholesterol),
MEDIAN = median(InitialCholesterol),
MAX = max(InitialCholesterol),
IQRange = IQR(InitialCholesterol))
## # A tibble: 2 x 7
## Group AVERAGE STD MIN MEDIAN MAX IQRange
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Placebo 6.93 2.00 2.6 7 11.8 2.95
## 2 Active 6.98 1.99 0.8 7.1 11.3 2.7
df <- df %>% mutate(DifferenceCholesterol = FinalCholesterol - InitialCholesterol)
head(df)
## ID Group Grade Smoking InitialCholesterol FinalCholesterol dBP Creatinine
## 1 1 Active II No 7.2 6.5 81 63
## 2 2 Placebo II Yes 6.4 6.2 86 56
## 3 3 Placebo II No 7.6 6.9 77 69
## 4 4 Placebo II No 6.9 6.9 71 67
## 5 5 Active I No 5.9 5.9 82 55
## 6 6 Placebo I Yes 8.8 9.2 75 86
## Urea Weight DifferenceCholesterol
## 1 7.8 89 -0.7
## 2 6.7 86 -0.2
## 3 8.2 105 -0.7
## 4 8.6 71 0.0
## 5 7.4 88 0.0
## 6 12.1 88 0.4
boxplot(InitialCholesterol ~ Group,
data = df,
main = "Difference in initial cholesterol levels",
xlab = "Group",
ylab = "Cholesterol",
las = 1,
col = c("deepskyblue", "orange"))
shapiro.test(df %>% filter(Group == "Active") %>% select(InitialCholesterol) %>% pull())
##
## Shapiro-Wilk normality test
##
## data: df %>% filter(Group == "Active") %>% select(InitialCholesterol) %>% pull()
## W = 0.99235, p-value = 0.2255
shapiro.test(df %>% filter(Group == "Placebo") %>% select(InitialCholesterol) %>% pull())
##
## Shapiro-Wilk normality test
##
## data: df %>% filter(Group == "Placebo") %>% select(InitialCholesterol) %>% pull()
## W = 0.99041, p-value = 0.09184
qqnorm(df$InitialCholesterol[df$Group == "Active"]) # Use qqplot for other distributions
qqline(df$InitialCholesterol[df$Group == "Active"],
col = "steelblue",
lwd = 2)
bartlett.test(InitialCholesterol ~ Group,
data = df)
##
## Bartlett test of homogeneity of variances
##
## data: InitialCholesterol by Group
## Bartlett's K-squared = 0.0052034, df = 1, p-value = 0.9425
t.test(InitialCholesterol ~ Group,
data = df,
var.equal = T)
##
## Two Sample t-test
##
## data: InitialCholesterol by Group
## t = -0.31235, df = 502, p-value = 0.7549
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.4055720 0.2943034
## sample estimates:
## mean in group Placebo mean in group Active
## 6.925490 6.981124
wilcox.test(InitialCholesterol ~ Group,
data = df)
##
## Wilcoxon rank sum test with continuity correction
##
## data: InitialCholesterol by Group
## W = 30910, p-value = 0.6084
## alternative hypothesis: true location shift is not equal to 0
See ANOVA later
Is there a difference in urea values between the different grades
df %>% group_by(Grade) %>% summarise(AVERAGE = mean(Urea, na.rm = T),
STD = sd(Urea),
MIN = min(Urea),
MEDIAN = median(Urea),
MAX = max(Urea),
IQRange = IQR(Urea))
## # A tibble: 3 x 7
## Grade AVERAGE STD MIN MEDIAN MAX IQRange
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 I 8.26 2.37 2.2 8.3 13.7 3.1
## 2 II 8.32 2.34 3.2 8.2 17.7 2.92
## 3 III 8.33 2.57 2.1 8.3 15 3.05
boxplot(Urea ~ Grade,
data = df,
main = "Difference in urea levels",
xlab = "Grade",
ylab = "Urea",
las = 1,
col = c("deepskyblue", "orange", "green"))
stripchart(df$Urea ~ df$Grade,
vertical = T,
method = "jitter",
jitter = 0.1,
main = "Difference in urea levels",
xlab = "Grade",
ylab = "Urea",
las = 1,
col = c("deepskyblue", "orange", "green"))
shapiro.test(df$Urea[df$Grade == "I"])
##
## Shapiro-Wilk normality test
##
## data: df$Urea[df$Grade == "I"]
## W = 0.99312, p-value = 0.6123
shapiro.test(df$Urea[df$Grade == "II"])
##
## Shapiro-Wilk normality test
##
## data: df$Urea[df$Grade == "II"]
## W = 0.9736, p-value = 0.000336
shapiro.test(df$Urea[df$Grade == "III"])
##
## Shapiro-Wilk normality test
##
## data: df$Urea[df$Grade == "III"]
## W = 0.97979, p-value = 0.08782
qqnorm(df$Urea[df$Grade == "I"])
qqline(df$Urea[df$Grade == "I"],
col = "deepskyblue",
lwd = 2)
qqnorm(df$Urea[df$Grade == "II"])
qqline(df$Urea[df$Grade == "II"],
col = "orange",
lwd = 2)
qqnorm(df$Urea[df$Grade == "III"])
qqline(df$Urea[df$Grade == "III"],
col = "green",
lwd = 2)
summary(aov(Urea ~ Grade,
data = df))
## Df Sum Sq Mean Sq F value Pr(>F)
## Grade 2 0.5 0.261 0.045 0.956
## Residuals 501 2889.4 5.767
kruskal.test(Urea ~ Grade,
data = df)
##
## Kruskal-Wallis rank sum test
##
## data: Urea by Grade
## Kruskal-Wallis chi-squared = 0.060917, df = 2, p-value = 0.97
dep_matrix <- data.matrix(df[, 7:8])
dep_matrix[1:5, ]
## dBP Creatinine
## [1,] 81 63
## [2,] 86 56
## [3,] 77 69
## [4,] 71 67
## [5,] 82 55
mvnormtest::mshapiro.test(t(dep_matrix))
##
## Shapiro-Wilk normality test
##
## data: Z
## W = 0.99559, p-value = 0.1676
det(cov(dep_matrix))
## [1] 29272.43
hot_test <- Hotelling::hotelling.test(df %>% filter(Group == "Active") %>% select(c("dBP", "Creatinine")),
df %>% filter(Group == "Placebo") %>% select(c("dBP", "Creatinine")))
hot_test
## Test stat: 0.19412
## Numerator df: 2
## Denominator df: 501
## P-value: 0.8236
ICSNP::HotellingsT2(df %>% filter(Group == "Active") %>% select(c("dBP", "Creatinine")),
df %>% filter(Group == "Placebo") %>% select(c("dBP", "Creatinine")))
##
## Hotelling's two sample T2-test
##
## data: df %>% filter(Group == "Active") %>% select(c("dBP", "Creatinine")) and df %>% filter(Group == "Placebo") %>% select(c("dBP", "Creatinine"))
## T.2 = 0.19412, df1 = 2, df2 = 501, p-value = 0.8236
## alternative hypothesis: true location difference is not equal to c(0,0)
table(df$Grade)
##
## I II III
## 168 224 112
chisq.test(table(df$Grade),
p =rep(1/3, 3))
##
## Chi-squared test for given probabilities
##
## data: table(df$Grade)
## X-squared = 37.333, df = 2, p-value = 7.819e-09
DescTools::GTest(x = c(168, 224, 112),
p = rep(1/3, 3),
correct = "none")
##
## Log likelihood ratio (G-test) goodness of fit test
##
## data: c(168, 224, 112)
## G = 38.057, X-squared df = 2, p-value = 5.444e-09
vals <- matrix(c(3, 1, 3, 2),
nrow = 2,
dimnames = list(Group = c("Control", "Experimental"),
Outcome = c("Worsened", "Improved")))
vals
## Outcome
## Group Worsened Improved
## Control 3 3
## Experimental 1 2
fisher.test(vals)
##
## Fisher's Exact Test for Count Data
##
## data: vals
## p-value = 1
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.06060903 156.52286969
## sample estimates:
## odds ratio
## 1.852496
Is there dependence between Grade and Smoking
Contingency table
table(df$Grade,
df$Smoking)
##
## No Yes
## I 131 37
## II 176 48
## III 103 9
CrossTable from gmodels for fractionsgmodels::CrossTable(df$Grade,
df$Smoking)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 504
##
##
## | df$Smoking
## df$Grade | No | Yes | Row Total |
## -------------|-----------|-----------|-----------|
## I | 131 | 37 | 168 |
## | 0.235 | 1.025 | |
## | 0.780 | 0.220 | 0.333 |
## | 0.320 | 0.394 | |
## | 0.260 | 0.073 | |
## -------------|-----------|-----------|-----------|
## II | 176 | 48 | 224 |
## | 0.212 | 0.927 | |
## | 0.786 | 0.214 | 0.444 |
## | 0.429 | 0.511 | |
## | 0.349 | 0.095 | |
## -------------|-----------|-----------|-----------|
## III | 103 | 9 | 112 |
## | 1.551 | 6.767 | |
## | 0.920 | 0.080 | 0.222 |
## | 0.251 | 0.096 | |
## | 0.204 | 0.018 | |
## -------------|-----------|-----------|-----------|
## Column Total | 410 | 94 | 504 |
## | 0.813 | 0.187 | |
## -------------|-----------|-----------|-----------|
##
##
chisq.test(table(df$Grade,
df$Smoking))
##
## Pearson's Chi-squared test
##
## data: table(df$Grade, df$Smoking)
## X-squared = 10.717, df = 2, p-value = 0.004708
\[G=2\sum_{i=1}^{N}{\text{observed}_{i}\cdot\ln{\frac{{\text{observed}}_i}{{\text{expected}}_i}}}\]
DescTools::GTest(table(df$Grade,
df$Smoking),
correct = "none")
##
## Log likelihood ratio (G-test) test of independence without correction
##
## data: table(df$Grade, df$Smoking)
## G = 12.415, X-squared df = 2, p-value = 0.002014
names(df)
## [1] "ID" "Group" "Grade"
## [4] "Smoking" "InitialCholesterol" "FinalCholesterol"
## [7] "dBP" "Creatinine" "Urea"
## [10] "Weight" "DifferenceCholesterol"
Urea and Creatinineplot(x = df$Urea,
y = df$Creatinine,
main = "Urea as predictor of creatinine",
xlab = "Urea",
ylab = "Creatinine",
las = 1,
col = "orange")
Creatine as dependent variable and Urea as independent variableurea_creat <- lm(Creatinine ~ Urea,
data = df,
na.action = na.omit)
summary(urea_creat)
##
## Call:
## lm(formula = Creatinine ~ Urea, data = df, na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.9441 -4.8015 -0.2036 4.8079 23.1357
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15.1182 1.2622 11.98 <2e-16 ***
## Urea 6.0919 0.1461 41.71 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.852 on 502 degrees of freedom
## Multiple R-squared: 0.7761, Adjusted R-squared: 0.7756
## F-statistic: 1740 on 1 and 502 DF, p-value: < 2.2e-16
plot(x = df$Urea,
y = df$Creatinine,
main = "Urea as predictor of creatinine",
xlab = "Urea",
ylab = "Creatinine",
las = 1,
col = "orange")
abline(urea_creat,
col = "gray",
lwd = 2)
confint(urea_creat, level = 0.95) # level = 0.95 is default
## 2.5 % 97.5 %
## (Intercept) 12.638402 17.598046
## Urea 5.804904 6.378807
attributes(urea_creat)
## $names
## [1] "coefficients" "residuals" "effects" "rank"
## [5] "fitted.values" "assign" "qr" "df.residual"
## [9] "xlevels" "call" "terms" "model"
##
## $class
## [1] "lm"
df$Creatinine[1:5]
## [1] 63 56 69 67 55
urea_creat$fitted.values[1:5]
## 1 2 3 4 5
## 62.63470 55.93366 65.07144 67.50818 60.19795
df$Creatinine[1:5] - urea_creat$fitted.values[1:5]
## 1 2 3 4 5
## 0.36530339 0.06634438 3.92856122 -0.50818096 -5.19795443
urea_creat$residuals[1:5]
## 1 2 3 4 5
## 0.36530339 0.06634438 3.92856122 -0.50818096 -5.19795443
anova(urea_creat)
## Analysis of Variance Table
##
## Response: Creatinine
## Df Sum Sq Mean Sq F value Pr(>F)
## Urea 1 107247 107247 1739.7 < 2.2e-16 ***
## Residuals 502 30947 62
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow = c(2, 2))
plot(urea_creat)
plot(urea_creat, which = 1) # Seems like a non linearity is present
plot(urea_creat, which = 2)
plot(urea_creat, which = 3)
plot(urea_creat, which = 4)
hist(urea_creat$residuals,
main = "Checking assumptions of residual distribution",
xlab = "Residuals",
ylab = "Frequency",
col = 8,
las = 1)
shapiro.test(urea_creat$residuals)
##
## Shapiro-Wilk normality test
##
## data: urea_creat$residuals
## W = 0.99568, p-value = 0.1802
Urea given Creatinine and Weighturea_weight_creat <- lm(Creatinine ~ Urea + Weight,
data = df)
summary(urea_weight_creat)
##
## Call:
## lm(formula = Creatinine ~ Urea + Weight, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.8288 -4.7944 -0.2361 4.8110 23.1360
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15.547397 2.548152 6.101 2.11e-09 ***
## Urea 6.094957 0.147066 41.444 < 2e-16 ***
## Weight -0.004757 0.024528 -0.194 0.846
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.859 on 501 degrees of freedom
## Multiple R-squared: 0.7761, Adjusted R-squared: 0.7752
## F-statistic: 868.2 on 2 and 501 DF, p-value: < 2.2e-16
cor(df$Urea, df$Weight)
## [1] 0.1087418
par(mfrow = c(2, 2))
plot(urea_weight_creat)
confint(urea_weight_creat)
## 2.5 % 97.5 %
## (Intercept) 10.54101713 20.55377713
## Urea 5.80601501 6.38389900
## Weight -0.05294636 0.04343253
anova(urea_weight_creat)
## Analysis of Variance Table
##
## Response: Creatinine
## Df Sum Sq Mean Sq F value Pr(>F)
## Urea 1 107247 107247 1736.3647 <2e-16 ***
## Weight 1 2 2 0.0376 0.8463
## Residuals 501 30944 62
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
df %>% summarise(MIN = min(Urea),
MAX = max(Urea))
## MIN MAX
## 1 2.1 17.7
df$UreaCat <- cut(df$Urea,
breaks = c(0, 5, 10, 18),
labels = c("U1", "U2", "U3")) # Breaks are left open and right closed
# Use right = FALSE to change this
# Use breaks = integer will create an integer number of equal sized bins
head(df)
## ID Group Grade Smoking InitialCholesterol FinalCholesterol dBP Creatinine
## 1 1 Active II No 7.2 6.5 81 63
## 2 2 Placebo II Yes 6.4 6.2 86 56
## 3 3 Placebo II No 7.6 6.9 77 69
## 4 4 Placebo II No 6.9 6.9 71 67
## 5 5 Active I No 5.9 5.9 82 55
## 6 6 Placebo I Yes 8.8 9.2 75 86
## Urea Weight DifferenceCholesterol UreaCat
## 1 7.8 89 -0.7 U2
## 2 6.7 86 -0.2 U2
## 3 8.2 105 -0.7 U2
## 4 8.6 71 0.0 U2
## 5 7.4 88 0.0 U2
## 6 12.1 88 0.4 U3
df %>% group_by(UreaCat) %>% summarise(AVEAGE = mean(Creatinine))
## # A tibble: 3 x 2
## UreaCat AVEAGE
## <fct> <dbl>
## 1 U1 34.4
## 2 U2 63.6
## 3 U3 84.2
urea_cat_plot <- plot_ly(data = df,
y = ~Creatinine,
color = ~UreaCat,
type = "box",
boxmean = "sd",
marker = list(symbol = "diamond"),
line = list(width = 1),
boxpoints = "all") %>%
layout(title = " Data")
urea_cat_plot
Estimate below and \(Ui\) is the dummy variable valuecat_urea_creat <- lm(Creatinine ~ UreaCat,
data = df)
summary(cat_urea_creat)
##
## Call:
## lm(formula = Creatinine ~ UreaCat, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31.2381 -8.3947 0.3934 7.7619 29.3934
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.395 1.823 18.86 <2e-16 ***
## UreaCatU2 29.212 1.917 15.24 <2e-16 ***
## UreaCatU3 49.843 2.128 23.43 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.24 on 501 degrees of freedom
## Multiple R-squared: 0.542, Adjusted R-squared: 0.5402
## F-statistic: 296.5 on 2 and 501 DF, p-value: < 2.2e-16
anova(cat_urea_creat)
## Analysis of Variance Table
##
## Response: Creatinine
## Df Sum Sq Mean Sq F value Pr(>F)
## UreaCat 2 74907 37454 296.5 < 2.2e-16 ***
## Residuals 501 63286 126
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow = c(1, 2))
plot(cat_urea_creat, which = c(1, 2))
hist(rstandard(cat_urea_creat),
main = "Histogram of the standardized residuals",
xlab = "Standadized residuals",
ylab = "Frequency",
las = 1,
col = "gray")
TukeyHSD(aov(cat_urea_creat))
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = cat_urea_creat)
##
## $UreaCat
## diff lwr upr p adj
## U2-U1 29.21191 24.70609 33.71773 0
## U3-U1 49.84336 44.84170 54.84502 0
## U3-U2 20.63145 17.70206 23.56084 0
smoking_grade_creat <- lm(Creatinine ~ Smoking + Grade,
data = df)
anova(smoking_grade_creat)
## Analysis of Variance Table
##
## Response: Creatinine
## Df Sum Sq Mean Sq F value Pr(>F)
## Smoking 1 3 3.338 0.0121 0.9125
## Grade 2 74 37.244 0.1348 0.8739
## Residuals 500 138116 276.231
Weight and Gradeweight_grade_model <- lm(Creatinine ~ Weight + Grade,
data = df)
weight_grade_model$coefficients
## (Intercept) Weight GradeII GradeIII
## 55.5825643 0.1055191 0.3675932 -0.6069257
weight_grade_creat_plot <- plot_ly(data = df,
x = ~Weight,
y = ~Creatinine,
color = ~Grade,
type = "scatter",
mode = "markers",
marker = list(size = 12)) %>%
layout(title = "Predicting creatinine by weight and grade",
xaxis = list(title = "Weight",
zeroline = F),
yaxis = list(title = "Creatinine",
zeroline = F))
weight_grade_creat_plot
anova(weight_grade_model)
## Analysis of Variance Table
##
## Response: Creatinine
## Df Sum Sq Mean Sq F value Pr(>F)
## Weight 1 1163 1162.55 4.2441 0.0399 *
## Grade 2 71 35.55 0.1298 0.8783
## Residuals 500 136960 273.92
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
car::Anova(weight_grade_model,
type = "III")
## Anova Table (Type III tests)
##
## Response: Creatinine
## Sum Sq Df F value Pr(>F)
## (Intercept) 32613 1 119.0589 < 2e-16 ***
## Weight 1156 1 4.2212 0.04044 *
## Grade 71 2 0.1298 0.87830
## Residuals 136960 500
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
weight_grade_interaction_model <- lm(Creatinine ~ Weight * Grade,
data = df)
anova(weight_grade_interaction_model)
## Analysis of Variance Table
##
## Response: Creatinine
## Df Sum Sq Mean Sq F value Pr(>F)
## Weight 1 1163 1162.55 4.2757 0.03918 *
## Grade 2 71 35.55 0.1308 0.87746
## Weight:Grade 2 1554 777.13 2.8582 0.05832 .
## Residuals 498 135405 271.90
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
weight_grade_interaction_model$coefficients
## (Intercept) Weight GradeII GradeIII Weight:GradeII
## 40.5737205 0.2618322 22.4615001 26.1720236 -0.2304398
## Weight:GradeIII
## -0.2799880
variable <- c(0.958, 0.971, 0.927, 0.971, 0.986, 1.051, 0.891, 1.010, 0.925, 0.952, 0.829, 0.955)
experiment <- factor(rep(1:4, 3))
grade <- rep(c("Mild", "Moderate", "Severe"), each = 4)
dfr_block <- data.frame(Variable = variable,
Experiment = experiment,
Grade = grade)
head(dfr_block)
## Variable Experiment Grade
## 1 0.958 1 Mild
## 2 0.971 2 Mild
## 3 0.927 3 Mild
## 4 0.971 4 Mild
## 5 0.986 1 Moderate
## 6 1.051 2 Moderate
block_project_plot <- plot_ly(data = dfr_block,
x = ~Grade,
y = ~Variable,
type = "box",
boxmean = "sd",
marker = list(symbol = "diamond"),
line = list(width = 1),
boxpoints = "all") %>%
layout(title = " Data")
block_project_plot
grade_variable_model <- lm(Variable ~ Grade,
data = dfr_block)
anova(grade_variable_model)
## Analysis of Variance Table
##
## Response: Variable
## Df Sum Sq Mean Sq F value Pr(>F)
## Grade 2 0.0097172 0.0048586 1.7098 0.2348
## Residuals 9 0.0255745 0.0028416
grade_experiment_variable_model <- lm(Variable ~ Grade + Experiment,
data = dfr_block)
anova(grade_experiment_variable_model)
## Analysis of Variance Table
##
## Response: Variable
## Df Sum Sq Mean Sq F value Pr(>F)
## Grade 2 0.0097172 0.0048586 6.9682 0.027259 *
## Experiment 3 0.0213910 0.0071303 10.2264 0.008967 **
## Residuals 6 0.0041835 0.0006972
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Grade is significant (we can stick to Type I here since there is no colinearlity)par(mfrow = c(1, 2))
plot(grade_experiment_variable_model, which = c(1, 2))
TukeyHSD(aov(model, "variable")) test
TukeyHSD(aov(grade_experiment_variable_model), "Grade")
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = grade_experiment_variable_model)
##
## $Grade
## diff lwr upr p adj
## Moderate-Mild 0.02775 -0.02953929 0.08503929 0.3611755
## Severe-Mild -0.04150 -0.09878929 0.01578929 0.1454294
## Severe-Moderate -0.06925 -0.12653929 -0.01196071 0.0232792
variable <- c(709, 679, 699, 657, 594, 677, 592, 538, 476, 508, 505, 539)
grade <- rep(c("I", "I", "I", "II", "II", "II"), 2)
dose <- rep(c("Low", "High"), each = 6)
dfr_fact <- data.frame(Variable = variable,
Grade = grade,
Dose = dose)
dfr_fact
## Variable Grade Dose
## 1 709 I Low
## 2 679 I Low
## 3 699 I Low
## 4 657 II Low
## 5 594 II Low
## 6 677 II Low
## 7 592 I High
## 8 538 I High
## 9 476 I High
## 10 508 II High
## 11 505 II High
## 12 539 II High
interaction.plot(dfr_fact$Grade,
dfr_fact$Dose,
dfr_fact$Variable,
xlab = "Grade",
main = "Interaction plot",
ylab = "Mean of variable",
las = 1)
full_factorial_model <- lm(Variable ~ Grade * Dose,
data = dfr_fact)
anova(full_factorial_model)
## Analysis of Variance Table
##
## Response: Variable
## Df Sum Sq Mean Sq F value Pr(>F)
## Grade 1 3781 3781 2.5925 0.1460358
## Dose 1 61204 61204 41.9685 0.0001925 ***
## Grade:Dose 1 919 919 0.6300 0.4502546
## Residuals 8 11667 1458
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow = c(1, 2))
plot(full_factorial_model, which = c(1, 2))
car library and Anova(model, type = "III) functionoptions(contrasts = c(unordered = "contr.sum", ordered = "contr.poly))poly_model <- lm(Creatinine ~ poly(Urea, degree = 2, raw = T),
data = df) # For orthogonal polynomial use raw = FALSE
summary(poly_model)
##
## Call:
## lm(formula = Creatinine ~ poly(Urea, degree = 2, raw = T), data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.2487 -4.6587 0.0023 4.5604 21.7639
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -12.6588 2.8769 -4.40 1.32e-05 ***
## poly(Urea, degree = 2, raw = T)1 12.9855 0.6684 19.43 < 2e-16 ***
## poly(Urea, degree = 2, raw = T)2 -0.3945 0.0375 -10.52 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.113 on 501 degrees of freedom
## Multiple R-squared: 0.8166, Adjusted R-squared: 0.8159
## F-statistic: 1115 on 2 and 501 DF, p-value: < 2.2e-16
poly_model_alt <- lm(Creatinine ~ Urea + I(Urea^2),
data = df)
summary(poly_model_alt)
##
## Call:
## lm(formula = Creatinine ~ Urea + I(Urea^2), data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.2487 -4.6587 0.0023 4.5604 21.7639
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -12.6588 2.8769 -4.40 1.32e-05 ***
## Urea 12.9855 0.6684 19.43 < 2e-16 ***
## I(Urea^2) -0.3945 0.0375 -10.52 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.113 on 501 degrees of freedom
## Multiple R-squared: 0.8166, Adjusted R-squared: 0.8159
## F-statistic: 1115 on 2 and 501 DF, p-value: < 2.2e-16
\[F_{\text{stat}}=\frac{\frac{\text{SSE}_{\text{partial}} - \text{SSE}_{\text{full}}}{\Delta_{\text{params}}}}{\text{MSE}_{\text{full}}}\tag{1}\]
Using ANOVA
Full model
full_model <- lm(Creatinine ~ Urea + I(Urea^2),
data = df)
summary(full_model)
##
## Call:
## lm(formula = Creatinine ~ Urea + I(Urea^2), data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.2487 -4.6587 0.0023 4.5604 21.7639
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -12.6588 2.8769 -4.40 1.32e-05 ***
## Urea 12.9855 0.6684 19.43 < 2e-16 ***
## I(Urea^2) -0.3945 0.0375 -10.52 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.113 on 501 degrees of freedom
## Multiple R-squared: 0.8166, Adjusted R-squared: 0.8159
## F-statistic: 1115 on 2 and 501 DF, p-value: < 2.2e-16
partial_model <- lm(Creatinine ~ Urea,
data = df)
summary(partial_model)
##
## Call:
## lm(formula = Creatinine ~ Urea, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.9441 -4.8015 -0.2036 4.8079 23.1357
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15.1182 1.2622 11.98 <2e-16 ***
## Urea 6.0919 0.1461 41.71 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.852 on 502 degrees of freedom
## Multiple R-squared: 0.7761, Adjusted R-squared: 0.7756
## F-statistic: 1740 on 1 and 502 DF, p-value: < 2.2e-16
anova(partial_model, full_model)
## Analysis of Variance Table
##
## Model 1: Creatinine ~ Urea
## Model 2: Creatinine ~ Urea + I(Urea^2)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 502 30947
## 2 501 25346 1 5600.7 110.71 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1