library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ ggplot2 3.4.4 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ car::recode() masks dplyr::recode()
## ✖ purrr::some() masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggpubr)
library(rstatix)
##
## Attaching package: 'rstatix'
##
## The following object is masked from 'package:stats':
##
## filter
library(readxl)
Data <- read_excel("D:/SNSHS STATS/Cielo/CIELO.xlsx")
View(Data)
library(rmarkdown)
paged_table(Data)
shapiro.test(Data$`Weight`)
##
## Shapiro-Wilk normality test
##
## data: Data$Weight
## W = 0.9511, p-value = 0.05592
Since p-value = 0.02731 > 0.05, it is conclusive that we reject the null hypothesis. That is, we cannot assume normality.
x <- c(157.3, 153.3, 152.6, 171.6, 175.52, 172.48, 198.32, 194.4, 193.18, 206.61, 208.96, 209.93, 225.56, 223.06, 222.54)
y <- c(153.4, 155.1, 151.1, 168.27, 173.08, 170.15, 194.03, 192.1, 191.28, 201.56, 204.53, 203.87, 218.95, 215.64, 217.61)
z <- c(151.6, 152.2, 154.1, 163.07, 168.12, 167.05, 189.9, 185.86, 186.74, 185.23, 184.75, 182.62, 203.6, 204.64, 207.45)
shapiro.test(x)
##
## Shapiro-Wilk normality test
##
## data: x
## W = 0.91996, p-value = 0.1924
shapiro.test(y)
##
## Shapiro-Wilk normality test
##
## data: y
## W = 0.9161, p-value = 0.1679
shapiro.test(z)
##
## Shapiro-Wilk normality test
##
## data: z
## W = 0.92252, p-value = 0.2104
leveneTest(`Weight` ~ Treatment,
data = Data)
## Warning in leveneTest.default(y = y, group = group, ...): group coerced to
## factor.
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 2 2.0261 0.1445
## 42
group_by(Data, Treatment) %>%
summarise(
count = n(),
mean = mean(Weight, na.rm = TRUE),
sd = sd(Weight, na.rm = TRUE)
)
## # A tibble: 3 × 4
## Treatment count mean sd
## <chr> <int> <dbl> <dbl>
## 1 Treatment 1 15 203. 32.5
## 2 Treatment 2 15 187. 23.8
## 3 Treatment 3 15 179. 18.9
library("ggpubr")
ggboxplot(Data, x = "Treatment", y = "Weight",
color = "Treatment", palette = c("#00AFBB", "#E7B800", "#FC4E07"),
order = c("Treatment 1", "Treatment 2", "Treatment 3"),
ylab = "Weight", xlab = "Treatment")
ggline(Data, x = "Treatment", y = "Weight",
add = c("mean_se", "jitter"),
order = c("Treatment 1", "Treatment 2", "Treatment 3"),
ylab = "Weight", xlab = "Treatment")
library("gplots")
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
plotmeans(Weight ~ Treatment, data = Data, frame = TRUE,
xlab = "Treatment", ylab = "Weight",
main="Mean Plot with 95% CI")
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter
## Warning in axis(1, at = 1:length(means), labels = legends, ...): "frame" is not
## a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter
res.aov <- aov(Weight ~ Treatment, data = Data)
summary(res.aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## Treatment 2 4579 2289.5 3.468 0.0404 *
## Residuals 42 27725 660.1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Since p-value = 0.356 > 0.05, it is conclusive that we fail to reject the null hypothesis, that is, there is no significant difference between the treatments.
TukeyHSD(res.aov)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = Weight ~ Treatment, data = Data)
##
## $Treatment
## diff lwr upr p adj
## Treatment 2-Treatment 1 -16.046000 -38.83852 6.746522 0.2132634
## Treatment 3-Treatment 1 -24.295333 -47.08786 -1.502811 0.0344202
## Treatment 3-Treatment 2 -8.249333 -31.04186 14.543189 0.6561816
library(correlation)
##
## Attaching package: 'correlation'
## The following object is masked from 'package:rstatix':
##
## cor_test
library(corrplot)
## corrplot 0.92 loaded
library(tidyverse)
library(psych)
## Warning: package 'psych' was built under R version 4.3.3
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
## The following object is masked from 'package:car':
##
## logit
shapiro.test(Data$`Salinity`)
##
## Shapiro-Wilk normality test
##
## data: Data$Salinity
## W = 0.75778, p-value = 3.16e-07
shapiro.test(Data$`pH level`)
##
## Shapiro-Wilk normality test
##
## data: Data$`pH level`
## W = 0.6223, p-value = 1.584e-09
shapiro.test(Data$`Temperature`)
##
## Shapiro-Wilk normality test
##
## data: Data$Temperature
## W = 0.6223, p-value = 1.584e-09
shapiro.test(Data$`Weight`)
##
## Shapiro-Wilk normality test
##
## data: Data$Weight
## W = 0.9511, p-value = 0.05592
cor(Data$Salinity, Data$Weight,
method = "spearman"
)
## [1] -0.6566769
cor(Data$`pH level`, Data$Weight,
method = "spearman"
)
## [1] 0.6217091
cor(Data$`Temperature`, Data$Weight,
method = "spearman"
)
## [1] 0.3492748
corrplot(corr = cor(Data[3:6], method = "spearman"),
addCoef.col = "black",
number.cex = 0.8,
number.digits = 2,
diag = FALSE,
bg = "grey",
outline = "black",
addgrid.col = "white",
mar = c(1,1,1,1))
pairs.panels(Data[3:6],
smooth = FALSE,
scale = FALSE,
density = TRUE,
ellipses = TRUE,
method = "spearman",
pch = 21,
bg = c("red", "yellow", "blue"),
lm = FALSE,
cor = TRUE,
jiggle = FALSE,
factor = 0,
hist.col = 4,
stars = TRUE,
ci = FALSE,
main = "Correlation Matrix of the Variables")
Correlation coefficients whose magnitude are between 0.9 and 1.0
indicate variables which can be considered very highly correlated.
Correlation coefficients whose magnitude are between 0.7 and 0.9
indicate variables which can be considered highly correlated.
Correlation coefficients whose magnitude are between 0.5 and 0.7
indicate variables which can be considered moderately correlated.
Correlation coefficients whose magnitude are between 0.3 and 0.5
indicate variables which have a low correlation. Correlation
coefficients whose magnitude are less than 0.3 have little if any
(linear) correlation.
multiple <- lm( `Weight`~ `Salinity` + `Temperature` + `pH level`, data = Data)
multiple
##
## Call:
## lm(formula = Weight ~ Salinity + Temperature + `pH level`, data = Data)
##
## Coefficients:
## (Intercept) Salinity Temperature `pH level`
## -6923.389 -26.063 3.875 1043.598
library(performance)
check_model(multiple)
Linearity (top left plot) is not perfect—a variable could be
removed/added or a transformation could be applied to improve
linearity10 Homogeneity of variance (top right plot) is respected
Multicollinearity (middle left plot) is not an issue (I tend to use the
threshold of 10 for VIF, and all of them are below 10)11 There is no
influential points (middle right plot) Normality of the residuals (two
bottom plots) is also not perfect due to 3 points deviating from the
reference line but it still seems acceptable to me. In any case, the
number of observations is large enough given the number of parameters12
and given the small deviation from normality so tests on the
coefficients are (approximately) valid whether the error follows a
normal distribution or not
summary(multiple)
##
## Call:
## lm(formula = Weight ~ Salinity + Temperature + `pH level`, data = Data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31.73 -12.11 -5.14 12.26 43.61
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6923.389 3107.287 -2.228 0.03142 *
## Salinity -26.063 8.041 -3.241 0.00237 **
## Temperature 3.875 4.531 0.855 0.39744
## `pH level` 1043.598 472.328 2.209 0.03278 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.89 on 41 degrees of freedom
## Multiple R-squared: 0.4977, Adjusted R-squared: 0.461
## F-statistic: 13.54 on 3 and 41 DF, p-value: 2.78e-06
The statistical output displays the coded coefficients, which are the standardized coefficients. pH level has the standardized coefficient with the largest absolute value. This measure suggests that pH level is the most important independent variable in the regression model.