library(readxl)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ ggplot2   3.4.4     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ car::recode()   masks dplyr::recode()
## ✖ purrr::some()   masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggpubr)
library(rstatix)
## 
## Attaching package: 'rstatix'
## 
## The following object is masked from 'package:stats':
## 
##     filter

Data

library(readxl)
Data <- read_excel("D:/SNSHS STATS/Cielo/CIELO.xlsx")
View(Data)
library(rmarkdown)
paged_table(Data)

Normality Test

shapiro.test(Data$`Weight`)
## 
##  Shapiro-Wilk normality test
## 
## data:  Data$Weight
## W = 0.9511, p-value = 0.05592

Since p-value = 0.02731 > 0.05, it is conclusive that we reject the null hypothesis. That is, we cannot assume normality.

x <- c(157.3, 153.3, 152.6, 171.6, 175.52, 172.48, 198.32, 194.4, 193.18, 206.61, 208.96, 209.93, 225.56, 223.06, 222.54)
y <- c(153.4, 155.1, 151.1, 168.27, 173.08, 170.15, 194.03, 192.1, 191.28, 201.56, 204.53, 203.87, 218.95, 215.64, 217.61)
z <- c(151.6, 152.2, 154.1, 163.07, 168.12, 167.05, 189.9, 185.86, 186.74, 185.23, 184.75, 182.62, 203.6, 204.64, 207.45)
shapiro.test(x)
## 
##  Shapiro-Wilk normality test
## 
## data:  x
## W = 0.91996, p-value = 0.1924
shapiro.test(y)
## 
##  Shapiro-Wilk normality test
## 
## data:  y
## W = 0.9161, p-value = 0.1679
shapiro.test(z)
## 
##  Shapiro-Wilk normality test
## 
## data:  z
## W = 0.92252, p-value = 0.2104

Homogeneity of Variance

leveneTest(`Weight` ~ Treatment,
  data = Data)
## Warning in leveneTest.default(y = y, group = group, ...): group coerced to
## factor.
## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value Pr(>F)
## group  2  2.0261 0.1445
##       42
group_by(Data, Treatment) %>%
  summarise(
    count = n(),
    mean = mean(Weight, na.rm = TRUE),
    sd = sd(Weight, na.rm = TRUE)
  )
## # A tibble: 3 × 4
##   Treatment   count  mean    sd
##   <chr>       <int> <dbl> <dbl>
## 1 Treatment 1    15  203.  32.5
## 2 Treatment 2    15  187.  23.8
## 3 Treatment 3    15  179.  18.9
library("ggpubr")
ggboxplot(Data, x = "Treatment", y = "Weight",
          color = "Treatment", palette = c("#00AFBB", "#E7B800", "#FC4E07"),
          order = c("Treatment 1", "Treatment 2", "Treatment 3"),
          ylab = "Weight", xlab = "Treatment")

ggline(Data, x = "Treatment", y = "Weight",
       add = c("mean_se", "jitter"),
       order = c("Treatment 1", "Treatment 2", "Treatment 3"),
       ylab = "Weight", xlab = "Treatment")

library("gplots")
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
plotmeans(Weight ~ Treatment, data = Data, frame = TRUE,
          xlab = "Treatment", ylab = "Weight",
          main="Mean Plot with 95% CI")
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter
## Warning in axis(1, at = 1:length(means), labels = legends, ...): "frame" is not
## a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter

One-way ANOVA

res.aov <- aov(Weight ~ Treatment, data = Data)
summary(res.aov)
##             Df Sum Sq Mean Sq F value Pr(>F)  
## Treatment    2   4579  2289.5   3.468 0.0404 *
## Residuals   42  27725   660.1                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Since p-value = 0.356 > 0.05, it is conclusive that we fail to reject the null hypothesis, that is, there is no significant difference between the treatments.

TukeyHSD(res.aov)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = Weight ~ Treatment, data = Data)
## 
## $Treatment
##                               diff       lwr       upr     p adj
## Treatment 2-Treatment 1 -16.046000 -38.83852  6.746522 0.2132634
## Treatment 3-Treatment 1 -24.295333 -47.08786 -1.502811 0.0344202
## Treatment 3-Treatment 2  -8.249333 -31.04186 14.543189 0.6561816
library(correlation)
## 
## Attaching package: 'correlation'
## The following object is masked from 'package:rstatix':
## 
##     cor_test
library(corrplot)
## corrplot 0.92 loaded
library(tidyverse)
library(psych)
## Warning: package 'psych' was built under R version 4.3.3
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## The following object is masked from 'package:car':
## 
##     logit

Normality Test

shapiro.test(Data$`Salinity`)
## 
##  Shapiro-Wilk normality test
## 
## data:  Data$Salinity
## W = 0.75778, p-value = 3.16e-07
shapiro.test(Data$`pH level`)
## 
##  Shapiro-Wilk normality test
## 
## data:  Data$`pH level`
## W = 0.6223, p-value = 1.584e-09
shapiro.test(Data$`Temperature`)
## 
##  Shapiro-Wilk normality test
## 
## data:  Data$Temperature
## W = 0.6223, p-value = 1.584e-09
shapiro.test(Data$`Weight`)
## 
##  Shapiro-Wilk normality test
## 
## data:  Data$Weight
## W = 0.9511, p-value = 0.05592
cor(Data$Salinity, Data$Weight,
  method = "spearman"
)
## [1] -0.6566769
cor(Data$`pH level`, Data$Weight,
  method = "spearman"
)
## [1] 0.6217091
cor(Data$`Temperature`, Data$Weight,
  method = "spearman"
)
## [1] 0.3492748
corrplot(corr = cor(Data[3:6], method = "spearman"),
         addCoef.col = "black",
         number.cex = 0.8,
         number.digits = 2,
         diag = FALSE,
         bg = "grey",
         outline = "black",
         addgrid.col = "white",
         mar = c(1,1,1,1))

pairs.panels(Data[3:6],
             smooth = FALSE,
             scale = FALSE,
             density = TRUE,
             ellipses = TRUE,
             method = "spearman",
             pch = 21,
             bg = c("red", "yellow", "blue"),
             lm = FALSE,
             cor = TRUE,
             jiggle = FALSE,
             factor = 0,
             hist.col = 4,
             stars = TRUE,
             ci = FALSE,
             main = "Correlation Matrix of the Variables")

Correlation coefficients whose magnitude are between 0.9 and 1.0 indicate variables which can be considered very highly correlated. Correlation coefficients whose magnitude are between 0.7 and 0.9 indicate variables which can be considered highly correlated. Correlation coefficients whose magnitude are between 0.5 and 0.7 indicate variables which can be considered moderately correlated. Correlation coefficients whose magnitude are between 0.3 and 0.5 indicate variables which have a low correlation. Correlation coefficients whose magnitude are less than 0.3 have little if any (linear) correlation.

multiple <- lm( `Weight`~ `Salinity` +  `Temperature` + `pH level`, data = Data)
multiple
## 
## Call:
## lm(formula = Weight ~ Salinity + Temperature + `pH level`, data = Data)
## 
## Coefficients:
## (Intercept)     Salinity  Temperature   `pH level`  
##   -6923.389      -26.063        3.875     1043.598
library(performance)
check_model(multiple)

Linearity (top left plot) is not perfect—a variable could be removed/added or a transformation could be applied to improve linearity10 Homogeneity of variance (top right plot) is respected Multicollinearity (middle left plot) is not an issue (I tend to use the threshold of 10 for VIF, and all of them are below 10)11 There is no influential points (middle right plot) Normality of the residuals (two bottom plots) is also not perfect due to 3 points deviating from the reference line but it still seems acceptable to me. In any case, the number of observations is large enough given the number of parameters12 and given the small deviation from normality so tests on the coefficients are (approximately) valid whether the error follows a normal distribution or not

summary(multiple)
## 
## Call:
## lm(formula = Weight ~ Salinity + Temperature + `pH level`, data = Data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -31.73 -12.11  -5.14  12.26  43.61 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)   
## (Intercept) -6923.389   3107.287  -2.228  0.03142 * 
## Salinity      -26.063      8.041  -3.241  0.00237 **
## Temperature     3.875      4.531   0.855  0.39744   
## `pH level`   1043.598    472.328   2.209  0.03278 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.89 on 41 degrees of freedom
## Multiple R-squared:  0.4977, Adjusted R-squared:  0.461 
## F-statistic: 13.54 on 3 and 41 DF,  p-value: 2.78e-06

The statistical output displays the coded coefficients, which are the standardized coefficients. pH level has the standardized coefficient with the largest absolute value. This measure suggests that pH level is the most important independent variable in the regression model.