library(readxl)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ ggplot2   3.4.4     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ car::recode()   masks dplyr::recode()
## ✖ purrr::some()   masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggpubr)
library(rstatix)
## 
## Attaching package: 'rstatix'
## 
## The following object is masked from 'package:stats':
## 
##     filter

Data

library(readxl)
Data <- read_excel("D:/SNSHS STATS/Gelia/GELIA.xlsx")
View(Data)
library(rmarkdown)
paged_table(Data)

Normality Test

res_aov <- aov(`Shannon's Index` ~ Location,
  data = Data
)
shapiro.test(res_aov$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  res_aov$residuals
## W = 0.70657, p-value = 0.001693

Data is not normally distributed.

res_aov <- aov(`Shannon's Index` ~ `Substrate Type`,
  data = Data
)
shapiro.test(res_aov$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  res_aov$residuals
## W = 0.87266, p-value = 0.1313

Data is normally distributed.

res_aov <- aov(`Population` ~ Location,
  data = Data
)
shapiro.test(res_aov$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  res_aov$residuals
## W = 0.92565, p-value = 0.441

Data is normally distributed.

res_aov <- aov(`Population` ~ `Substrate Type`,
  data = Data
)
shapiro.test(res_aov$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  res_aov$residuals
## W = 0.95023, p-value = 0.6926

Data is not normally distributed.

Homogeneity of Variance

leveneTest(`Shannon's Index` ~ Location,
  data = Data)
## Warning in leveneTest.default(y = y, group = group, ...): group coerced to
## factor.
## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value  Pr(>F)  
## group  2  3.9631 0.07997 .
##        6                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The p-value is greater than the 0.05 level of significance. Thus, the homogeneity assumption of the variance is met.

leveneTest(`Shannon's Index` ~ `Substrate Type`,
  data = Data)
## Warning in leveneTest.default(y = y, group = group, ...): group coerced to
## factor.
## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value Pr(>F)
## group  2  0.5872 0.5849
##        6

The p-value is greater than the 0.05 level of significance. Thus, the homogeneity assumption of the variance is met.

leveneTest(`Population` ~ Location,
  data = Data)
## Warning in leveneTest.default(y = y, group = group, ...): group coerced to
## factor.
## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value Pr(>F)
## group  2  0.1624 0.8537
##        6

The p-value is greater than the 0.05 level of significance. Thus, the homogeneity assumption of the variance is met.

leveneTest(`Population` ~ `Substrate Type`,
  data = Data)
## Warning in leveneTest.default(y = y, group = group, ...): group coerced to
## factor.
## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value Pr(>F)
## group  2  0.2591   0.78
##        6

The p-value is greater than the 0.05 level of significance. Thus, the homogeneity assumption of the variance is met.

Shannon’s Index (Location)

ggboxplot(Data, x = "Location", y = "Shannon's Index",
          color = "Location", palette = c("#FC4E07", "#00FF00", "#4E07FC"),
          order = c("Catabaan", "RSK", "Boulevard"),
          ylab = "Shannon's Index", xlab = "Location")

ggline(Data, x = "Location", y = "Shannon's Index",
       add = c("mean_se", "jitter"),
       order = c("Catabaan", "RSK", "Boulevard"),
       ylab = "Shannon's Index", xlab = "Location")

library("gplots")
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
plotmeans(`Shannon's Index` ~ Location, data = Data, frame = TRUE,
          xlab = "Location", ylab = "Shannon's Index",
          main="Mean Plot with 95% CI")
## Warning in arrows(x, li, x, pmax(y - gap, li), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(x, li, x, pmax(y - gap, li), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(x, ui, x, pmin(y + gap, ui), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped

## Warning in arrows(x, ui, x, pmin(y + gap, ui), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter
## Warning in axis(1, at = 1:length(means), labels = legends, ...): "frame" is not
## a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter

PlantGrowth <- Data %>%
  reorder_levels(Location, order = c("Catabaan", "RSK", "Boulevard"))
PlantGrowth %>%  
group_by(Location) %>%
  get_summary_stats(`Shannon's Index`, type = "common")
## # A tibble: 3 × 11
##   Location  variable          n   min   max median   iqr  mean    sd    se    ci
##   <fct>     <fct>         <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Catabaan  Shannon's In…     3     0 0      0     0     0     0     0     0    
## 2 RSK       Shannon's In…     3     0 0      0     0     0     0     0     0    
## 3 Boulevard Shannon's In…     3     0 0.377  0.199 0.189 0.192 0.189 0.109 0.469
ggboxplot(PlantGrowth, x = "Location", y = "Shannon's Index", fill="Location")

Kruskal-wallis Test

res.kruskal <- PlantGrowth %>% kruskal_test(`Shannon's Index` ~ Location)
res.kruskal
## # A tibble: 1 × 6
##   .y.                 n statistic    df     p method        
## * <chr>           <int>     <dbl> <int> <dbl> <chr>         
## 1 Shannon's Index     9      4.50     2 0.105 Kruskal-Wallis

Based on the p-value, there is no significant difference was observed between the group pairs in terms of length.

Pairwise Comparisons

res1<- PlantGrowth %>%
  dunn_test(`Shannon's Index` ~ Location, p.adjust.method = "bonferroni")
res1
## # A tibble: 3 × 9
##   .y.             group1  group2    n1    n2 statistic      p p.adj p.adj.signif
## * <chr>           <chr>   <chr>  <int> <int>     <dbl>  <dbl> <dbl> <chr>       
## 1 Shannon's Index Cataba… RSK        3     3      0    1      1     ns          
## 2 Shannon's Index Cataba… Boule…     3     3      1.84 0.0662 0.199 ns          
## 3 Shannon's Index RSK     Boule…     3     3      1.84 0.0662 0.199 ns

Shannon’s Index (Substrate Type)

group_by(Data, `Substrate Type`) %>%
  summarise(
    count = n(),
    mean = mean(`Shannon's Index`, na.rm = TRUE),
    sd = sd(`Shannon's Index`, na.rm = TRUE)
  )
## # A tibble: 3 × 4
##   `Substrate Type`        count   mean    sd
##   <chr>                   <int>  <dbl> <dbl>
## 1 Intertidal Rocky Shores     3 0.126  0.218
## 2 Sandy Beaches               3 0      0    
## 3 Sea Grass Beds              3 0.0663 0.115
library("ggpubr")
ggboxplot(Data, x = "Substrate Type", y = "Shannon's Index",
          color = "Substrate Type", palette = c("#00AFBB", "#E7B800", "#FC4E07"),
          order = c("Intertidal Rocky Shores", "Sandy Beaches", "Sea Grass Beds"),
          ylab = "Shannon's Index", xlab = "Substrate Type")

ggline(Data, x = "Substrate Type", y = "Shannon's Index",
       add = c("mean_se", "jitter"),
       order = c("Intertidal Rocky Shores", "Sandy Beaches", "Sea Grass Beds"),
       ylab = "Shannon's Index", xlab = "Substrate Type")

library("gplots")
plotmeans(`Shannon's Index` ~ `Substrate Type`, data = Data, frame = TRUE,
          xlab = "Substrate Type", ylab = "Shannon's Index",
          main="Mean Plot with 95% CI")
## Warning in arrows(x, li, x, pmax(y - gap, li), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(x, ui, x, pmin(y + gap, ui), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter
## Warning in axis(1, at = 1:length(means), labels = legends, ...): "frame" is not
## a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter

One-way ANOVA

res.aov <- aov(`Shannon's Index` ~ `Substrate Type`, data = Data)
summary(res.aov)
##                  Df  Sum Sq Mean Sq F value Pr(>F)
## `Substrate Type`  2 0.02371 0.01186   0.587  0.585
## Residuals         6 0.12115 0.02019

Since p-value = 0.585 < 0.05, it is conclusive that we fail to reject the null hypothesis, that is, there is no significant difference between the substrate type in terms of Shannon’s index

Population (Location)

group_by(Data, `Location`) %>%
  summarise(
    count = n(),
    mean = mean(`Population`, na.rm = TRUE),
    sd = sd(`Population`, na.rm = TRUE)
  )
## # A tibble: 3 × 4
##   Location  count  mean    sd
##   <chr>     <int> <dbl> <dbl>
## 1 Boulevard     3  36.3  4.04
## 2 Catabaan      3  24    6.56
## 3 RSK           3  21.7  5.69
library("ggpubr")
ggboxplot(Data, x = "Location", y = "Population",
          color = "Location", palette = c("#00AFBB", "#E7B800", "#FC4E07"),
          order = c("Catabaan", "RSK", "Boulevard"),
          ylab = "Population", xlab = "Location")

ggline(Data, x = "Location", y = "Population",
       add = c("mean_se", "jitter"),
       order = c("Catabaan", "RSK", "Boulevard"),
       ylab = "Population", xlab = "Location")

library("gplots")
plotmeans(`Population` ~ `Location`, data = Data, frame = TRUE,
          xlab = "Location", ylab = "Population",
          main="Mean Plot with 95% CI")
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter
## Warning in axis(1, at = 1:length(means), labels = legends, ...): "frame" is not
## a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter

One-way ANOVA

res.aov <- aov(`Population` ~ `Location`, data = Data)
summary(res.aov)
##             Df Sum Sq Mean Sq F value Pr(>F)  
## Location     2  372.7  186.33   6.098 0.0359 *
## Residuals    6  183.3   30.56                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Since p-value = 0.0359 < 0.05, it is conclusive that we reject the null hypothesis, that is, there is significant difference between the location in terms of population.

TukeyHSD(res.aov)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = Population ~ Location, data = Data)
## 
## $Location
##                          diff       lwr        upr     p adj
## Catabaan-Boulevard -12.333333 -26.18154  1.5148774 0.0759949
## RSK-Boulevard      -14.666667 -28.51488 -0.8184559 0.0400674
## RSK-Catabaan        -2.333333 -16.18154 11.5148774 0.8661040

Population (Substrate Type)

group_by(Data, `Substrate Type`) %>%
  summarise(
    count = n(),
    mean = mean(`Population`, na.rm = TRUE),
    sd = sd(`Population`, na.rm = TRUE)
  )
## # A tibble: 3 × 4
##   `Substrate Type`        count  mean    sd
##   <chr>                   <int> <dbl> <dbl>
## 1 Intertidal Rocky Shores     3  27.7  4.51
## 2 Sandy Beaches               3  24   11.3 
## 3 Sea Grass Beds              3  30.3 10.0
library("ggpubr")
ggboxplot(Data, x = "Substrate Type", y = "Population",
          color = "Substrate Type", palette = c("#00AFBB", "#E7B800", "#FC4E07"),
          order = c("Intertidal Rocky Shores", "Sandy Beaches", "Sea Grass Beds"),
          ylab = "Population", xlab = "Substrate Type")

ggline(Data, x = "Substrate Type", y = "Population",
       add = c("mean_se", "jitter"),
       order = c("Intertidal Rocky Shores", "Sandy Beaches", "Sea Grass Beds"),
       ylab = "Population", xlab = "Substrate Type")

library("gplots")
plotmeans(`Population` ~ `Substrate Type`, data = Data, frame = TRUE,
          xlab = "Substrate Type", ylab = "Population",
          main="Mean Plot with 95% CI")
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter
## Warning in axis(1, at = 1:length(means), labels = legends, ...): "frame" is not
## a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "frame" is not a
## graphical parameter

One-way ANOVA

res.aov <- aov(`Population` ~ `Substrate Type`, data = Data)
summary(res.aov)
##                  Df Sum Sq Mean Sq F value Pr(>F)
## `Substrate Type`  2   60.7   30.33   0.367  0.707
## Residuals         6  495.3   82.56

Since p-value = 0.707 > 0.05, it is conclusive that we fail to reject the null hypothesis, that is, there is no significant difference between the substrate type in terms of population.

Correlation Analysis

library(correlation)
## 
## Attaching package: 'correlation'
## The following object is masked from 'package:rstatix':
## 
##     cor_test
library(corrplot)
## corrplot 0.92 loaded
library(tidyverse)
library(psych)
## Warning: package 'psych' was built under R version 4.3.3
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## The following object is masked from 'package:car':
## 
##     logit
corrplot(corr = cor(Data[3:7], method = "spearman"),
         addCoef.col = "black",
         number.cex = 0.8,
         number.digits = 2,
         diag = FALSE,
         bg = "grey",
         outline = "black",
         addgrid.col = "white",
         mar = c(1,1,1,1))

pairs.panels(Data[3:7],
             smooth = FALSE,
             scale = FALSE,
             density = TRUE,
             ellipses = TRUE,
             method = "spearman",
             pch = 21,
             bg = c("red", "yellow", "blue"),
             lm = FALSE,
             cor = TRUE,
             jiggle = FALSE,
             factor = 0,
             hist.col = 4,
             stars = TRUE,
             ci = FALSE,
             main = "Correlation Matrix of the Variables")

Correlation coefficients whose magnitude are between 0.9 and 1.0 indicate variables which can be considered very highly correlated. Correlation coefficients whose magnitude are between 0.7 and 0.9 indicate variables which can be considered highly correlated. Correlation coefficients whose magnitude are between 0.5 and 0.7 indicate variables which can be considered moderately correlated. Correlation coefficients whose magnitude are between 0.3 and 0.5 indicate variables which have a low correlation. Correlation coefficients whose magnitude are less than 0.3 have little if any (linear) correlation.

multiple <- lm( `Shannon's Index`~ `Salinity` +  `Water Temperature` + `pH Level`, data = Data)
multiple
## 
## Call:
## lm(formula = `Shannon's Index` ~ Salinity + `Water Temperature` + 
##     `pH Level`, data = Data)
## 
## Coefficients:
##         (Intercept)             Salinity  `Water Temperature`  
##            1.914349             0.005509             0.092164  
##          `pH Level`  
##           -0.591327
library(performance)
check_model(multiple)

multiple1 <- lm( `Population`~ `Salinity` +  `Water Temperature` + `pH Level`, data = Data)
multiple1
## 
## Call:
## lm(formula = Population ~ Salinity + `Water Temperature` + `pH Level`, 
##     data = Data)
## 
## Coefficients:
##         (Intercept)             Salinity  `Water Temperature`  
##             38.8721               0.1209               5.1989  
##          `pH Level`  
##            -20.9810
library(performance)
check_model(multiple1)

summary(multiple)
## 
## Call:
## lm(formula = `Shannon's Index` ~ Salinity + `Water Temperature` + 
##     `pH Level`, data = Data)
## 
## Residuals:
##         1         2         3         4         5         6         7         8 
## -0.023295 -0.012395  0.168716  0.017811  0.005489 -0.196604 -0.005548  0.017169 
##         9 
##  0.028657 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)
## (Intercept)          1.914349   7.497661   0.255    0.809
## Salinity             0.005509   0.010622   0.519    0.626
## `Water Temperature`  0.092164   0.093725   0.983    0.371
## `pH Level`          -0.591327   1.306208  -0.453    0.670
## 
## Residual standard error: 0.1177 on 5 degrees of freedom
## Multiple R-squared:  0.5216, Adjusted R-squared:  0.2345 
## F-statistic: 1.817 on 3 and 5 DF,  p-value: 0.2609
summary(multiple1)
## 
## Call:
## lm(formula = Population ~ Salinity + `Water Temperature` + `pH Level`, 
##     data = Data)
## 
## Residuals:
##       1       2       3       4       5       6       7       8       9 
## -1.3595  5.7905 -5.0552 -5.4498 -4.4193  0.4412  6.5574 -0.9229  4.4177 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)
## (Intercept)          38.8721   375.3735   0.104    0.922
## Salinity              0.1209     0.5318   0.227    0.829
## `Water Temperature`   5.1989     4.6924   1.108    0.318
## `pH Level`          -20.9810    65.3958  -0.321    0.761
## 
## Residual standard error: 5.894 on 5 degrees of freedom
## Multiple R-squared:  0.6875, Adjusted R-squared:  0.5001 
## F-statistic: 3.667 on 3 and 5 DF,  p-value: 0.0979

The statistical output displays the coded coefficients, which are the standardized coefficients. pH level has the standardized coefficient with the largest absolute value. This measure suggests that pH level is the most important independent variable in the regression model.