Marine Species Abundance and Spatial Patterns Analysis

Data Cleaning & Preprocessing

Standardize taxonomic names and remove duplicates.

Verify site coordinates and integrate spatial data.

Check for missing values and inconsistencies in observer records.

# Load necessary libraries
library(tidyverse)
library(readxl)
library(janitor)
library(sf)
library(sp)
library(gstat)
library(ggplot2)
library(viridis) # For better color scales
library(raster)
library(rnaturalearth)
# Define file path
file_path <- "C:/Users/Cordio ea/Documents/Jane_Repository/Attachment/Fish_data_2008_2009_recovered from oldserver.xls"

# Read different sheets (update sheet names if necessary)
raw_data <- read_xls(file_path, sheet = "RawData") %>% clean_names()
comments <- read_xls(file_path, sheet = "Comments") %>% clean_names()
gps_sites <- read_xls(file_path, sheet = "GPS-Sites") %>% clean_names()
species_data <- read_xls(file_path, sheet = "species_encountered") %>% clean_names()

# View structure of the data

colnames(raw_data)
##  [1] "date"                  "site"                  "area"                 
##  [4] "zone"                  "strata_depth"          "method"               
##  [7] "observer_name"         "functional_group"      "family"               
## [10] "taxonomic_groups"      "long_swim"             "size_class"           
## [13] "transect_1"            "transect_2"            "transect_3"           
## [16] "swim_distance"         "mean"                  "stdev"                
## [19] "comments"              "x20"                   "size_class_categories"
## [22] "x22"                   "x23"
colnames(comments)
## [1] "data_source"                     "salvaged_from_cordio_old_server"
colnames(gps_sites)
##  [1] "sites"  "lat_s"  "long_e" "x4"     "x5"     "x6"     "x7"     "x8"    
##  [9] "x9"     "x10"    "x11"    "x12"    "x13"    "x14"
colnames(species_data)
## [1] "functional_group" "family"           "species"          "encountered"     
## [5] "x5"               "x6"               "x7"               "x8"              
## [9] "x9"
species_data <- species_data %>% 
  distinct() %>% 
  mutate(species = str_to_title(species))
print(head(species_data))  
## # A tibble: 6 × 9
##   functional_group family   species    encountered x5    x6    x7    x8    x9   
##   <chr>            <chr>    <chr>      <chr>       <chr> <chr> <chr> <lgl> <chr>
## 1 Excavators       Scaridae Bolbometo… <NA>        <NA>  Inst… <NA>  NA    <NA> 
## 2 Excavators       Scaridae Chlorurus… Y           <NA>  <NA>  <NA>  NA    <NA> 
## 3 Excavators       Scaridae Chlororus… ?           <NA>  <NA>  <NA>  NA    <NA> 
## 4 Excavators       Scaridae Chlorurus… <NA>        <NA>  <NA>  <NA>  NA    <NA> 
## 5 Excavators       Scaridae Cetoscaru… y           only… <NA>  <NA>  NA    <NA> 
## 6 Scrapers         Scaridae Hipposcar… <NA>        <NA>  <NA>  <NA>  NA    <NA>
raw_data <- raw_data %>% 
  distinct() %>% 
  mutate(taxonomic_groups = str_to_title(taxonomic_groups))
print(head(raw_data))  
## # A tibble: 6 × 23
##   date                site         area  zone  strata_depth method observer_name
##   <dttm>              <chr>        <chr> <chr>        <dbl> <chr>  <chr>        
## 1 2008-02-26 00:00:00 Coral Garden Momb… Inner            3 Snork… Mwaura       
## 2 2008-02-26 00:00:00 Coral Garden Momb… Inner            3 Snork… Mwaura       
## 3 2008-02-26 00:00:00 Coral Garden Momb… Inner            3 Snork… Mwaura       
## 4 2008-02-26 00:00:00 Coral Garden Momb… Inner            3 Snork… Mwaura       
## 5 2008-02-26 00:00:00 Coral Garden Momb… Inner            3 Snork… Mwaura       
## 6 2008-02-26 00:00:00 Coral Garden Momb… Inner            3 Snork… Mwaura       
## # ℹ 16 more variables: functional_group <chr>, family <chr>,
## #   taxonomic_groups <chr>, long_swim <dbl>, size_class <dbl>,
## #   transect_1 <dbl>, transect_2 <dbl>, transect_3 <dbl>, swim_distance <dbl>,
## #   mean <dbl>, stdev <dbl>, comments <chr>, x20 <lgl>,
## #   size_class_categories <chr>, x22 <chr>, x23 <dbl>
gps_sites1 <- gps_sites %>%
  filter(!is.na(lat_s) & !is.na(long_e)) %>%
  st_as_sf(coords = c("long_e","lat_s"), crs = 4326)
# Print first few rows to verify
print(gps_sites1)
## Simple feature collection with 14 features and 12 fields
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: 41.24666 ymin: 1.732389 xmax: 41.54981 ymax: 2.042957
## Geodetic CRS:  WGS 84
## # A tibble: 14 × 13
##    sites       x4       x5 x6    x7    x8    x9    x10   x11   x12   x13   x14  
##  * <chr>       <chr> <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
##  1 Boso        Shal…  NA   2000  Boso  <NA>  1.73… 41.5… 1.73… 41.5… 1.74… 41.5…
##  2 Ch. Cha Bo… 1.74…  41.5 1.74… 41.5… <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
##  3 Ch.Chano    Deep…  NA   1998  Ch. … <NA>  2.00… 41.3… 2.00… 41.3… <NA>  <NA> 
##  4 Ch.mongo s… Deep…  NA   2000  Ch. … <NA>  1.73… 41.5… <NA>  <NA>  <NA>  <NA> 
##  5 Chole       Shal…  NA   2000  Ch. … <NA>  1.85… 41.4… <NA>  <NA>  <NA>  <NA> 
##  6 Kui         1.83…  41.4 2000  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
##  7 Kupi        <NA>   NA   <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
##  8 Kwa Radi    <NA>   NA   <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
##  9 Mikes inner <NA>   NA   2000  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA> 
## 10 Mikes outer Shal…  NA   2000  Mago… <NA>  1.89… 41.4… 1.89… 41.4… <NA>  <NA> 
## 11 Coral Gard… <NA>   NA   1998  Mike… <NA>  1.99… 41.2… <NA>  <NA>  <NA>  <NA> 
## 12 Mlango wa … <NA>   NA   1998  Mkok… WWF … 1.96… 41.3… <NA>  <NA>  <NA>  <NA> 
## 13 Mwamba Mku… Shal…  NA   2000  Mwam… <NA>  1.77… 41.5… <NA>  <NA>  <NA>  <NA> 
## 14 Shimo la t… Shal…  NA   2000  Rubu… <NA>  1.88… 41.4… <NA>  <NA>  <NA>  <NA> 
## # ℹ 1 more variable: geometry <POINT [°]>
# Count missing values for each column
missing_values <- raw_data %>% summarise(across(everything(), ~sum(is.na(.))))
print(missing_values)
## # A tibble: 1 × 23
##    date  site  area  zone strata_depth method observer_name functional_group
##   <int> <int> <int> <int>        <int>  <int>         <int>            <int>
## 1     0     0   364   364            0      0             0                0
## # ℹ 15 more variables: family <int>, taxonomic_groups <int>, long_swim <int>,
## #   size_class <int>, transect_1 <int>, transect_2 <int>, transect_3 <int>,
## #   swim_distance <int>, mean <int>, stdev <int>, comments <int>, x20 <int>,
## #   size_class_categories <int>, x22 <int>, x23 <int>
# Identify rows where observer_name is missing or empty
observer_issues <- raw_data %>% filter(is.na(observer_name) | observer_name == "")
print(observer_issues)
## # A tibble: 0 × 23
## # ℹ 23 variables: date <dttm>, site <chr>, area <chr>, zone <chr>,
## #   strata_depth <dbl>, method <chr>, observer_name <chr>,
## #   functional_group <chr>, family <chr>, taxonomic_groups <chr>,
## #   long_swim <dbl>, size_class <dbl>, transect_1 <dbl>, transect_2 <dbl>,
## #   transect_3 <dbl>, swim_distance <dbl>, mean <dbl>, stdev <dbl>,
## #   comments <chr>, x20 <lgl>, size_class_categories <chr>, x22 <chr>,
## #   x23 <dbl>
#Since all values are 0, there are no missing values in the dataset. 
#Since the output is empty, all observer names are present. ✅

Descriptive & Exploratory Analysis

Summarize species richness and functional group distributions across sites.

Compute the mean and standard deviation of species counts by zone and method.

Create plots showing variations in taxonomic groups over different sites and depths.

species_richness <- species_data %>% 
  group_by(functional_group) %>% 
  summarise(Richness = n_distinct(species))
print(species_richness)
## # A tibble: 6 × 2
##   functional_group      Richness
##   <chr>                    <int>
## 1 Browsers                    14
## 2 Excavators                   5
## 3 Grazers                      9
## 4 Grazers/detritivorous        9
## 5 Scrapers                    10
## 6 <NA>                         1
#This groups species data by functional_group and counts unique species.
species_summary <- raw_data %>% 
  group_by(zone, method) %>% 
  summarise(Mean_Count = mean(mean, na.rm = TRUE), 
            SD_Count = sd(mean, na.rm = TRUE))
print(species_summary)
## # A tibble: 6 × 4
## # Groups:   zone [4]
##   zone  method      Mean_Count SD_Count
##   <chr> <chr>            <dbl>    <dbl>
## 1 Deep  Scuba            11.1     42.8 
## 2 Inner Scuba             3.86    13.1 
## 3 Inner Snorkelling       2.01     6.08
## 4 Outer Scuba             3.35    14.0 
## 5 Outer Snorkelling       2.53    10.6 
## 6 <NA>  Scuba             1.26     3.48
#Calculates Mean and Standard Deviation of species count for each zone and method.
ggplot(species_data, aes(x = functional_group, fill = family)) +
  geom_bar() +
  theme_minimal() +
  labs(title = "Functional Group Distribution")

#This bar plot visualizes the number of species in each functional group.

Statistical Analysis

Conduct ANOVA or Kruskal-Wallis tests to compare species abundance across different sites and depths.

Use correlation or regression analysis to explore relationships between fish counts and swim distance.

#str(raw_data$strata_depth)
#table(raw_data$strata_depth)
#raw_data <- raw_data %>% mutate(strata_depth = as.factor(strata_depth))
#sum(is.na(raw_data$mean))
anova_result <- aov(mean ~ site + strata_depth, data = raw_data)
summary(anova_result)
##               Df Sum Sq Mean Sq F value Pr(>F)    
## site          43  62265  1448.0   5.431 <2e-16 ***
## Residuals   2167 577750   266.6                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#ANOVA (Analysis of Variance) checks if species counts vary significantly by site and depth.
kruskal_result <- kruskal.test(mean ~ site, data = raw_data)
print(kruskal_result)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  mean by site
## Kruskal-Wallis chi-squared = 104.29, df = 43, p-value = 5.265e-07
#It compares the distributions of mean across different site groups to determine if at least one group is significantly different.

#Unlike ANOVA (which assumes normality and equal variance), Kruskal-Wallis works on ranks and is useful when:

#Data is not normally distributed

#Variances are unequal across groups
#If p < 0.05, there is a significant difference in mean across sites.

#If p > 0.05, there is no significant difference across sites.
cor_test <- cor.test(raw_data$mean, raw_data$swim_distance)
print(cor_test)
## 
##  Pearson's product-moment correlation
## 
## data:  raw_data$mean and raw_data$swim_distance
## t = -0.53597, df = 1268, p-value = 0.5921
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.06999918  0.03999079
## sample estimates:
##         cor 
## -0.01504973
regression_model <- lm(mean ~ swim_distance, data = raw_data)
summary(regression_model)
## 
## Call:
## lm(formula = mean ~ swim_distance, data = raw_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -4.26  -3.75  -3.36  -2.08 369.58 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)  
## (Intercept)    4.532938   1.860478   2.436    0.015 *
## swim_distance -0.003921   0.007317  -0.536    0.592  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 18.18 on 1268 degrees of freedom
##   (941 observations deleted due to missingness)
## Multiple R-squared:  0.0002265,  Adjusted R-squared:  -0.000562 
## F-statistic: 0.2873 on 1 and 1268 DF,  p-value: 0.5921
#cor.test() checks if fish counts and swim distances are correlated.

#lm() performs linear regression to model the relationship.

Spatial Analysis (Requires Coordinates File)

Map species distribution across sites using GIS.

Use spatial interpolation to model species abundance across sampled areas.

# Ensure column names match
gps_sites <- gps_sites %>% rename(site = sites)

# Convert coordinates to numeric before converting to sf
gps_sites_sf <- gps_sites %>%
  filter(!is.na(lat_s) & !is.na(long_e)) %>%
  mutate(across(c(lat_s, long_e), as.numeric),
         lat_s = lat_s * -1) %>%  # Convert latitude to negative
  st_as_sf(coords = c("long_e", "lat_s"), crs = 4326)

# Merge species data with GPS coordinates and convert latitudes to negative
species_sites <- raw_data %>%
  left_join(gps_sites, by = "site") %>%
  filter(!is.na(lat_s) & !is.na(long_e), !is.na(mean)) %>%  # Remove NAs
  mutate(across(c(lat_s, long_e), as.numeric),
         lat_s = lat_s * -1) %>%  # Convert latitude to negative
  st_as_sf(coords = c("long_e", "lat_s"), crs = 4326)

# Plot species abundance distribution
ggplot() +
  geom_sf(data = species_sites, aes(color = mean), size = 3, alpha = 0.7) +
  scale_color_viridis_c(option = "C") +  # Use _c for continuous values
  theme_minimal() +
  labs(title = "Species Abundance Distribution", color = "Mean Abundance")

colnames(species_sites)
##  [1] "date"                  "site"                  "area"                 
##  [4] "zone"                  "strata_depth"          "method"               
##  [7] "observer_name"         "functional_group"      "family"               
## [10] "taxonomic_groups"      "long_swim"             "size_class"           
## [13] "transect_1"            "transect_2"            "transect_3"           
## [16] "swim_distance"         "mean"                  "stdev"                
## [19] "comments"              "x20"                   "size_class_categories"
## [22] "x22"                   "x23"                   "x4"                   
## [25] "x5"                    "x6"                    "x7"                   
## [28] "x8"                    "x9"                    "x10"                  
## [31] "x11"                   "x12"                   "x13"                  
## [34] "x14"                   "geometry"
# Define bounding box for interpolation grid
grid <- st_make_grid(species_sites, cellsize = 0.01, what = "centers") %>%
  st_as_sf()

# Convert to spatial object (sp package)
species_sp <- as(species_sites, "Spatial")
grid_sp <- as(grid, "Spatial")

idw_model <- gstat(formula = mean ~ 1, locations = species_sp, nmax = 10, set = list(idp = 2))
idw_result <- predict(idw_model, grid_sp)
## [inverse distance weighted interpolation]
# Convert to data frame for plotting
idw_df <- as.data.frame(idw_result)
colnames(idw_df)[1:3] <- c("long", "lat", "predicted_abundance")

ggplot() +
  geom_tile(data = idw_df, aes(x = long, y = lat, fill = predicted_abundance)) +
  geom_sf(data = species_sites, color = "red", size = 2) +
  scale_fill_viridis(option = "C") +
  theme_minimal() +
  labs(title = "Interpolated Species Abundance", fill = "Predicted Abundance")

Reporting & Visualization

Develop maps, bar charts, and scatter plots for findings.

Write a research report summarizing key trends and statistical results

# Load a world basemap
world <- ne_countries(scale = "medium", returnclass = "sf")
# Mapping Species Abundance (Spatial Visualization)
ggplot() +
  geom_sf(data = world, fill = "gray90", color = "white") +  # Add world map
  geom_sf(data = species_sites, aes(color = mean), size = 3, alpha = 0.7) +
  scale_color_viridis_c(option = "C") +
  theme_minimal() +
  labs(title = "Species Abundance Distribution", color = "Mean Abundance")

# Bar Chart: Species Count by Site
ggplot(raw_data, aes(x = site, fill = functional_group)) +
  geom_bar(width = 0.8) +  # Increase bar width (default is 0.5)
  theme_minimal() +
  labs(title = "Species Count per Site", x = "Site", y = "Count", fill = "Functional Group") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Scatter Plot: Mean Abundance vs. Swim Distance

ggplot(raw_data, aes(x = swim_distance, y = mean, color = taxonomic_groups)) +
  geom_point(size = 3, alpha = 0.7) +
  theme_minimal() +
  labs(title = "Mean Abundance vs. Swim Distance", x = "Swim Distance", y = "Mean Abundance", color = "Taxonomic Groups")