Data Cleaning & Preprocessing
Standardize taxonomic names and remove duplicates.
Verify site coordinates and integrate spatial data.
# Load necessary libraries
library(tidyverse)
library(readxl)
library(janitor)
library(sf)
library(sp)
library(gstat)
library(ggplot2)
library(viridis) # For better color scales
library(raster)
library(rnaturalearth)
# Define file path
file_path <- "C:/Users/Cordio ea/Documents/Jane_Repository/Attachment/Fish_data_2008_2009_recovered from oldserver.xls"
# Read different sheets (update sheet names if necessary)
raw_data <- read_xls(file_path, sheet = "RawData") %>% clean_names()
comments <- read_xls(file_path, sheet = "Comments") %>% clean_names()
gps_sites <- read_xls(file_path, sheet = "GPS-Sites") %>% clean_names()
species_data <- read_xls(file_path, sheet = "species_encountered") %>% clean_names()
# View structure of the data
colnames(raw_data)
## [1] "date" "site" "area"
## [4] "zone" "strata_depth" "method"
## [7] "observer_name" "functional_group" "family"
## [10] "taxonomic_groups" "long_swim" "size_class"
## [13] "transect_1" "transect_2" "transect_3"
## [16] "swim_distance" "mean" "stdev"
## [19] "comments" "x20" "size_class_categories"
## [22] "x22" "x23"
colnames(comments)
## [1] "data_source" "salvaged_from_cordio_old_server"
colnames(gps_sites)
## [1] "sites" "lat_s" "long_e" "x4" "x5" "x6" "x7" "x8"
## [9] "x9" "x10" "x11" "x12" "x13" "x14"
colnames(species_data)
## [1] "functional_group" "family" "species" "encountered"
## [5] "x5" "x6" "x7" "x8"
## [9] "x9"
species_data <- species_data %>%
distinct() %>%
mutate(species = str_to_title(species))
print(head(species_data))
## # A tibble: 6 × 9
## functional_group family species encountered x5 x6 x7 x8 x9
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <lgl> <chr>
## 1 Excavators Scaridae Bolbometo… <NA> <NA> Inst… <NA> NA <NA>
## 2 Excavators Scaridae Chlorurus… Y <NA> <NA> <NA> NA <NA>
## 3 Excavators Scaridae Chlororus… ? <NA> <NA> <NA> NA <NA>
## 4 Excavators Scaridae Chlorurus… <NA> <NA> <NA> <NA> NA <NA>
## 5 Excavators Scaridae Cetoscaru… y only… <NA> <NA> NA <NA>
## 6 Scrapers Scaridae Hipposcar… <NA> <NA> <NA> <NA> NA <NA>
raw_data <- raw_data %>%
distinct() %>%
mutate(taxonomic_groups = str_to_title(taxonomic_groups))
print(head(raw_data))
## # A tibble: 6 × 23
## date site area zone strata_depth method observer_name
## <dttm> <chr> <chr> <chr> <dbl> <chr> <chr>
## 1 2008-02-26 00:00:00 Coral Garden Momb… Inner 3 Snork… Mwaura
## 2 2008-02-26 00:00:00 Coral Garden Momb… Inner 3 Snork… Mwaura
## 3 2008-02-26 00:00:00 Coral Garden Momb… Inner 3 Snork… Mwaura
## 4 2008-02-26 00:00:00 Coral Garden Momb… Inner 3 Snork… Mwaura
## 5 2008-02-26 00:00:00 Coral Garden Momb… Inner 3 Snork… Mwaura
## 6 2008-02-26 00:00:00 Coral Garden Momb… Inner 3 Snork… Mwaura
## # ℹ 16 more variables: functional_group <chr>, family <chr>,
## # taxonomic_groups <chr>, long_swim <dbl>, size_class <dbl>,
## # transect_1 <dbl>, transect_2 <dbl>, transect_3 <dbl>, swim_distance <dbl>,
## # mean <dbl>, stdev <dbl>, comments <chr>, x20 <lgl>,
## # size_class_categories <chr>, x22 <chr>, x23 <dbl>
gps_sites1 <- gps_sites %>%
filter(!is.na(lat_s) & !is.na(long_e)) %>%
st_as_sf(coords = c("long_e","lat_s"), crs = 4326)
# Print first few rows to verify
print(gps_sites1)
## Simple feature collection with 14 features and 12 fields
## Geometry type: POINT
## Dimension: XY
## Bounding box: xmin: 41.24666 ymin: 1.732389 xmax: 41.54981 ymax: 2.042957
## Geodetic CRS: WGS 84
## # A tibble: 14 × 13
## sites x4 x5 x6 x7 x8 x9 x10 x11 x12 x13 x14
## * <chr> <chr> <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 Boso Shal… NA 2000 Boso <NA> 1.73… 41.5… 1.73… 41.5… 1.74… 41.5…
## 2 Ch. Cha Bo… 1.74… 41.5 1.74… 41.5… <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 3 Ch.Chano Deep… NA 1998 Ch. … <NA> 2.00… 41.3… 2.00… 41.3… <NA> <NA>
## 4 Ch.mongo s… Deep… NA 2000 Ch. … <NA> 1.73… 41.5… <NA> <NA> <NA> <NA>
## 5 Chole Shal… NA 2000 Ch. … <NA> 1.85… 41.4… <NA> <NA> <NA> <NA>
## 6 Kui 1.83… 41.4 2000 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 7 Kupi <NA> NA <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 8 Kwa Radi <NA> NA <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 9 Mikes inner <NA> NA 2000 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 10 Mikes outer Shal… NA 2000 Mago… <NA> 1.89… 41.4… 1.89… 41.4… <NA> <NA>
## 11 Coral Gard… <NA> NA 1998 Mike… <NA> 1.99… 41.2… <NA> <NA> <NA> <NA>
## 12 Mlango wa … <NA> NA 1998 Mkok… WWF … 1.96… 41.3… <NA> <NA> <NA> <NA>
## 13 Mwamba Mku… Shal… NA 2000 Mwam… <NA> 1.77… 41.5… <NA> <NA> <NA> <NA>
## 14 Shimo la t… Shal… NA 2000 Rubu… <NA> 1.88… 41.4… <NA> <NA> <NA> <NA>
## # ℹ 1 more variable: geometry <POINT [°]>
# Count missing values for each column
missing_values <- raw_data %>% summarise(across(everything(), ~sum(is.na(.))))
print(missing_values)
## # A tibble: 1 × 23
## date site area zone strata_depth method observer_name functional_group
## <int> <int> <int> <int> <int> <int> <int> <int>
## 1 0 0 364 364 0 0 0 0
## # ℹ 15 more variables: family <int>, taxonomic_groups <int>, long_swim <int>,
## # size_class <int>, transect_1 <int>, transect_2 <int>, transect_3 <int>,
## # swim_distance <int>, mean <int>, stdev <int>, comments <int>, x20 <int>,
## # size_class_categories <int>, x22 <int>, x23 <int>
# Identify rows where observer_name is missing or empty
observer_issues <- raw_data %>% filter(is.na(observer_name) | observer_name == "")
print(observer_issues)
## # A tibble: 0 × 23
## # ℹ 23 variables: date <dttm>, site <chr>, area <chr>, zone <chr>,
## # strata_depth <dbl>, method <chr>, observer_name <chr>,
## # functional_group <chr>, family <chr>, taxonomic_groups <chr>,
## # long_swim <dbl>, size_class <dbl>, transect_1 <dbl>, transect_2 <dbl>,
## # transect_3 <dbl>, swim_distance <dbl>, mean <dbl>, stdev <dbl>,
## # comments <chr>, x20 <lgl>, size_class_categories <chr>, x22 <chr>,
## # x23 <dbl>
#Since all values are 0, there are no missing values in the dataset.
#Since the output is empty, all observer names are present. ✅
Descriptive & Exploratory Analysis
Summarize species richness and functional group distributions across
sites.
Compute the mean and standard deviation of species counts by zone and
method.
species_richness <- species_data %>%
group_by(functional_group) %>%
summarise(Richness = n_distinct(species))
print(species_richness)
## # A tibble: 6 × 2
## functional_group Richness
## <chr> <int>
## 1 Browsers 14
## 2 Excavators 5
## 3 Grazers 9
## 4 Grazers/detritivorous 9
## 5 Scrapers 10
## 6 <NA> 1
#This groups species data by functional_group and counts unique species.
species_summary <- raw_data %>%
group_by(zone, method) %>%
summarise(Mean_Count = mean(mean, na.rm = TRUE),
SD_Count = sd(mean, na.rm = TRUE))
print(species_summary)
## # A tibble: 6 × 4
## # Groups: zone [4]
## zone method Mean_Count SD_Count
## <chr> <chr> <dbl> <dbl>
## 1 Deep Scuba 11.1 42.8
## 2 Inner Scuba 3.86 13.1
## 3 Inner Snorkelling 2.01 6.08
## 4 Outer Scuba 3.35 14.0
## 5 Outer Snorkelling 2.53 10.6
## 6 <NA> Scuba 1.26 3.48
#Calculates Mean and Standard Deviation of species count for each zone and method.
ggplot(species_data, aes(x = functional_group, fill = family)) +
geom_bar() +
theme_minimal() +
labs(title = "Functional Group Distribution")
#This bar plot visualizes the number of species in each functional group.
Statistical Analysis
Conduct ANOVA or Kruskal-Wallis tests to compare species abundance
across different sites and depths.
#str(raw_data$strata_depth)
#table(raw_data$strata_depth)
#raw_data <- raw_data %>% mutate(strata_depth = as.factor(strata_depth))
#sum(is.na(raw_data$mean))
anova_result <- aov(mean ~ site + strata_depth, data = raw_data)
summary(anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## site 43 62265 1448.0 5.431 <2e-16 ***
## Residuals 2167 577750 266.6
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#ANOVA (Analysis of Variance) checks if species counts vary significantly by site and depth.
kruskal_result <- kruskal.test(mean ~ site, data = raw_data)
print(kruskal_result)
##
## Kruskal-Wallis rank sum test
##
## data: mean by site
## Kruskal-Wallis chi-squared = 104.29, df = 43, p-value = 5.265e-07
#It compares the distributions of mean across different site groups to determine if at least one group is significantly different.
#Unlike ANOVA (which assumes normality and equal variance), Kruskal-Wallis works on ranks and is useful when:
#Data is not normally distributed
#Variances are unequal across groups
#If p < 0.05, there is a significant difference in mean across sites.
#If p > 0.05, there is no significant difference across sites.
cor_test <- cor.test(raw_data$mean, raw_data$swim_distance)
print(cor_test)
##
## Pearson's product-moment correlation
##
## data: raw_data$mean and raw_data$swim_distance
## t = -0.53597, df = 1268, p-value = 0.5921
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.06999918 0.03999079
## sample estimates:
## cor
## -0.01504973
regression_model <- lm(mean ~ swim_distance, data = raw_data)
summary(regression_model)
##
## Call:
## lm(formula = mean ~ swim_distance, data = raw_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.26 -3.75 -3.36 -2.08 369.58
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.532938 1.860478 2.436 0.015 *
## swim_distance -0.003921 0.007317 -0.536 0.592
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18.18 on 1268 degrees of freedom
## (941 observations deleted due to missingness)
## Multiple R-squared: 0.0002265, Adjusted R-squared: -0.000562
## F-statistic: 0.2873 on 1 and 1268 DF, p-value: 0.5921
#cor.test() checks if fish counts and swim distances are correlated.
#lm() performs linear regression to model the relationship.
Spatial Analysis (Requires Coordinates File)
Map species distribution across sites using GIS.
# Ensure column names match
gps_sites <- gps_sites %>% rename(site = sites)
# Convert coordinates to numeric before converting to sf
gps_sites_sf <- gps_sites %>%
filter(!is.na(lat_s) & !is.na(long_e)) %>%
mutate(across(c(lat_s, long_e), as.numeric),
lat_s = lat_s * -1) %>% # Convert latitude to negative
st_as_sf(coords = c("long_e", "lat_s"), crs = 4326)
# Merge species data with GPS coordinates and convert latitudes to negative
species_sites <- raw_data %>%
left_join(gps_sites, by = "site") %>%
filter(!is.na(lat_s) & !is.na(long_e), !is.na(mean)) %>% # Remove NAs
mutate(across(c(lat_s, long_e), as.numeric),
lat_s = lat_s * -1) %>% # Convert latitude to negative
st_as_sf(coords = c("long_e", "lat_s"), crs = 4326)
# Plot species abundance distribution
ggplot() +
geom_sf(data = species_sites, aes(color = mean), size = 3, alpha = 0.7) +
scale_color_viridis_c(option = "C") + # Use _c for continuous values
theme_minimal() +
labs(title = "Species Abundance Distribution", color = "Mean Abundance")
colnames(species_sites)
## [1] "date" "site" "area"
## [4] "zone" "strata_depth" "method"
## [7] "observer_name" "functional_group" "family"
## [10] "taxonomic_groups" "long_swim" "size_class"
## [13] "transect_1" "transect_2" "transect_3"
## [16] "swim_distance" "mean" "stdev"
## [19] "comments" "x20" "size_class_categories"
## [22] "x22" "x23" "x4"
## [25] "x5" "x6" "x7"
## [28] "x8" "x9" "x10"
## [31] "x11" "x12" "x13"
## [34] "x14" "geometry"
# Define bounding box for interpolation grid
grid <- st_make_grid(species_sites, cellsize = 0.01, what = "centers") %>%
st_as_sf()
# Convert to spatial object (sp package)
species_sp <- as(species_sites, "Spatial")
grid_sp <- as(grid, "Spatial")
idw_model <- gstat(formula = mean ~ 1, locations = species_sp, nmax = 10, set = list(idp = 2))
idw_result <- predict(idw_model, grid_sp)
## [inverse distance weighted interpolation]
# Convert to data frame for plotting
idw_df <- as.data.frame(idw_result)
colnames(idw_df)[1:3] <- c("long", "lat", "predicted_abundance")
ggplot() +
geom_tile(data = idw_df, aes(x = long, y = lat, fill = predicted_abundance)) +
geom_sf(data = species_sites, color = "red", size = 2) +
scale_fill_viridis(option = "C") +
theme_minimal() +
labs(title = "Interpolated Species Abundance", fill = "Predicted Abundance")
Reporting & Visualization
Develop maps, bar charts, and scatter plots for findings.
# Load a world basemap
world <- ne_countries(scale = "medium", returnclass = "sf")
# Mapping Species Abundance (Spatial Visualization)
ggplot() +
geom_sf(data = world, fill = "gray90", color = "white") + # Add world map
geom_sf(data = species_sites, aes(color = mean), size = 3, alpha = 0.7) +
scale_color_viridis_c(option = "C") +
theme_minimal() +
labs(title = "Species Abundance Distribution", color = "Mean Abundance")
# Bar Chart: Species Count by Site
ggplot(raw_data, aes(x = site, fill = functional_group)) +
geom_bar(width = 0.8) + # Increase bar width (default is 0.5)
theme_minimal() +
labs(title = "Species Count per Site", x = "Site", y = "Count", fill = "Functional Group") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Scatter Plot: Mean Abundance vs. Swim Distance
ggplot(raw_data, aes(x = swim_distance, y = mean, color = taxonomic_groups)) +
geom_point(size = 3, alpha = 0.7) +
theme_minimal() +
labs(title = "Mean Abundance vs. Swim Distance", x = "Swim Distance", y = "Mean Abundance", color = "Taxonomic Groups")