# Data Cleaning
library(readxl)
## Warning: package 'readxl' was built under R version 4.5.3
# Site 1: Camorin-Jacarepaguá, Brazil
camorin <- read_excel("CAMORIN.xls")
## New names:
## • `voucher` -> `voucher...8`
## • `voucher` -> `voucher...10`
## • `voucher` -> `voucher...11`
## • `voucher` -> `voucher...12`
## • `STEMSDBH` -> `STEMSDBH...15`
## • `STEMSDBH` -> `STEMSDBH...16`
## • `STEMSDBH` -> `STEMSDBH...17`
## • `STEMSDBH` -> `STEMSDBH...18`
## • `STEMSDBH` -> `STEMSDBH...19`
camorin$Line <- as.numeric(camorin$Line)
camorin <- camorin[order(camorin$Line),]

# Remove rows with missing values (number of individuals)
camorin <- camorin[!is.na(camorin[,"N(IND.)"]),]

# Total individuals per transect (N. Ind)
camorin_line <- aggregate(camorin[,"N(IND.)"], list(Line=camorin$Line), sum)
colnames(camorin_line)[2] <- "N(IND.)"

# Per species calculations (proportions)
camorin[,"Prop"] <- camorin[,"N(IND.)"]/camorin_line[match(camorin$Line, camorin_line$Line), "N(IND.)"]
camorin[,"PropSquared"] <- camorin$Prop^2

# Simpson's index calculations
camorin_line[,"D"] <- aggregate(list(D=camorin$PropSquared), list(Line=camorin$Line), sum)$D
camorin_line[,"1-D"] <- 1 - camorin_line[,"D"]
camorin_line[,"1/D"] <- 1 / camorin_line[,"D"]
camorin_line[,"S"] <- aggregate(list(S=camorin$Line), list(Line=camorin$Line), length)$S
camorin_line[,"Esimpson"] <- camorin_line[,"1/D"] / camorin_line[,"S"]

# Shannon index calculations
# H = -sum(pi * ln(pi)) per transect
camorin[,"PlogP"] <- ifelse(camorin$Prop > 0, camorin$Prop * log(camorin$Prop), 0)
camorin_line[,"H"] <- -aggregate(list(H=camorin$PlogP), list(Line=camorin$Line), sum)$H
camorin_line[,"Eshannon"] <- camorin_line[,"H"] / log(camorin_line[,"S"])  # Evenness = H / ln(S)

# Camorin averages 
camorin_line$Mean_D <- mean(camorin_line$D)
camorin_line$Mean_1minusD <- mean(camorin_line$`1-D`)
camorin_line$Mean_1overD <- mean(camorin_line$`1/D`)
camorin_line$Mean_Esimpson <- mean(camorin_line$Esimpson)
camorin_line$Mean_H <- mean(camorin_line$H)
camorin_line$Mean_Eshannon <- mean(camorin_line$Eshannon)
camorin_line$Mean_S <- mean(camorin_line$S)

# DBH calculations
dbh_cols_camorin <- grep("STEMSDBH", colnames(camorin))
camorin[, dbh_cols_camorin] <- lapply(camorin[, dbh_cols_camorin], as.numeric)
camorin[,"DBHsum"] <- rowSums(camorin[, dbh_cols_camorin], na.rm=TRUE)
camorin_line[,"DBHsum"] <- aggregate(camorin$DBHsum, list(camorin$Line), sum)$x
camorin_line[,"DBHmean"] <- camorin_line[,"DBHsum"] / camorin_line[,"N(IND.)"]
# SITE 2: Ruissalo, Finland
ruissalo <- read_excel("RUISSALO.xls")
## New names:
## • `voucher` -> `voucher...5`
## • `voucher` -> `voucher...6`
## • `voucher` -> `voucher...7`
## • `voucher` -> `voucher...8`
## • `voucher` -> `voucher...9`
## • `voucher` -> `voucher...10`
## • `voucher` -> `voucher...11`
## • `voucher` -> `voucher...12`
## • `Stemdbh` -> `Stemdbh...15`
## • `Stemdbh` -> `Stemdbh...16`
## • `Stemdbh` -> `Stemdbh...17`
## • `Stemdbh` -> `Stemdbh...18`
## • `Stemdbh` -> `Stemdbh...19`
## • `Stemdbh` -> `Stemdbh...20`
## • `Stemdbh` -> `Stemdbh...21`
## • `Stemdbh` -> `Stemdbh...22`
## • `Stemdbh` -> `Stemdbh...23`
## • `Stemdbh` -> `Stemdbh...24`
## • `Stemdbh` -> `Stemdbh...25`
## • `Stemdbh` -> `Stemdbh...26`
## • `Stemdbh` -> `Stemdbh...27`
## • `Stemdbh` -> `Stemdbh...28`
## • `Stemdbh` -> `Stemdbh...29`
## • `Stemdbh` -> `Stemdbh...30`
## • `Stemdbh` -> `Stemdbh...31`
## • `Stemdbh` -> `Stemdbh...32`
## • `Stemdbh` -> `Stemdbh...33`
ruissalo$Line <- as.numeric(ruissalo$Line)
ruissalo <- ruissalo[order(ruissalo$Line),]

# Remove rows with missing values (number of individuals)
ruissalo <- ruissalo[!is.na(ruissalo[,"N(Ind.)"]),]

# Total individuals per transect (N. Ind)
ruissalo_line <- aggregate(ruissalo[,"N(Ind.)"], list(Line=ruissalo$Line), sum)
colnames(ruissalo_line)[2] <- "N(IND.)"

# Per species calculations (proportions)
ruissalo[,"Prop"] <- ruissalo[,"N(Ind.)"]/ruissalo_line[match(ruissalo$Line, ruissalo_line$Line), "N(IND.)"]
ruissalo[,"PropSquared"] <- ruissalo$Prop^2

# Simpson's index calculations
ruissalo_line[,"D"] <- aggregate(list(D=ruissalo$PropSquared), list(Line=ruissalo$Line), sum)$D
ruissalo_line[,"1-D"] <- 1 - ruissalo_line[,"D"]
ruissalo_line[,"1/D"] <- 1 / ruissalo_line[,"D"]
ruissalo_line[,"S"] <- aggregate(list(S=ruissalo$Line), list(Line=ruissalo$Line), length)$S
ruissalo_line[,"Esimpson"] <- ruissalo_line[,"1/D"] / ruissalo_line[,"S"]

# Shannon index
ruissalo[,"PlogP"] <- ifelse(ruissalo$Prop > 0, ruissalo$Prop * log(ruissalo$Prop), 0)
ruissalo_line[,"H"] <- -aggregate(list(H=ruissalo$PlogP), list(Line=ruissalo$Line), sum)$H
ruissalo_line[,"Eshannon"] <- ruissalo_line[,"H"] / log(ruissalo_line[,"S"])

# Site averages
ruissalo_line$Mean_D    <- mean(ruissalo_line$D)
ruissalo_line$Mean_1minusD  <- mean(ruissalo_line$`1-D`)
ruissalo_line$Mean_1overD <- mean(ruissalo_line$`1/D`)
ruissalo_line$Mean_Esimpson <- mean(ruissalo_line$Esimpson)
ruissalo_line$Mean_H    <- mean(ruissalo_line$H)
ruissalo_line$Mean_Eshannon <- mean(ruissalo_line$Eshannon)
ruissalo_line$Mean_S    <- mean(ruissalo_line$S)

# DBH calculations
dbh_cols_ruissalo <- grep("Stemdbh", colnames(ruissalo))
ruissalo[, dbh_cols_ruissalo] <- lapply(ruissalo[, dbh_cols_ruissalo], as.numeric)
ruissalo[,"DBHsum"] <- rowSums(ruissalo[, dbh_cols_ruissalo], na.rm=TRUE)
ruissalo_line[,"DBHsum"] <- aggregate(ruissalo$DBHsum, list(ruissalo$Line), sum)$x
ruissalo_line[,"DBHmean"] <- ruissalo_line[,"DBHsum"] / ruissalo_line[,"N(IND.)"]
# Summary Tables
# Simpson's table
simpsons_table <- data.frame(
  Site     = c("Camorin, Brazil", "Ruissalo, Finland"),
  D        = c(camorin_line$Mean_D[1],        ruissalo_line$Mean_D[1]),
  "1-D"    = c(camorin_line$Mean_1minusD[1],  ruissalo_line$Mean_1minusD[1]),
  "1/D"    = c(camorin_line$Mean_1overD[1],   ruissalo_line$Mean_1overD[1]),
  Esimpson = c(camorin_line$Mean_Esimpson[1], ruissalo_line$Mean_Esimpson[1])
)

# Shannon table
shannon_table <- data.frame(
  Site        = c("Camorin, Brazil", "Ruissalo, Finland"),
  H           = c(camorin_line$Mean_H[1],         ruissalo_line$Mean_H[1]),
  Eshannon    = c(camorin_line$Mean_Eshannon[1],  ruissalo_line$Mean_Eshannon[1])
)

# Print tables
print(simpsons_table)
##                Site          D      X1.D    X1.D.1  Esimpson
## 1   Camorin, Brazil 0.04758249 0.9524175 21.696706 0.7903022
## 2 Ruissalo, Finland 0.28832369 0.7116763  3.614451 0.7017526
print(shannon_table)
##                Site        H  Eshannon
## 1   Camorin, Brazil 3.192966 0.9674377
## 2 Ruissalo, Finland 1.405109 0.8622421
# Visualizations
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.3
library(ggridges)
## Warning: package 'ggridges' was built under R version 4.5.3
# Rename columns
colnames(simpsons_table) <- c("Site", "D", "1-D", "1/D", "Esimpson")

# Plot 1: Side-by-side average DBH bar plot
dbh_matrix <- rbind(camorin_line$DBHmean, ruissalo_line$DBHmean)
rownames(dbh_matrix) <- c("Camorin, Brazil", "Ruissalo, Finland")
colnames(dbh_matrix) <- camorin_line$Line

barplot(dbh_matrix,
        beside    = TRUE,
        col       = c("forestgreen", "lightblue"),
        xlab      = "Transect Number",
        ylab      = "Average DBH (cm)",
        main      = "Average Diameter at Breast Height (DBH) per Transect",
        legend.text = TRUE,
        args.legend = list(x = "topleft"),
        cex.main  = 0.9)

# Plot 2: Ridge plot of DBH distributions by transect and site
# Camorin long format
camorin_raw2  <- read_excel("CAMORIN.xls")
## New names:
## • `voucher` -> `voucher...8`
## • `voucher` -> `voucher...10`
## • `voucher` -> `voucher...11`
## • `voucher` -> `voucher...12`
## • `STEMSDBH` -> `STEMSDBH...15`
## • `STEMSDBH` -> `STEMSDBH...16`
## • `STEMSDBH` -> `STEMSDBH...17`
## • `STEMSDBH` -> `STEMSDBH...18`
## • `STEMSDBH` -> `STEMSDBH...19`
ruissalo_raw2 <- read_excel("RUISSALO.xls")
## New names:
## • `voucher` -> `voucher...5`
## • `voucher` -> `voucher...6`
## • `voucher` -> `voucher...7`
## • `voucher` -> `voucher...8`
## • `voucher` -> `voucher...9`
## • `voucher` -> `voucher...10`
## • `voucher` -> `voucher...11`
## • `voucher` -> `voucher...12`
## • `Stemdbh` -> `Stemdbh...15`
## • `Stemdbh` -> `Stemdbh...16`
## • `Stemdbh` -> `Stemdbh...17`
## • `Stemdbh` -> `Stemdbh...18`
## • `Stemdbh` -> `Stemdbh...19`
## • `Stemdbh` -> `Stemdbh...20`
## • `Stemdbh` -> `Stemdbh...21`
## • `Stemdbh` -> `Stemdbh...22`
## • `Stemdbh` -> `Stemdbh...23`
## • `Stemdbh` -> `Stemdbh...24`
## • `Stemdbh` -> `Stemdbh...25`
## • `Stemdbh` -> `Stemdbh...26`
## • `Stemdbh` -> `Stemdbh...27`
## • `Stemdbh` -> `Stemdbh...28`
## • `Stemdbh` -> `Stemdbh...29`
## • `Stemdbh` -> `Stemdbh...30`
## • `Stemdbh` -> `Stemdbh...31`
## • `Stemdbh` -> `Stemdbh...32`
## • `Stemdbh` -> `Stemdbh...33`
# Camorin long format
dbh_cols1 <- grep("STEMSDBH", colnames(camorin_raw2))
camorin_long <- camorin_raw2[, c(1, dbh_cols1)]  # column 1 = Line
colnames(camorin_long)[1] <- "Line"
camorin_long[, -1] <- lapply(camorin_long[, -1], as.numeric)
camorin_long <- reshape(as.data.frame(camorin_long),
                        varying   = colnames(camorin_long)[-1],
                        v.names   = "DBH",
                        timevar   = "stem",
                        direction = "long")
camorin_long <- camorin_long[!is.na(camorin_long$DBH),]
camorin_long$Site <- "Camorin, Brazil"
camorin_long$Transect <- factor(paste("T", camorin_long$Line, sep=""), 
                                levels = c("T1","T2","T3","T4","T5",
                                           "T6","T7","T8","T9","T10"))

# Ruissalo long format
dbh_cols2 <- grep("Stemdbh", colnames(ruissalo_raw2))
ruissalo_long <- ruissalo_raw2[, c(1, dbh_cols2)]
colnames(ruissalo_long)[1] <- "Line"
ruissalo_long[, -1] <- lapply(ruissalo_long[, -1], as.numeric)
ruissalo_long <- reshape(as.data.frame(ruissalo_long),
                         varying   = colnames(ruissalo_long)[-1],
                         v.names   = "DBH",
                         timevar   = "stem",
                         direction = "long")
ruissalo_long <- ruissalo_long[!is.na(ruissalo_long$DBH),]
ruissalo_long$Site <- "Ruissalo, Finland"
ruissalo_long$Transect <- factor(paste("T", ruissalo_long$Line, sep=""), 
                                 levels = c("T1","T2","T3","T4","T5",
                                            "T6","T7","T8","T9","T10"))

# Combine
dbh_long <- rbind(
  camorin_long[, c("Site", "Transect", "DBH")],
  ruissalo_long[, c("Site", "Transect", "DBH")]
)

# Ridge plot
ggplot(dbh_long, aes(x = DBH, y = Transect, fill = Site)) +
  geom_density_ridges(alpha = 0.7, scale = 1.2) +
  facet_wrap(~Site) +
  scale_fill_manual(values = c("Camorin, Brazil" = "forestgreen",
                               "Ruissalo, Finland" = "steelblue")) +
  labs(title    = "DBH Distribution by Transect",
    subtitle = "Camorin-Jacarepaguá, Brazil vs Ruissalo, Finland",
    x        = "Diameter at Breast Height (cm)",
    y        = "Transect") +
  theme_ridges() +
  theme(legend.position = "none")
## Picking joint bandwidth of 2.5
## Picking joint bandwidth of 1.4

# Plot 3: Rarefaction Curves
library(vegan)
## Warning: package 'vegan' was built under R version 4.5.3
## Loading required package: permute
## Warning: package 'permute' was built under R version 4.5.3
# Camorin

camorin_matrix <- aggregate(camorin[,"N(IND.)"],
                            list(Line    = camorin$Line,
                                 Species = paste(camorin$Genus, camorin$species)),
                            sum)
camorin_wide <- reshape(camorin_matrix,
                        idvar     = "Line",
                        timevar   = "Species",
                        direction = "wide")
rownames(camorin_wide) <- paste("Camorin_T", camorin_wide$Line, sep="")
camorin_wide$Line <- NULL
camorin_wide[is.na(camorin_wide)] <- 0

# Ruissalo
ruissalo_matrix <- aggregate(ruissalo[,"N(Ind.)"],
                             list(Line    = ruissalo$Line,
                                  Species = paste(ruissalo$Genus, ruissalo$Species)),
                             sum)
ruissalo_wide <- reshape(ruissalo_matrix,
                         idvar     = "Line",
                         timevar   = "Species",
                         direction = "wide")
rownames(ruissalo_wide) <- paste("Ruissalo_T", ruissalo_wide$Line, sep="")
ruissalo_wide$Line <- NULL
ruissalo_wide[is.na(ruissalo_wide)] <- 0
# Plot Rarefaction curves using vegan
par(mfrow = c(1,2))

rarecurve(camorin_wide,
          step    = 5,
          col     = "forestgreen",
          xlab    = "Number of Individuals",
          ylab    = "Number of Species",
          main    = "Camorin, Brazil",
          label   = FALSE)

rarecurve(ruissalo_wide,
          step    = 5,
          col     = "steelblue",
          xlab    = "Number of Individuals",
          ylab    = "Number of Species",
          main    = "Ruissalo, Finland",
          label   = FALSE)

# Tables 1 and 2: Biodiversity indices per transect
# Camorin per-transect table
camorin_table1 <- data.frame(
  Transect = camorin_line$Line,
  D        = round(camorin_line$D, 4),
  "1-D"    = round(camorin_line$`1-D`, 4),
  "1/D"    = round(camorin_line$`1/D`, 4),
  Esimpson = round(camorin_line$Esimpson, 4),
  H        = round(camorin_line$H, 4),
  Eshannon = round(camorin_line$Eshannon, 4),
  S        = camorin_line$S)

# Ruissalo per-transect table
ruissalo_table2 <- data.frame(
  Transect = ruissalo_line$Line,
  D        = round(ruissalo_line$D, 4),
  "1-D"    = round(ruissalo_line$`1-D`, 4),
  "1/D"    = round(ruissalo_line$`1/D`, 4),
  Esimpson = round(ruissalo_line$Esimpson, 4),
  H        = round(ruissalo_line$H, 4),
  Eshannon = round(ruissalo_line$Eshannon, 4),
  S        = ruissalo_line$S)

# Fix column names
colnames(camorin_table1)  <- c("Transect","D","1-D","1/D","Esimpson","H","Eshannon","S")
colnames(ruissalo_table2) <- c("Transect","D","1-D","1/D","Esimpson","H","Eshannon","S")

print(camorin_table1)
##    Transect      D    1-D     1/D Esimpson      H Eshannon  S
## 1         1 0.0432 0.9568 23.1429   0.8571 3.2224   0.9777 27
## 2         2 0.0446 0.9554 22.4426   0.8312 3.2080   0.9733 27
## 3         3 0.0443 0.9557 22.5882   0.6644 3.3461   0.9489 34
## 4         4 0.0457 0.9543 21.8788   0.7544 3.2452   0.9638 29
## 5         5 0.0308 0.9692 32.4386   0.8767 3.5556   0.9847 37
## 6         6 0.0542 0.9458 18.4576   0.8390 3.0027   0.9714 22
## 7         7 0.0587 0.9413 17.0435   0.7747 2.9856   0.9659 22
## 8         8 0.0585 0.9415 17.0899   0.6573 3.0662   0.9411 26
## 9         9 0.0510 0.9490 19.6122   0.8527 3.0594   0.9757 23
## 10       10 0.0449 0.9551 22.2727   0.7955 3.2385   0.9719 28
print(ruissalo_table2)
##    Transect      D    1-D    1/D Esimpson      H Eshannon S
## 1         1 0.2416 0.7584 4.1391   0.6898 1.5542   0.8674 6
## 2         2 0.4118 0.5882 2.4286   0.6071 1.0910   0.7870 4
## 3         3 0.2143 0.7857 4.6667   0.6667 1.6908   0.8689 7
## 4         4 0.2350 0.7650 4.2553   0.7092 1.5741   0.8785 6
## 5         5 0.3291 0.6709 3.0388   0.7597 1.2214   0.8811 4
## 6         6 0.2645 0.7355 3.7812   0.6302 1.5029   0.8388 6
## 7         7 0.2992 0.7008 3.3419   0.6684 1.3602   0.8451 5
## 8         8 0.2450 0.7550 4.0816   0.8163 1.4786   0.9187 5
## 9         9 0.2663 0.7337 3.7556   0.9389 1.3517   0.9750 4
## 10       10 0.3765 0.6235 2.6557   0.5311 1.2261   0.7618 5
# Table 3: Site-level averages
table3 <- data.frame(
  Site     = c("Camorin, Brazil", "Ruissalo, Finland"),
  D        = round(c(camorin_line$Mean_D[1],        ruissalo_line$Mean_D[1]), 4),
  "1-D"    = round(c(camorin_line$Mean_1minusD[1],  ruissalo_line$Mean_1minusD[1]), 4),
  "1/D"    = round(c(camorin_line$Mean_1overD[1],   ruissalo_line$Mean_1overD[1]), 4),
  Esimpson = round(c(camorin_line$Mean_Esimpson[1], ruissalo_line$Mean_Esimpson[1]), 4),
  H        = round(c(camorin_line$Mean_H[1],        ruissalo_line$Mean_H[1]), 4),
  Eshannon = round(c(camorin_line$Mean_Eshannon[1], ruissalo_line$Mean_Eshannon[1]), 4),
  S        = round(c(camorin_line$Mean_S[1],        ruissalo_line$Mean_S[1]), 1))

colnames(table3) <- c("Site","D","1-D","1/D","Esimpson","H","Eshannon","S")

print(table3)
##                Site      D    1-D     1/D Esimpson      H Eshannon    S
## 1   Camorin, Brazil 0.0476 0.9524 21.6967   0.7903 3.1930   0.9674 27.5
## 2 Ruissalo, Finland 0.2883 0.7117  3.6145   0.7018 1.4051   0.8622  5.2