knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)

library(tidyverse)
library(dplyr)
library(here)
library(broom)
library(GGally)
library(jtools)
library(kableExtra)
# read in data
palmetto <- read_csv(here("data", "palmetto.csv"),
                       col_types = cols(.default = 'c')) %>% 
  mutate(height = as.numeric(height))

# assign species 1 & 2 names, make species numeric
# 1 1 = s repens, 1 2 = s etonia 

palmetto_tidy <- palmetto %>% 
  filter(plant == "1") %>% 
  mutate(species = as.numeric(species)) %>% 
  mutate(species_name = case_when(species == 1 ~ "repens",
                                  species == 2 ~ "etonia",
                                  TRUE ~ "none")) %>% 
  select(year, species_name, height, length, width, green_lvs) %>% 
  mutate(width = as.numeric(width)) %>% 
  mutate(length = as.numeric(length)) %>% 
  mutate(green_lvs = as.numeric(green_lvs)) 
# Directions: use binary logistic regression to test feasibility of using variables plant height (height), canopy length (length), canopy width (width), and number of green leaves (green_lvs) to classify whether a palmetto is species Serenoa repens or Sabal etonia. Use code folding and hide all messages & warnings in your knitted HTML. 

Section 1: Explore the Data

# exploratory graphs
palmetto_tidy %>% 
  select(species_name, height:green_lvs) %>% 
  ggpairs(aes(color=species_name))

# Figure 1: Height v Green Leaves
height_grnlvs_figure <- ggplot(data = palmetto_tidy, aes(x = height, y = green_lvs)) +
  geom_point(aes(color = species_name)) +
  facet_wrap(~species_name) +
  theme(legend.position = 'none') +
  labs(x = "plant height", y = "# of green leaves")

height_grnlvs_figure
**Figure 1.** Height of plant vs number of green leaves on S. etonia and S. repens plant species.

Figure 1. Height of plant vs number of green leaves on S. etonia and S. repens plant species.

## --------------------------------------------
# etonia_height <- palmetto_tidy %>% 
#   filter(species_name == "etonia") %>% 
#   mutate(green_lvs = as.numeric(green_lvs))
# 
# mean(etonia_height$height) #79.26316
# mean(etonia_height$green_lvs) #4.263158

## --------------------------------------------
# repens_height <- palmetto_tidy %>% 
#   filter(species_name == "repens") %>% 
#   mutate(green_lvs = as.numeric(green_lvs))
# 
# mean(repens_height$height) #73.31579
# mean(repens_height$green_lvs) #6.842105

Figure 1 shows that overall, S. etonia plants are a little taller (mean height = 79 cm) than S. repens, whose average height is 73 cm. However, S. repens averaged a higher number of green leaves (mean = 6.8) while S. etonia averaged approximately 4 green leaves per plant.

# Figure 2: canopy length vs width
canopy_length_height <- ggplot(data=palmetto_tidy, aes(x=width, y=length)) + 
  geom_point(aes(color = species_name)) +
  scale_x_continuous(limits =c (55,143),
                     breaks = c(60, 80, 100, 120, 140)) +
  scale_y_continuous(limits = c(65, 190),
                     breaks = c(60, 90, 120, 150, 180)) +
  labs(x = "Canopy Width", y = "Canopy Length") +
  scale_color_discrete(name = "Species")

canopy_length_height
**Figure 2.** Width of plant canopy vs length of plant canopy for S. etonia and S. repens plant species.

Figure 2. Width of plant canopy vs length of plant canopy for S. etonia and S. repens plant species.

S. etonia, on average, has a much larger canopy width and length compared to S. repens. Some individuals (n=2) of S. etonia are relatively smaller and closer in size to S. repens.

# Figure 3: Height vs canopy width
height_canopy_width <- ggplot(data=palmetto_tidy, aes(x=height, y=width)) + 
  geom_point(aes(color = species_name)) +
  labs(x = "Plant Height", y = "Canopy Width") +
  scale_color_discrete(name = "Species")

height_canopy_width
**Figure 3.** Height of plant vs canopy width for S. entonia and S. repens plant species.

Figure 3. Height of plant vs canopy width for S. entonia and S. repens plant species.

S. entonia, overall, has a greater canopy width than S. repens, but is close to the same height of S. repens. As seen in Figure 1, S. entonia has a mean plant height of approximately 79 cm and S. repens has a mean plant height of approximately 73 cm.

Section 2: Binary Logistic Regression

# read in data
palmetto_new <- read_csv(here("data", "palmetto_two.csv")) %>% 
  mutate(species = case_when(species == 1 ~ "Serenoa_repens",
                             species == 2 ~ "Sabal_etonia")) %>% 
  select(species, length, width, height, green_lvs) 

palmetto_new$species <- as.factor(palmetto_new$species)

palmetto_blr_new <- glm(species ~ height + length + width + green_lvs,
                        data = palmetto_new,
                        family = "binomial") 

# summary(palmetto_blr_new)

blr_kable_new_tidy <- palmetto_blr_new %>% 
  broom::tidy ()

palmetto_table <- blr_kable_new_tidy %>% 
  kable(col.names = c("term",
                      "estimate",
                      "std error",
                      "statistic",
                      "p-value")) %>% 
  kable_styling(bootstrap_options = "bordered",
                full_width = F,
                position = "left") 
  
palmetto_table
term estimate std error statistic p-value
(Intercept) -3.2266851 0.1420708 -22.71180 0
height 0.0292173 0.0023061 12.66984 0
length -0.0458233 0.0018661 -24.55600 0
width -0.0394434 0.0021000 -18.78227 0
green_lvs 1.9084747 0.0388634 49.10728 0

Section 3: Table of BLR

palmetto_blr_fitted <- palmetto_blr_new %>%
  broom::augment(type.predict = "response")

# palmetto_blr_fitted
palmetto_blr_50 <- palmetto_blr_fitted %>% 
  mutate(prob_50 = case_when(.fitted >= .5 ~ "Serenoa_repens",
                             .fitted < .5 ~ "Sabal_etonia")) %>% 
  mutate(class_correctly = case_when(species == prob_50 ~ "Correct",
                                      species != prob_50 ~ "Incorrect"))

palmetto_count <- palmetto_blr_50 %>% 
  count(species, class_correctly) %>% 
  pivot_wider(names_from = class_correctly, values_from = n) %>% 
  mutate(total = rowSums(across(where(is.numeric)))) %>% 
  mutate(percent_correct = (Correct/total*100)) %>% 
  mutate(species = case_when(species == "Serenoa_repens" ~ "Serenoa repens",
                             species == "Sabal_etonia" ~ "Sabal etonia"))

palmetto_count_table <- kable(palmetto_count, col.names = c("Species", "Classified Correctly (n)",
                                                            "Classified Incorrectly (n)",
                                                            "Total Species (n)",
                                                            "Percent Correctly Classified")) %>% 
  kable_styling(bootstrap_options = "striped")

palmetto_count_table
Species Classified Correctly (n) Classified Incorrectly (n) Total Species (n) Percent Correctly Classified
Sabal etonia 5701 454 6155 92.62388
Serenoa repens 5548 564 6112 90.77225

Table 1. Percentage of correctly classified binomial regression for Sabal etonia and Serenoa repens.