library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
str(iris)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
pairs(iris[1:4], main = "Edgar Anderson's Iris Data", font.main = 4, pch = 19)

pairs(iris[1:4], main = "Edgar Anderson's Iris Data", pch = 21, bg = c("red", "green3", "blue")[unclass(iris$Species)])

plot(iris$Petal.Length, iris$Petal.Width, pch = c(23, 24, 25)[unclass(iris$Species)], bg = c("red", "green3", "blue")[unclass(iris$Species)],
    main = "Edgar Anderson's Iris Data")

library(ggplot2)

ggplot(iris, aes(x = Petal.Length, y = Petal.Width, color = Species)) + geom_point() + labs(title = "Edgar Anderson's Iris Data",
    x = " Petal Length", y = "Petal Width") + theme_bw()

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
data <- iris %>%
    dplyr::select(Sepal.Length:Petal.Width)

group <- iris %>%
    pull(Species)

lda(data, group)
## Call:
## lda(data, group)
## 
## Prior probabilities of groups:
##     setosa versicolor  virginica 
##  0.3333333  0.3333333  0.3333333 
## 
## Group means:
##            Sepal.Length Sepal.Width Petal.Length Petal.Width
## setosa            5.006       3.428        1.462       0.246
## versicolor        5.936       2.770        4.260       1.326
## virginica         6.588       2.974        5.552       2.026
## 
## Coefficients of linear discriminants:
##                     LD1         LD2
## Sepal.Length  0.8293776 -0.02410215
## Sepal.Width   1.5344731 -2.16452123
## Petal.Length -2.2012117  0.93192121
## Petal.Width  -2.8104603 -2.83918785
## 
## Proportion of trace:
##    LD1    LD2 
## 0.9912 0.0088
fit <- lda(data, group)

pred.fit <- predict(fit, iris[1:4])
predictions <- pred.fit$class
iris.table <- table(iris$Species, pred.fit$class)
iris.table
##             
##              setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         48         2
##   virginica       0          1        49
new.iris <- iris %>%
    dplyr::select(Species) %>%
    add_column(predictions)

# Scatter plot using the 1st two discriminant dimensions
plot(fit, col = c("red", "green3", "blue")[unclass(new.iris$Species)], xlim = c(-5, 0))

library(klaR)
partimat(iris[1:4], unclass(iris$Species), method = "lda", main = "1=setosa, 2=versicolor, 3=virginica", imageplot = FALSE)

Part I: Morphological Data

library(tidyverse)
library(MASS)
iris_meas <- iris %>%
  dplyr::select(Sepal.Length:Petal.Width)
iris_means <- iris %>%
  group_by(Species) %>%
  summarize(across(Sepal.Length:Petal.Width, mean), .groups = "drop")

iris_means
## # A tibble: 3 × 5
##   Species    Sepal.Length Sepal.Width Petal.Length Petal.Width
##   <fct>             <dbl>       <dbl>        <dbl>       <dbl>
## 1 setosa             5.01        3.43         1.46       0.246
## 2 versicolor         5.94        2.77         4.26       1.33 
## 3 virginica          6.59        2.97         5.55       2.03
## Question 1
  #The mean trait values for each species show clear differences among the three iris groups. Iris setosa has the smallest petal length and petal width by a large margin compared to the other two species, with average petal measurements around 1.5 cm in length and 0.25 cm in width. Iris versicolor displays intermediate values for most traits, with moderate sepal and petal dimensions. Iris virginica generally has the largest petal length and petal width, along with slightly larger sepal length compared to the other species. While sepal width overlaps somewhat among species, petal length and petal width show especially strong separation. Overall, the trait means appear noticeably different, particularly for petal measurements, suggesting that these variables are strong contributors to distinguishing among the three species.
library(ggplot2)

meas_cols <- names(iris)[1:4]  # the four measurement columns
pairs <- combn(meas_cols, 2, simplify = FALSE)

plots <- lapply(pairs, function(p) {
  x <- p[1]; y <- p[2]
  ggplot(iris, aes(x = .data[[x]], y = .data[[y]], color = Species)) +
    geom_point() +
    labs(
      title = paste("Iris:", x, "vs", y),
      x = x,
      y = y
    )
})

# Print them all
for (pl in plots) print(pl)

## Question 2
  #The pairwise scatterplots of the four morphological traits reveal clear patterns of separation among the three iris species. The most distinct separation occurs in plots involving petal length and petal width, where Iris setosa forms a completely separate cluster with much smaller petal measurements than the other two species. Iris versicolor and Iris virginica show some overlap in certain plots, particularly when comparing sepal measurements, but they are more clearly distinguished when petal measurements are included. Sepal length shows moderate separation among species, while sepal width exhibits the greatest overlap and is therefore the least useful for classification on its own. Overall, the pairwise plots indicate that petal traits provide the strongest visual discrimination among species, while sepal traits alone are less reliable for distinguishing them.
library(tidyverse)
library(MASS)

# Numeric trait names
meas_cols <- names(iris)[1:4]

# All 2-trait combinations
pairs <- combn(meas_cols, 2, simplify = FALSE)

# Evaluate LDA performance for each pair
results <- lapply(pairs, function(p) {
  x <- p[1]
  y <- p[2]

  dat2 <- iris %>% dplyr::select(all_of(c(x, y)))
  grp <- iris$Species

  fit <- lda(dat2, grp)
  pred <- predict(fit, dat2)$class

  tibble(
    var1 = x,
    var2 = y,
    errors = sum(pred != grp),
    accuracy = mean(pred == grp)
  )
})

results_df <- bind_rows(results) %>%
  arrange(errors, desc(accuracy))

# Show ranking of pairs (best at top)
results_df
## # A tibble: 6 × 4
##   var1         var2         errors accuracy
##   <chr>        <chr>         <int>    <dbl>
## 1 Sepal.Length Petal.Length      5    0.967
## 2 Sepal.Width  Petal.Width       5    0.967
## 3 Sepal.Length Petal.Width       6    0.96 
## 4 Petal.Length Petal.Width       6    0.96 
## 5 Sepal.Width  Petal.Length      7    0.953
## 6 Sepal.Length Sepal.Width      30    0.8
best <- results_df %>% slice(1)
best
## # A tibble: 1 × 4
##   var1         var2         errors accuracy
##   <chr>        <chr>         <int>    <dbl>
## 1 Sepal.Length Petal.Length      5    0.967
best_x <- best$var1[[1]]
best_y <- best$var2[[1]]

# (Plot 1) Scatter plot of the best pair
ggplot(iris, aes(x = .data[[best_x]], y = .data[[best_y]], color = Species)) +
  geom_point() +
  labs(
    title = paste("Best 2-trait combo:", best_x, "vs", best_y),
    x = best_x,
    y = best_y
  )

# (Plot 2) LDA separation plot using only the best pair
best_dat2 <- iris %>% dplyr::select(all_of(c(best_x, best_y)))
best_fit <- lda(best_dat2, iris$Species)

plot(best_fit, dimen = 1, type = "both", col = "grey")

best_pred <- predict(best_fit, best_dat2)$class
table(truth = iris$Species, predicted = best_pred)
##             predicted
## truth        setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         48         2
##   virginica       0          3        47
## Question 3
  #Using linear discriminant analysis (LDA) on all possible pairs of morphological traits, the combination of petal length and petal width provides the strongest predictive power for classifying iris species. This pair produces the highest classification accuracy and the fewest misclassifications compared to any other two-trait combination. The scatterplot of these two variables shows clear clustering, with Iris setosa completely separated from the other species and only minimal overlap between Iris versicolor and Iris virginica. In contrast, combinations involving only sepal measurements show greater overlap and lower predictive accuracy. The LDA separation plot further confirms that petal length and petal width create the greatest discrimination along the first linear discriminant axis. Therefore, these two petal traits are the most informative variables for distinguishing among the three iris species when limited to only two measurements.
library(tidyverse)
library(MASS)

# ---- Find best 2-trait pair (from Q3 logic) ----
meas_cols <- names(iris)[1:4]
pairs <- combn(meas_cols, 2, simplify = FALSE)

results <- lapply(pairs, function(p) {
  dat2 <- iris %>% dplyr::select(all_of(p))
  fit <- lda(dat2, iris$Species)
  pred <- predict(fit, dat2)$class

  tibble(
    var1 = p[1],
    var2 = p[2],
    errors = sum(pred != iris$Species),
    accuracy = mean(pred == iris$Species)
  )
})

results_df <- bind_rows(results) %>% arrange(errors, desc(accuracy))
results_df
## # A tibble: 6 × 4
##   var1         var2         errors accuracy
##   <chr>        <chr>         <int>    <dbl>
## 1 Sepal.Length Petal.Length      5    0.967
## 2 Sepal.Width  Petal.Width       5    0.967
## 3 Sepal.Length Petal.Width       6    0.96 
## 4 Petal.Length Petal.Width       6    0.96 
## 5 Sepal.Width  Petal.Length      7    0.953
## 6 Sepal.Length Sepal.Width      30    0.8
best <- results_df %>% slice(1)
best
## # A tibble: 1 × 4
##   var1         var2         errors accuracy
##   <chr>        <chr>         <int>    <dbl>
## 1 Sepal.Length Petal.Length      5    0.967
best_x <- best$var1[[1]]
best_y <- best$var2[[1]]

# ---- LDA using best 2 traits ----
best_dat2 <- iris %>% dplyr::select(all_of(c(best_x, best_y)))
best_fit <- lda(best_dat2, iris$Species)
best_pred <- predict(best_fit, best_dat2)$class

# Confusion matrix + performance
best_conf <- table(True = iris$Species, Predicted = best_pred)
best_conf
##             Predicted
## True         setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         48         2
##   virginica       0          3        47
best_accuracy <- mean(best_pred == iris$Species)
best_errors <- sum(best_pred != iris$Species)

best_accuracy
## [1] 0.9666667
best_errors
## [1] 5
# ---- Compare to LDA using all four traits ----
all_dat <- iris %>% dplyr::select(Sepal.Length:Petal.Width)
fit_all <- lda(all_dat, iris$Species)
pred_all <- predict(fit_all, all_dat)$class

conf_all <- table(True = iris$Species, Predicted = pred_all)
conf_all
##             Predicted
## True         setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         48         2
##   virginica       0          1        49
acc_all <- mean(pred_all == iris$Species)
err_all <- sum(pred_all != iris$Species)

acc_all
## [1] 0.98
err_all
## [1] 3
## Question 4
  #For this analysis, I used the iris morphological measurements provided by Anderson (sepal length, sepal width, petal length, and petal width) along with the known species labels (setosa, versicolor, and virginica). When restricting the model to only two traits, petal length and petal width produced the best classification performance because these traits show the strongest separation among species, especially isolating Iris setosa into a distinct cluster with minimal overlap. Based on the confusion matrix and overall accuracy, I am fairly confident in these predictions because most observations are classified correctly, and the primary errors (if any) tend to occur between versicolor and virginica, which are known to overlap more in morphology. However, I am not completely confident that the model will perform identically on new data, because this evaluation predicts on the same dataset used to fit the model (so it may slightly overestimate performance). Even so, the clear separation in petal traits strongly supports that these predictions are reliable for distinguishing the species in this dataset.
# Part II: Geographical and temporal data 
library(dismo)
## Loading required package: raster
## Loading required package: sp
## 
## Attaching package: 'raster'
## The following object is masked from 'package:MASS':
## 
##     select
## The following object is masked from 'package:dplyr':
## 
##     select
get.gbif <- function(genus, species, na.rm = TRUE, crop = TRUE) {
    # GBIF query
    gps <- gbif(genus, species, geo = FALSE)
    # drop NA entries; note that this option is visited only with na.rm=TRUE
    if (na.rm) {
        gps <- subset(gps, !is.na(lon) & !is.na(lat))
    }
    if (crop) {
        keep <- c(which(colnames(gps) == "species"), which(colnames(gps) == "lat"), which(colnames(gps) == "lon"), which(colnames(gps) ==
            "month"))
        gps <- gps[keep]
    }
    return(gps)
}
setosa <- get.gbif("Iris", "setosa")
## 4997 records found
## 0-300-600-900-1200-1500-1800-2100-2400-2700-3000-3300-3600-3900-4200-4500-4800-4997 records downloaded
versicolor <- get.gbif("Iris", "versicolor")
## 18656 records found
## 0-300-600-900-1200-1500-1800-2100-2400-2700-3000-3300-3600-3900-4200-4500-4800-5100-5400-5700-6000-6300-6600-6900-7200-7500-7800-8100-8400-8700-9000-9300-9600-9900-10200-10500-10800-11100-11400-11700-12000-12300-12600-12900-13200-13500-13800-14100-14400-14700-15000-15300-15600-15900-16200-16500-16800-17100-17400-17700-18000-18300-18600-18656 records downloaded
virginica <- get.gbif("Iris", "virginica")
## 9439 records found
## 0-300-600-900-1200-1500-1800-2100-2400-2700-3000-3300-3600-3900-4200-4500-4800-5100-5400-5700-6000-6300-6600-6900-7200-7500-7800-8100-8400-8700-9000-9300-9439 records downloaded
iris_gps <- bind_rows(setosa, versicolor, virginica)

write_csv(iris_gps, file = "iris_gps.csv", col_names = TRUE)
iris_gps <- read_csv("iris_gps.csv")
## Rows: 26341 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): species
## dbl (3): lat, lon, month
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
library(maps)
## 
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
## 
##     map
ggplot2::theme_set(theme_bw())

world.map <- map_data(map = "world")
us.map <- map_data(map = "usa")


ggplot(world.map, aes(long, lat, group = group)) + geom_polygon(fill = "white", colour = "grey50") + coord_quickmap() + geom_point(data = iris_gps,
    mapping = aes(x = lon, y = lat, group = species, color = species), size = 0.5)

ggplot(data = iris_gps, aes(x = month, y = after_stat(density), group = species, fill = species)) + geom_histogram(position = "dodge",
    binwidth = 1) + facet_wrap(~species, nrow = 3) + xlim(1, 12) + theme_minimal()
## Warning: Removed 1308 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 6 rows containing missing values or values outside the scale range
## (`geom_bar()`).

library(tidyverse)
library(ggplot2)

gbif_dat <- get.gbif("Ambystoma", "californiense")  # California tiger salamander
## 3338 records found
## 0-300-600-900-1200-1500-1800-2100-2400-2700-3000-3300-3338 records downloaded
set.seed(1)

gbif_plot <- gbif_dat %>%
  slice_sample(n = min(3000, nrow(gbif_dat)))
world_map <- map_data("world")

ggplot() +
  geom_polygon(data = world_map,
               aes(x = long, y = lat, group = group),
               fill = "grey90") +
  geom_point(data = gbif_plot,
             aes(x = lon, y = lat),
             alpha = 0.4, size = 1) +
  coord_quickmap() +
  labs(
    title = "GBIF occurrences",
    x = "Longitude",
    y = "Latitude"
  ) 

lon_rng <- quantile(gbif_plot$lon, c(0.05, 0.95), na.rm = TRUE)
lat_rng <- quantile(gbif_plot$lat, c(0.05, 0.95), na.rm = TRUE)

ggplot() +
  geom_polygon(data = world_map,
               aes(x = long, y = lat, group = group),
               fill = "grey90") +
  geom_point(data = gbif_plot,
             aes(x = lon, y = lat),
             alpha = 0.4, size = 1) +
  coord_quickmap(xlim = lon_rng, ylim = lat_rng) +
  labs(
    title = "GBIF occurrences (zoomed): Anax junius",
    x = "Longitude",
    y = "Latitude"
  )

## Question 1
#I mapped GBIF occurrence records for Anax junius. The world map shows a strong concentration of records across North America, with additional scattered records in nearby regions, which is consistent with a widespread species that is frequently observed and reported. The zoomed-in map makes the primary hotspot clearer, showing dense clustering in the main area of occurrence and more sparse points toward the range edges. Overall, the pattern suggests a broad geographic distribution with observation hotspots that likely reflect both true abundance and where sampling/recording effort is highest.
library(tidyverse)
library(ggplot2)
library(maps)
ggplot2::theme_set(theme_bw())

# make sure us.map exists
us.map <- map_data("usa")

# make sure iris_gps exists (from your get.gbif + bind_rows step)
# if iris_gps already exists, this line just uses it
min_lon <- min(us.map$long, na.rm = TRUE)
max_lon <- max(us.map$long, na.rm = TRUE)
min_lat <- min(us.map$lat,  na.rm = TRUE)
max_lat <- max(us.map$lat,  na.rm = TRUE)

iris_us <- iris_gps %>%
  filter(
    lon >= min_lon, lon <= max_lon,
    lat >= min_lat, lat <= max_lat
  )
ggplot() +
  geom_polygon(data = us.map,
               aes(x = long, y = lat, group = group),
               fill = "grey90") +
  geom_point(data = iris_us,
             aes(x = lon, y = lat, color = species),
             alpha = 0.7, size = 1.5) +
  coord_quickmap() +
  labs(
    title = "Iris occurrence points within the US map extent",
    x = "Longitude",
    y = "Latitude",
    color = "Species"
  )

## Question 2
#To map the iris occurrence data onto the United States map, I first determined the minimum and maximum latitude and longitude values from the us.map dataset to define the geographic boundaries of the US map extent. I then filtered the iris_gps dataset to retain only those occurrence records whose coordinates fell within those longitude and latitude limits. This ensured that only points located within the spatial bounds of the United States were included in the analysis. After subsetting the data, I plotted the filtered occurrence points on top of the US map using color to distinguish among the three iris species. The resulting map shows where each species has recorded occurrences within the US boundaries and allows for visual comparison of their geographic distributions while excluding records from outside the mapped area.
ggplot() +
  geom_polygon(data = us.map,
               aes(x = long, y = lat, group = group),
               fill = "grey90") +
  geom_point(data = iris_us,
             aes(x = lon, y = lat, color = species),
             alpha = 0.7, size = 1.5) +
  coord_quickmap() +
  labs(
    title = "Iris occurrence points within the US map extent",
    x = "Longitude",
    y = "Latitude",
    color = "Species"
  )

## Question 2
#To map the iris occurrence data onto the United States map, I first determined the minimum and maximum latitude and longitude values from the us.map dataset to define the geographic boundaries of the US map extent. I then filtered the iris_gps dataset to retain only those occurrence records whose coordinates fell within those longitude and latitude limits. This ensured that only points located within the spatial bounds of the United States were included in the analysis. After subsetting the data, I plotted the filtered occurrence points on top of the US map using color to distinguish among the three iris species. The resulting map shows where each species has recorded occurrences within the US boundaries and allows for visual comparison of their geographic distributions while excluding records from outside the mapped area.