The iris data set records the sepal width, sepal length, petal length, and petal width measured in centimeters for iris setosa, iris virginica, and iris versicolor. In this analysis, the traits of each species were compared to one another by calculating the maximum, minimum, average, and median value of each trait. Additionally, sepal width and sepal length were analyzed across all species together to determine if sepal length is a good indicator of sepal width.
library(tidyverse)
library(ggplot2)
library(knitr)
library(kableExtra)
library(dplyr)
statistics <- function(x) {
output <- c(min = min(x), max = max(x), mean = (sum(x) / length(x)), median = median (x))
return(output)
} #creating a function to report the maximum, minimum, average, and median value of the output
setosa <- filter(iris, Species == "setosa")
versicolor <- filter(iris, Species == "versicolor")
virginica <-filter(iris, Species == "virginica")
#separate data frames containing data only for that species
#The "statistics" formula was applied to each trait for setosa
setosa_SL <- statistics(setosa$Sepal.Length)
setosa_SW <- statistics(setosa$Sepal.Width)
setosa_PL <- statistics(setosa$Petal.Length)
setosa_PW <- statistics(setosa$Petal.Width)
setosa_df <- data.frame(setosa_SL, setosa_SW, setosa_PL, setosa_PW) #adding all of the statistics to one data frame
setosa_statistics <- t(setosa_df) #flipping the columns and rows
#The "statistics" formula was applied to each trait of versicolor
versicolor_SL <- statistics(versicolor$Sepal.Length)
versicolor_SW <- statistics(versicolor$Sepal.Width)
versicolor_PL <- statistics(versicolor$Petal.Length)
versicolor_PW <- statistics(versicolor$Petal.Width)
versicolor_df <- data.frame(versicolor_SL, versicolor_SW, versicolor_PL, versicolor_PW) #adding all of the statistics to one data frame
versicolor_statistics <- t(versicolor_df) #flipping the columns and rows
#The "statistics" formula was calculated to each trait of virginica
virginica_SL <- statistics(virginica$Sepal.Length)
virgininca_SW <- statistics(virginica$Sepal.Width)
virginica_PL <- statistics(virginica$Petal.Length)
virginica_PW <- statistics(virginica$Petal.Width)
virginica_df <- data.frame(virginica_SL, versicolor_SW, virginica_PL, virginica_PW) #adding all of the statistics to one data frame
virginica_statistics <- t(virginica_df) #flipping the columns and rows
final_statistics <- rbind(setosa_statistics, versicolor_statistics, virginica_statistics)
#renaming the rows with the traits for each species
row.names(final_statistics) = c("Sepal Length (setosa)", "Sepal Width (setosa)", "Petal Length (setosa)", "Petal Width (setosa)", "Sepal Length (versicolor)", "Sepal Width (versicolor)", "Petal Length (versicolor)", "Petal Width (versicolor)", "Sepal Length (virginica)", "Sepal Width (virginica)", "Petal Length (virginica)", "Petal Width (virginica)")
#compiling the statistics in a table
knitr::kable(final_statistics,
caption = "Table 1. Statistics for Each Species",
align = "cccc",
col.names = c("Minimum Value", "Maximum Value", "Mean", "Median")) %>%
kable_classic(full_width = F,
position = "center",
html_font = "Cambria")
| Minimum Value | Maximum Value | Mean | Median | |
|---|---|---|---|---|
| Sepal Length (setosa) | 4.3 | 5.8 | 5.006 | 5.00 |
| Sepal Width (setosa) | 2.3 | 4.4 | 3.428 | 3.40 |
| Petal Length (setosa) | 1.0 | 1.9 | 1.462 | 1.50 |
| Petal Width (setosa) | 0.1 | 0.6 | 0.246 | 0.20 |
| Sepal Length (versicolor) | 4.9 | 7.0 | 5.936 | 5.90 |
| Sepal Width (versicolor) | 2.0 | 3.4 | 2.770 | 2.80 |
| Petal Length (versicolor) | 3.0 | 5.1 | 4.260 | 4.35 |
| Petal Width (versicolor) | 1.0 | 1.8 | 1.326 | 1.30 |
| Sepal Length (virginica) | 4.9 | 7.9 | 6.588 | 6.50 |
| Sepal Width (virginica) | 2.0 | 3.4 | 2.770 | 2.80 |
| Petal Length (virginica) | 4.5 | 6.9 | 5.552 | 5.55 |
| Petal Width (virginica) | 1.4 | 2.5 | 2.026 | 2.00 |
lm_sepal_width <- lm(Sepal.Width ~ Sepal.Length, iris) #Creating a linear model with sepal width dependent on sepal length
summary_lm_sepal_width <- summary(lm_sepal_width) #summarizing the statistics of the linear model above
lm_statistics <- tibble("Intercept" = summary_lm_sepal_width$coefficients[1,1],
"Slope" = summary_lm_sepal_width$coefficients[2,1],
"p-value" = summary_lm_sepal_width$coefficients[2,4],
"r^2" = summary_lm_sepal_width$r.squared) #creating a tibble with the intercept, the slope, the p-value, and the r^2 value of the sepal width
#Presenting these four statistics in a table
row.names(lm_statistics) =c("Sepal Width")
knitr::kable(lm_statistics,
caption = "Table 2. Sepal length as a predictor of sepal width. Sepal width was dependent on sepal length, and the relationship is negative.",
align = "cccc", col.names = c("Intercept", "slope", "p-value", "r^2")) %>%
kable_classic(full_width = F,
position = "center",
html_font = "Cambria")
| Intercept | slope | p-value | r^2 | |
|---|---|---|---|---|
| Sepal Width | 3.418947 | -0.0618848 | 0.1518983 | 0.0138227 |
iris %>%
ggplot(aes(Sepal.Length, Sepal.Width)) +
geom_point() +
geom_smooth(method = "lm", se = F, color = "red") +
labs(x = "Sepal Length (cm)",
y = "Sepal Width (cm)") +
theme_classic()
Figure 1. Scatter plot of sepal width as a function of sepal length for setosa, versicolor, and virginica from the iris data set. Sepal width and length have a negative relationship, but it is insignificant based on the p-value (p=0.15, r^2=0.014).
From Table 1, none of the species have the highest statistics across all traits. For example, iris virginica has the highest average sepal length, but iris setosa has the highest average sepal width. Based on the p-value of the regression line (Figure 1) and linear model (Table 2), sepal length is not a good predictor of sepal width. The p-value is greater than 0.05, so the relationship is insignificant.