Explore dataset and variation with species
# Explore dataset
data(iris)
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
# Petal length variation with species
ggplot(iris) +
# Set aesthetics (x, y, fill)
geom_histogram(aes(x = Petal.Length,
fill = Species),
alpha = 0.5) +
# Split the graph by species
facet_grid(rows = vars(Species)) +
# Format axis labels
labs(title = "Petal Length Variation within Species", x = "Petal Length / cm", y = "Count") +
# Add colours manually
scale_fill_manual(values = c("darkorange",
"purple",
"cyan4")) +
# Set plot theme
theme_minimal() +
# Remove legend
theme(legend.position = "none")

# Petal width variation with species
ggplot(iris) +
# Set aesthetics (x, y, fill)
geom_histogram(aes(x = Petal.Width,
fill = Species),
alpha = 0.5) +
# Split the graph by species
facet_grid(rows = vars(Species)) +
# Format axis labels
labs(title = "Petal Width Variation within Species", x = "Petal Width / cm", y = "Count") +
# Add colours manually
scale_fill_manual(values = c("darkorange",
"purple",
"cyan4")) +
# Set plot theme
theme_minimal() +
# Remove legend
theme(legend.position = "none")

# Graph the petal dimension across Iris species
# petal length
iris_petal_length_summary <-
iris %>%
group_by(Species) %>%
summarise(count = n(),
mean = mean(Petal.Length, na.rm = TRUE),
ssd = sd(Petal.Length, na.rm = TRUE)) %>%
mutate(se = ssd / sqrt(count),
lower_ci = mean - qt(1 - (0.05 / 2), count - 1) * se,
upper_ci = mean + qt(1 - (0.05 / 2), count - 1) * se)
p1 <- iris_petal_length_summary %>% ggplot() +
geom_point(aes(x = Species, y = mean,
colour = Species),
size = 3) +
geom_errorbar(aes(x = Species,
ymin = lower_ci,
ymax = upper_ci,
colour = Species),
width = 0.2, size = 1) +
labs(x = "Species", y = "Petal Length / cm",) +
scale_colour_manual(values = c("darkorange",
"purple",
"cyan4")) +
theme_minimal() +
theme(axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank()) +
theme(legend.position = "none")
# petal width
iris_petal_width_summary <-
iris %>%
group_by(Species) %>%
summarise(count = n(),
mean = mean(Petal.Width, na.rm = TRUE),
ssd = sd(Petal.Width, na.rm = TRUE)) %>%
mutate(se = ssd / sqrt(count),
lower_ci = mean - qt(1 - (0.05 / 2), count - 1) * se,
upper_ci = mean + qt(1 - (0.05 / 2), count - 1) * se)
p2 <- iris_petal_width_summary %>% ggplot() +
geom_point(aes(x = Species, y = mean,
colour = Species),
size = 3) +
geom_errorbar(aes(x = Species,
ymin = lower_ci,
ymax = upper_ci,
colour = Species),
width = 0.2, size = 1) +
labs(x = "Species", y = "Petal Width / cm",) +
scale_colour_manual(values = c("darkorange",
"purple",
"cyan4")) +
theme_minimal() +
theme(legend.position = "none") +
theme(axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank()) +
theme(legend.position = "none")
# sepal length
iris_sepal_length_summary <-
iris %>%
group_by(Species) %>%
summarise(count = n(),
mean = mean(Sepal.Length, na.rm = TRUE),
ssd = sd(Sepal.Length, na.rm = TRUE)) %>%
mutate(se = ssd / sqrt(count),
lower_ci = mean - qt(1 - (0.05 / 2), count - 1) * se,
upper_ci = mean + qt(1 - (0.05 / 2), count - 1) * se)
p3 <- iris_sepal_length_summary %>% ggplot() +
geom_point(aes(x = Species, y = mean,
colour = Species),
size = 3) +
geom_errorbar(aes(x = Species,
ymin = lower_ci,
ymax = upper_ci,
colour = Species),
width = 0.2, size = 1) +
labs(x = "Species", y = "Sepal Length / cm",) +
scale_colour_manual(values = c("darkorange",
"purple",
"cyan4")) +
theme_minimal() +
theme(legend.position = "none")
# sepal width
iris_petal_width_summary <-
iris %>%
group_by(Species) %>%
summarise(count = n(),
mean = mean(Sepal.Width, na.rm = TRUE),
ssd = sd(Sepal.Width, na.rm = TRUE)) %>%
mutate(se = ssd / sqrt(count),
lower_ci = mean - qt(1 - (0.05 / 2), count - 1) * se,
upper_ci = mean + qt(1 - (0.05 / 2), count - 1) * se)
p4 <- iris_sepal_length_summary %>% ggplot() +
geom_point(aes(x = Species, y = mean,
colour = Species),
size = 3) +
geom_errorbar(aes(x = Species,
ymin = lower_ci,
ymax = upper_ci,
colour = Species),
width = 0.2, size = 1) +
labs(x = "Species", y = "Sepal Width / cm",) +
scale_colour_manual(values = c("darkorange",
"purple",
"cyan4")) +
theme_minimal() +
theme(legend.position = "none")
# combine all plots into 1
grid.arrange(p1, p2, p3, p4, ncol = 2, top = "Petal and Sepal Dimensions across Species")

Correlation between sepal and petal dimensions
# Sepal Length vs Petal Length
p5 <- ggplot(iris, aes(x = Petal.Length, y = Sepal.Length, color = Species)) +
geom_point(size = 2, alpha = 0.7) +
geom_smooth(method = "lm", se = TRUE) +
labs(title = "Correlation between Sepal Length and Petal Length",
x = "Petal Length / cm",
y = "Sepal Length / cm") +
theme_minimal()
# Sepal Width vs Petal Width
p6 <- ggplot(iris, aes(x = Petal.Width, y = Sepal.Width, color = Species)) +
geom_point(size = 2, alpha = 0.7) +
geom_smooth(method = "lm", se = TRUE) +
labs(title = "Correlation between Sepal Width and Petal Width",
x = "Petal Width / cm",
y = "Sepal Width / cm") +
theme_minimal()
p5/p6

length_model <- lm(Sepal.Length ~ Petal.Length, data = iris)
length_r_squared <- summary(length_model)$r.squared
width_model <- lm(Sepal.Width ~ Petal.Width, data = iris)
width_r_squared <- summary(width_model)$r.squared
print(paste("The r value for petal vs sepal length is", round(length_r_squared, 5)))
## [1] "The r value for petal vs sepal length is 0.75995"
print(paste("The r value for petal vs sepal width is", round(width_r_squared, 5)))
## [1] "The r value for petal vs sepal width is 0.13405"