The aim of this practical is to investigate a dataset on iris flower morphology and environmental characteristics, to see if we can produce a linear model to predict the petal area of an iris flower using other variables.
This dataset is an extension of the classical iris dataset and contains the following information on 1200 plant samples:
# Install only if needed:
# install.packages("ggplot2")
# install.packages("ggpubr")
library(ggplot2)
library(ggpubr)
setwd("~/Downloads")
iris <- read.csv("Iris.csv")
head(iris)
## species elevation soil_type sepal_length sepal_width petal_length petal_width
## 1 setosa 161.8 sandy 5.16 3.41 1.64 0.26
## 2 setosa 291.4 clay 5.48 4.05 1.53 0.37
## 3 setosa 144.3 sandy 5.10 2.80 1.47 0.38
## 4 setosa 114.6 clay 4.64 3.44 1.53 0.17
## 5 setosa 110.9 loamy 4.85 2.87 1.23 0.26
## 6 setosa 164.4 clay 4.91 3.19 1.86 0.35
## sepal_area petal_area
## 1 17.5956 0.4264
## 2 22.1940 0.5661
## 3 14.2800 0.5586
## 4 15.9616 0.2601
## 5 13.9195 0.3198
## 6 15.6629 0.6510
summary(iris)
## species elevation soil_type sepal_length
## Length:1200 Min. : 50.1 Length:1200 Min. :4.040
## Class :character 1st Qu.:110.9 Class :character 1st Qu.:5.210
## Mode :character Median :171.3 Mode :character Median :5.840
## Mean :173.5 Mean :5.896
## 3rd Qu.:239.1 3rd Qu.:6.520
## Max. :299.9 Max. :9.120
## sepal_width petal_length petal_width sepal_area
## Min. :1.700 Min. :0.960 Min. :0.05 Min. : 9.048
## 1st Qu.:2.690 1st Qu.:1.640 1st Qu.:0.36 1st Qu.:15.555
## Median :3.030 Median :4.300 Median :1.29 Median :17.646
## Mean :3.053 Mean :3.808 Mean :1.19 Mean :17.871
## 3rd Qu.:3.390 3rd Qu.:5.200 3rd Qu.:1.74 3rd Qu.:20.007
## Max. :4.760 Max. :7.840 Max. :2.99 Max. :29.723
## petal_area
## Min. : 0.0860
## 1st Qu.: 0.5658
## Median : 5.5254
## Mean : 5.6891
## 3rd Qu.: 9.2567
## Max. :23.1280
set.seed(123)
train_data <- sample(seq_len(nrow(iris)), size = 600)
iris_train <- iris[train_data, ]
iris_test <- iris[-train_data, ]
head(iris_train)
## species elevation soil_type sepal_length sepal_width petal_length
## 415 versicolor 291.5 sandy 5.40 2.73 4.25
## 463 versicolor 291.8 clay 5.52 3.52 4.54
## 179 setosa 259.1 sandy 4.92 3.18 1.37
## 526 versicolor 79.2 loamy 5.80 3.18 4.74
## 195 setosa 197.7 sandy 4.86 3.01 1.38
## 938 virginica 69.6 clay 6.43 3.40 6.52
## petal_width sepal_area petal_area
## 415 1.34 14.7420 5.6950
## 463 1.46 19.4304 6.6284
## 179 0.32 15.6456 0.4384
## 526 1.13 18.4440 5.3562
## 195 0.28 14.6286 0.3864
## 938 1.76 21.8620 11.4752
head(iris_test)
## species elevation soil_type sepal_length sepal_width petal_length petal_width
## 1 setosa 161.8 sandy 5.16 3.41 1.64 0.26
## 2 setosa 291.4 clay 5.48 4.05 1.53 0.37
## 3 setosa 144.3 sandy 5.10 2.80 1.47 0.38
## 4 setosa 114.6 clay 4.64 3.44 1.53 0.17
## 5 setosa 110.9 loamy 4.85 2.87 1.23 0.26
## 7 setosa 239.9 clay 4.47 3.91 1.42 0.41
## sepal_area petal_area
## 1 17.5956 0.4264
## 2 22.1940 0.5661
## 3 14.2800 0.5586
## 4 15.9616 0.2601
## 5 13.9195 0.3198
## 7 17.4777 0.5822
qqnorm(iris_train$petal_area)
qqline(iris_train$petal_area)
shapiro.test(iris_train$petal_area)
##
## Shapiro-Wilk normality test
##
## data: iris_train$petal_area
## W = 0.90845, p-value < 2.2e-16
The model will use the main effects of elevation, soil type, and sepal area.
linear_model <- lm(petal_area ~ elevation + soil_type + sepal_area,
data = iris_train)
summary(linear_model)
##
## Call:
## lm(formula = petal_area ~ elevation + soil_type + sepal_area,
## data = iris_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.0031 -3.9735 0.1302 3.1612 11.4461
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.884310 1.400853 -3.487 0.000525 ***
## elevation 0.001716 0.002481 0.691 0.489546
## soil_typeloamy 0.911265 0.429380 2.122 0.034227 *
## soil_typesandy 1.409856 0.503244 2.802 0.005251 **
## sepal_area 0.530427 0.062662 8.465 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.276 on 595 degrees of freedom
## Multiple R-squared: 0.1158, Adjusted R-squared: 0.1098
## F-statistic: 19.47 on 4 and 595 DF, p-value: 4.517e-15
par(mfrow = c(2,2))
plot(linear_model)
ggscatter(
data = iris_train,
x = "sepal_area",
y = "petal_area",
color = "species",
shape = "species",
size = 1.5,
add = "reg.line",
conf.int = TRUE,
cor.coef = TRUE,
cor.method = "pearson",
xlab = "Sepal area (cm^2)",
ylab = "Petal area (cm^2)"
) +
scale_colour_manual(values = c("orange", "steelblue", "forestgreen")) +
scale_fill_manual(values = c("orange", "steelblue", "forestgreen")) +
labs_pubr()
predictions <- predict(linear_model, newdata = iris_test)
plot(iris_test$petal_area, predictions,
xlab = "Actual Petal Area",
ylab = "Predicted Petal Area")
abline(lm(predictions ~ iris_test$petal_area), col="red")
cor.test(iris_test$petal_area, predictions)
##
## Pearson's product-moment correlation
##
## data: iris_test$petal_area and predictions
## t = 9.4233, df = 598, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2878135 0.4273194
## sample estimates:
## cor
## 0.3595741
# install.packages("Metrics")
library(Metrics)
rmse(iris_test$petal_area, predictions)
## [1] 4.327137
ggplot(data = iris_test,
aes(x = petal_area, y = predictions, color = species)) +
geom_point() +
theme_pubr() +
labs_pubr() +
labs(
title = "Actual Petal Area vs. Predicted Petal Area",
x = "Actual Petal Area",
y = "Predicted Petal Area",
color = "Species"
)