Extended Practical - Linear Models

Introduction

The aim of this practical is to investigate a dataset on iris flower morphology and environmental characteristics, to see if we can produce a linear model to predict the petal area of an iris flower using other variables.

This dataset is an extension of the classical iris dataset and contains the following information on 1200 plant samples:

  • Species (Setosa, Versicolor, Virginica)
  • Elevation level where the iris was found (m)
  • Soil type (Loamy, Sandy, Clay)
  • Sepal length (cm)
  • Sepal width (cm)
  • Petal length (cm)
  • Petal width (cm)
  • Sepal area (length × width)
  • Petal area (length × width)

Load packages

# Install only if needed:
# install.packages("ggplot2")
# install.packages("ggpubr")

library(ggplot2)
library(ggpubr)

Load dataset

setwd("~/Downloads")
iris <- read.csv("Iris.csv")

head(iris)
##   species elevation soil_type sepal_length sepal_width petal_length petal_width
## 1  setosa     161.8     sandy         5.16        3.41         1.64        0.26
## 2  setosa     291.4      clay         5.48        4.05         1.53        0.37
## 3  setosa     144.3     sandy         5.10        2.80         1.47        0.38
## 4  setosa     114.6      clay         4.64        3.44         1.53        0.17
## 5  setosa     110.9     loamy         4.85        2.87         1.23        0.26
## 6  setosa     164.4      clay         4.91        3.19         1.86        0.35
##   sepal_area petal_area
## 1    17.5956     0.4264
## 2    22.1940     0.5661
## 3    14.2800     0.5586
## 4    15.9616     0.2601
## 5    13.9195     0.3198
## 6    15.6629     0.6510
summary(iris)
##    species            elevation      soil_type          sepal_length  
##  Length:1200        Min.   : 50.1   Length:1200        Min.   :4.040  
##  Class :character   1st Qu.:110.9   Class :character   1st Qu.:5.210  
##  Mode  :character   Median :171.3   Mode  :character   Median :5.840  
##                     Mean   :173.5                      Mean   :5.896  
##                     3rd Qu.:239.1                      3rd Qu.:6.520  
##                     Max.   :299.9                      Max.   :9.120  
##   sepal_width     petal_length    petal_width     sepal_area    
##  Min.   :1.700   Min.   :0.960   Min.   :0.05   Min.   : 9.048  
##  1st Qu.:2.690   1st Qu.:1.640   1st Qu.:0.36   1st Qu.:15.555  
##  Median :3.030   Median :4.300   Median :1.29   Median :17.646  
##  Mean   :3.053   Mean   :3.808   Mean   :1.19   Mean   :17.871  
##  3rd Qu.:3.390   3rd Qu.:5.200   3rd Qu.:1.74   3rd Qu.:20.007  
##  Max.   :4.760   Max.   :7.840   Max.   :2.99   Max.   :29.723  
##    petal_area     
##  Min.   : 0.0860  
##  1st Qu.: 0.5658  
##  Median : 5.5254  
##  Mean   : 5.6891  
##  3rd Qu.: 9.2567  
##  Max.   :23.1280

Split into training and testing sets

set.seed(123)
train_data <- sample(seq_len(nrow(iris)), size = 600)

iris_train <- iris[train_data, ]
iris_test  <- iris[-train_data, ]

head(iris_train)
##        species elevation soil_type sepal_length sepal_width petal_length
## 415 versicolor     291.5     sandy         5.40        2.73         4.25
## 463 versicolor     291.8      clay         5.52        3.52         4.54
## 179     setosa     259.1     sandy         4.92        3.18         1.37
## 526 versicolor      79.2     loamy         5.80        3.18         4.74
## 195     setosa     197.7     sandy         4.86        3.01         1.38
## 938  virginica      69.6      clay         6.43        3.40         6.52
##     petal_width sepal_area petal_area
## 415        1.34    14.7420     5.6950
## 463        1.46    19.4304     6.6284
## 179        0.32    15.6456     0.4384
## 526        1.13    18.4440     5.3562
## 195        0.28    14.6286     0.3864
## 938        1.76    21.8620    11.4752
head(iris_test)
##   species elevation soil_type sepal_length sepal_width petal_length petal_width
## 1  setosa     161.8     sandy         5.16        3.41         1.64        0.26
## 2  setosa     291.4      clay         5.48        4.05         1.53        0.37
## 3  setosa     144.3     sandy         5.10        2.80         1.47        0.38
## 4  setosa     114.6      clay         4.64        3.44         1.53        0.17
## 5  setosa     110.9     loamy         4.85        2.87         1.23        0.26
## 7  setosa     239.9      clay         4.47        3.91         1.42        0.41
##   sepal_area petal_area
## 1    17.5956     0.4264
## 2    22.1940     0.5661
## 3    14.2800     0.5586
## 4    15.9616     0.2601
## 5    13.9195     0.3198
## 7    17.4777     0.5822

Check if the response variable is normally distributed

qqnorm(iris_train$petal_area)
qqline(iris_train$petal_area)

shapiro.test(iris_train$petal_area)
## 
##  Shapiro-Wilk normality test
## 
## data:  iris_train$petal_area
## W = 0.90845, p-value < 2.2e-16

Build the linear model

The model will use the main effects of elevation, soil type, and sepal area.

linear_model <- lm(petal_area ~ elevation + soil_type + sepal_area,
                   data = iris_train)

summary(linear_model)
## 
## Call:
## lm(formula = petal_area ~ elevation + soil_type + sepal_area, 
##     data = iris_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.0031 -3.9735  0.1302  3.1612 11.4461 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -4.884310   1.400853  -3.487 0.000525 ***
## elevation       0.001716   0.002481   0.691 0.489546    
## soil_typeloamy  0.911265   0.429380   2.122 0.034227 *  
## soil_typesandy  1.409856   0.503244   2.802 0.005251 ** 
## sepal_area      0.530427   0.062662   8.465  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.276 on 595 degrees of freedom
## Multiple R-squared:  0.1158, Adjusted R-squared:  0.1098 
## F-statistic: 19.47 on 4 and 595 DF,  p-value: 4.517e-15

Linear model diagnostic plots

par(mfrow = c(2,2))
plot(linear_model)


Scatterplot with regression line

ggscatter(
  data = iris_train,
  x = "sepal_area",
  y = "petal_area",
  color = "species",
  shape = "species",
  size = 1.5,
  add = "reg.line",
  conf.int = TRUE,
  cor.coef = TRUE,
  cor.method = "pearson",
  xlab = "Sepal area (cm^2)",
  ylab = "Petal area (cm^2)"
) +
  scale_colour_manual(values = c("orange", "steelblue", "forestgreen")) +
  scale_fill_manual(values = c("orange", "steelblue", "forestgreen")) +
  labs_pubr()


Make predictions on the test set

predictions <- predict(linear_model, newdata = iris_test)

plot(iris_test$petal_area, predictions,
     xlab = "Actual Petal Area",
     ylab = "Predicted Petal Area")
abline(lm(predictions ~ iris_test$petal_area), col="red")

cor.test(iris_test$petal_area, predictions)
## 
##  Pearson's product-moment correlation
## 
## data:  iris_test$petal_area and predictions
## t = 9.4233, df = 598, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2878135 0.4273194
## sample estimates:
##       cor 
## 0.3595741

RMSE Calculation

# install.packages("Metrics")
library(Metrics)

rmse(iris_test$petal_area, predictions)
## [1] 4.327137

Actual vs Predicted Plot (ggplot)

ggplot(data = iris_test,
       aes(x = petal_area, y = predictions, color = species)) +
  geom_point() +
  theme_pubr() +
  labs_pubr() +
  labs(
    title = "Actual Petal Area vs. Predicted Petal Area",
    x = "Actual Petal Area",
    y = "Predicted Petal Area",
    color = "Species"
  )