Project 3 PCA

Author

Sophia Tweed

Loading Data and predictors

library(readxl)
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.4     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(ggfortify)
library(plotly)

Attaching package: 'plotly'

The following object is masked from 'package:ggplot2':

    last_plot

The following object is masked from 'package:stats':

    filter

The following object is masked from 'package:graphics':

    layout
# Read data
data <- read_excel("Project3-Data.xls", sheet = "Barley")

data$Soil_Type <- as.factor(data$Soil_Type)
predictors <- c("Soil_pH", "Temperature", "Humidity", "Wind_Speed", "N", "P", "K")
X <- scale(data[, predictors])

Correlation

pairs(data[,2:8], col = as.integer(data$Crop_Yield))

cor(data[,2:8])
                Soil_pH Temperature    Humidity  Wind_Speed           N
Soil_pH      1.00000000  0.14614622 -0.11645531  0.02087753  0.34684399
Temperature  0.14614622  1.00000000 -0.88489424 -0.16313294 -0.08425913
Humidity    -0.11645531 -0.88489424  1.00000000  0.08580919  0.05136120
Wind_Speed   0.02087753 -0.16313294  0.08580919  1.00000000 -0.13372454
N            0.34684399 -0.08425913  0.05136120 -0.13372454  1.00000000
P            0.34684399 -0.08425913  0.05136120 -0.13372454  1.00000000
K            0.34684399 -0.08425913  0.05136120 -0.13372454  1.00000000
                      P           K
Soil_pH      0.34684399  0.34684399
Temperature -0.08425913 -0.08425913
Humidity     0.05136120  0.05136120
Wind_Speed  -0.13372454 -0.13372454
N            1.00000000  1.00000000
P            1.00000000  1.00000000
K            1.00000000  1.00000000
Corrmatrix <- cor(data[,2:8])

PCA

# Select only numeric predictor columns for PCA (excluding Crop_Yield and Soil_Type)
pca1 <- prcomp(X, scale. = TRUE)
pca2 <- princomp(X, cor = TRUE)
summary(pca1)
Importance of components:
                          PC1    PC2    PC3    PC4     PC5       PC6       PC7
Standard deviation     1.7875 1.3972 1.0057 0.8557 0.32985 2.977e-16 4.715e-32
Proportion of Variance 0.4565 0.2789 0.1445 0.1046 0.01554 0.000e+00 0.000e+00
Cumulative Proportion  0.4565 0.7354 0.8799 0.9845 1.00000 1.000e+00 1.000e+00
summary(pca2)
Importance of components:
                          Comp.1    Comp.2    Comp.3    Comp.4     Comp.5
Standard deviation     1.7875470 1.3972170 1.0057252 0.8556745 0.32984592
Proportion of Variance 0.4564749 0.2788879 0.1444976 0.1045970 0.01554262
Cumulative Proportion  0.4564749 0.7353628 0.8798604 0.9844574 1.00000000
                             Comp.6       Comp.7
Standard deviation     8.669829e-09 5.500858e-09
Proportion of Variance 1.073799e-17 4.322776e-18
Cumulative Proportion  1.000000e+00 1.000000e+00

we dont want todo 8 –> we are only using the first 3 PCA as they explain 87.99% of the variance. the first 3 PCA were retained for fither analysis

Scree Plot

# Scree plot 
screeplot(pca1, type="lines", main = "Scree Plot of PCA")

Loadings

loadings <- pca1$rotation
loadings
                    PC1          PC2         PC3        PC4         PC5
Soil_pH      0.25544030 -0.188632470 -0.48313423 -0.8144540  0.04913339
Temperature -0.05808065 -0.688224834 -0.01786900  0.1086632 -0.71473507
Humidity     0.04511995  0.677946578  0.07449269 -0.2288685 -0.69312530
Wind_Speed  -0.09272889  0.176390016 -0.87127855  0.4424873 -0.07325689
N            0.55399833 -0.003622558  0.02299665  0.1598762 -0.01779920
P            0.55399833 -0.003622558  0.02299665  0.1598762 -0.01779920
K            0.55399833 -0.003622558  0.02299665  0.1598762 -0.01779920
                      PC6           PC7
Soil_pH      0.000000e+00  0.000000e+00
Temperature -3.450188e-15 -4.588216e-17
Humidity    -3.395512e-15 -6.558255e-17
Wind_Speed   2.527902e-16  4.857769e-17
N           -8.164966e-01  1.845026e-16
P            4.082483e-01 -7.071068e-01
K            4.082483e-01  7.071068e-01

PC1 (46% of variance):Dominated by N, P, K (positive loadings ~0.55), Moderate contribution from Soil_pH –> Nutrient avaliablity

PC2 (28%): High positive loading for Temperature (0.69), Strong negative loading for Humidity (-0.68) –> Temp and humidity inversely related

PC3 (14%): Driven mainly by Wind_Speed (0.87), Moderate Soil_pH influence –> wind or air flow

Regression Model

# regression model 
pca_scores <- as.data.frame(pca1$x)
data_pca <- cbind(data, pca_scores)

model <- lm(Crop_Yield ~ PC1 + PC2 + PC3, data = data_pca)
summary(model)

Call:
lm(formula = Crop_Yield ~ PC1 + PC2 + PC3, data = data_pca)

Residuals:
    Min      1Q  Median      3Q     Max 
-54.405 -13.497   1.331  14.911  55.939 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   40.542      2.223  18.238  < 2e-16 ***
PC1            3.479      1.250   2.783  0.00648 ** 
PC2            7.300      1.599   4.565 1.48e-05 ***
PC3            5.682      2.221   2.558  0.01209 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 22.23 on 96 degrees of freedom
Multiple R-squared:  0.2679,    Adjusted R-squared:  0.245 
F-statistic: 11.71 on 3 and 96 DF,  p-value: 1.321e-06

Biplot - PC1 vs PC2 ~ 73%

biplot(pca1, scale = 0,
       cex = 0.7,
       main = "PC1 vs PC2")

3D Biplot - PC1 vs PC2 vs PC3

Based off crop Yield

Based off Soil Texture

plot_ly(pca_scores, 
        x = ~PC1, y = ~PC2, z = ~PC3, 
        color = ~data$Soil_Type, 
        type = "scatter3d", 
        mode = "markers") %>%
  layout(title = "3D PCA Plot Colored by Soil Type",
         scene = list(xaxis = list(title = "PC1"),
                      yaxis = list(title = "PC2"),
                      zaxis = list(title = "PC3")))
plot_ly(pca_scores, 
        x = ~PC1, y = ~PC2, z = ~PC3, 
        color = ~data$Crop_Yield, 
        type = "scatter3d", 
        mode = "markers") %>%
  layout(title = "3D PCA Plot Colored by Crop Yield",
         scene = list(xaxis = list(title = "PC1"),
                      yaxis = list(title = "PC2"),
                      zaxis = list(title = "PC3")))