Final Project

# These are the package and libraries I needed for the motorins data. 
options(repos = list(CRAN = "http://cran.us.r-project.org"))
install.packages("GLMsData")

## 
## The downloaded binary packages are in
##  /var/folders/c6/d2zbrqtx28x8kgkvh9_m0v800000gn/T//Rtmpe4AX3Z/downloaded_packages

install.packages(c("reshape2", "ggplot2"))

## 
## The downloaded binary packages are in
##  /var/folders/c6/d2zbrqtx28x8kgkvh9_m0v800000gn/T//Rtmpe4AX3Z/downloaded_packages

install.packages("Hmisc")

## 
## The downloaded binary packages are in
##  /var/folders/c6/d2zbrqtx28x8kgkvh9_m0v800000gn/T//Rtmpe4AX3Z/downloaded_packages

install.packages(c("tidymodels", "GGally"))

## 
## The downloaded binary packages are in
##  /var/folders/c6/d2zbrqtx28x8kgkvh9_m0v800000gn/T//Rtmpe4AX3Z/downloaded_packages

install.packages("rsample")

## 
## The downloaded binary packages are in
##  /var/folders/c6/d2zbrqtx28x8kgkvh9_m0v800000gn/T//Rtmpe4AX3Z/downloaded_packages

install.packages("caret")

## 
## The downloaded binary packages are in
##  /var/folders/c6/d2zbrqtx28x8kgkvh9_m0v800000gn/T//Rtmpe4AX3Z/downloaded_packages

library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.3.0 ──

## ✔ broom        1.0.8     ✔ recipes      1.3.1
## ✔ dials        1.4.0     ✔ rsample      1.3.0
## ✔ dplyr        1.1.4     ✔ tibble       3.3.0
## ✔ ggplot2      3.5.2     ✔ tidyr        1.3.1
## ✔ infer        1.0.9     ✔ tune         1.3.0
## ✔ modeldata    1.4.0     ✔ workflows    1.2.0
## ✔ parsnip      1.3.2     ✔ workflowsets 1.1.1
## ✔ purrr        1.0.4     ✔ yardstick    1.3.2

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ purrr::discard() masks scales::discard()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ✖ recipes::step()  masks stats::step()

library(GGally)

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

library(rsample)
library(dplyr)
library(mlbench)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ lubridate 1.9.4     ✔ stringr   1.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ purrr::discard()    masks scales::discard()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ stringr::fixed()    masks recipes::fixed()
## ✖ dplyr::lag()        masks stats::lag()
## ✖ readr::spec()       masks yardstick::spec()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(reshape2)

## 
## Attaching package: 'reshape2'
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths

library(Hmisc)

## 
## Attaching package: 'Hmisc'
## 
## The following object is masked from 'package:parsnip':
## 
##     translate
## 
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, units

library(GLMsData) 
library(Hmisc)
library(ggplot2)
library(caret)

## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following objects are masked from 'package:yardstick':
## 
##     precision, recall, sensitivity, specificity
## 
## The following object is masked from 'package:purrr':
## 
##     lift

#Data for auto insurance in 1977
data(motorins) 
# There are 2182 observations and 7 variables
dim(motorins)

## [1] 2182    7

head(motorins)

##   Kilometres Zone Bonus Make Insured Claims Payment
## 1          1    1     1    1  455.13    108  392491
## 2          1    1     1    2   69.17     19   46221
## 3          1    1     1    3   72.88     13   15694
## 4          1    1     1    4 1292.39    124  422201
## 5          1    1     1    5  191.01     40  119373
## 6          1    1     1    6  477.66     57  170913

glimpse(motorins)

## Rows: 2,182
## Columns: 7
## $ Kilometres <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ Zone       <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ Bonus      <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3,…
## $ Make       <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2,…
## $ Insured    <dbl> 455.13, 69.17, 72.88, 1292.39, 191.01, 477.66, 105.58, 32.5…
## $ Claims     <int> 108, 19, 13, 124, 40, 57, 23, 14, 1704, 45, 10, 5, 48, 11, …
## $ Payment    <int> 392491, 46221, 15694, 422201, 119373, 170913, 56940, 77487,…

describe(motorins)

## motorins 
## 
##  7  Variables      2182  Observations
## --------------------------------------------------------------------------------
## Kilometres 
##        n  missing distinct     Info     Mean  pMedian      Gmd 
##     2182        0        5     0.96    2.986        3    1.596 
##                                         
## Value          1     2     3     4     5
## Frequency    439   441   441   434   427
## Proportion 0.201 0.202 0.202 0.199 0.196
## 
## For the frequency table, variable is rounded to the nearest 0
## --------------------------------------------------------------------------------
## Zone 
##        n  missing distinct     Info     Mean  pMedian      Gmd 
##     2182        0        7     0.98     3.97        4    2.273 
##                                                     
## Value          1     2     3     4     5     6     7
## Frequency    315   315   315   315   313   315   294
## Proportion 0.144 0.144 0.144 0.144 0.143 0.144 0.135
## 
## For the frequency table, variable is rounded to the nearest 0
## --------------------------------------------------------------------------------
## Bonus 
##        n  missing distinct     Info     Mean  pMedian      Gmd 
##     2182        0        7     0.98    4.015        4    2.287 
##                                                     
## Value          1     2     3     4     5     6     7
## Frequency    307   312   310   310   313   315   315
## Proportion 0.141 0.143 0.142 0.142 0.143 0.144 0.144
## 
## For the frequency table, variable is rounded to the nearest 0
## --------------------------------------------------------------------------------
## Make 
##        n  missing distinct     Info     Mean  pMedian      Gmd 
##     2182        0        9    0.988    4.992        5    2.969 
##                                                                 
## Value          1     2     3     4     5     6     7     8     9
## Frequency    245   245   242   238   244   244   242   237   245
## Proportion 0.112 0.112 0.111 0.109 0.112 0.112 0.111 0.109 0.112
## 
## For the frequency table, variable is rounded to the nearest 0
## --------------------------------------------------------------------------------
## Insured 
##        n  missing distinct     Info     Mean  pMedian      Gmd      .05 
##     2182        0     2070        1     1092      173     1932    2.311 
##      .10      .25      .50      .75      .90      .95 
##    5.701   21.610   81.525  389.783 1688.186 4711.983 
## 
## lowest : 0.01    0.03    0.04    0.08    0.09   
## highest: 60140.2 65617.5 79615   121293  127687 
## --------------------------------------------------------------------------------
## Claims 
##        n  missing distinct     Info     Mean  pMedian      Gmd      .05 
##     2182        0      256    0.993    51.87       10     89.6      0.0 
##      .10      .25      .50      .75      .90      .95 
##      0.0      1.0      5.0     21.0     87.9    233.9 
## 
## lowest :    0    1    2    3    4, highest: 2087 2127 2548 2894 3338
## --------------------------------------------------------------------------------
## Payment 
##        n  missing distinct     Info     Mean  pMedian      Gmd      .05 
##     2182        0     1752    0.995   257008    51132   445992        0 
##      .10      .25      .50      .75      .90      .95 
##        0     2989    27404   111954   439778  1102675 
## 
## lowest :        0       72      148      303      309
## highest: 10315455 10518198 13203616 15540162 18245026
## --------------------------------------------------------------------------------

# View the description, structure and summary
?motorins

# Description Provided by R
# The data give details of third-party motor insurance claims in Sweden for the year 1977.

# Format
# A data frame with 2182 observations on the following 7 variables
# 
# Kilometres
# the number of kilometres travelled per year; a numeric vector with levels 1 (less than 10000), 2 (from 10000 to 15000), 3 (15000 to 20000), 4 (20000 to 25000) or 5 (more than 25000)
# 
# Zone
# geographical zone (only in motorins); a numeric vector with levels 1 to 7 ()see Details below)
# 
# Bonus
# no claim bonus; a numeric vector equal to the number of years plus one since the last claim
# 
# Make
# the make of vehicle; a numeric vector with levels from 1 to 8 representing eight common cart models, and 9 representing all other models
# 
# Insured
# the number of insured in policy-years; a numeric vector
# 
# Claims
# the number of claims; a numeric vector
# 
# Payment
# the total value of payments in Skoner; a numeric vector
# 
# Details
# For variable Zone, the geographical zones are:
# 
# 1 Stockholm, Goteborg, Malmo with surroundings
# 2 Other large cities and surroundings
# 3 Small cities in northern Sweden
# 4 Small cities in southern Sweden
# 5 Rural areas in northern Sweden
# 6 Rural areas in southern Sweden
# 7 Gotland
# 
# “In Sweden all motor insurance companies apply identical risk arguments to classify customers, and thus their portfolios and their claims statistics can be combined. The data were compiled by a Swedish Committee on the Analysis of Risk Premium in Motor Insurance. The Committee was asked to look into the problem of analyzing the real influence on claims of the risk arguments and to compare this structure with the actual tariff” (Andrews and Herzberg (1985), p. 413).
# 
# Make 4 is the Volkswagen 1200, which was discontinued shortly after 1977. The other makes could not be identified because of the potential for the data to impact on sales of those cars.

str(motorins)

## 'data.frame':    2182 obs. of  7 variables:
##  $ Kilometres: int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Zone      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Bonus     : int  1 1 1 1 1 1 1 1 1 2 ...
##  $ Make      : int  1 2 3 4 5 6 7 8 9 1 ...
##  $ Insured   : num  455.1 69.2 72.9 1292.4 191 ...
##  $ Claims    : int  108 19 13 124 40 57 23 14 1704 45 ...
##  $ Payment   : int  392491 46221 15694 422201 119373 170913 56940 77487 6805992 214011 ...

summary(motorins)

##    Kilometres         Zone          Bonus            Make      
##  Min.   :1.000   Min.   :1.00   Min.   :1.000   Min.   :1.000  
##  1st Qu.:2.000   1st Qu.:2.00   1st Qu.:2.000   1st Qu.:3.000  
##  Median :3.000   Median :4.00   Median :4.000   Median :5.000  
##  Mean   :2.986   Mean   :3.97   Mean   :4.015   Mean   :4.992  
##  3rd Qu.:4.000   3rd Qu.:6.00   3rd Qu.:6.000   3rd Qu.:7.000  
##  Max.   :5.000   Max.   :7.00   Max.   :7.000   Max.   :9.000  
##     Insured              Claims           Payment        
##  Min.   :     0.01   Min.   :   0.00   Min.   :       0  
##  1st Qu.:    21.61   1st Qu.:   1.00   1st Qu.:    2989  
##  Median :    81.53   Median :   5.00   Median :   27404  
##  Mean   :  1092.19   Mean   :  51.87   Mean   :  257008  
##  3rd Qu.:   389.78   3rd Qu.:  21.00   3rd Qu.:  111954  
##  Max.   :127687.27   Max.   :3338.00   Max.   :18245026

# This will show the factor variables (Kilometres, Zone, Make)
sapply(motorins, is.factor)

## Kilometres       Zone      Bonus       Make    Insured     Claims    Payment 
##      FALSE      FALSE      FALSE      FALSE      FALSE      FALSE      FALSE

# Display in a table with the proper labels
Zone_labels <- c(
  "1" = "Stockholm, Göteborg, Malmö & surroundings",
  "2" = "Other large cities & surroundings",
  "3" = "Small cities (northern Sweden)",
  "4" = "Small cities (southern Sweden)",
  "5" = "Rural areas (northern Sweden)",
  "6" = "Rural areas (southern Sweden)",
  "7" = "Gotland"
)

# Convert numeric Zone to labeled factor
motorins$ZoneLabel <- factor(as.character(motorins$Zone),
                             levels = names(Zone_labels),
                             labels = Zone_labels)
table(motorins$ZoneLabel)

## 
## Stockholm, Göteborg, Malmö & surroundings 
##                                       315 
##         Other large cities & surroundings 
##                                       315 
##            Small cities (northern Sweden) 
##                                       315 
##            Small cities (southern Sweden) 
##                                       315 
##             Rural areas (northern Sweden) 
##                                       313 
##             Rural areas (southern Sweden) 
##                                       315 
##                                   Gotland 
##                                       294

# This will plot claims vs insured after making numeric

motor_raw <- motorins %>%
  mutate(
    Kilometres = as.numeric(as.character(Kilometres)),
    Zone       = as.numeric(as.character(Zone)),
    Make       = as.numeric(as.character(Make))
  )

ggplot(motor_raw, aes(x = Insured, y = Claims)) +
  geom_point(alpha = 0.5, color = "blue") +
  geom_smooth(method = "lm", color = "darkblue", se = FALSE) +
  labs(
    title = "Claims vs Insured (Number of Policies)",
    x = "Insured (policy-years)",
    y = "Number of Claims"
  ) +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

#This will plot payment vs insured
ggplot(motor_raw, aes(x = Insured, y = Payment)) +
  geom_point(alpha = 0.5, color = "darkgreen") +
  geom_smooth(method = "lm", color = "forestgreen", se = FALSE) +
  labs(
    title = "Payment vs Insured (Total Claim Costs)",
    x = "Insured (policy-years)",
    y = "Payment (SEK)"
  ) +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

#Summary information for the payment variable
x <- motorins$Payment  

mean(x, na.rm = TRUE)

## [1] 257007.6

median(x, na.rm = TRUE)

## [1] 27403.5

min(x, na.rm = TRUE)

## [1] 0

max(x, na.rm = TRUE)

## [1] 18245026

quantile(x, probs = c(0.25, 0.75), na.rm = TRUE)

##       25%       75% 
##   2988.75 111953.75

summary(x)  # gives Min, 1st Qu., Median, Mean, 3rd Qu., Max

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##        0     2989    27404   257008   111954 18245026

# This chunk is to find the correlation matrix to answer the first assignment question.

# Find numeric columns in motorins
numeric_vars <- sapply(motorins, is.numeric)

# Compute correlation matrix
cm <- cor(motorins[, numeric_vars], use = "pairwise.complete.obs")

# Create a matrix of the output.
cm <- as.matrix(cm)

# Remove diagonal and duplicates. Set diagonal to 0 to ignore self-correlation, which will always return 1.0
diag(cm) <- 0
cm[upper.tri(cm)] <- NA

# I am isolating the variables with correlation absolute value greater than 0.5.
filtered <- which(abs(cm) > 0.5, arr.ind = TRUE)

# 4. List variable pairs and their correlations
result <- data.frame(
  var1 = rownames(cm)[filtered[, 1]],
  var2 = colnames(cm)[filtered[, 2]],
  cor  = cm[filtered]
)
result

##      var1    var2       cor
## 1  Claims Insured 0.9103478
## 2 Payment Insured 0.9332170
## 3 Payment  Claims 0.9954003

# I don't want duplicates, only the unique pairs.
library(reshape2)
cm[upper.tri(cm, diag = TRUE)] <- NA

# This converts to long format, then filters by threshold
df <- melt(cm, na.rm = TRUE)
colnames(df) <- c("var1", "var2", "correlation")

strong_pairs <- subset(df, abs(correlation) > 0.5)
strong_pairs

##       var1    var2 correlation
## 34  Claims Insured   0.9103478
## 35 Payment Insured   0.9332170
## 42 Payment  Claims   0.9954003

# This starts the second part of the assignment: preprocessing and visualization.
# install.packages(c("tidymodels", "GGally"))
# install.packages("rsample")
# library(tidymodels)
# library(GGally)
# library(rsample)

# I'm starting the process by separating the data into two parts, train and test. It specifies 75% used to train and 25% used to test. We didn't do this in class but the code is straightforward enough for me to understand.
set.seed(2025)
data_split <- initial_split(motorins, prop = 0.75)

# Name the new data sets.
train_data <- training(data_split)
test_data  <- testing(data_split)

# Check for new dimensions.
dim(train_data)

## [1] 1636    8

dim(test_data)

## [1] 546   8

# This "recipe" is predicting payment using all the other data
# The 2nd line is marking the variables as categorical
# The 3rd is making the categories binary (yes or no, 0 or 1)
# The 4th removes columns that don't vary (zero variance)
# The 5th line removes values that are highly correlated
# The last line normalizes by centering and scaling
motor_rec <- recipe(Payment ~ ., data = train_data) %>%
  update_role(Zone, Make, Kilometres, new_role = "nominal") %>%  
  step_dummy(all_nominal_predictors()) %>%                       
  step_zv(all_predictors()) %>%                                  
  step_corr(all_numeric_predictors(), threshold = 0.8) %>%       
  step_normalize(all_numeric_predictors())                     

motor_rec

##

## ── Recipe ──────────────────────────────────────────────────────────────────────

##

## ── Inputs

## Number of variables by role

## outcome:   1
## predictor: 4
## nominal:   3

##

## ── Operations

## • Dummy variables from: all_nominal_predictors()

## • Zero variance filter on: all_predictors()

## • Correlation filter on: all_numeric_predictors()

## • Centering and scaling for: all_numeric_predictors()

library(GGally)
# The "prep" function prepares the new data for the recipe to be applied to in the "bake" function. It makes the actual updates to the data whereas the recipe just provides the plan.
motor_rec_prep <- prep(motor_rec, training = train_data)

# The "bake" function now applies the recipe transforms to both training and test sets. It executes the plan.
train_clean <- bake(motor_rec_prep, new_data = train_data)
test_clean  <- bake(motor_rec_prep, new_data = test_data)

# View the outcome.
dim(train_clean)

## [1] 1636   12

dim(test_clean)

## [1] 546  12

glimpse(train_clean)

## Rows: 1,636
## Columns: 12
## $ Kilometres                                  <int> 3, 5, 1, 4, 1, 3, 2, 3, 5,…
## $ Zone                                        <int> 1, 4, 5, 4, 3, 7, 1, 2, 3,…
## $ Bonus                                       <dbl> 0.98914282, 0.48767469, -0…
## $ Make                                        <int> 7, 5, 9, 6, 7, 5, 4, 2, 7,…
## $ Claims                                      <dbl> -0.18113922, -0.26009655, …
## $ Payment                                     <int> 90830, 4525, 481752, 13911…
## $ ZoneLabel_Other.large.cities...surroundings <dbl> -0.4114647, -0.4114647, -0…
## $ ZoneLabel_Small.cities..northern.Sweden.    <dbl> -0.4012589, -0.4012589, -0…
## $ ZoneLabel_Small.cities..southern.Sweden.    <dbl> -0.4155169, 2.4051702, -0.…
## $ ZoneLabel_Rural.areas..northern.Sweden.     <dbl> -0.3961129, -0.3961129, 2.…
## $ ZoneLabel_Rural.areas..southern.Sweden.     <dbl> -0.4365405, -0.4365405, -0…
## $ ZoneLabel_Gotland                           <dbl> -0.3961129, -0.3961129, -0…

# Plot the pairwise relationships among numeric predictors
GGally::ggpairs(
  select(train_clean, where(is.numeric)),
  title = "Pairwise Relationships of Numeric Features (Post-Processing)"
)

ggplot(train_clean, aes(
  x = factor(Kilometres), 
  y = Payment, 
  fill = factor(Kilometres)
)) +
  geom_boxplot(alpha = 0.7) +
  labs(
    x = "Kilometres Category",
    y = "Scaled Payment",
    title = "Post-Processed Payment by Usage"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

# Now start for the second dataset
# This dataset we've used and preprocessed before in a previous homework.

data(iris)
dim(iris)

## [1] 150   5

# Summary statistics
summary(iris[, 1:4])

##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500

# Compute means, medians, SDs
sapply(iris[, 1:4], mean)

## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##     5.843333     3.057333     3.758000     1.199333

sapply(iris[, 1:4], median)

## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##         5.80         3.00         4.35         1.30

sapply(iris[, 1:4], sd)

## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##    0.8280661    0.4358663    1.7652982    0.7622377

# a) Scatterplot matrix with species coloring
GGally::ggpairs(iris, aes(color = Species, alpha = 0.5))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# b) Scatterplot: Petal.Length vs Petal.Width
ggplot(iris, aes(x = Petal.Width, y = Petal.Length, color = Species)) +
  geom_point(size = 2) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(
    title = "Petal Length vs Width by Species",
    x = "Petal Width",
    y = "Petal Length"
  ) +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

# Preprocess (center, scale, PCA)
preProc <- preProcess(iris[, 1:4], method = c("center", "scale", "pca"), thresh = 0.95)
print(preProc)   # shows mean, SD, PCs used

## Created from 150 samples and 4 variables
## 
## Pre-processing:
##   - centered (4)
##   - ignored (0)
##   - principal component signal extraction (4)
##   - scaled (4)
## 
## PCA needed 2 components to capture 95 percent of the variance

iris_pca <- predict(preProc, iris[, 1:4])
summary(iris_pca)

##       PC1               PC2          
##  Min.   :-2.7651   Min.   :-2.67732  
##  1st Qu.:-2.0957   1st Qu.:-0.59205  
##  Median : 0.4169   Median :-0.01744  
##  Mean   : 0.0000   Mean   : 0.00000  
##  3rd Qu.: 1.3385   3rd Qu.: 0.59649  
##  Max.   : 3.2996   Max.   : 2.64521

# Verify scaled mean ≈ 0 and sd ≈ 1
apply(iris_pca, 2, mean)

##           PC1           PC2 
## -9.399888e-17 -7.736867e-17

apply(iris_pca, 2, sd)

##       PC1       PC2 
## 1.7083611 0.9560494

# PCA plot with ggplot2 (first 2 components)
iris_pca_df <- data.frame(iris_pca[, 1:2], Species = iris$Species)

# Calculate variance percentages
preProc <- preProcess(iris[, 1:4], method = c("center", "scale", "pca"), thresh = 0.95)

# This line finds the standard deviations of PC’s
stds <- preProc$std

# Compute variance-percentage for the first two components
vars <- round(100 * stds[1:2]^2 / sum(stds^2), 1)

print(vars)

## Sepal.Length  Sepal.Width 
##         15.0          4.2

ggplot(iris_pca_df, aes(x = PC1, y = PC2, color = Species, shape = Species)) +
  geom_point(size = 2.5) +
  labs(
    title = "PCA on Iris (Scaled, Centered)",
    x = paste0("PC1 (", vars[1], "%)"),
    y = paste0("PC2 (", vars[2], "%)")
  ) +
  theme_minimal()

Final Project

LLJ

2025-07-07