Discussion: Principal Component Analysis

Setup

Empty variables and functions in the environment tab/window

First, empty the environment so that we can upload the clean data.

# Clear the workspace
  rm(list = ls()) # Clear environment
  gc()            # Clear unused memory
          used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells  597869 32.0    1358077 72.6         NA   700240 37.4
Vcells 1105861  8.5    8388608 64.0      49152  1963441 15.0
  cat("\f")       # Clear the console
  graphics.off    # Clear the charts
function () 
{
    while ((which <- dev.cur()) != 1) dev.off(which)
    invisible()
}
<bytecode: 0x10531c700>
<environment: namespace:grDevices>

Load packages

Now, we load the packages.

# Prepare needed libraries
packages <- c("visdat",
              "psych",
              "tidyverse",
              "ggplot2",
              "ggcorrplot",
              "stargazer", 
              "stats",
              "factoextra", 
              "FactoMineR"
              )

  for (i in 1:length(packages)) {
    if (!packages[i] %in% rownames(installed.packages())) {
      install.packages(packages[i]
                       , repos = "http://cran.rstudio.com/"
                       , dependencies = TRUE
                       )
    }
    library(packages[i], character.only = TRUE)
  }
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ ggplot2::%+%()   masks psych::%+%()
✖ ggplot2::alpha() masks psych::alpha()
✖ dplyr::filter()  masks stats::filter()
✖ dplyr::lag()     masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Please cite as: 


 Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.

 R package version 5.2.3. https://CRAN.R-project.org/package=stargazer 


Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
rm(packages)

Load raw data

# Set working directory and path to data
cd <- "/Users/arvindsharma/Dropbox/WCAS/Econometrics/" ### CHANGE !!!
setwd(cd)

df_train    <- read.csv("moneyball-training-data.csv")
df_eval     <- read.csv("moneyball-evaluation-data.csv")

str(df_train)
'data.frame':   2276 obs. of  17 variables:
 $ INDEX           : int  1 2 3 4 5 6 7 8 11 12 ...
 $ TARGET_WINS     : int  39 70 86 70 82 75 80 85 86 76 ...
 $ TEAM_BATTING_H  : int  1445 1339 1377 1387 1297 1279 1244 1273 1391 1271 ...
 $ TEAM_BATTING_2B : int  194 219 232 209 186 200 179 171 197 213 ...
 $ TEAM_BATTING_3B : int  39 22 35 38 27 36 54 37 40 18 ...
 $ TEAM_BATTING_HR : int  13 190 137 96 102 92 122 115 114 96 ...
 $ TEAM_BATTING_BB : int  143 685 602 451 472 443 525 456 447 441 ...
 $ TEAM_BATTING_SO : int  842 1075 917 922 920 973 1062 1027 922 827 ...
 $ TEAM_BASERUN_SB : int  NA 37 46 43 49 107 80 40 69 72 ...
 $ TEAM_BASERUN_CS : int  NA 28 27 30 39 59 54 36 27 34 ...
 $ TEAM_BATTING_HBP: int  NA NA NA NA NA NA NA NA NA NA ...
 $ TEAM_PITCHING_H : int  9364 1347 1377 1396 1297 1279 1244 1281 1391 1271 ...
 $ TEAM_PITCHING_HR: int  84 191 137 97 102 92 122 116 114 96 ...
 $ TEAM_PITCHING_BB: int  927 689 602 454 472 443 525 459 447 441 ...
 $ TEAM_PITCHING_SO: int  5456 1082 917 928 920 973 1062 1033 922 827 ...
 $ TEAM_FIELDING_E : int  1011 193 175 164 138 123 136 112 127 131 ...
 $ TEAM_FIELDING_DP: int  NA 155 153 156 168 149 186 136 169 159 ...

Missing Observations

vis_dat(df_train)

vis_miss(df_train)

Correlation Plot

  • cor(): Compute the correlation of x and y if these are vectors. It is the entire data in our example below.

  • cor_pmat() : Compute a correlation matrix p-values. Tells us of the correlation coefficient is significatly different from 0 or not.

?cor
mycorr<- cor(x = df_train[ , 1:ncol(df_train)], # all columns
             use = "pairwise.complete.obs"
             )

p.mat <- ggcorrplot::cor_pmat(x = df_train[,1:ncol(df_train)])

Now, lets plot it out.

myplot<-ggcorrplot(corr     = mycorr,   # correlation matrix to visualize
                   method   = "square", # "square" (default), "circle"
                   type     = "lower", # "full" (default), "lower" or "upper" display
                   title    = "Correlation Plot", 
                   colors   = c("red", "white","green"), # low, mid & high correlation values
                   lab      = TRUE,     # add corr coeff on the plot.
                   lab_size = 2,      # labels. used when lab = TRUE.
                   p.mat    = p.mat,  # matrix of p-value. If NULL, arguments sig.level, insig, pch, pch.col, pch.cex is invalid.  
                     insig    = "pch",  # character, specialized insignificant correlation coefficients, "pch" (default), "blank". If "blank", wipe away the corresponding glyphs; if "pch", add characters (see pch for details) on corresponding glyphs.
                     pch      = 4, # add character on the glyphs of insignificant correlation coefficients (only valid when insig is "pch"). Default value is 4.
                   hc.order  = FALSE, # corr matrix will be hc.ordered using hclust function.
                   tl.cex   = 8, # the size, the color and the string rotation of text label
                   tl.col   = "black", 
                   digits   = 2
                     )

myplot 

Graphs

str(df_train)
'data.frame':   2276 obs. of  17 variables:
 $ INDEX           : int  1 2 3 4 5 6 7 8 11 12 ...
 $ TARGET_WINS     : int  39 70 86 70 82 75 80 85 86 76 ...
 $ TEAM_BATTING_H  : int  1445 1339 1377 1387 1297 1279 1244 1273 1391 1271 ...
 $ TEAM_BATTING_2B : int  194 219 232 209 186 200 179 171 197 213 ...
 $ TEAM_BATTING_3B : int  39 22 35 38 27 36 54 37 40 18 ...
 $ TEAM_BATTING_HR : int  13 190 137 96 102 92 122 115 114 96 ...
 $ TEAM_BATTING_BB : int  143 685 602 451 472 443 525 456 447 441 ...
 $ TEAM_BATTING_SO : int  842 1075 917 922 920 973 1062 1027 922 827 ...
 $ TEAM_BASERUN_SB : int  NA 37 46 43 49 107 80 40 69 72 ...
 $ TEAM_BASERUN_CS : int  NA 28 27 30 39 59 54 36 27 34 ...
 $ TEAM_BATTING_HBP: int  NA NA NA NA NA NA NA NA NA NA ...
 $ TEAM_PITCHING_H : int  9364 1347 1377 1396 1297 1279 1244 1281 1391 1271 ...
 $ TEAM_PITCHING_HR: int  84 191 137 97 102 92 122 116 114 96 ...
 $ TEAM_PITCHING_BB: int  927 689 602 454 472 443 525 459 447 441 ...
 $ TEAM_PITCHING_SO: int  5456 1082 917 928 920 973 1062 1033 922 827 ...
 $ TEAM_FIELDING_E : int  1011 193 175 164 138 123 136 112 127 131 ...
 $ TEAM_FIELDING_DP: int  NA 155 153 156 168 149 186 136 169 159 ...
# melt the dataframe into long format
df_melted <- reshape2::melt(df_train)
No id variables; using all as measure variables
str(df_melted)
'data.frame':   38692 obs. of  2 variables:
 $ variable: Factor w/ 17 levels "INDEX","TARGET_WINS",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ value   : int  1 2 3 4 5 6 7 8 11 12 ...
# create histogram using ggplot2
ggplot(df_melted, aes(value)) + 
  geom_histogram() + 
  facet_wrap(~variable, scales = "free_x")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Removed 3478 rows containing non-finite outside the scale range
(`stat_bin()`).

PCA (prcomp)

PCA is a powerful technique for dimensionality reduction, data exploration, and feature selection in R.

  • Use prcomp() if you want if you prefer Principal Component Analysis using Singular Value Decomposition (SVD) method. SVD is a numerically stable method for decomposing a data matrix into orthogonal components.

You can adapt the steps below to your specific dataset and analysis goals.

Batting

We will try to use the six variable to create on batting variable.

\(Batting <- \ f(TEAM\_BATTING_H , TEAM\_BATTING_{2B},TEAM\_BATTING_{3B}, TEAM\_BATTING_{HR}, TEAM\_BATTING_{BB} + TEAM\_BATTING_{SO})\)

Step 1: Load the Data

We create a subset of all batting variables.

batting_data <- (df_train[,3:8])

Do your EDA.

stargazer(table = batting_data,
          type = "text",
          title = "Team Batting Summary Statistics",
         covariate.labels = c( "Singles", 
                               "Doubles", 
                               "Triples", 
                               "Home Runs", 
                               "BB", 
                               "Strike Outs" 
                               )
          )

Team Batting Summary Statistics
==============================================
Statistic     N     Mean    St. Dev. Min  Max 
----------------------------------------------
Singles     2,276 1,469.270 144.591  891 2,554
Doubles     2,276  241.247   46.801  69   458 
Triples     2,276  55.250    27.939   0   223 
Home Runs   2,276  99.612    60.547   0   264 
BB          2,276  501.559  122.671   0   878 
Strike Outs 2,174  735.605  248.526   0  1,399
----------------------------------------------

Step 2: Preprocess Your Data:

PCA is sensitive to the scale of your variables, so it’s a good practice to standardize (center and scale) your data if the variables have different units or scales. You can use the scale() function for this purpose.

# typeof(batting_data)
# class(batting_data)

#    Standardize the data
scaled_batting_data <- scale(batting_data)

# typeof(scaled_batting_data)
# class(scaled_batting_data)

#    Store as dataframe instead of a matrix
scaled_batting_data <- as.data.frame(scaled_batting_data)

See if the scaling (applied correctly) worked or not.

stargazer(table = scaled_batting_data,
          type = "text",
          title = "Team Batting Summary Statistics (Scaled)",
         covariate.labels = c( "Singles", 
                               "Doubles", 
                               "Triples", 
                               "Home Runs", 
                               "BB", 
                               "Strike Outs" 
                               )
          )

Team Batting Summary Statistics (Scaled)
==============================================
Statistic     N    Mean  St. Dev.  Min    Max 
----------------------------------------------
Singles     2,276 -0.000  1.000   -3.999 7.502
Doubles     2,276 -0.000  1.000   -3.680 4.631
Triples     2,276 -0.000  1.000   -1.978 6.004
Home Runs   2,276 -0.000  1.000   -1.645 2.715
BB          2,276 -0.000  1.000   -4.089 3.069
Strike Outs 2,174 0.000   1.000   -2.960 2.669
----------------------------------------------

Step 3: Perform PCA

Use the prcomp() function to perform PCA on your scaled data. You need to provide the standardized data as input, and you can specify additional options if needed.

## Perform PCA
# pca_result <- prcomp(scaled_batting_data)

You cannot have any missing observations in the variables. Either you impute the missing values, or drop the rows with missing values. I will go with the later approach. You can drop the columns with missing values as well.

scaled_batting_data <- 
scaled_batting_data %>% 
                filter(TEAM_BATTING_SO!="NA")

batting_data_noMissing <- 
batting_data %>% filter(TEAM_BATTING_SO!="NA")

## Perform PCA
# pca_result <- prcomp(scaled_batting_data[,1:5])

Now, lets implement the PCA.

?prcomp
# Perform PCA
pca_result  <- prcomp(scaled_batting_data) 
                  
# Alternatively, you can center the data with the scale command directly
pca_result2 <- prcomp(batting_data_noMissing, 
                      scale  = TRUE, # False 
                      center = TRUE 
                      )
?princomp

Output should be similar.

Step 4: Explore PCA Results:

summary(pca_result)
Importance of components:
                          PC1    PC2    PC3     PC4     PC5     PC6
Standard deviation     1.6808 1.2762 0.8502 0.57506 0.54282 0.34029
Proportion of Variance 0.4774 0.2752 0.1221 0.05588 0.04979 0.01957
Cumulative Proportion  0.4774 0.7526 0.8748 0.93064 0.98043 1.00000
summary(pca_result2)
Importance of components:
                         PC1    PC2    PC3     PC4     PC5     PC6
Standard deviation     1.688 1.2955 0.8476 0.57428 0.55289 0.34334
Proportion of Variance 0.475 0.2797 0.1197 0.05497 0.05095 0.01965
Cumulative Proportion  0.475 0.7547 0.8744 0.92941 0.98035 1.00000

The pca_result object contains information about the PCA. You can access various properties and summaries to understand the results:

  1. pca_result$rotation: This matrix contains the loadings of each variable on each principal component.

    pca_result$rotation
                           PC1         PC2        PC3          PC4         PC5
    TEAM_BATTING_H   0.2547810 -0.65558650  0.1008227  0.185016680 -0.42690792
    TEAM_BATTING_2B -0.1404044 -0.66620983  0.2294521 -0.136496292  0.63878726
    TEAM_BATTING_3B  0.4981842 -0.15038757 -0.3352853 -0.722857927 -0.17559528
    TEAM_BATTING_HR -0.5039161 -0.23701363  0.1039518 -0.004360086 -0.60539712
    TEAM_BATTING_BB -0.3604193 -0.20199249 -0.8850989  0.133007673  0.10538982
    TEAM_BATTING_SO -0.5323085  0.08232831  0.1748263 -0.637890238 -0.03541284
                           PC6
    TEAM_BATTING_H   0.5278695
    TEAM_BATTING_2B -0.2390029
    TEAM_BATTING_3B -0.2518384
    TEAM_BATTING_HR -0.5590729
    TEAM_BATTING_BB  0.1307626
    TEAM_BATTING_SO  0.5207182
    • This matrix contains the loadings of each original variable on each principal component.

    • The loadings indicate the strength and direction of the relationship between the original variables and the principal components.

      • Loadings close to zero suggest that a variable has little influence on a particular principal component, while loadings with higher absolute values indicate stronger influence.
    • This matrix is crucial for understanding which variables contribute most to each principal component and how they are related.

  2. pca_result$x: This matrix contains the scores for each observation on each principal component.

    head(pca_result$x, 
         n = 10)
                 PC1       PC2        PC3         PC4         PC5         PC6
     [1,]  1.4099105 1.9128879  2.4526104 -0.15181188  0.08086077  0.95707782
     [2,] -2.7196330 0.6209773 -0.7375865  0.05636948 -0.49544881  0.02723595
     [3,] -1.4364842 0.4854542 -0.4066407  0.05008817 -0.03077116  0.05226250
     [4,] -0.5221928 1.1621587  0.4741936 -0.12126081 -0.11356499  0.40728155
     [5,] -0.9152782 1.8981078  0.2880590  0.14273135 -0.13429360  0.25811375
     [6,] -0.7735991 1.8364429  0.4654937 -0.32074846  0.12089213  0.21225043
     [7,] -1.1325381 1.9741995 -0.3554179 -0.91168984 -0.41772521  0.02633511
     [8,] -1.0246188 2.1775221  0.2908082 -0.39588270 -0.48999426  0.24405140
     [9,] -0.5815386 1.2401993  0.4539139 -0.13852401 -0.48514704  0.29466675
    [10,] -0.8623647 1.7238234  0.6582588  0.46909686  0.41416893 -0.06606063
    • Each row corresponds to an observation, and each column corresponds to a principal component.

    • These scores represent how each observation is positioned in the reduced-dimensional space defined by the principal components.

    • PCA reduces the dimensionality of the data by projecting the original observations onto these new axes, and the scores tell you the coordinates of each observation in this new space.

  3. pca_result$sdev: This vector contains the standard deviations of the principal components.

    pca_result$sdev
    [1] 1.6808431 1.2762151 0.8502429 0.5750593 0.5428196 0.3402945
    • The standard deviations, often referred to as “singular values,” represent the spread or variability along each principal component axis.

    • They are useful for understanding how much variance is explained by each principal component.

    • Larger standard deviations correspond to principal components that capture more variance in the data.

  4. pca_result$center and pca_result$scale: These values contain the centering and scaling information used in the PCA.

    pca_result$center  
     TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB 
       3.711099e-02    5.643633e-02   -2.854042e-02    6.177540e-02    2.893662e-02 
    TEAM_BATTING_SO 
       1.801176e-16 
    pca_result$scale   # we scaled manually !
    [1] FALSE
    • Centering involves subtracting the mean of each variable from the data to make the variables have zero means.

    • Scaling involves dividing the centered data by the standard deviation of each variable to ensure that variables have the same scale.

    • These centering and scaling transformations are applied to the data before PCA to ensure that variables with different units or scales do not dominate the results.

Understanding and interpreting these components is essential for making sense of the results of a PCA analysis and for using PCA effectively in data exploration, dimensionality reduction, and feature selection tasks.

You can visualize the results, create biplots, and explore how much variance each principal component explains.

Visualization:

You can create visualizations to explore the results. For example, you can plot the variance explained by each principal component to determine how many components to retain:

Graph of Variables

# Plot the variance explained by each component 

plot(pca_result)

biplot(pca_result)

Interpretation:

Interpret the results, including the loadings of variables on principal components and the scores of observations on principal components. This can help you understand which variables contribute the most to each component and how observations are positioned in the reduced-dimensional space.

Alternative Commands

PCA (princomp)

princomp is another standard function in base R that performs Principal Component Analysis but uses using Covariance Matrix.

?princomp
# Perform PCA using princomp

princomp_pca_result <- princomp(scaled_batting_data)

summary(princomp_pca_result)
Importance of components:
                          Comp.1    Comp.2    Comp.3     Comp.4     Comp.5
Standard deviation     1.6804565 1.2759216 0.8500474 0.57492701 0.54269471
Proportion of Variance 0.4773952 0.2752146 0.1221546 0.05587904 0.04978915
Cumulative Proportion  0.4773952 0.7526098 0.8747644 0.93064343 0.98043258
                           Comp.6
Standard deviation     0.34021626
Proportion of Variance 0.01956742
Cumulative Proportion  1.00000000

Should give similar answer to the prcomp() function, which is often preferred for large datasets as it can be computationally more efficient.

summary(pca_result)
Importance of components:
                          PC1    PC2    PC3     PC4     PC5     PC6
Standard deviation     1.6808 1.2762 0.8502 0.57506 0.54282 0.34029
Proportion of Variance 0.4774 0.2752 0.1221 0.05588 0.04979 0.01957
Cumulative Proportion  0.4774 0.7526 0.8748 0.93064 0.98043 1.00000

PCA (FactoMineR)

# Install and load FactoMineR
#install.packages("FactoMineR")
library(FactoMineR)

?PCA
# Perform PCA using FactoMineR
pca_result3 <- PCA(scaled_batting_data, 
                   scale.unit = TRUE, 
                   graph = TRUE
                   )

summary(pca_result3)

Call:
PCA(X = scaled_batting_data, scale.unit = TRUE, graph = TRUE) 


Eigenvalues
                       Dim.1   Dim.2   Dim.3   Dim.4   Dim.5   Dim.6
Variance               2.850   1.678   0.718   0.330   0.306   0.118
% of var.             47.501  27.971  11.973   5.497   5.095   1.965
Cumulative % of var.  47.501  75.471  87.444  92.941  98.035 100.000

Individuals (the 10 first)
                    Dist    Dim.1    ctr   cos2    Dim.2    ctr   cos2    Dim.3
1               |  3.559 | -1.449  0.034  0.166 | -1.883  0.097  0.280 | -2.462
2               |  2.942 |  2.718  0.119  0.854 | -0.684  0.013  0.054 |  0.739
3               |  1.576 |  1.430  0.033  0.824 | -0.518  0.007  0.108 |  0.404
4               |  1.439 |  0.501  0.004  0.121 | -1.181  0.038  0.673 | -0.480
5               |  2.173 |  0.882  0.013  0.165 | -1.936  0.103  0.794 | -0.300
6               |  2.101 |  0.744  0.009  0.125 | -1.862  0.095  0.785 | -0.481
7               |  2.540 |  1.107  0.020  0.190 | -2.028  0.113  0.637 |  0.338
8               |  2.546 |  0.993  0.016  0.152 | -2.224  0.136  0.763 | -0.304
9               |  1.574 |  0.564  0.005  0.128 | -1.266  0.044  0.647 | -0.458
10              |  2.139 |  0.834  0.011  0.152 | -1.740  0.083  0.661 | -0.673
                   ctr   cos2  
1                0.388  0.479 |
2                0.035  0.063 |
3                0.010  0.066 |
4                0.015  0.111 |
5                0.006  0.019 |
6                0.015  0.052 |
7                0.007  0.018 |
8                0.006  0.014 |
9                0.013  0.085 |
10               0.029  0.099 |

Variables
                   Dim.1    ctr   cos2    Dim.2    ctr   cos2    Dim.3    ctr
TEAM_BATTING_H  | -0.419  6.156  0.175 |  0.847 42.703  0.717 | -0.074  0.766
TEAM_BATTING_2B |  0.255  2.276  0.065 |  0.874 45.557  0.765 | -0.190  5.002
TEAM_BATTING_3B | -0.831 24.215  0.690 |  0.199  2.362  0.040 |  0.283 11.127
TEAM_BATTING_HR |  0.870 26.569  0.757 |  0.296  5.214  0.087 | -0.081  0.919
TEAM_BATTING_BB |  0.603 12.748  0.363 |  0.238  3.367  0.057 |  0.754 79.161
TEAM_BATTING_SO |  0.894 28.036  0.799 | -0.116  0.797  0.013 | -0.147  3.025
                  cos2  
TEAM_BATTING_H   0.006 |
TEAM_BATTING_2B  0.036 |
TEAM_BATTING_3B  0.080 |
TEAM_BATTING_HR  0.007 |
TEAM_BATTING_BB  0.569 |
TEAM_BATTING_SO  0.022 |
summary(pca_result)
Importance of components:
                          PC1    PC2    PC3     PC4     PC5     PC6
Standard deviation     1.6808 1.2762 0.8502 0.57506 0.54282 0.34029
Proportion of Variance 0.4774 0.2752 0.1221 0.05588 0.04979 0.01957
Cumulative Proportion  0.4774 0.7526 0.8748 0.93064 0.98043 1.00000

YouTube video in the help file if you type ?PCA

PCA (factoextra)

The factoextra package provides functions for extracting and visualizing PCA results.

  • It is often used in combination with other PCA packages to create informative plots and charts.

Graph of Individuals

# Install and load factoextra
library(factoextra)

# Visualize PCA results using factoextra
fviz_pca_ind(pca_result)

Guides may be helpful to see how people create clusters as well

Predict

Principal Component Analysis (PCA) is primarily a dimensionality reduction technique and does not perform predictions in the traditional sense, such as making predictions for new or unseen data points. Instead, PCA is used for data exploration, feature selection, and reducing the dimensionality of a dataset while retaining as much variance as possible.

However, you can use PCA in combination with statistical techniques for prediction tasks. Here’s a general outline of how you can incorporate PCA into a predictive modeling workflow:

  1. Data Preparation: Preprocess your data, including data cleaning, missing value imputation, and feature scaling. Standardize or normalize your data if it’s not already done.

  2. PCA: Perform PCA on your dataset to reduce its dimensionality. Determine the number of principal components to retain based on the explained variance or other criteria.

  3. Feature Selection: You can choose to use the retained principal components as features for your prediction model. These components capture the most important information in the original data.

  4. Split Data: Split your data into training and testing sets for model development and evaluation.

  5. Model Building: Build a predictive model using the retained principal components (or other selected features) and the target variable. You can use various modeling techniques, such as regression, classification, or clustering, depending on your prediction task.

  6. Model Training: Train your predictive model on the training data.

  7. Model Evaluation: Evaluate the model’s performance using the testing dataset. Common evaluation metrics depend on the specific prediction task but may include accuracy, precision, recall, F1-score, or mean squared error, among others.

  8. Prediction: Use the trained model to make predictions on new or unseen data.

  9. Inverse PCA (Optional): If needed, you can perform an inverse PCA transformation to obtain predictions or representations in the original data space. This step may be necessary if you want predictions in the original feature space.

  10. Assess Model Performance: Evaluate the performance of your prediction model on the original data space or in the space of the retained principal components, depending on your goals.

It’s important to note that while PCA can be a useful preprocessing step to reduce dimensionality and remove multicollinearity in your data, it may not always improve predictive model performance. The decision to use PCA should be based on the specific characteristics of your dataset and the goals of your prediction task.

In summary, PCA is a valuable tool in data preprocessing, but its primary role is not prediction. Instead, it can help improve the efficiency and interpretability of predictive models when used as part of a broader machine learning or statistical analysis pipeline.

Session Info

R version 4.4.1 (2024-06-14)
Platform: aarch64-apple-darwin20
Running under: macOS Sonoma 14.6.1

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: America/New_York
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] FactoMineR_2.11    factoextra_1.0.7   stargazer_5.2.3    ggcorrplot_0.1.4.1
 [5] lubridate_1.9.3    forcats_1.0.0      stringr_1.5.1      dplyr_1.1.4       
 [9] purrr_1.0.2        readr_2.1.5        tidyr_1.3.1        tibble_3.2.1      
[13] ggplot2_3.5.1      tidyverse_2.0.0    psych_2.4.6.26     visdat_0.6.0      

loaded via a namespace (and not attached):
 [1] gtable_0.3.5         xfun_0.46            htmlwidgets_1.6.4   
 [4] rstatix_0.7.2        ggrepel_0.9.5        lattice_0.22-6      
 [7] tzdb_0.4.0           vctrs_0.6.5          tools_4.4.1         
[10] generics_0.1.3       parallel_4.4.1       sandwich_3.1-0      
[13] fansi_1.0.6          cluster_2.1.6        pkgconfig_2.0.3     
[16] Matrix_1.7-0         scatterplot3d_0.3-44 lifecycle_1.0.4     
[19] farver_2.1.2         compiler_4.4.1       munsell_0.5.1       
[22] mnormt_2.1.1         leaps_3.2            codetools_0.2-20    
[25] carData_3.0-5        htmltools_0.5.8.1    yaml_2.3.10         
[28] car_3.1-2            ggpubr_0.6.0         pillar_1.9.0        
[31] MASS_7.3-60.2        DT_0.33              flashClust_1.01-2   
[34] abind_1.4-5          multcomp_1.4-26      nlme_3.1-164        
[37] tidyselect_1.2.1     digest_0.6.36        mvtnorm_1.2-5       
[40] stringi_1.8.4        reshape2_1.4.4       labeling_0.4.3      
[43] splines_4.4.1        fastmap_1.2.0        grid_4.4.1          
[46] colorspace_2.1-1     cli_3.6.3            magrittr_2.0.3      
[49] survival_3.6-4       utf8_1.2.4           broom_1.0.6         
[52] TH.data_1.1-2        withr_3.0.1          backports_1.5.0     
[55] scales_1.3.0         timechange_0.3.0     estimability_1.5.1  
[58] rmarkdown_2.28       emmeans_1.10.3       ggsignif_0.6.4      
[61] zoo_1.8-12           hms_1.1.3            evaluate_0.24.0     
[64] knitr_1.48           rlang_1.1.4          Rcpp_1.0.13         
[67] xtable_1.8-4         glue_1.7.0           rstudioapi_0.16.0   
[70] jsonlite_1.8.8       plyr_1.8.9           R6_2.5.1            
[73] multcompView_0.1-10