library(psych)
library(corrplot)
## corrplot 0.95 loaded
library(FactoMineR)
library(factoextra)
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
## Welcome to factoextra!
## Want to learn more? See two factoextra-related books at https://www.datanovia.com/en/product/practical-guide-to-principal-component-methods-in-r/
library(readr)

Principal Component Analysis (PCA)

A. Stage 1: Make Objective

The multi-channel stock market dataset containing 19 quantitative financial indicators (Open, High, Low, Close, Volume, SMA_10, SMA_20, EMA_10, RSI, MACD, Signal, BB_Middle, BB_Upper, BB_Lower, Sentiment_Pos, Sentiment_Neg, Sentiment_Neu, Sentiment_Compound, and Target) is examined for the following reasons:

A.1. Data Summarization with Interpretation

Understand whether these financial indicators can be grouped into a smaller number of meaningful components. By grouping these variables and performing component and factor interpretation, the analysis can identify the main underlying dimensions that represent stock market behavior, such as price movement, trend indicators, volatility measures, and sentiment influence. This helps provide a clearer overall understanding of the relationships among financial indicators and reveals the fundamental structure of the stock market dataset

A.2. Data Reduction

If the 19 financial variables can be represented by a smaller number of principal components or factors, then the dataset can be simplified without losing significant information. This reduction helps minimize redundancy caused by high correlations among variables and makes further multivariate analysis more efficient, stable, and easier to interpret, while still preserving the essential characteristics of the original stock market data.

A.3. Identification of Underlying Market Structure

Determine whether the numerical indicators represent underlying latent components such as price movement, technical trend strength, volatility, and market sentiment. By identifying these latent components, the analysis can reveal the fundamental dimensions that drive stock market behavior, allowing for a clearer understanding of how different technical and sentiment indicators collectively describe overall market dynamics.

B. Stage 2: Design

data <- read_csv("stock_dataset.csv")
## Rows: 1000 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (1): Clean_Headline
## dbl  (19): Open, High, Low, Close, Volume, SMA_10, SMA_20, EMA_10, RSI, MACD...
## date  (1): Date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(data)
## spc_tbl_ [1,000 × 21] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Date              : Date[1:1000], format: "2020-01-01" "2020-01-02" ...
##  $ Open              : num [1:1000] 0.0564 0.0847 0.204 0.2748 0.1779 ...
##  $ High              : num [1:1000] 0.0182 -0.00942 0.16957 0.34351 0.17785 ...
##  $ Low               : num [1:1000] 0.146 0.169 0.222 0.333 0.212 ...
##  $ Close             : num [1:1000] 0.0409 0.1298 0.1624 0.292 0.2364 ...
##  $ Volume            : num [1:1000] -1.1509 -0.0775 0.4726 -0.1679 -1.6229 ...
##  $ SMA_10            : num [1:1000] NA NA NA NA NA ...
##  $ SMA_20            : num [1:1000] NA NA NA NA NA NA NA NA NA NA ...
##  $ EMA_10            : num [1:1000] 0.051 0.0675 0.0871 0.1273 0.1498 ...
##  $ RSI               : num [1:1000] NA NA NA NA NA NA NA NA NA NA ...
##  $ MACD              : num [1:1000] -0.130093 -0.069361 -0.000217 0.140952 0.210517 ...
##  $ Signal            : num [1:1000] -0.141933 -0.129043 -0.104055 -0.054101 0.000627 ...
##  $ BB_Middle         : num [1:1000] NA NA NA NA NA NA NA NA NA NA ...
##  $ BB_Upper          : num [1:1000] NA NA NA NA NA NA NA NA NA NA ...
##  $ BB_Lower          : num [1:1000] NA NA NA NA NA NA NA NA NA NA ...
##  $ Clean_Headline    : chr [1:1000] "Economic slowdown affects growth" "Market faces uncertainty" "Company reports strong earnings" "Market faces uncertainty" ...
##  $ Sentiment_Pos     : num [1:1000] 0.464 0 0.524 0 0 0 0 0 0 0.524 ...
##  $ Sentiment_Neg     : num [1:1000] 0 0.545 0 0.545 0.318 0.545 0.545 0.318 0.545 0 ...
##  $ Sentiment_Neu     : num [1:1000] 0.536 0.455 0.476 0.455 0.682 0.455 0.455 0.682 0.455 0.476 ...
##  $ Sentiment_Compound: num [1:1000] 0.382 -0.34 0.511 -0.34 -0.103 ...
##  $ Target            : num [1:1000] 1 1 1 0 0 1 0 0 0 0 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Date = col_date(format = ""),
##   ..   Open = col_double(),
##   ..   High = col_double(),
##   ..   Low = col_double(),
##   ..   Close = col_double(),
##   ..   Volume = col_double(),
##   ..   SMA_10 = col_double(),
##   ..   SMA_20 = col_double(),
##   ..   EMA_10 = col_double(),
##   ..   RSI = col_double(),
##   ..   MACD = col_double(),
##   ..   Signal = col_double(),
##   ..   BB_Middle = col_double(),
##   ..   BB_Upper = col_double(),
##   ..   BB_Lower = col_double(),
##   ..   Clean_Headline = col_character(),
##   ..   Sentiment_Pos = col_double(),
##   ..   Sentiment_Neg = col_double(),
##   ..   Sentiment_Neu = col_double(),
##   ..   Sentiment_Compound = col_double(),
##   ..   Target = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
numeric_data <- data[, sapply(data, is.numeric)]
ncol(numeric_data)
## [1] 19
colnames(numeric_data)
##  [1] "Open"               "High"               "Low"               
##  [4] "Close"              "Volume"             "SMA_10"            
##  [7] "SMA_20"             "EMA_10"             "RSI"               
## [10] "MACD"               "Signal"             "BB_Middle"         
## [13] "BB_Upper"           "BB_Lower"           "Sentiment_Pos"     
## [16] "Sentiment_Neg"      "Sentiment_Neu"      "Sentiment_Compound"
## [19] "Target"
numeric_data <- na.omit(numeric_data)

B.1. Variable selection and measurement issues

The variables selected for this analysis include 19 numerical stock market indicators: Open, High, Low, Close, Volume, SMA_10, SMA_20, EMA_10, RSI, MACD, Signal, BB_Middle, BB_Upper, BB_Lower, Sentiment_Pos, Sentiment_Neg, Sentiment_Neu, Sentiment_Compound, and Target. All variables are continuous numerical variables representing stock price movement, technical indicators, and sentiment scores, making them appropriate for Principal Component Analysis (PCA).

B.2. What types of variables can be used?

Principal Component Analysis requires metric (numerical) variables measured on an interval or ratio scale. In this dataset, all selected variables are numerical and standardized technical and sentiment indicators, ensuring compatibility with PCA assumptions and allowing meaningful interpretation of component loadings.

B.3. How many variables should be included?

A total of 19 numerical variables are included in this analysis. This number exceeds the minimum requirement of 10 variables and provides sufficient dimensionality to identify meaningful underlying components while allowing effective dimensionality reduction.

B.4. Sample size

The dataset contains 1000 observations and 19 variables, resulting in an observation-to-variable ratio of approximately 52:1. This ratio exceeds the recommended minimum of 5:1 and the preferred minimum sample size of 100 observations, ensuring adequate statistical power and stable correlation estimates for PCA.

B.5. Correlations among variables or respondents

This analysis uses R-type factor analysis, which examines correlations among variables rather than observations. A correlation matrix among the 19 numerical variables is used as input to identify underlying component structures that explain relationships among stock price indicators and sentiment measures.

B.6. Data Information and Initial visualization

Descriptive Stastistics

desc_stats <- describe(numeric_data)
desc_stats
##                    vars   n  mean   sd median trimmed  mad   min  max range
## Open                  1 981  0.00 1.01  -0.27   -0.14 0.88 -1.72 2.98  4.70
## High                  2 981  0.00 1.01  -0.27   -0.14 0.86 -1.71 2.94  4.64
## Low                   3 981  0.00 1.01  -0.27   -0.14 0.88 -1.68 3.01  4.70
## Close                 4 981  0.00 1.01  -0.28   -0.14 0.87 -1.80 2.96  4.75
## Volume                5 981  0.01 1.00   0.02    0.01 1.30 -1.77 1.68  3.45
## SMA_10                6 981  0.00 1.00  -0.26   -0.14 0.87 -1.54 2.84  4.38
## SMA_20                7 981  0.00 1.00  -0.24   -0.14 0.90 -1.38 2.73  4.11
## EMA_10                8 981  0.00 1.01  -0.26   -0.15 0.90 -1.50 2.81  4.31
## RSI                   9 981  0.00 1.00  -0.03   -0.01 0.99 -2.74 3.46  6.19
## MACD                 10 981 -0.01 1.01  -0.09   -0.04 0.94 -2.38 3.13  5.51
## Signal               11 981 -0.01 1.01  -0.08   -0.03 0.93 -2.36 3.00  5.37
## BB_Middle            12 981  0.00 1.00  -0.24   -0.14 0.90 -1.38 2.73  4.11
## BB_Upper             13 981  0.00 1.00  -0.20   -0.14 0.95 -1.34 2.78  4.12
## BB_Lower             14 981  0.00 1.00  -0.31   -0.14 0.83 -1.61 2.78  4.39
## Sentiment_Pos        15 981  0.30 0.24   0.46    0.30 0.09  0.00 0.52  0.52
## Sentiment_Neg        16 981  0.17 0.22   0.00    0.14 0.00  0.00 0.54  0.54
## Sentiment_Neu        17 981  0.54 0.08   0.53    0.53 0.08  0.46 0.68  0.23
## Sentiment_Compound   18 981  0.21 0.35   0.38    0.23 0.24 -0.34 0.54  0.88
## Target               19 981  0.51 0.50   1.00    0.51 0.00  0.00 1.00  1.00
##                     skew kurtosis   se
## Open                1.01     0.31 0.03
## High                1.01     0.31 0.03
## Low                 1.02     0.32 0.03
## Close               1.01     0.31 0.03
## Volume             -0.02    -1.23 0.03
## SMA_10              1.05     0.39 0.03
## SMA_20              1.08     0.47 0.03
## EMA_10              1.06     0.39 0.03
## RSI                 0.12     0.00 0.03
## MACD                0.35     0.14 0.03
## Signal              0.35     0.28 0.03
## BB_Middle           1.08     0.47 0.03
## BB_Upper            1.04     0.36 0.03
## BB_Lower            1.09     0.60 0.03
## Sentiment_Pos      -0.42    -1.79 0.01
## Sentiment_Neg       0.74    -1.14 0.01
## Sentiment_Neu       0.95    -0.43 0.00
## Sentiment_Compound -0.51    -1.49 0.01
## Target             -0.03    -2.00 0.02
  • The descriptive statistics summarize 19 variables across 981 observations, including price, technical, sentiment, and the target variables. Most price and technical variables exhibit a mean close to 0 and a standard deviation approximately equal to 1. This indicates that the dataset has been standardized prior to analysis, ensuring that all variables are measured on the same scale.
  • Price variables (Open, High, Low, and Close) show highly similar statistical characteristics. Their means are approximately 0 and their standard deviations are about 1.01, confirming consistent scaling from the data preprocessing stage before the dataset uploaded. The median values are slightly negative (around -0.27), while the trimmed means are closer to zero (-0.14), indicating that a small number of higher values slightly influence the distribution. The ranges of these variables are relatively large, between 4.64 and 4.75, reflecting considerable variability in standardized price movements. Skewness values around 1.01–1.02 indicate moderately right-skewed distributions, meaning that extreme positive values occur occasionally. The kurtosis values close to 0.3 indicate distributions that are moderately peaked but still relatively close to normal.
  • Technical variables such as moving averages (SMA_10, SMA_20, EMA_10) and Bollinger Band components (BB_Middle, BB_Upper, BB_Lower) also demonstrate similar statistical behavior. The median and trimmed mean values are slightly negative, suggesting that most observations cluster slightly below the standardized mean. The median absolute deviation (MAD) values between 0.83 and 0.95 indicate moderate dispersion around the median. These variables also show positive skewness values around 1.04–1.09, indicating occasional large positive deviations.
  • Among technical variables, the RSI variable exhibits the largest range (6.19), with values extending from -2.74 to 3.46. However, its skewness is only 0.12, suggesting that the distribution is relatively symmetric despite the wide spread. Similarly, MACD and Signal variables show moderate ranges (5.51 and 5.37 respectively) and mild positive skewness, indicating slightly longer right tails but no extreme asymmetry.
  • Sentiment variables differ substantially from the standardized price variables because they are naturally bounded between fixed values. The Sentiment_Pos variable has a mean of 0.30 with a standard deviation of 0.24, indicating moderate variation in positive sentiment intensity. Its negative skewness (-0.42) suggests that higher positive sentiment values occur more frequently than lower ones. In contrast, Sentiment_Neg shows a mean of 0.17 and positive skewness (0.74), indicating that most values cluster near zero with occasional higher negative sentiment scores.
  • The Sentiment_Neu variable has the highest mean (0.54) and the smallest standard deviation (0.08), indicating that neutral sentiment dominates the dataset and varies little across observations. The Sentiment_Compound variable, which summarizes overall sentiment polarity, has a mean of 0.21 and a range of 0.88, suggesting that the overall sentiment is slightly positive but still varies across the dataset.
  • The Target variable represents a binary outcome with values between 0 and 1. The mean value of 0.51 indicates that the two classes are nearly balanced in the dataset. Its skewness close to zero (-0.03) confirms that there is no significant imbalance between the classes, which is beneficial for predictive modeling and classification tasks.
  • In general, the descriptive statistics indicate that most numerical predictors are exhibit moderate variability with slight positive skewness. Sentiment variables show bounded distributions with different dispersion patterns, reflecting the characteristics of sentiment scoring. These results indicate that the dataset is well-structured and suitable for PCA and FA.

Histogram

par(mfrow=c(2,2), mar=c(4,4,2,1))

hist(numeric_data$Open,
     main="Distribution of Open Price",
     xlab="Open Price")

hist(numeric_data$Close,
     main="Distribution of Close Price",
     xlab="Close Price")

hist(numeric_data$Volume,
     main="Distribution of Trading Volume",
     xlab="Volume")

hist(numeric_data$RSI,
     main="Distribution of RSI Indicator",
     xlab="RSI")

par(mfrow=c(1,1))

Open and Close price distributions are relatively symmetric around the center with slight positive skewness, which is consistent with the descriptive statistics. The Volume variable shows a wider spread, indicating fluctuations in trading activity, while the RSI indicator appears centered around the mean, indicating balanced momentum conditions.

Correlation Heatmap

data_numeric <- numeric_data[, sapply(numeric_data, is.numeric)]
data_clean <- na.omit(data_numeric)

cor_matrix <- cor(data_clean, use = "complete.obs")

par(mar=c(1,1,1,1))
corrplot(cor_matrix,
         method = "color",
         type = "upper",
         tl.cex = 0.5,
         tl.col = "black",
         tl.srt = 45)

Strong positive correlations are observed among price-based variables (Open, High, Low, Close) and several technical variables. This occurs because these indicators are derived from similar historical price information. In contrast, sentiment variables show relatively weaker correlations with price and technical variables, indicating that sentiment variables capture a different dimension of market dynamics related to investor perception rather than direct price movement.

Summary design statement

  • Understanding the structure of stock market indicators requires R-type principal component analysis and a correlation matrix between variables, not observations.
  • All selected variables are metric and represent a homogeneous set of technical and sentiment indicators appropriate for PCA. The sample size in this analysis provides a 52:1 ratio of observations to variables, which exceeds the recommended minimum ratio.
  • The sample size of 1000 observations provides a strong basis for reliable correlation estimation and component extraction.

C. Stage 3: Assumptions

C.1. Correlations Among Variables

The initial examination is conducted to ensure that there are sufficiently strong correlations among variables. PCA is only appropriate if correlations exist among variables, because PCA aims to reduce variables based on shared variance.

mat_corr <- round(cor(numeric_data),3)
library(sjPlot)
## 
## Attaching package: 'sjPlot'
## The following object is masked from 'package:ggplot2':
## 
##     set_theme
tab_corr(numeric_data)
  Open High Low Close Volume SMA_10 SMA_20 EMA_10 RSI MACD Signal BB_Middle BB_Upper BB_Lower Sentiment_Pos Sentiment_Neg Sentiment_Neu Sentiment_Compound Target
Open   0.997*** 0.997*** 0.997*** 0.055 0.984*** 0.966*** 0.988*** 0.299*** 0.468*** 0.510*** 0.966*** 0.964*** 0.948*** 0.001 -0.026 0.070* 0.017 -0.002
High 0.997***   0.996*** 0.997*** 0.057 0.983*** 0.965*** 0.987*** 0.302*** 0.470*** 0.510*** 0.965*** 0.962*** 0.947*** 0.004 -0.029 0.069* 0.020 0.003
Low 0.997*** 0.996***   0.997*** 0.055 0.984*** 0.966*** 0.988*** 0.299*** 0.466*** 0.507*** 0.966*** 0.962*** 0.948*** -0.002 -0.023 0.070* 0.014 -0.000
Close 0.997*** 0.997*** 0.997***   0.055 0.985*** 0.967*** 0.989*** 0.304*** 0.468*** 0.507*** 0.967*** 0.964*** 0.949*** 0.002 -0.027 0.068* 0.018 -0.019
Volume 0.055 0.057 0.055 0.055   0.060 0.056 0.058 0.015 0.029 0.035 0.056 0.055 0.056 -0.020 0.006 0.044 -0.015 -0.025
SMA_10 0.984*** 0.983*** 0.984*** 0.985*** 0.060   0.991*** 0.999*** 0.184*** 0.398*** 0.475*** 0.991*** 0.986*** 0.975*** -0.002 -0.021 0.064* 0.012 0.002
SMA_20 0.966*** 0.965*** 0.966*** 0.967*** 0.056 0.991***   0.991*** 0.071* 0.289*** 0.386*** 1.000*** 0.990*** 0.989*** -0.008 -0.017 0.070* 0.006 0.004
EMA_10 0.988*** 0.987*** 0.988*** 0.989*** 0.058 0.999*** 0.991***   0.192*** 0.397*** 0.468*** 0.991*** 0.985*** 0.976*** -0.003 -0.021 0.067* 0.012 -0.001
RSI 0.299*** 0.302*** 0.299*** 0.304*** 0.015 0.184*** 0.071* 0.192***   0.768*** 0.590*** 0.071* 0.094** 0.045 0.049 -0.045 -0.020 0.050 -0.063*
MACD 0.468*** 0.470*** 0.466*** 0.468*** 0.029 0.398*** 0.289*** 0.397*** 0.768***   0.957*** 0.289*** 0.312*** 0.258*** 0.040 -0.047 0.011 0.046 -0.013
Signal 0.510*** 0.510*** 0.507*** 0.507*** 0.035 0.475*** 0.386*** 0.468*** 0.590*** 0.957***   0.386*** 0.402*** 0.359*** 0.036 -0.046 0.021 0.042 0.006
BB_Middle 0.966*** 0.965*** 0.966*** 0.967*** 0.056 0.991*** 1.000*** 0.991*** 0.071* 0.289*** 0.386***   0.990*** 0.989*** -0.008 -0.017 0.070* 0.006 0.004
BB_Upper 0.964*** 0.962*** 0.962*** 0.964*** 0.055 0.986*** 0.990*** 0.985*** 0.094** 0.312*** 0.402*** 0.990***   0.958*** -0.008 -0.018 0.073* 0.007 0.003
BB_Lower 0.948*** 0.947*** 0.948*** 0.949*** 0.056 0.975*** 0.989*** 0.976*** 0.045 0.258*** 0.359*** 0.989*** 0.958***   -0.008 -0.015 0.066* 0.005 0.004
Sentiment_Pos 0.001 0.004 -0.002 0.002 -0.020 -0.002 -0.008 -0.003 0.049 0.040 0.036 -0.008 -0.008 -0.008   -0.943*** -0.374*** 0.967*** -0.023
Sentiment_Neg -0.026 -0.029 -0.023 -0.027 0.006 -0.021 -0.017 -0.021 -0.045 -0.047 -0.046 -0.017 -0.018 -0.015 -0.943***   0.044 -0.982*** 0.045
Sentiment_Neu 0.070* 0.069* 0.070* 0.068* 0.044 0.064* 0.070* 0.067* -0.020 0.011 0.021 0.070* 0.073* 0.066* -0.374*** 0.044   -0.166*** -0.055
Sentiment_Compound 0.017 0.020 0.014 0.018 -0.015 0.012 0.006 0.012 0.050 0.046 0.042 0.006 0.007 0.005 0.967*** -0.982*** -0.166***   -0.044
Target -0.002 0.003 -0.000 -0.019 -0.025 0.002 0.004 -0.001 -0.063* -0.013 0.006 0.004 0.003 0.004 -0.023 0.045 -0.055 -0.044  
Computed correlation used pearson-method with listwise-deletion.
cor_matrix <- cor(numeric_data)
print(cor_matrix)
##                            Open         High           Low        Close
## Open                1.000000000  0.996759149  0.9969443268  0.997368627
## High                0.996759149  1.000000000  0.9964644901  0.996875993
## Low                 0.996944327  0.996464490  1.0000000000  0.996904323
## Close               0.997368627  0.996875993  0.9969043226  1.000000000
## Volume              0.055483532  0.056561493  0.0550233711  0.054794943
## SMA_10              0.984413164  0.983396928  0.9836859642  0.984676662
## SMA_20              0.966388231  0.964795885  0.9655687275  0.966609329
## EMA_10              0.988417180  0.987452854  0.9877410209  0.988916060
## RSI                 0.298520083  0.301929683  0.2990593089  0.303863171
## MACD                0.468483455  0.469967501  0.4664553939  0.467595543
## Signal              0.510187017  0.510137567  0.5068308503  0.507359260
## BB_Middle           0.966388231  0.964795885  0.9655687275  0.966609329
## BB_Upper            0.963695413  0.962162197  0.9623197643  0.963548226
## BB_Lower            0.948189193  0.946567769  0.9479882106  0.948803201
## Sentiment_Pos       0.001017324  0.003983390 -0.0017417689  0.002216901
## Sentiment_Neg      -0.026115788 -0.028935408 -0.0231535418 -0.026706687
## Sentiment_Neu       0.069845390  0.068798794  0.0698715507  0.067888281
## Sentiment_Compound  0.017273906  0.019890605  0.0142897715  0.018043744
## Target             -0.002290956  0.002988989 -0.0001764206 -0.019150467
##                          Volume       SMA_10       SMA_20        EMA_10
## Open                0.055483532  0.984413164  0.966388231  0.9884171798
## High                0.056561493  0.983396928  0.964795885  0.9874528539
## Low                 0.055023371  0.983685964  0.965568727  0.9877410209
## Close               0.054794943  0.984676662  0.966609329  0.9889160599
## Volume              1.000000000  0.060175250  0.055839217  0.0583245316
## SMA_10              0.060175250  1.000000000  0.991034411  0.9991943449
## SMA_20              0.055839217  0.991034411  1.000000000  0.9911021653
## EMA_10              0.058324532  0.999194345  0.991102165  1.0000000000
## RSI                 0.014881255  0.183824449  0.071423033  0.1919698936
## MACD                0.029460477  0.397815029  0.289117042  0.3970189282
## Signal              0.034854014  0.475445724  0.385667117  0.4679745971
## BB_Middle           0.055839217  0.991034411  1.000000000  0.9911021653
## BB_Upper            0.054737251  0.985562560  0.990280927  0.9854670876
## BB_Lower            0.055809706  0.975298345  0.988654995  0.9755409009
## Sentiment_Pos      -0.020309412 -0.002246779 -0.007989124 -0.0026376653
## Sentiment_Neg       0.006271579 -0.020633890 -0.016616515 -0.0209851256
## Sentiment_Neu       0.043554662  0.064356129  0.070406257  0.0665118629
## Sentiment_Compound -0.014851503  0.011586392  0.006280960  0.0117481791
## Target             -0.024994517  0.002285671  0.003704787 -0.0007148014
##                            RSI        MACD       Signal    BB_Middle
## Open                0.29852008  0.46848345  0.510187017  0.966388231
## High                0.30192968  0.46996750  0.510137567  0.964795885
## Low                 0.29905931  0.46645539  0.506830850  0.965568727
## Close               0.30386317  0.46759554  0.507359260  0.966609329
## Volume              0.01488125  0.02946048  0.034854014  0.055839217
## SMA_10              0.18382445  0.39781503  0.475445724  0.991034411
## SMA_20              0.07142303  0.28911704  0.385667117  1.000000000
## EMA_10              0.19196989  0.39701893  0.467974597  0.991102165
## RSI                 1.00000000  0.76794718  0.590222306  0.071423033
## MACD                0.76794718  1.00000000  0.956779582  0.289117042
## Signal              0.59022231  0.95677958  1.000000000  0.385667117
## BB_Middle           0.07142303  0.28911704  0.385667117  1.000000000
## BB_Upper            0.09442503  0.31186541  0.402341003  0.990280927
## BB_Lower            0.04502155  0.25823474  0.359236305  0.988654995
## Sentiment_Pos       0.04852486  0.03977505  0.035873737 -0.007989124
## Sentiment_Neg      -0.04502517 -0.04677665 -0.046005208 -0.016616515
## Sentiment_Neu      -0.02020375  0.01099277  0.020568927  0.070406257
## Sentiment_Compound  0.05040054  0.04571434  0.041698559  0.006280960
## Target             -0.06319916 -0.01289870  0.006446226  0.003704787
##                        BB_Upper     BB_Lower Sentiment_Pos Sentiment_Neg
## Open                0.963695413  0.948189193   0.001017324  -0.026115788
## High                0.962162197  0.946567769   0.003983390  -0.028935408
## Low                 0.962319764  0.947988211  -0.001741769  -0.023153542
## Close               0.963548226  0.948803201   0.002216901  -0.026706687
## Volume              0.054737251  0.055809706  -0.020309412   0.006271579
## SMA_10              0.985562560  0.975298345  -0.002246779  -0.020633890
## SMA_20              0.990280927  0.988654995  -0.007989124  -0.016616515
## EMA_10              0.985467088  0.975540901  -0.002637665  -0.020985126
## RSI                 0.094425029  0.045021549   0.048524862  -0.045025172
## MACD                0.311865412  0.258234739   0.039775052  -0.046776647
## Signal              0.402341003  0.359236305   0.035873737  -0.046005208
## BB_Middle           0.990280927  0.988654995  -0.007989124  -0.016616515
## BB_Upper            1.000000000  0.958155548  -0.008080126  -0.017530831
## BB_Lower            0.958155548  1.000000000  -0.007716351  -0.015266152
## Sentiment_Pos      -0.008080126 -0.007716351   1.000000000  -0.943181875
## Sentiment_Neg      -0.017530831 -0.015266152  -0.943181875   1.000000000
## Sentiment_Neu       0.073232242  0.065816506  -0.373638072   0.044197036
## Sentiment_Compound  0.007133732  0.005222805   0.967148360  -0.982298899
## Target              0.003011107  0.004373026  -0.022871022   0.044503215
##                    Sentiment_Neu Sentiment_Compound        Target
## Open                  0.06984539        0.017273906 -0.0022909560
## High                  0.06879879        0.019890605  0.0029889892
## Low                   0.06987155        0.014289772 -0.0001764206
## Close                 0.06788828        0.018043744 -0.0191504672
## Volume                0.04355466       -0.014851503 -0.0249945166
## SMA_10                0.06435613        0.011586392  0.0022856706
## SMA_20                0.07040626        0.006280960  0.0037047873
## EMA_10                0.06651186        0.011748179 -0.0007148014
## RSI                  -0.02020375        0.050400537 -0.0631991618
## MACD                  0.01099277        0.045714337 -0.0128986962
## Signal                0.02056893        0.041698559  0.0064462263
## BB_Middle             0.07040626        0.006280960  0.0037047873
## BB_Upper              0.07323224        0.007133732  0.0030111066
## BB_Lower              0.06581651        0.005222805  0.0043730256
## Sentiment_Pos        -0.37363807        0.967148360 -0.0228710225
## Sentiment_Neg         0.04419704       -0.982298899  0.0445032153
## Sentiment_Neu         1.00000000       -0.165668480 -0.0554699297
## Sentiment_Compound   -0.16566848        1.000000000 -0.0440140527
## Target               -0.05546993       -0.044014053  1.0000000000
corrplot(cor_matrix,
         method = "color",
         type = "upper",
         tl.col = "black")

The correlation matrix results show that many variables have very high correlations (> 0.30), especially among stock price variables and technical indicators. Examples of high correlations found:

  • Open - Close = 0.997
  • Open - SMA_10 = 0.984
  • SMA_10 - EMA_10 = 0.999
  • MACD – Signal = 0.957
  • Sentiment_Pos – Sentiment_Compound = 0.967
  • Sentiment_Neg – Sentiment_Compound = -0.982

This indicates that:

  • Stock price variables (Open, High, Low, Close) have very strong correlations
  • Technical indicator variables (SMA, EMA, Bollinger Bands) are highly correlated
  • Momentum variables (MACD, Signal, RSI) are also strongly correlated
  • Sentiment variables have strong correlations with each other

In general, there are many correlations > 0.30, indicating that the variables have sufficient shared variance. Because many significant correlations exist among variables, the dataset satisfies the initial requirement for PCA and FA.

C.2. Measure of Sampling Adequacy (MSA)

KMO is used to measure whether variables are sufficiently correlated to form factors or components. The range interpretation:

  • 0.90+: Excellent
  • 0.80+: Meritorious
  • 0.70+: Middling
  • 0.60+: Mediocre
  • 0.50+: Miserable (minimum acceptable)
  • <0.50: Unacceptable
mat_corr <- round(cor(numeric_data),3)
KMO(mat_corr)
## Error in solve.default(r) : 
##   Lapack routine dgesv: system is exactly singular: U[19,19] = 0
## matrix is not invertible, image not found
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = mat_corr)
## Overall MSA =  0.5
## MSA for each item = 
##               Open               High                Low              Close 
##                0.5                0.5                0.5                0.5 
##             Volume             SMA_10             SMA_20             EMA_10 
##                0.5                0.5                0.5                0.5 
##                RSI               MACD             Signal          BB_Middle 
##                0.5                0.5                0.5                0.5 
##           BB_Upper           BB_Lower      Sentiment_Pos      Sentiment_Neg 
##                0.5                0.5                0.5                0.5 
##      Sentiment_Neu Sentiment_Compound             Target 
##                0.5                0.5                0.5

Overall MSA = 0.5 indicates that the dataset is at the minimum acceptable threshold for PCA.

This means:

  • Correlations among variables are sufficient for PCA
  • The factor structure quality is relatively low
  • PCA can still be performed, but interpretation must be done carefully

C.3. Bartlett’s Test

Bartlett’s Test is used to test whether the correlation matrix is significantly different from the identity matrix. Identity matrix means that there is no correlations among variables, so that PCA cannot be used.

Hypothesis:

  • H0: There are no correlations among variables
  • H1: There are correlations among variables
mat_corr <- round(cor(numeric_data),3)
library(psych)
n = nrow(numeric_data)
p = ncol(numeric_data)
cortest.bartlett(mat_corr,n=n, diag = TRUE)
## $chisq
## [1] Inf
## 
## $p.value
## [1] 0
## 
## $df
## [1] 171

The Bartlett’s Test produced a p-value = 0 (< 0.05) with 171 degrees of freedom. < 0.05 p-value indicates that the result is reject H0, so it is statistically significant. This means:

  • The correlation matrix is significantly different from the identity matrix
  • Correlations exist among variables
  • PCA is appropriate to perform

The very large chi-square value (Inf) indicates extremely strong correlations among variables, especially in price indicators and moving averages. These components indicate that significant correlations exist among variables, so PCA and FA can be proceed.

But there is an important issue, which is matrix singular in the MSA test where they have perfect multicollinearity. This happens because some variables have extremely high correlations (≈1.000). For examples, SMA_20 - BB_Middle = 1.000, SMA_10 - EMA_10 = 0.999, Open - Close = 0.997. This problem will make PCA becomes unstable, KMO becomes low, and matrix cannot be inverted.

The solution is by removing redundant variables.In this case, BB_Middle ≈ SMA_20, EMA_10 ≈ SMA_10, Open, High, Low, Close are highly redundant.

numeric_data_reduced

numeric_data_reduced <- numeric_data[, !colnames(numeric_data) %in% c(
  "BB_Middle",
  "EMA_10",
  "High",
  "Low"
)]

After removed, then re-do the correlation matrix.

mat_corr <- round(cor(numeric_data_reduced),3)
library(sjPlot)
tab_corr(numeric_data_reduced)
  Open Close Volume SMA_10 SMA_20 RSI MACD Signal BB_Upper BB_Lower Sentiment_Pos Sentiment_Neg Sentiment_Neu Sentiment_Compound Target
Open   0.997*** 0.055 0.984*** 0.966*** 0.299*** 0.468*** 0.510*** 0.964*** 0.948*** 0.001 -0.026 0.070* 0.017 -0.002
Close 0.997***   0.055 0.985*** 0.967*** 0.304*** 0.468*** 0.507*** 0.964*** 0.949*** 0.002 -0.027 0.068* 0.018 -0.019
Volume 0.055 0.055   0.060 0.056 0.015 0.029 0.035 0.055 0.056 -0.020 0.006 0.044 -0.015 -0.025
SMA_10 0.984*** 0.985*** 0.060   0.991*** 0.184*** 0.398*** 0.475*** 0.986*** 0.975*** -0.002 -0.021 0.064* 0.012 0.002
SMA_20 0.966*** 0.967*** 0.056 0.991***   0.071* 0.289*** 0.386*** 0.990*** 0.989*** -0.008 -0.017 0.070* 0.006 0.004
RSI 0.299*** 0.304*** 0.015 0.184*** 0.071*   0.768*** 0.590*** 0.094** 0.045 0.049 -0.045 -0.020 0.050 -0.063*
MACD 0.468*** 0.468*** 0.029 0.398*** 0.289*** 0.768***   0.957*** 0.312*** 0.258*** 0.040 -0.047 0.011 0.046 -0.013
Signal 0.510*** 0.507*** 0.035 0.475*** 0.386*** 0.590*** 0.957***   0.402*** 0.359*** 0.036 -0.046 0.021 0.042 0.006
BB_Upper 0.964*** 0.964*** 0.055 0.986*** 0.990*** 0.094** 0.312*** 0.402***   0.958*** -0.008 -0.018 0.073* 0.007 0.003
BB_Lower 0.948*** 0.949*** 0.056 0.975*** 0.989*** 0.045 0.258*** 0.359*** 0.958***   -0.008 -0.015 0.066* 0.005 0.004
Sentiment_Pos 0.001 0.002 -0.020 -0.002 -0.008 0.049 0.040 0.036 -0.008 -0.008   -0.943*** -0.374*** 0.967*** -0.023
Sentiment_Neg -0.026 -0.027 0.006 -0.021 -0.017 -0.045 -0.047 -0.046 -0.018 -0.015 -0.943***   0.044 -0.982*** 0.045
Sentiment_Neu 0.070* 0.068* 0.044 0.064* 0.070* -0.020 0.011 0.021 0.073* 0.066* -0.374*** 0.044   -0.166*** -0.055
Sentiment_Compound 0.017 0.018 -0.015 0.012 0.006 0.050 0.046 0.042 0.007 0.005 0.967*** -0.982*** -0.166***   -0.044
Target -0.002 -0.019 -0.025 0.002 0.004 -0.063* -0.013 0.006 0.003 0.004 -0.023 0.045 -0.055 -0.044  
Computed correlation used pearson-method with listwise-deletion.
cor_matrix <- cor(numeric_data_reduced)
print(cor_matrix)
##                            Open        Close       Volume       SMA_10
## Open                1.000000000  0.997368627  0.055483532  0.984413164
## Close               0.997368627  1.000000000  0.054794943  0.984676662
## Volume              0.055483532  0.054794943  1.000000000  0.060175250
## SMA_10              0.984413164  0.984676662  0.060175250  1.000000000
## SMA_20              0.966388231  0.966609329  0.055839217  0.991034411
## RSI                 0.298520083  0.303863171  0.014881255  0.183824449
## MACD                0.468483455  0.467595543  0.029460477  0.397815029
## Signal              0.510187017  0.507359260  0.034854014  0.475445724
## BB_Upper            0.963695413  0.963548226  0.054737251  0.985562560
## BB_Lower            0.948189193  0.948803201  0.055809706  0.975298345
## Sentiment_Pos       0.001017324  0.002216901 -0.020309412 -0.002246779
## Sentiment_Neg      -0.026115788 -0.026706687  0.006271579 -0.020633890
## Sentiment_Neu       0.069845390  0.067888281  0.043554662  0.064356129
## Sentiment_Compound  0.017273906  0.018043744 -0.014851503  0.011586392
## Target             -0.002290956 -0.019150467 -0.024994517  0.002285671
##                          SMA_20         RSI        MACD       Signal
## Open                0.966388231  0.29852008  0.46848345  0.510187017
## Close               0.966609329  0.30386317  0.46759554  0.507359260
## Volume              0.055839217  0.01488125  0.02946048  0.034854014
## SMA_10              0.991034411  0.18382445  0.39781503  0.475445724
## SMA_20              1.000000000  0.07142303  0.28911704  0.385667117
## RSI                 0.071423033  1.00000000  0.76794718  0.590222306
## MACD                0.289117042  0.76794718  1.00000000  0.956779582
## Signal              0.385667117  0.59022231  0.95677958  1.000000000
## BB_Upper            0.990280927  0.09442503  0.31186541  0.402341003
## BB_Lower            0.988654995  0.04502155  0.25823474  0.359236305
## Sentiment_Pos      -0.007989124  0.04852486  0.03977505  0.035873737
## Sentiment_Neg      -0.016616515 -0.04502517 -0.04677665 -0.046005208
## Sentiment_Neu       0.070406257 -0.02020375  0.01099277  0.020568927
## Sentiment_Compound  0.006280960  0.05040054  0.04571434  0.041698559
## Target              0.003704787 -0.06319916 -0.01289870  0.006446226
##                        BB_Upper     BB_Lower Sentiment_Pos Sentiment_Neg
## Open                0.963695413  0.948189193   0.001017324  -0.026115788
## Close               0.963548226  0.948803201   0.002216901  -0.026706687
## Volume              0.054737251  0.055809706  -0.020309412   0.006271579
## SMA_10              0.985562560  0.975298345  -0.002246779  -0.020633890
## SMA_20              0.990280927  0.988654995  -0.007989124  -0.016616515
## RSI                 0.094425029  0.045021549   0.048524862  -0.045025172
## MACD                0.311865412  0.258234739   0.039775052  -0.046776647
## Signal              0.402341003  0.359236305   0.035873737  -0.046005208
## BB_Upper            1.000000000  0.958155548  -0.008080126  -0.017530831
## BB_Lower            0.958155548  1.000000000  -0.007716351  -0.015266152
## Sentiment_Pos      -0.008080126 -0.007716351   1.000000000  -0.943181875
## Sentiment_Neg      -0.017530831 -0.015266152  -0.943181875   1.000000000
## Sentiment_Neu       0.073232242  0.065816506  -0.373638072   0.044197036
## Sentiment_Compound  0.007133732  0.005222805   0.967148360  -0.982298899
## Target              0.003011107  0.004373026  -0.022871022   0.044503215
##                    Sentiment_Neu Sentiment_Compound       Target
## Open                  0.06984539        0.017273906 -0.002290956
## Close                 0.06788828        0.018043744 -0.019150467
## Volume                0.04355466       -0.014851503 -0.024994517
## SMA_10                0.06435613        0.011586392  0.002285671
## SMA_20                0.07040626        0.006280960  0.003704787
## RSI                  -0.02020375        0.050400537 -0.063199162
## MACD                  0.01099277        0.045714337 -0.012898696
## Signal                0.02056893        0.041698559  0.006446226
## BB_Upper              0.07323224        0.007133732  0.003011107
## BB_Lower              0.06581651        0.005222805  0.004373026
## Sentiment_Pos        -0.37363807        0.967148360 -0.022871022
## Sentiment_Neg         0.04419704       -0.982298899  0.044503215
## Sentiment_Neu         1.00000000       -0.165668480 -0.055469930
## Sentiment_Compound   -0.16566848        1.000000000 -0.044014053
## Target               -0.05546993       -0.044014053  1.000000000
corrplot(cor_matrix,
         method = "color",
         type = "upper",
         tl.col = "black")

Re-do the MSA-KMO Test too.

mat_corr <- round(cor(numeric_data_reduced),3)
KMO(mat_corr)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = mat_corr)
## Overall MSA =  0.37
## MSA for each item = 
##               Open              Close             Volume             SMA_10 
##               0.57               0.55               0.55               0.65 
##             SMA_20                RSI               MACD             Signal 
##               0.32               0.42               0.40               0.42 
##           BB_Upper           BB_Lower      Sentiment_Pos      Sentiment_Neg 
##               0.42               0.40               0.18               0.17 
##      Sentiment_Neu Sentiment_Compound             Target 
##               0.02               0.84               0.02

And also the Bartlett’s Test.

mat_corr <- round(cor(numeric_data_reduced),3)
library(psych)
n = nrow(numeric_data_reduced)
p = ncol(numeric_data_reduced)
cortest.bartlett(mat_corr,n=n, diag = TRUE)
## Warning in log(detR): NaNs produced
## $chisq
## [1] NaN
## 
## $p.value
## [1] NaN
## 
## $df
## [1] 105

After removing redundant variables (BB_Middle, EMA_10, High, and Low), the correlation matrix of the reduced dataset showed that stock price variables and technical indicators remained highly correlated. For example, Open - Close showed 0.997, SMA_10 - SMA_20 showed 0.991, BB_Upper - SMA_20 showed 0.990, and MACD - Signal showed 0.957. These high correlations indicate strong shared variance among technical indicators. In contrast, sentiment variables such as Sentiment_Pos, Sentiment_Neg, Sentiment_Neu, and Target showed extremely weak correlations with other variables, with values close to zero, indicating poor integration into the shared variance structure.

The KMO test result showed an Overall MSA value of 0.37, which is below the acceptable threshold of 0.50, indicating that the dataset was not suitable for PCA. Bartlett’s Test produced NaN results due to matrix singularity, confirming that the correlation matrix was not invertible and PCA could not yet be performed. That is why, we need to eliminate some variables with low MSA values.

data_step1

data_step1 <- numeric_data_reduced[, !colnames(numeric_data_reduced) %in% c(
"Sentiment_Neu",
"Target",
"Sentiment_Neg",
"Sentiment_Pos"
)]

mat_corr1 <- cor(data_step1)

library(psych)
tab_corr(data_step1)
  Open Close Volume SMA_10 SMA_20 RSI MACD Signal BB_Upper BB_Lower Sentiment_Compound
Open   0.997*** 0.055 0.984*** 0.966*** 0.299*** 0.468*** 0.510*** 0.964*** 0.948*** 0.017
Close 0.997***   0.055 0.985*** 0.967*** 0.304*** 0.468*** 0.507*** 0.964*** 0.949*** 0.018
Volume 0.055 0.055   0.060 0.056 0.015 0.029 0.035 0.055 0.056 -0.015
SMA_10 0.984*** 0.985*** 0.060   0.991*** 0.184*** 0.398*** 0.475*** 0.986*** 0.975*** 0.012
SMA_20 0.966*** 0.967*** 0.056 0.991***   0.071* 0.289*** 0.386*** 0.990*** 0.989*** 0.006
RSI 0.299*** 0.304*** 0.015 0.184*** 0.071*   0.768*** 0.590*** 0.094** 0.045 0.050
MACD 0.468*** 0.468*** 0.029 0.398*** 0.289*** 0.768***   0.957*** 0.312*** 0.258*** 0.046
Signal 0.510*** 0.507*** 0.035 0.475*** 0.386*** 0.590*** 0.957***   0.402*** 0.359*** 0.042
BB_Upper 0.964*** 0.964*** 0.055 0.986*** 0.990*** 0.094** 0.312*** 0.402***   0.958*** 0.007
BB_Lower 0.948*** 0.949*** 0.056 0.975*** 0.989*** 0.045 0.258*** 0.359*** 0.958***   0.005
Sentiment_Compound 0.017 0.018 -0.015 0.012 0.006 0.050 0.046 0.042 0.007 0.005  
Computed correlation used pearson-method with listwise-deletion.
cor_matrix <- cor(data_step1)
print(cor_matrix)
##                          Open      Close      Volume     SMA_10     SMA_20
## Open               1.00000000 0.99736863  0.05548353 0.98441316 0.96638823
## Close              0.99736863 1.00000000  0.05479494 0.98467666 0.96660933
## Volume             0.05548353 0.05479494  1.00000000 0.06017525 0.05583922
## SMA_10             0.98441316 0.98467666  0.06017525 1.00000000 0.99103441
## SMA_20             0.96638823 0.96660933  0.05583922 0.99103441 1.00000000
## RSI                0.29852008 0.30386317  0.01488125 0.18382445 0.07142303
## MACD               0.46848345 0.46759554  0.02946048 0.39781503 0.28911704
## Signal             0.51018702 0.50735926  0.03485401 0.47544572 0.38566712
## BB_Upper           0.96369541 0.96354823  0.05473725 0.98556256 0.99028093
## BB_Lower           0.94818919 0.94880320  0.05580971 0.97529834 0.98865500
## Sentiment_Compound 0.01727391 0.01804374 -0.01485150 0.01158639 0.00628096
##                           RSI       MACD     Signal    BB_Upper    BB_Lower
## Open               0.29852008 0.46848345 0.51018702 0.963695413 0.948189193
## Close              0.30386317 0.46759554 0.50735926 0.963548226 0.948803201
## Volume             0.01488125 0.02946048 0.03485401 0.054737251 0.055809706
## SMA_10             0.18382445 0.39781503 0.47544572 0.985562560 0.975298345
## SMA_20             0.07142303 0.28911704 0.38566712 0.990280927 0.988654995
## RSI                1.00000000 0.76794718 0.59022231 0.094425029 0.045021549
## MACD               0.76794718 1.00000000 0.95677958 0.311865412 0.258234739
## Signal             0.59022231 0.95677958 1.00000000 0.402341003 0.359236305
## BB_Upper           0.09442503 0.31186541 0.40234100 1.000000000 0.958155548
## BB_Lower           0.04502155 0.25823474 0.35923630 0.958155548 1.000000000
## Sentiment_Compound 0.05040054 0.04571434 0.04169856 0.007133732 0.005222805
##                    Sentiment_Compound
## Open                      0.017273906
## Close                     0.018043744
## Volume                   -0.014851503
## SMA_10                    0.011586392
## SMA_20                    0.006280960
## RSI                       0.050400537
## MACD                      0.045714337
## Signal                    0.041698559
## BB_Upper                  0.007133732
## BB_Lower                  0.005222805
## Sentiment_Compound        1.000000000
corrplot(cor_matrix,
         method = "color",
         type = "upper",
         tl.col = "black")

cortest.bartlett(mat_corr1, n=nrow(data_step1))
## Warning in log(detR): NaNs produced
## $chisq
## [1] NaN
## 
## $p.value
## [1] NaN
## 
## $df
## [1] 55
KMO(mat_corr1)
## Error in solve.default(r) : 
##   system is computationally singular: reciprocal condition number = 4.18472e-18
## matrix is not invertible, image not found
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = mat_corr1)
## Overall MSA =  0.5
## MSA for each item = 
##               Open              Close             Volume             SMA_10 
##                0.5                0.5                0.5                0.5 
##             SMA_20                RSI               MACD             Signal 
##                0.5                0.5                0.5                0.5 
##           BB_Upper           BB_Lower Sentiment_Compound 
##                0.5                0.5                0.5

In step 1, variables with extremely low MSA values were removed. These variables included Sentiment_Neu, Target, Sentiment_Pos, and Sentiment_Neg. After removing these variables, the correlation matrix showed improved structural consistency among the remaining variables, especially among price and technical indicators such as Open, Close, SMA_10, SMA_20, MACD, Signal, and Bollinger Bands.

The KMO test result showed an Overall MSA value of 0.50, which meets the minimum acceptable threshold. However, the correlation matrix remained computationally singular and non-invertible due to severe multicollinearity among technical indicators. Bartlett’s Test still produced NaN values, confirming that the matrix was unstable. So we eliminate the variables with low MSA values again. Because the MSA values remain the same in data_step1, we remove variables with low MSA values from based on numeric_data_reduced test.

data_step2

data_step2 <- data_step1[, !colnames(data_step1) %in% c(
"SMA_20",
"MACD",
"BB_Lower"
)]

mat_corr2 <- cor(data_step2)

library(psych)
cor_matrix <- cor(data_step2)
print(cor_matrix)
##                          Open      Close      Volume     SMA_10        RSI
## Open               1.00000000 0.99736863  0.05548353 0.98441316 0.29852008
## Close              0.99736863 1.00000000  0.05479494 0.98467666 0.30386317
## Volume             0.05548353 0.05479494  1.00000000 0.06017525 0.01488125
## SMA_10             0.98441316 0.98467666  0.06017525 1.00000000 0.18382445
## RSI                0.29852008 0.30386317  0.01488125 0.18382445 1.00000000
## Signal             0.51018702 0.50735926  0.03485401 0.47544572 0.59022231
## BB_Upper           0.96369541 0.96354823  0.05473725 0.98556256 0.09442503
## Sentiment_Compound 0.01727391 0.01804374 -0.01485150 0.01158639 0.05040054
##                        Signal    BB_Upper Sentiment_Compound
## Open               0.51018702 0.963695413        0.017273906
## Close              0.50735926 0.963548226        0.018043744
## Volume             0.03485401 0.054737251       -0.014851503
## SMA_10             0.47544572 0.985562560        0.011586392
## RSI                0.59022231 0.094425029        0.050400537
## Signal             1.00000000 0.402341003        0.041698559
## BB_Upper           0.40234100 1.000000000        0.007133732
## Sentiment_Compound 0.04169856 0.007133732        1.000000000
corrplot(cor_matrix,
         method = "color",
         type = "upper",
         tl.col = "black")

tab_corr(data_step2)
  Open Close Volume SMA_10 RSI Signal BB_Upper Sentiment_Compound
Open   0.997*** 0.055 0.984*** 0.299*** 0.510*** 0.964*** 0.017
Close 0.997***   0.055 0.985*** 0.304*** 0.507*** 0.964*** 0.018
Volume 0.055 0.055   0.060 0.015 0.035 0.055 -0.015
SMA_10 0.984*** 0.985*** 0.060   0.184*** 0.475*** 0.986*** 0.012
RSI 0.299*** 0.304*** 0.015 0.184***   0.590*** 0.094** 0.050
Signal 0.510*** 0.507*** 0.035 0.475*** 0.590***   0.402*** 0.042
BB_Upper 0.964*** 0.964*** 0.055 0.986*** 0.094** 0.402***   0.007
Sentiment_Compound 0.017 0.018 -0.015 0.012 0.050 0.042 0.007  
Computed correlation used pearson-method with listwise-deletion.
cortest.bartlett(mat_corr2, n=nrow(data_step2))
## $chisq
## [1] 13919.31
## 
## $p.value
## [1] 0
## 
## $df
## [1] 28
KMO(mat_corr2)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = mat_corr2)
## Overall MSA =  0.79
## MSA for each item = 
##               Open              Close             Volume             SMA_10 
##               0.82               0.77               0.77               0.83 
##                RSI             Signal           BB_Upper Sentiment_Compound 
##               0.47               0.75               0.86               0.84

Same like before, in step 2. variables with low MSA values and high redundancy were removed. These variables included SMA_20, MACD, and BB_Lower. After removing these variables, the correlation matrix became more stable while maintaining strong correlations among core variables.

The KMO test result improved significantly, with an Overall MSA value of 0.79, indicating meritorious sampling adequacy. Bartlett’s Test produced a chi-square value of 13919.31 with a p-value of 0.000, confirming that the correlation matrix was statistically significant and suitable for PCA. However, individual MSA values showed that RSI still had an MSA value of 0.47, which is below the acceptable threshold of 0.50, indicating that this variable did not adequately share variance with the other variables.

Even though step 2 already achieved a high Overall KMO value of 0.79, PCA assumptions require that not only the Overall KMO but also the individual MSA values for each variable must be above 0.50. In Step 2, RSI had an individual MSA value of 0.47, which is below the acceptable threshold. So we apply the step 3 to eliminate RSI.

data_step3

data_step3 <- data_step2[, !colnames(data_step2) %in% c(
"RSI"
)]

mat_corr3 <- cor(data_step3)

library(psych)
tab_corr(data_step3)
  Open Close Volume SMA_10 Signal BB_Upper Sentiment_Compound
Open   0.997*** 0.055 0.984*** 0.510*** 0.964*** 0.017
Close 0.997***   0.055 0.985*** 0.507*** 0.964*** 0.018
Volume 0.055 0.055   0.060 0.035 0.055 -0.015
SMA_10 0.984*** 0.985*** 0.060   0.475*** 0.986*** 0.012
Signal 0.510*** 0.507*** 0.035 0.475***   0.402*** 0.042
BB_Upper 0.964*** 0.964*** 0.055 0.986*** 0.402***   0.007
Sentiment_Compound 0.017 0.018 -0.015 0.012 0.042 0.007  
Computed correlation used pearson-method with listwise-deletion.
cortest.bartlett(mat_corr3, n=nrow(data_step3))
## $chisq
## [1] 12611.73
## 
## $p.value
## [1] 0
## 
## $df
## [1] 21
KMO(mat_corr3)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = mat_corr3)
## Overall MSA =  0.79
## MSA for each item = 
##               Open              Close             Volume             SMA_10 
##               0.79               0.78               0.79               0.81 
##             Signal           BB_Upper Sentiment_Compound 
##               0.80               0.79               0.63

RSI and Signal were removed to improve overall model stability. After removing these variables, the correlation matrix remained strong and stable, showing high correlations among the remaining price and technical indicators.

The KMO test result remained at 0.79, indicating continued strong sampling adequacy. Importantly, all remaining variables now had individual MSA values above 0.50. Bartlett’s Test produced a chi-square value of 12611.73 with a p-value of 0.000, confirming that the correlation matrix was statistically significant and fully suitable for PCA.

data_final_10

For an option, we also apply this 10-variable structure, just in case if the minimum needed for PCA in this task is 10. This is the final data we use for the next steps.

data_final_10 <- numeric_data[, c(
"Open",
"Close",
"Volume",
"SMA_10",
"SMA_20",
"RSI",
"Signal",
"BB_Upper",
"Sentiment_Compound",
"MACD"
)]

mat_corr_final10 <- cor(data_final_10)

library(psych)
tab_corr(data_final_10)
  Open Close Volume SMA_10 SMA_20 RSI Signal BB_Upper Sentiment_Compound MACD
Open   0.997*** 0.055 0.984*** 0.966*** 0.299*** 0.510*** 0.964*** 0.017 0.468***
Close 0.997***   0.055 0.985*** 0.967*** 0.304*** 0.507*** 0.964*** 0.018 0.468***
Volume 0.055 0.055   0.060 0.056 0.015 0.035 0.055 -0.015 0.029
SMA_10 0.984*** 0.985*** 0.060   0.991*** 0.184*** 0.475*** 0.986*** 0.012 0.398***
SMA_20 0.966*** 0.967*** 0.056 0.991***   0.071* 0.386*** 0.990*** 0.006 0.289***
RSI 0.299*** 0.304*** 0.015 0.184*** 0.071*   0.590*** 0.094** 0.050 0.768***
Signal 0.510*** 0.507*** 0.035 0.475*** 0.386*** 0.590***   0.402*** 0.042 0.957***
BB_Upper 0.964*** 0.964*** 0.055 0.986*** 0.990*** 0.094** 0.402***   0.007 0.312***
Sentiment_Compound 0.017 0.018 -0.015 0.012 0.006 0.050 0.042 0.007   0.046
MACD 0.468*** 0.468*** 0.029 0.398*** 0.289*** 0.768*** 0.957*** 0.312*** 0.046  
Computed correlation used pearson-method with listwise-deletion.
cortest.bartlett(mat_corr_final10, n=nrow(data_final_10))
## $chisq
## [1] 24680.21
## 
## $p.value
## [1] 0
## 
## $df
## [1] 45
KMO(mat_corr_final10)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = mat_corr_final10)
## Overall MSA =  0.71
## MSA for each item = 
##               Open              Close             Volume             SMA_10 
##               0.91               0.75               0.74               0.73 
##             SMA_20                RSI             Signal           BB_Upper 
##               0.62               0.65               0.53               0.99 
## Sentiment_Compound               MACD 
##               0.80               0.51

The Kaiser–Meyer–Olkin (KMO) test produced an Overall MSA value of 0.71, indicating good sampling adequacy and confirming that the dataset is suitable for PCA, as it exceeds the minimum threshold of 0.50. All individual MSA values were also above 0.50, ranging from 0.51 to 0.99, confirming that each variable contributes adequately to the shared variance.

Bartlett’s Test showed a chi-square value of 24680.21 with a p-value of 0.000, indicating that the correlation matrix is not an identity matrix and that sufficient correlations exist among variables. As a result, although this configuration includes more variables, it still satisfies all PCA assumptions and can be used as a statistically valid alternative when retaining at least 10 variables is required.

D. Stage 4. Deriving Factors and Assessing Overall Fit

After the data satisfies all PCA assumptions, we need to scale it and apply PCA before deriving factors and assessing overall fit.

library(FactoMineR)
library(factoextra)

pca_final10 <- PCA(data_final_10,
                   scale.unit = TRUE,
                   graph = FALSE)

Then see the eigenvalue table with the variance and cumulative variance.

eig_values <- get_eigenvalue(pca_final10)

eig_values
##          eigenvalue variance.percent cumulative.variance.percent
## Dim.1  5.5619369902     55.619369902                    55.61937
## Dim.2  1.9946199607     19.946199607                    75.56557
## Dim.3  1.0115725982     10.115725982                    85.68130
## Dim.4  0.9798320048      9.798320048                    95.47962
## Dim.5  0.4099887027      4.099887027                    99.57950
## Dim.6  0.0206605410      0.206605410                    99.78611
## Dim.7  0.0127125825      0.127125825                    99.91323
## Dim.8  0.0055062609      0.055062609                    99.96830
## Dim.9  0.0025449606      0.025449606                    99.99375
## Dim.10 0.0006253987      0.006253987                   100.00000

And also do the Scree Plot too.

library(ggplot2)

eig_df <- data.frame(
  Component = 1:nrow(eig_values),
  Eigenvalue = eig_values[,1],
  Variance = eig_values[,2]   # ambil persen variance
)

ggplot(eig_df, aes(x = Component, y = Eigenvalue)) +
  geom_col(fill = "lightblue", color = "black") +
  geom_point(color = "red", size = 3) +
  geom_line(color = "red") +
  geom_hline(yintercept = 1, linetype = "dashed", color = "blue") +
  geom_text(aes(label = paste0(round(Variance,2), "%")),
            vjust = -0.8,
            size = 3.5) +
  labs(title = "Scree Plot Final 10 Variables",
       x = "Component",
       y = "Eigenvalue") +
  theme_minimal()

D.1. A Priori Criterion

Practical considerations suggest retaining multiple variables per component, with at least 2 and 3 variables representing each component. Using more than three components would likely produce components with limited substantive contribution and smaller eigenvalues, so it will reduce the interpretability.

D.2.Latent Root Criterion (Eigenvalue >1)

Based on the latent root criterion, the results show:

  • Component 1 eigenvalue = 5.562
  • Component 2 eigenvalue = 1.995
  • Component 3 eigenvalue = 1.012
  • Component 4 eigenvalue = 0.980

The first 3 components have eigenvalues greater than 1.0. Therefore, 3 components should be maintained according to the latent root criterion.

Although Component 4 has an eigenvalue slightly below 1.0 (0.98), it explains nearly 10% of the variance. And also, according to the strict Kaiser criterion, it was not retained.

Total variance explained by first three components: 55.62% + 19.95% + 10.12% = 85.68% This level of explained variance is considered excellent.

D.3. Percentage of Variance Criterion

The first 3 components explain 85.68% of the total variance, which exceeds the commonly accepted threshold of 60%. This indicates that the maintained components sufficiently summarize the original variables. This confirms that PCA achieves strong dimensional reduction while preserving most of the information.

D.4. Scree Test Critetion

The scree plot shows a sharp decline between Component 1, Component 2, and Component 3, followed by a noticeable flattening of the curve after Component 3. This indicates that the elbow occurs at Component 3. This suggests that retaining 3 components is appropriate.

D.5. Decision

Combining all criteria:

  • A priori criterion: suggests 2-3 components
  • Latent root criterion: suggests 3 components
  • Percentage variance criterion: suggests 3 components
  • Scree plot criterion: suggests 3 components

Final decision: 3 components are retained.

E. Stage 5: Interpreting The Factors*

First, apply PCA with 3 components.

library(psych)

pca_stage5 <- principal(
  data_final_10,
  nfactors = 3,
  rotate = "varimax",
  scores = TRUE
)

print(pca_stage5, cut = 0.40, sort = TRUE)
## Principal Components Analysis
## Call: principal(r = data_final_10, nfactors = 3, rotate = "varimax", 
##     scores = TRUE)
## Standardized loadings (pattern matrix) based upon correlation matrix
##                    item  RC1  RC2   RC3   h2     u2 com
## SMA_20                5 1.00            1.00 0.0028 1.0
## BB_Upper              8 0.99            0.99 0.0124 1.0
## SMA_10                4 0.98            1.00 0.0043 1.1
## Open                  1 0.96            0.99 0.0136 1.2
## Close                 2 0.96            0.99 0.0139 1.2
## MACD                 10      0.96       0.97 0.0303 1.1
## RSI                   6      0.87       0.77 0.2337 1.0
## Signal                7      0.87       0.86 0.1433 1.3
## Volume                3            0.72 0.52 0.4811 1.0
## Sentiment_Compound    9           -0.71 0.50 0.4965 1.0
## 
##                        RC1  RC2  RC3
## SS loadings           4.92 2.63 1.02
## Proportion Var        0.49 0.26 0.10
## Cumulative Var        0.49 0.76 0.86
## Proportion Explained  0.57 0.31 0.12
## Cumulative Proportion 0.57 0.88 1.00
## 
## Mean item complexity =  1.1
## Test of the hypothesis that 3 components are sufficient.
## 
## The root mean square of the residuals (RMSR) is  0.06 
##  with the empirical chi square  281.61  with prob <  2.9e-49 
## 
## Fit based upon off diagonal values = 0.99

Then show communalities.

pca_stage5$communality
##               Open              Close             Volume             SMA_10 
##          0.9863506          0.9861092          0.5188637          0.9957455 
##             SMA_20                RSI             Signal           BB_Upper 
##          0.9972188          0.7663040          0.8567283          0.9876485 
## Sentiment_Compound               MACD 
##          0.5034670          0.9696939

And also show variance explained.

pca_stage5$Vaccounted
##                             RC1       RC2       RC3
## SS loadings           4.9235717 2.6287684 1.0157894
## Proportion Var        0.4923572 0.2628768 0.1015789
## Cumulative Var        0.4923572 0.7552340 0.8568130
## Proportion Explained  0.5746379 0.3068077 0.1185544
## Cumulative Proportion 0.5746379 0.8814456 1.0000000

Interpreting the Principal Components

  1. SMA_20 (1.00)
  2. BB_Upper (0.99)
  3. SMA_10 (0.98)
  4. Open (0.96)
  5. Close (0.96)
  1. MACD (0.96)
  2. RSI (0.87)
  3. Signal (0.87)
  1. Volume (0.72)
  2. Sentiment_Compound (−0.71)

Communalities Interpretation

  1. The highest communalities are observed in SMA_20 (0.997), SMA_10 (0.996), and BB_Upper (0.988).
  2. The lowest communalities are Sentiment_Compound (0.503) and Volume (0.519).

Variance Explained and Rotation Interpretation

  1. Component 1 explains 49.24% of the variance
  2. Component 2 explains 26.29% of the variance
  3. Component 3 explains 10.16% of the variance

Principal Component Naming and Interpretation

F. Stage 6: Validation of Principal Component Analysis

Split-Sample Validation

  1. 3 retained components
  2. Varimax rotation
  3. Loading cut-off of 0.40
  1. The same 3-component structure appears.
  2. Variables load on the same components across subsamples.
  3. The variance explained remains relatively stable.
  1. The first PC shows extremely high loadings (≥ 0.96).
  2. The second PC shows strong and consistent momentum loadings.
  3. The third PC shows clear separation of Volume and Sentiment.
  4. No cross-loadings are present.
  5. Communalities are all above 0.50.
  6. High explained variance (85.68%)

G. Stage 7: Additional Uses of The Exploratory Factor Analysis Results

  1. Reduce dimensionality
  2. Minimize multicollinearity
  3. Improve modeling efficiency

In conclusion, the PCA results can be applied in further modeling. Below are some additional uses of the PCA results.

Because each PC shows strong loadings without significant cross-loadings, a single variable with the highest loading can be selected to represent each dimension. For example, SMA_20 can serve as a replacement for the Price Trend dimension, MACD for the Technical Momentum dimension, and Volume for the Market Activity & Sentiment dimension. This approach reduces multicollinearity in subsequent analyses while preserving the conceptual structure identified by the PCA.

Variables that load strongly on the same component can be combined into composite representing the extracted dimensions. For example, the price-based indicators (SMA_20, SMA_10, BB_Upper, Open, Close) can form a Price Trend Index, and MACD, RSI, and Signal can form a Momentum Index, also Volume together with Sentiment_Compound can form an Activity & Sentiment Index. These summated scales simplify the data structure.

Since the PCA was estimated with scores = TRUE, component scores are available for each observation, which represents its position along the 3 PC. Because varimax rotation preserves orthogonality, the component scores are uncorrelated and can be used as independent variables in regression models, clustering procedures, or predictive modelings.

Additional Visualization

1. Multidimensional Scaling

dist_matrix <- dist(scale(data_final_10))

mds_result <- cmdscale(dist_matrix, k = 2)

mds_df <- as.data.frame(mds_result)
colnames(mds_df) <- c("Dim1", "Dim2")

library(ggplot2)

ggplot(mds_df, aes(x = Dim1, y = Dim2)) +
  geom_point() +
  labs(title = "Multidimensional Scaling (MDS)",
       x = "Dimension 1",
       y = "Dimension 2") +
  theme_minimal()

  • The dataset contains multiple regimes or behavioral patterns, likely reflecting different market states (bullish, bearish, volatile).
  • The MDS plot shows several clusters rather than a single compact group.
  • There is a dense cluster on the right, indicating many observations share similar characteristics.
  • The spread toward the left suggests some distinct or outlier market conditions.

2. Biplot

fviz_pca_var(pca_final10,
             col.var = "contrib",
             gradient.cols = c("blue", "yellow", "red"),
             repel = TRUE)

  • The data structure is dominated by a price-trend factor (Dim1), while momentum indicators (Dim2) provide additional but smaller variation. Volume and sentiment behave more independently.
  • Dim1 (55.6%) explains most of the variance. It is mainly driven by Close, Open, SMA_10, and SMA_20, which cluster closely together. This suggests these variables are strongly positively correlated and represent the main price trend component.
  • Dim2 (19.9%) captures secondary variation. RSI and MACD load strongly in the upper direction of Dim2. This dimension reflects momentum/technical indicator behavior.
  • Volume and Sentiment_Compound point in different directions from price variables. They have weaker or different relationships compared to core price indicators.
  • BB_Upper is moderately aligned with moving averages, meaning it is related to price trends but contributes less strongly.

Factor Analysis (FA)

Unlike PCA, FA starts from Stage 4 because the objective, design, and assumptions are already done in PCA.

A. Stage 4. Deriving Factors and Assessing Overall Fit

Based on the results of the correlation analysis, the KMO test, and Bartlett’s Test of Sphericity, the dataset satisfies the required assumptions for factor analysis. Therefore, the next step is to perform factor extraction using the Principal Component method with the objective of reducing the dimensionality of the dataset by transforming the original variables into a smaller set of principal components that still retain most of the information contained in the data.

A.1. FA Manual (Eigendecomposition)

scale_final10 <- scale(data_final_10)
varcov_fa <- cov(scale_final10)

pc_fa <- eigen(varcov_fa)

cat("Eigenvalue:\n")
## Eigenvalue:
print(pc_fa$values)
##  [1] 5.5619369902 1.9946199607 1.0115725982 0.9798320048 0.4099887027
##  [6] 0.0206605410 0.0127125825 0.0055062609 0.0025449606 0.0006253987
cat("\nEigenvector:\n")
## 
## Eigenvector:
print(pc_fa$vectors)
##              [,1]        [,2]        [,3]         [,4]         [,5]
##  [1,] -0.41567325  0.11230865  0.01322081 -0.008216611  0.135009310
##  [2,] -0.41567151  0.11172235  0.01425196 -0.008154637  0.149376044
##  [3,] -0.02934564  0.01129368 -0.71270003  0.700735393  0.004723621
##  [4,] -0.40954728  0.17731455  0.01158528 -0.003340026  0.002620419
##  [5,] -0.39417059  0.25795726  0.01812316 -0.001712594 -0.026645321
##  [6,] -0.16598702 -0.55425343 -0.01784897 -0.020443170  0.751209237
##  [7,] -0.27999040 -0.45885484 -0.02703895 -0.027474275 -0.581116171
##  [8,] -0.39592616  0.24058516  0.01784160 -0.003495431 -0.023259891
##  [9,] -0.01232335 -0.06314977  0.69929173  0.711818882 -0.012652764
## [10,] -0.26003200 -0.54514328 -0.02900259 -0.030833601 -0.236659449
##               [,6]         [,7]         [,8]         [,9]         [,10]
##  [1,]  0.443652362  0.009456708 -0.418911018  0.650094684  0.0332612383
##  [2,]  0.387057666 -0.019809996 -0.192860308 -0.719702461  0.2945839203
##  [3,]  0.001637344  0.002169932 -0.002764269 -0.001223511 -0.0010126911
##  [4,] -0.139933855 -0.307371064  0.659439352  0.208223170  0.4565114803
##  [5,] -0.115947586 -0.372945542  0.065006763 -0.125229861 -0.7775639045
##  [6,] -0.292797623 -0.105017783 -0.044923233  0.007997880 -0.0368561027
##  [7,] -0.311543244 -0.315300965 -0.379054355 -0.015009381  0.1774663073
##  [8,] -0.529458209  0.702015435 -0.104925752 -0.007636990  0.0164405845
##  [9,]  0.001149315  0.001139025  0.001447335  0.000576489 -0.0002278188
## [10,]  0.396502182  0.403333208  0.437621837 -0.004057362 -0.2568688653
L1_fa <- sqrt(pc_fa$values[1]) * pc_fa$vectors[, 1]
L2_fa <- sqrt(pc_fa$values[2]) * pc_fa$vectors[, 2]
L3_fa <- sqrt(pc_fa$values[3]) * pc_fa$vectors[, 3]

L_fa <- cbind(L1_fa, L2_fa, L3_fa)
rownames(L_fa) <- colnames(data_final_10)
colnames(L_fa) <- c("F1", "F2", "F3")

cat("\nFactor Loadings (Manual):\n")
## 
## Factor Loadings (Manual):
print(round(L_fa, 4))
##                         F1      F2      F3
## Open               -0.9803  0.1586  0.0133
## Close              -0.9803  0.1578  0.0143
## Volume             -0.0692  0.0160 -0.7168
## SMA_10             -0.9659  0.2504  0.0117
## SMA_20             -0.9296  0.3643  0.0182
## RSI                -0.3915 -0.7828 -0.0180
## Signal             -0.6603 -0.6480 -0.0272
## BB_Upper           -0.9337  0.3398  0.0179
## Sentiment_Compound -0.0291 -0.0892  0.7033
## MACD               -0.6133 -0.7699 -0.0292
  • The analysis successfully reduced 10 variables into 3 main factors.
  • These three factors explain 85.68% of the total data variance.
  • The factors formed describe three main dimensions of the stock market: Price Trend, Technical Momentum, and Market Activity & Sentiment

A.2. FA with the principal function (Without Rotation)

FA without rotation uses the principal() function from the psych package. This result is comparable to the manual output above.

library(psych)

fa_norotate <- principal(scale_final10, nfactors = 3, rotate = "none")
print(fa_norotate)
## Principal Components Analysis
## Call: principal(r = scale_final10, nfactors = 3, rotate = "none")
## Standardized loadings (pattern matrix) based upon correlation matrix
##                     PC1   PC2   PC3   h2     u2 com
## Open               0.98 -0.16 -0.01 0.99 0.0136 1.1
## Close              0.98 -0.16 -0.01 0.99 0.0139 1.1
## Volume             0.07 -0.02  0.72 0.52 0.4811 1.0
## SMA_10             0.97 -0.25 -0.01 1.00 0.0043 1.1
## SMA_20             0.93 -0.36 -0.02 1.00 0.0028 1.3
## RSI                0.39  0.78  0.02 0.77 0.2337 1.5
## Signal             0.66  0.65  0.03 0.86 0.1433 2.0
## BB_Upper           0.93 -0.34 -0.02 0.99 0.0124 1.3
## Sentiment_Compound 0.03  0.09 -0.70 0.50 0.4965 1.0
## MACD               0.61  0.77  0.03 0.97 0.0303 1.9
## 
##                        PC1  PC2  PC3
## SS loadings           5.56 1.99 1.01
## Proportion Var        0.56 0.20 0.10
## Cumulative Var        0.56 0.76 0.86
## Proportion Explained  0.65 0.23 0.12
## Cumulative Proportion 0.65 0.88 1.00
## 
## Mean item complexity =  1.3
## Test of the hypothesis that 3 components are sufficient.
## 
## The root mean square of the residuals (RMSR) is  0.06 
##  with the empirical chi square  281.61  with prob <  2.9e-49 
## 
## Fit based upon off diagonal values = 0.99

The result are:

  • PCA was performed on scale_final10 data with 3 main components (PC1, PC2, PC3).
  • PC1 represents price trends (Open, Close, SMA_10, SMA_20, BB_Upper), PC2 describes technical momentum (RSI, MACD, Signal), and PC3 relates to volume and market sentiment (Volume, Sentiment_Compound).
  • Most variables have high h2 (≈0.77–1.00), while Volume and Sentiment are only around 0.5, so their explanation by the components is still moderate.
  • PC1 explains 56%, PC2 20%, and PC3 10% of the total data variance.
  • Cumulatively, the three components explain about 86% of the data variance.
  • The Mean item complexity value = 1.3 indicates that the variables generally correlate strongly with one main component.
  • RMSR = 0.06 indicates a relatively small residual value, so the model is quite good.

B. Stage 5: Interpreting The Factors*

B. 1. FA with Varimax Rotation

Varimax rotation is applied to clarify the factor structure by maximizing the variance loading on each factor, so that each variable has a high loading on one factor and a low loading on other factors.

fa_varimax <- principal(scale_final10, nfactors = 3, rotate = "varimax")
print(fa_varimax, cut = 0.40, sort = TRUE)
## Principal Components Analysis
## Call: principal(r = scale_final10, nfactors = 3, rotate = "varimax")
## Standardized loadings (pattern matrix) based upon correlation matrix
##                    item  RC1  RC2   RC3   h2     u2 com
## SMA_20                5 1.00            1.00 0.0028 1.0
## BB_Upper              8 0.99            0.99 0.0124 1.0
## SMA_10                4 0.98            1.00 0.0043 1.1
## Open                  1 0.96            0.99 0.0136 1.2
## Close                 2 0.96            0.99 0.0139 1.2
## MACD                 10      0.96       0.97 0.0303 1.1
## RSI                   6      0.87       0.77 0.2337 1.0
## Signal                7      0.87       0.86 0.1433 1.3
## Volume                3            0.72 0.52 0.4811 1.0
## Sentiment_Compound    9           -0.71 0.50 0.4965 1.0
## 
##                        RC1  RC2  RC3
## SS loadings           4.92 2.63 1.02
## Proportion Var        0.49 0.26 0.10
## Cumulative Var        0.49 0.76 0.86
## Proportion Explained  0.57 0.31 0.12
## Cumulative Proportion 0.57 0.88 1.00
## 
## Mean item complexity =  1.1
## Test of the hypothesis that 3 components are sufficient.
## 
## The root mean square of the residuals (RMSR) is  0.06 
##  with the empirical chi square  281.61  with prob <  2.9e-49 
## 
## Fit based upon off diagonal values = 0.99

Varimax rotation interpretation:

  • RC1 tends to represent price and technical trend factors (Price & Trend Factor).
  • RC2 tends to represent momentum factors (Momentum Factor).
  • RC3 tends to represent market sentiment factors (Sentiment Factor).
  • After rotation, each variable has a clearer loading on one dominant factor.
  • Varimax rotation makes each variable have a higher loading on one main component, so that the factor structure becomes clearer and easier to interpret.

B. 2. Communalities

cat("Communalities (h2) setelah rotasi Varimax:\n")
## Communalities (h2) setelah rotasi Varimax:
print(round(fa_varimax$communality, 4))
##               Open              Close             Volume             SMA_10 
##             0.9864             0.9861             0.5189             0.9957 
##             SMA_20                RSI             Signal           BB_Upper 
##             0.9972             0.7663             0.8567             0.9876 
## Sentiment_Compound               MACD 
##             0.5035             0.9697
cat("\nUniqueness (u2) setelah rotasi Varimax:\n")
## 
## Uniqueness (u2) setelah rotasi Varimax:
print(round(fa_varimax$uniqueness, 4))
##               Open              Close             Volume             SMA_10 
##             0.0136             0.0139             0.4811             0.0043 
##             SMA_20                RSI             Signal           BB_Upper 
##             0.0028             0.2337             0.1433             0.0124 
## Sentiment_Compound               MACD 
##             0.4965             0.0303
  1. Communalities (h²)
  • Communality (h²) shows the proportion of variance in each variable that can be explained by the three main components of PCA results.
  • Most variables have very high h² values (≈0.85–1.00), such as Open, Close, SMA_10, SMA_20, BB_Upper, and MACD, which means that almost all of the variance in these variables is successfully explained by the components formed.
  1. Uniqueness (u²)
  • Uniqueness (u²) shows the proportion of variable variance that is not explained by the components and is unique variance or error.
  • Variables such as Volume (0.4811) and Sentiment_Compound (0.4965) have relatively higher u² values, which means that almost 50% of their variance is not explained by the components, so their contribution to the factor structure is relatively smaller than other variables.

Interpretation:

  • High communality (> 0.70) indicates that the variable is well represented by the extracted factors.
  • Variables with low communality (< 0.50) are poorly represented and should be considered for exclusion or addition of factors.

B. 3. Explained Variance for Each Factor

cat("Proporsi Varians yang Dijelaskan:\n")
## Proporsi Varians yang Dijelaskan:
print(fa_varimax$Vaccounted)
##                             RC1       RC2       RC3
## SS loadings           4.9235717 2.6287684 1.0157894
## Proportion Var        0.4923572 0.2628768 0.1015789
## Cumulative Var        0.4923572 0.7552340 0.8568130
## Proportion Explained  0.5746379 0.3068077 0.1185544
## Cumulative Proportion 0.5746379 0.8814456 1.0000000

Proportion of Explained Variance

  • RC1, RC2, and RC3 have eigenvalues of 4.92, 2.63, and 1.02, respectively, so all three are worth keeping (eigenvalue > 1).
  • RC1 explains 49.24%, RC2 26.29%, and RC3 10.16% of the total data variance.
  • Cumulatively, the three components explain 85.68% of the variance, which is sufficient to represent the information from the 10 variables.
  • Of the variance explained by the model, the largest contribution comes from RC1 (57.46%), followed by RC2 (30.68%) and RC3 (11.86%).

B. 4. Factor Scores

Factor scores are calculated using the regression method (Anderson-Rubin), which involves multiplying the scaled data by the inverse correlation matrix and factor loadings.

scores_FA_final10 <- scale_final10 %*% solve(cor(scale_final10)) %*% as.matrix(fa_norotate$loadings)
colnames(scores_FA_final10) <- c("F1", "F2", "F3")

cat("Factor Scores (6 first obsevation):\n")
## Factor Scores (6 first obsevation):
head(scores_FA_final10)
##              F1         F2          F3
## [1,] -0.1656698 -0.8742466 -0.52352849
## [2,] -0.2721755 -1.5032516  0.91324796
## [3,] -0.3056642 -1.5046164 -0.08347475
## [4,] -0.2412841 -1.3178723  0.47734823
## [5,] -0.3565034 -1.5395865  1.85751097
## [6,] -0.4642799 -1.5612693 -0.64384825
  • The actor score shows the position of each observation relative to the formed factors (F1, F2, F3). This value is obtained from a combination of reduced original variables.

  • In the first 6 observations, the F2 value tends to be negative, indicating that these observations have a relatively low value on the second factor (momentum factor).

  • The F3 value varies (positive and negative), indicating differences in the characteristics of observations in the market activity/sentiment factor.

library(ggplot2)

scores_df <- as.data.frame(scores_FA_final10)

ggplot(scores_df, aes(x = F1, y = F2)) +
  geom_point(color = "steelblue", alpha = 0.6, size = 2) +
  labs(title = "Factor Scores: F1 vs F2",
       x = "Factor 1 (Price & Trend)",
       y = "Factor 2 (Momentum)") +
  theme_minimal()

ggplot(scores_df, aes(x = F1, y = F3)) +
  geom_point(color = "darkorange", alpha = 0.6, size = 2) +
  labs(title = "Factor Scores: F1 vs F3",
       x = "Factor 1 (Price & Trend)",
       y = "Factor 3 (Sentiment)") +
  theme_minimal()

C. Stage 6: Validation of Principal Component Analysis

Based on the obtained FA results:

These characteristics indicate that the three-factor solution is structurally strong and interpretable.

Because the loading structure is clearly differentiated and communalities meet the acceptable threshold, the 3-factor FA model is expected to demonstrate good stability under split-sample validation.

D. Stage 7: Additional Uses of The Exploratory Factor Analysis Results

The Factor Analysis successfully reduced 10 observed variables into 3 interpretable latent factors representing:

  • Price & Trend Factor
  • Momentum Factor
  • Market Activity & Sentiment Factor

These factors explain 85.68% of the total variance, indicating that the extracted structure effectively captures the main information in the dataset and can be confidently used for further statistical or predictive analysis.

Additional Visualization

1. Biplot

pca_for_fa_viz <- PCA(data_clean, scale.unit = TRUE, graph = FALSE)
fviz_pca_biplot(pca_for_fa_viz,
                geom.ind = "point",
                addEllipses = TRUE,
                col.var = "contrib",
                gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
                repel = TRUE) +
  ggtitle("Biplot FA: Individu dan Variabel")

  1. Principal Dimensions
  • Dim1 (55.6%) is the main factor explaining most of the data variation.
  • Dim2 (19.9%) is the second factor, so these two dimensions together explain about 75.5% of the data variance.
  1. Distribution of Observations
  • Black dots indicate data observations.
  • Nearby observations have similar characteristics, while distant ones show differences in variable characteristics.
  1. Contributing Variables
  • The variables Open, Close, SMA_10, SMA_20, and BB_Upper have long arrows pointing towards Dim1, indicating a strong contribution to the price/trend factor.
  • The variables RSI, MACD, and Signal point towards Dim2, indicating a contribution to the momentum factor.
  1. Relationships Between Variables
  • Variables with arrows pointing in the same direction have a positive correlation (e.g., Open, Close, SMA).
  • Volume and Sentiment_Compound are closer to the center, so their contribution to the two main dimensions is smaller.

2. Correlation Circle

library(FactoMineR)
library(factoextra)

pca_for_fa_viz <- PCA(data_final_10, scale.unit = TRUE, graph = FALSE, ncp = 3)

fviz_pca_var(pca_for_fa_viz,
             col.var = "contrib",
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE) +
  ggtitle("Kontribusi Variabel terhadap Faktor (FA)")

  1. Meaning of Dimensions
  • Dim1 (55.6%) is the first factor that explains most of the data variance.
  • Dim2 (19.9%) is the second factor that explains additional variation in the data.
  1. Variable Contribution
  • The variables Open, Close, SMA_10, SMA_20, and BB_Upper have high loadings on Dim1, so this factor can be interpreted as a price and technical trend factor.
  • The variables RSI, MACD, and Signal have large loadings on Dim2, which represents the technical momentum factor.
  1. Variables with Smaller Contributions
  • Volume is near the center of the circle, indicating that its contribution to the two main factors is relatively small.
  • Sentiment_Compound is also near the center and tends to be opposite to price variables, indicating a lower contribution to the main factors.
  1. Relationships Between Variables
  • Variables with arrows pointing in the same direction indicate a strong positive correlation (e.g., Open, Close, SMA).
  • Variables that form large angles or point in opposite directions indicate a weak or negative correlation.

Comparison of FA Without Rotation vs. Varimax Rotation

cat("FA Without Rotation")
## FA Without Rotation
print(round(fa_norotate$loadings, 3))
## 
## Loadings:
##                    PC1    PC2    PC3   
## Open                0.980 -0.159       
## Close               0.980 -0.158       
## Volume                            0.717
## SMA_10              0.966 -0.250       
## SMA_20              0.930 -0.364       
## RSI                 0.391  0.783       
## Signal              0.660  0.648       
## BB_Upper            0.934 -0.340       
## Sentiment_Compound               -0.703
## MACD                0.613  0.770       
## 
##                  PC1   PC2   PC3
## SS loadings    5.561 1.995 1.011
## Proportion Var 0.556 0.199 0.101
## Cumulative Var 0.556 0.756 0.857
cat("FA Varimax Rotation")
## FA Varimax Rotation
print(round(fa_varimax$loadings, 3))
## 
## Loadings:
##                    RC1    RC2    RC3   
## Open                0.956  0.270       
## Close               0.955  0.270       
## Volume                            0.718
## SMA_10              0.981  0.181       
## SMA_20              0.997              
## RSI                        0.875       
## Signal              0.325  0.867       
## BB_Upper            0.990              
## Sentiment_Compound               -0.707
## MACD                0.231  0.957       
## 
##                  RC1   RC2   RC3
## SS loadings    4.924 2.629 1.017
## Proportion Var 0.492 0.263 0.102
## Cumulative Var 0.492 0.755 0.857

Conclusion

This analysis successfully applied dimensionality reduction techniques to the multi-channel stock market dataset to address high dimensionality and multicollinearity. PCA summarized the correlated financial indicators into three principal components.which are Price Trend, Technical Momentum, and Market Activity & Sentiment, explaining 85.68% of the total variance, preserving most of the dataset’s information. Factor Analysis (FA) further identified three latent factors consistent with the PCA components, clarifying main relationships among price, technical, and sentiment variables. These findings demonstrate that both PCA and FA effectively reduce complexity while capturing key market dynamics.