library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(dplyr)
library(Hmisc)

## 
## Attaching package: 'Hmisc'
## 
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, units

library(tidyr)
library(ggplot2)
library(sjPlot)

## 
## Attaching package: 'sjPlot'
## 
## The following object is masked from 'package:ggplot2':
## 
##     set_theme

library(psych)

## 
## Attaching package: 'psych'
## 
## The following object is masked from 'package:Hmisc':
## 
##     describe
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

library(factoextra)

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(GPArotation)

## 
## Attaching package: 'GPArotation'
## 
## The following objects are masked from 'package:psych':
## 
##     equamax, varimin

df <- read.csv('Customer Churn.csv')
head(df)

##   Call..Failure Complains Subscription..Length Charge..Amount Seconds.of.Use
## 1             8         0                   38              0           4370
## 2             0         0                   39              0            318
## 3            10         0                   37              0           2453
## 4            10         0                   38              0           4198
## 5             3         0                   38              0           2393
## 6            11         0                   38              1           3775
##   Frequency.of.use Frequency.of.SMS Distinct.Called.Numbers Age.Group
## 1               71                5                      17         3
## 2                5                7                       4         2
## 3               60              359                      24         3
## 4               66                1                      35         1
## 5               58                2                      33         1
## 6               82               32                      28         3
##   Tariff.Plan Status Age Customer.Value Churn
## 1           1      1  30        197.640     0
## 2           1      2  25         46.035     0
## 3           1      1  30       1536.520     0
## 4           1      1  15        240.020     0
## 5           1      1  15        145.805     0
## 6           1      1  30        282.280     0

summary(df)

##  Call..Failure      Complains       Subscription..Length Charge..Amount   
##  Min.   : 0.000   Min.   :0.00000   Min.   : 3.00        Min.   : 0.0000  
##  1st Qu.: 1.000   1st Qu.:0.00000   1st Qu.:30.00        1st Qu.: 0.0000  
##  Median : 6.000   Median :0.00000   Median :35.00        Median : 0.0000  
##  Mean   : 7.628   Mean   :0.07651   Mean   :32.54        Mean   : 0.9429  
##  3rd Qu.:12.000   3rd Qu.:0.00000   3rd Qu.:38.00        3rd Qu.: 1.0000  
##  Max.   :36.000   Max.   :1.00000   Max.   :47.00        Max.   :10.0000  
##  Seconds.of.Use  Frequency.of.use Frequency.of.SMS Distinct.Called.Numbers
##  Min.   :    0   Min.   :  0.00   Min.   :  0.00   Min.   : 0.00          
##  1st Qu.: 1391   1st Qu.: 27.00   1st Qu.:  6.00   1st Qu.:10.00          
##  Median : 2990   Median : 54.00   Median : 21.00   Median :21.00          
##  Mean   : 4472   Mean   : 69.46   Mean   : 73.17   Mean   :23.51          
##  3rd Qu.: 6478   3rd Qu.: 95.00   3rd Qu.: 87.00   3rd Qu.:34.00          
##  Max.   :17090   Max.   :255.00   Max.   :522.00   Max.   :97.00          
##    Age.Group      Tariff.Plan        Status           Age     Customer.Value  
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :15   Min.   :   0.0  
##  1st Qu.:2.000   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:25   1st Qu.: 113.8  
##  Median :3.000   Median :1.000   Median :1.000   Median :30   Median : 228.5  
##  Mean   :2.826   Mean   :1.078   Mean   :1.248   Mean   :31   Mean   : 471.0  
##  3rd Qu.:3.000   3rd Qu.:1.000   3rd Qu.:1.000   3rd Qu.:30   3rd Qu.: 788.4  
##  Max.   :5.000   Max.   :2.000   Max.   :2.000   Max.   :55   Max.   :2165.3  
##      Churn       
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :0.0000  
##  Mean   :0.1571  
##  3rd Qu.:0.0000  
##  Max.   :1.0000

From the result above the data as no missing value so we can just go to the next step. We exclude Churn during the PCA and FA calculation since it is the target which could result in data leakage for these unsupervised algorithm.

clean_df <- df |>
  select(-Churn)

Data Characteristics

ggplot(df, aes(x = Churn, fill = as.factor(Churn))) +
  geom_bar() +
  labs(
    title = "Distribution Number of Customer in Churn Category",
    x = "Churn",
    y = "Number of Customers",
    fill = "Category"
  ) +
  scale_fill_manual(
    values = c("red", "green"),
    labels= c("Churn", "Not Churn")
  ) +
  theme_minimal() +
  theme(
      legend.position = "bottom",
      plot.title = element_text(hjust = 0.5, face = "bold")
  )

ggplot(df, aes(x= Churn, y= Customer.Value, fill = as.factor(Churn))) +
  geom_boxplot(alpha = 0.7, outlier.colour = "red", outlier.size = 1) +
  labs(
      title = "Distribution of Customer Value based on the Churn Status",
      fill = "Category"
  ) +
  scale_fill_manual(
    values = c("red", "green"),
    labels= c("Churn", "Not Churn")
  ) +
  theme_minimal() +
  theme(
    legend.position = "bottom",
    plot.title = element_text(hjust = 0.5, face = "bold")
  )

Assumptions

corr_data <- clean_df |>
      as.matrix() |>
        rcorr()

lower_idx <- lower.tri(corr_data$r)
unique_r <- corr_data$r[lower_idx]
unique_p <- corr_data$P[lower_idx]

valid_pairs_count <- sum(abs(unique_r) > 0.30 & unique_p < 0.01, na.rm = TRUE)
total_pairs_count <- length(unique_r)

percentage_valid <- (valid_pairs_count / total_pairs_count) * 100
cat("Percentage of unique pairs with |r| > 0.30 and p < 0.01: ", round(percentage_valid, 2), "%\n")

## Percentage of unique pairs with |r| > 0.30 and p < 0.01:  25.64 %

clean_df |>
      tab_corr()

	Call..Failure	Complains	Subscription..Length	Charge..Amount	Seconds.of.Use	Frequency.of.use	Frequency.of.SMS	Distinct.Called.Numbers	Age.Group	Tariff.Plan	Status	Age	Customer.Value
Call..Failure		0.153***	0.170***	0.589***	0.502***	0.573***	-0.022	0.504***	0.050**	0.192***	-0.115***	0.042*	0.121***
Complains	0.153***		-0.020	-0.034	-0.105***	-0.091***	-0.112***	-0.058**	0.020	0.001	0.271***	0.003	-0.133***
Subscription..Length	0.170***	-0.020		0.079***	0.125***	0.107***	0.076***	0.092***	0.021	-0.160***	0.143***	-0.002	0.110***
Charge..Amount	0.589***	-0.034	0.079***		0.447***	0.379***	0.092***	0.415***	0.280***	0.324***	-0.356***	0.279***	0.169***
Seconds.of.Use	0.502***	-0.105***	0.125***	0.447***		0.946***	0.102***	0.677***	0.020	0.134***	-0.461***	0.021	0.415***
Frequency.of.use	0.573***	-0.091***	0.107***	0.379***	0.946***		0.100***	0.736***	-0.033	0.206***	-0.455***	-0.028	0.402***
Frequency.of.SMS	-0.022	-0.112***	0.076***	0.092***	0.102***	0.100***		0.080***	-0.054**	0.196***	-0.296***	-0.093***	0.925***
Distinct.Called.Numbers	0.504***	-0.058**	0.092***	0.415***	0.677***	0.736***	0.080***		0.021	0.172***	-0.413***	0.051**	0.285***
Age.Group	0.050**	0.020	0.021	0.280***	0.020	-0.033	-0.054**	0.021		-0.151***	0.003	0.961***	-0.183***
Tariff.Plan	0.192***	0.001	-0.160***	0.324***	0.134***	0.206***	0.196***	0.172***	-0.151***		-0.164***	-0.119***	0.252***
Status	-0.115***	0.271***	0.143***	-0.356***	-0.461***	-0.455***	-0.296***	-0.413***	0.003	-0.164***		-0.001	-0.413***
Age	0.042*	0.003	-0.002	0.279***	0.021	-0.028	-0.093***	0.051**	0.961***	-0.119***	-0.001		-0.220***
Customer.Value	0.121***	-0.133***	0.110***	0.169***	0.415***	0.402***	0.925***	0.285***	-0.183***	0.252***	-0.413***	-0.220***
Computed correlation used pearson-method with listwise-deletion.

fa_data <- corr_data$r

kmo_filter <- function(data) {
  repeat {
    kmo_result <- KMO(data)
    
    msa_values <- kmo_result$MSAi
    
    min_msa <- min(msa_values)
    min_var <- names(which.min(msa_values))
    
    if (min_msa >= 0.5) {
      cat("All variables passed KMO threshold (MSA >= 0.5)\n")
      cat("Overall KMO:", kmo_result$MSA, "\n")
      print(kmo_result$MSAi)
      break
    }
    
    cat("Removing variable:", min_var, "| MSA =", round(min_msa, 4), "\n")
    data <- data[, !colnames(data) %in% min_var]
  }
  
  return(data)
}


clean_df <- kmo_filter(clean_df)

## Removing variable: Tariff.Plan | MSA = 0.3601 
## Removing variable: Frequency.of.SMS | MSA = 0.4259 
## Removing variable: Subscription..Length | MSA = 0.4563 
## All variables passed KMO threshold (MSA >= 0.5)
## Overall KMO: 0.6390464 
##           Call..Failure               Complains          Charge..Amount 
##               0.5811866               0.6125565               0.5671869 
##          Seconds.of.Use        Frequency.of.use Distinct.Called.Numbers 
##               0.6575280               0.6175513               0.8801046 
##               Age.Group                  Status                     Age 
##               0.5209310               0.7435941               0.5161150 
##          Customer.Value 
##               0.8560696

cat("\nRemaining variables:", ncol(clean_df), "\n")

## 
## Remaining variables: 10

cat("Variables kept:", colnames(clean_df), "\n")

## Variables kept: Call..Failure Complains Charge..Amount Seconds.of.Use Frequency.of.use Distinct.Called.Numbers Age.Group Status Age Customer.Value

n = nrow(clean_df)
corr_data$r |>
  cortest.bartlett(n=n, diag = TRUE)

## $chisq
## [1] 38219.42
## 
## $p.value
## [1] 0
## 
## $df
## [1] 78

PCA

Calculate PCA using Eigen

scale_df <- scale(clean_df)
cor_matrix <- cov(scale_df)
  
pc <- eigen(cor_matrix)

data.frame(pc$vectors)

##             X1           X2          X3          X4          X5          X6
## 1  -0.34201545 -0.089401352 -0.46734405 -0.26943225  0.25394178 -0.22477570
## 2   0.06800960 -0.077254119 -0.67264766  0.65074639  0.01080933  0.32284678
## 3  -0.33144418 -0.246519579 -0.08669780 -0.17404388  0.67943455  0.01469626
## 4  -0.46165528  0.045662558 -0.01407282  0.02337777 -0.31857031 -0.07346387
## 5  -0.46619067  0.079646973 -0.06565265 -0.01696293 -0.37453129 -0.04834995
## 6  -0.41649198  0.006891662 -0.08036415 -0.08560871 -0.28833643  0.21259943
## 7  -0.03500520 -0.646860892  0.16091280  0.17309366 -0.10476672 -0.10934302
## 8   0.31154664 -0.079943554 -0.43840284 -0.18631649 -0.25693448 -0.67825805
## 9  -0.03621553 -0.649975076  0.16474925  0.12662996 -0.13555420 -0.05789990
## 10 -0.25499360  0.263635176  0.25577166  0.62029881  0.23130167 -0.56280939
##             X7           X8           X9          X10
## 1  -0.01550589  0.667027942 -0.007062854 -0.154277351
## 2  -0.05261369 -0.077869761  0.013816874  0.010538612
## 3  -0.02495864 -0.554538561  0.023607901  0.144998292
## 4  -0.43895767 -0.289599674 -0.122742786 -0.620921236
## 5  -0.28959240  0.007096198  0.151578799  0.722649954
## 6   0.81244159 -0.120903770 -0.065466681 -0.073473310
## 7  -0.01327856  0.106593562 -0.685223780  0.143140419
## 8   0.13609186 -0.350890447  0.008848405  0.043175525
## 9   0.03739861  0.068908344  0.696872802 -0.139775943
## 10  0.19915705  0.061043485  0.040403494 -0.002801393

propvar <- sapply(pc$values, function(x) x/sum(pc$values))*100

cumvar <- data.frame(cbind(pc$values, propvar)) |>
  mutate(cum = cumsum(propvar))

colnames(cumvar)[1] <- "eigen_value"
row.names(cumvar) <- paste0("PC",c(1:ncol(clean_df)))
print(cumvar)

##      eigen_value   propvar       cum
## PC1    3.8172229 38.172229  38.17223
## PC2    2.1587786 21.587786  59.76001
## PC3    1.3056698 13.056698  72.81671
## PC4    0.7536057  7.536057  80.35277
## PC5    0.7185989  7.185989  87.53876
## PC6    0.5452963  5.452963  92.99172
## PC7    0.3561144  3.561144  96.55287
## PC8    0.2731875  2.731875  99.28474
## PC9    0.0369698  0.369698  99.65444
## PC10   0.0345561  0.345561 100.00000

Based on the cumulative variance, the PC1 to PC4 hold 80.35% of data information, thus making them able do be use for representing the data without the sacrifice the essential information.

# Make df for pca with only 4 first components
pca_df <-
  scale_df %*% pc$vectors[, 1:4] |>
     data.frame()

colnames(pca_df) <- paste0("PC", 1:ncol(pca_df))

head(pca_df)

##           PC1         PC2         PC3        PC4
## 1  0.27767731  0.02283714  0.38176054 -0.2633117
## 2  2.80590292  0.81086220 -0.31702271 -0.7837384
## 3 -0.34602642  0.64763010  0.90177075  1.2266190
## 4 -0.07335151  2.57169955 -0.44407340 -0.9785779
## 5  0.61454666  2.57827882 -0.01578183 -0.8297137
## 6 -0.41320037 -0.11980529  0.11169283 -0.4487193

Visualization

In here we use prcomp since the visualization library integrated with it. To ensure the result are similar with the manual calculation, checking being used.

pca_result <- clean_df |>
  prcomp(scale = TRUE)

cat("Are the Manual and Auto PCA is the same?", all.equal(
  abs(scale_df %*% pc$vectors),
  abs(pca_result$x),
  check.attributes = FALSE
))

## Are the Manual and Auto PCA is the same? TRUE

fviz_eig(
   pca_result,
   addlabels = TRUE,
   ncp = ncol(data),
   barfill = "skyblue",
   barcolor = "darkblue",
   linecolor = "red"
)

## Warning in geom_bar(stat = "identity", fill = barfill, color = barcolor, :
## Ignoring empty aesthetic: `width`.

fviz_pca_biplot(
  pca_result, 
  label = "var",
  repel = TRUE,
  invisible = aes("ind")
)

## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

plot_data <- data.frame(
  PC1 = pca_result$x[, 1],
  PC2 = pca_result$x[, 2],
  PC3 = pca_result$x[, 3],
  PC4 = pca_result$x[, 4],
  Label = as.factor(df$Churn)
)

ggplot(plot_data, aes(x = PC1, y = PC2, color= Label)) +
  geom_point(alpha = 0.7, size = 2) +
  stat_ellipse(level = 0.95, linetype = "dashed") +
  labs(
    title = "PCA of Churn Dataset (PC1 vs PC2)",
    x = "Principal Component 1",
    y = "Principal Component 2",
    color = "Churn"
  ) +
  scale_color_manual(
    values = c("red", "green"),
    labels= c("Churn", "Not Churn")
  ) +
  theme_minimal()

long_box_data <- plot_data |>
  pivot_longer(
    cols = starts_with("PC"),
    names_to = "Component",
    values_to = "Score"
  )

ggplot(long_box_data, aes(x = Label, y = Score, fill = Label)) +
  geom_boxplot(alpha = 0.8, outlier.size = 1, outlier.alpha = 0.5) +
  facet_wrap(~ Component, scales = "free_y") +
  labs(
    title = "Distribution of Principal Component Scores by Churn",
    x = "Churn",
    y = "Principal Component Score",
    fill = "Category"
  ) +
  scale_fill_manual(
    values = c("red", "green"),
    labels = c("Churn", "Not Churn")
  ) +
  theme_minimal() +
  theme(legend.position = "bottom")

FA

loading <- pca_result$rotation[, 1:4]
loading

##                                 PC1          PC2         PC3         PC4
## Call..Failure           -0.34201545 -0.089401352  0.46734405 -0.26943225
## Complains                0.06800960 -0.077254119  0.67264766  0.65074639
## Charge..Amount          -0.33144418 -0.246519579  0.08669780 -0.17404388
## Seconds.of.Use          -0.46165528  0.045662558  0.01407282  0.02337777
## Frequency.of.use        -0.46619067  0.079646973  0.06565265 -0.01696293
## Distinct.Called.Numbers -0.41649198  0.006891662  0.08036415 -0.08560871
## Age.Group               -0.03500520 -0.646860892 -0.16091280  0.17309366
## Status                   0.31154664 -0.079943554  0.43840284 -0.18631649
## Age                     -0.03621553 -0.649975076 -0.16474925  0.12662996
## Customer.Value          -0.25499360  0.263635176 -0.25577166  0.62029881

Determining The Number of Factors

Parallel Analysis

fa.parallel(clean_df, fm = "ml", fa = "fa", main = "Parallel Analysis - FA")

## Parallel analysis suggests that the number of factors =  4  and the number of components =  NA

Kaiser’s criterion

eigen_values <- eigen(cor(clean_df))$values

eigen_table <- data.frame(
  Factor        = paste0("F", 1:length(eigen_values)),
  Eigenvalue    = round(eigen_values, 4),
  Variance_Pct  = round((eigen_values / sum(eigen_values)) * 100, 2),
  Cumulative_Pct = round(cumsum((eigen_values / sum(eigen_values)) * 100), 2),
  Retain        = ifelse(eigen_values > 1, "Yes", "No")
)

print(eigen_table)

##    Factor Eigenvalue Variance_Pct Cumulative_Pct Retain
## 1      F1     3.8172        38.17          38.17    Yes
## 2      F2     2.1588        21.59          59.76    Yes
## 3      F3     1.3057        13.06          72.82    Yes
## 4      F4     0.7536         7.54          80.35     No
## 5      F5     0.7186         7.19          87.54     No
## 6      F6     0.5453         5.45          92.99     No
## 7      F7     0.3561         3.56          96.55     No
## 8      F8     0.2732         2.73          99.28     No
## 9      F9     0.0370         0.37          99.65     No
## 10    F10     0.0346         0.35         100.00     No

kaiser_factors <- sum(eigen_values > 1)
cat("Number of factors (Kaiser's criteria):", kaiser_factors, "\n")

## Number of factors (Kaiser's criteria): 3

fa_result <- fa(fa_data, nfactors = 4, rotate = "varimax", fm = "ml")

print(fa_result, cut = 0.3, sort = TRUE)

## Factor Analysis using method =  ml
## Call: fa(r = fa_data, nfactors = 4, rotate = "varimax", fm = "ml")
## Standardized loadings (pattern matrix) based upon correlation matrix
##                         item   ML4   ML1   ML3   ML2    h2     u2 com
## Seconds.of.Use             5  0.98                   0.985 0.0150 1.1
## Frequency.of.use           6  0.92                   0.924 0.0762 1.2
## Distinct.Called.Numbers    8  0.65                   0.513 0.4872 1.4
## Status                    11 -0.45                   0.297 0.7033 1.8
## Frequency.of.SMS           7        1.00             0.994 0.0062 1.0
## Customer.Value            13  0.32  0.93             0.987 0.0127 1.3
## Tariff.Plan               10                         0.096 0.9041 3.2
## Age                       12              0.98       0.979 0.0212 1.0
## Age.Group                  9              0.97       0.945 0.0553 1.0
## Call..Failure              1  0.38              0.92 0.995 0.0050 1.3
## Charge..Amount             4  0.36              0.49 0.452 0.5477 2.6
## Complains                  2                         0.076 0.9240 2.2
## Subscription..Length       3                         0.036 0.9636 2.3
## 
##                        ML4  ML1  ML3  ML2
## SS loadings           2.85 2.06 2.02 1.35
## Proportion Var        0.22 0.16 0.16 0.10
## Cumulative Var        0.22 0.38 0.53 0.64
## Proportion Explained  0.34 0.25 0.24 0.16
## Cumulative Proportion 0.34 0.59 0.84 1.00
## 
## Mean item complexity =  1.6
## Test of the hypothesis that 4 factors are sufficient.
## 
## df null model =  78  with the objective function =  12.16
## df of  the model are 32  and the objective function was  1.01 
## 
## The root mean square of the residuals (RMSR) is  0.04 
## The df corrected root mean square of the residuals is  0.06 
## 
## Fit based upon off diagonal values = 0.98
## Measures of factor score adequacy             
##                                                    ML4  ML1  ML3  ML2
## Correlation of (regression) scores with factors   0.99 1.00 0.99 1.00
## Multiple R square of scores with factors          0.99 1.00 0.98 0.99
## Minimum correlation of possible factor scores     0.97 0.99 0.97 0.98

fa_data <- clean_df[, !colnames(clean_df) %in% 
                        c("Tariff.Plan", "Complains", "Subscription..Length")]

fa_result2 <- fa(fa_data, nfactors = 4, rotate = "varimax", fm = "ml")

print(fa_result2, cut = 0.3, sort = TRUE)

## Factor Analysis using method =  ml
## Call: fa(r = fa_data, nfactors = 4, rotate = "varimax", fm = "ml")
## Standardized loadings (pattern matrix) based upon correlation matrix
##                         item   ML1   ML2   ML4   ML3   h2    u2 com
## Frequency.of.use           4  0.92        0.39       1.00 0.005 1.3
## Seconds.of.Use             3  0.91                   0.92 0.077 1.2
## Distinct.Called.Numbers    5  0.65        0.35       0.57 0.434 1.6
## Status                     7 -0.54                   0.37 0.628 1.6
## Customer.Value             9  0.45                   0.26 0.738 1.6
## Age                        8        0.99             1.00 0.005 1.0
## Age.Group                  6        0.96             0.93 0.072 1.0
## Call..Failure              1              0.90       0.93 0.074 1.3
## Charge..Amount             2              0.35  0.87 1.00 0.005 1.7
## 
##                        ML1  ML2  ML4  ML3
## SS loadings           2.73 2.00 1.29 0.94
## Proportion Var        0.30 0.22 0.14 0.10
## Cumulative Var        0.30 0.53 0.67 0.77
## Proportion Explained  0.39 0.29 0.19 0.13
## Cumulative Proportion 0.39 0.68 0.87 1.00
## 
## Mean item complexity =  1.4
## Test of the hypothesis that 4 factors are sufficient.
## 
## df null model =  36  with the objective function =  7.7 with Chi Square =  24230.52
## df of  the model are 6  and the objective function was  0.15 
## 
## The root mean square of the residuals (RMSR) is  0.03 
## The df corrected root mean square of the residuals is  0.07 
## 
## The harmonic n.obs is  3150 with the empirical chi square  83.58  with prob <  6.5e-16 
## The total n.obs was  3150  with Likelihood Chi Square =  479.31  with prob <  2.4e-100 
## 
## Tucker Lewis Index of factoring reliability =  0.883
## RMSEA index =  0.158  and the 90 % confidence intervals are  0.146 0.17
## BIC =  430.98
## Fit based upon off diagonal values = 1
## Measures of factor score adequacy             
##                                                    ML1  ML2  ML4  ML3
## Correlation of (regression) scores with factors   0.99 1.00 0.94 0.99
## Multiple R square of scores with factors          0.97 0.99 0.89 0.98
## Minimum correlation of possible factor scores     0.95 0.99 0.78 0.97

comm_table <- data.frame(
  Variable    = colnames(fa_data),
  Communality = round(fa_result2$communality, 3),
  Uniqueness  = round(fa_result2$uniquenesses, 3)
)
print(comm_table)

##                                        Variable Communality Uniqueness
## Call..Failure                     Call..Failure       0.926      0.074
## Charge..Amount                   Charge..Amount       0.995      0.005
## Seconds.of.Use                   Seconds.of.Use       0.923      0.077
## Frequency.of.use               Frequency.of.use       0.995      0.005
## Distinct.Called.Numbers Distinct.Called.Numbers       0.566      0.434
## Age.Group                             Age.Group       0.928      0.072
## Status                                   Status       0.372      0.628
## Age                                         Age       0.995      0.005
## Customer.Value                   Customer.Value       0.262      0.738

variance_table <- as.data.frame(fa_result2$Vaccounted) |>
  rownames_to_column("Metric") |>
  mutate(across(where(is.numeric), ~ round(., 4)))
print(variance_table)

##                  Metric    ML1    ML2    ML4    ML3
## 1           SS loadings 2.7334 1.9986 1.2923 0.9365
## 2        Proportion Var 0.3037 0.2221 0.1436 0.1041
## 3        Cumulative Var 0.3037 0.5258 0.6694 0.7734
## 4  Proportion Explained 0.3927 0.2871 0.1857 0.1345
## 5 Cumulative Proportion 0.3927 0.6798 0.8655 1.0000

Visualization

fa.diagram(fa_result2,
           main = "Figure. Factor Structure - Iranian Churn Dataset")

loadings_matrix <- as.data.frame(unclass(fa_result2$loadings))

loadings_matrix |>
  rownames_to_column("Variable") |>
  pivot_longer(-Variable, names_to = "Factor", values_to = "Loading") |>
  ggplot(aes(x = Factor, y = Variable, fill = Loading)) +
  geom_tile(color = "white") +
  geom_text(aes(label = round(Loading, 2)), size = 3.5) +
  scale_fill_gradient2(low = "red", mid = "white", high = "blue",
                       midpoint = 0, limits = c(-1, 1)) +
  labs(title = "Figure. Factor Loadings Heatmap",
       x = "Factor", y = "Variable") +
  theme_minimal()

factor_scores <- as.data.frame(fa_result2$scores)

factor_scores$Churn <- as.factor(df$Churn)

ggplot(factor_scores, aes(x = ML1, y = ML2, color = Churn)) +
  geom_point(alpha = 0.5, size = 2) +
  stat_ellipse(level = 0.95, linetype = "dashed") +
  labs(
    title = "Factor Scores Plot (ML1 vs ML2)",
    x = "Factor 1 - Usage Intensity",
    y = "Factor 2 - Demographics",
    color = "Churn"
  ) +
  scale_color_manual(
    values = c("0" = "green", "1" = "red"),
    labels = c("0" = "Not Churn", "1" = "Churn")
  ) +
  theme_minimal()

ggplot(factor_scores, aes(x = ML1, y = ML2, color = Churn)) +
  geom_jitter(alpha = 0.4, size = 1.5, width = 0.05, height = 0.05) +
  stat_ellipse(level = 0.95, linetype = "dashed", linewidth = 0.8) +
  
  geom_hline(yintercept = 0, linetype = "dotted", color = "grey50") +
  geom_vline(xintercept = 0, linetype = "dotted", color = "grey50") +
  
  annotate("text", x = 2, y = 2.8,
           label = "Older + Heavy User", size = 3, color = "grey40") +
  annotate("text", x = -1.5, y = 2.8,
           label = "Older + Light User", size = 3, color = "grey40") +
  annotate("text", x = 2, y = -1.8,
           label = "Younger + Heavy User", size = 3, color = "grey40") +
  annotate("text", x = -1.5, y = -1.8,
           label = "Younger + Light User", size = 3, color = "grey40") +
  
  labs(
    title    = "Factor Scores Plot (Usage Intensity vs Demographics)",
    subtitle = "Red = Churned | Green = Retained",
    x        = "Factor 1 — Usage Intensity (low ← → high)",
    y        = "Factor 2 — Demographics (younger ← → older)",
    color    = "Churn Status"
  ) +
  scale_color_manual(
    values = c("0" = "#2ecc71", "1" = "#e74c3c"),
    labels = c("0" = "Not Churn", "1" = "Churn")
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title    = element_text(face = "bold", size = 14),
    plot.subtitle = element_text(color = "grey50"),
    legend.position = "right"
  )

Implementation of Principal Component Analysis (PCA) and Factor Analysis (FA) for Dimensionality Reduction and Pattern Mapping in Electroencephalogram Epilepsy Data

Muhammad Raffi Fahrezi (24031554100), Ketut Shridhara (24031554115)

2026-02-24

Data Characteristics

Assumptions

PCA

Calculate PCA using Eigen

Visualization

FA

Determining The Number of Factors

Parallel Analysis

Kaiser’s criterion

Visualization