library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.0 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(Hmisc)
##
## Attaching package: 'Hmisc'
##
## The following objects are masked from 'package:dplyr':
##
## src, summarize
##
## The following objects are masked from 'package:base':
##
## format.pval, units
library(tidyr)
library(ggplot2)
library(sjPlot)
##
## Attaching package: 'sjPlot'
##
## The following object is masked from 'package:ggplot2':
##
## set_theme
library(psych)
##
## Attaching package: 'psych'
##
## The following object is masked from 'package:Hmisc':
##
## describe
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(GPArotation)
##
## Attaching package: 'GPArotation'
##
## The following objects are masked from 'package:psych':
##
## equamax, varimin
df <- read.csv('Customer Churn.csv')
head(df)
## Call..Failure Complains Subscription..Length Charge..Amount Seconds.of.Use
## 1 8 0 38 0 4370
## 2 0 0 39 0 318
## 3 10 0 37 0 2453
## 4 10 0 38 0 4198
## 5 3 0 38 0 2393
## 6 11 0 38 1 3775
## Frequency.of.use Frequency.of.SMS Distinct.Called.Numbers Age.Group
## 1 71 5 17 3
## 2 5 7 4 2
## 3 60 359 24 3
## 4 66 1 35 1
## 5 58 2 33 1
## 6 82 32 28 3
## Tariff.Plan Status Age Customer.Value Churn
## 1 1 1 30 197.640 0
## 2 1 2 25 46.035 0
## 3 1 1 30 1536.520 0
## 4 1 1 15 240.020 0
## 5 1 1 15 145.805 0
## 6 1 1 30 282.280 0
summary(df)
## Call..Failure Complains Subscription..Length Charge..Amount
## Min. : 0.000 Min. :0.00000 Min. : 3.00 Min. : 0.0000
## 1st Qu.: 1.000 1st Qu.:0.00000 1st Qu.:30.00 1st Qu.: 0.0000
## Median : 6.000 Median :0.00000 Median :35.00 Median : 0.0000
## Mean : 7.628 Mean :0.07651 Mean :32.54 Mean : 0.9429
## 3rd Qu.:12.000 3rd Qu.:0.00000 3rd Qu.:38.00 3rd Qu.: 1.0000
## Max. :36.000 Max. :1.00000 Max. :47.00 Max. :10.0000
## Seconds.of.Use Frequency.of.use Frequency.of.SMS Distinct.Called.Numbers
## Min. : 0 Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1391 1st Qu.: 27.00 1st Qu.: 6.00 1st Qu.:10.00
## Median : 2990 Median : 54.00 Median : 21.00 Median :21.00
## Mean : 4472 Mean : 69.46 Mean : 73.17 Mean :23.51
## 3rd Qu.: 6478 3rd Qu.: 95.00 3rd Qu.: 87.00 3rd Qu.:34.00
## Max. :17090 Max. :255.00 Max. :522.00 Max. :97.00
## Age.Group Tariff.Plan Status Age Customer.Value
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :15 Min. : 0.0
## 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:25 1st Qu.: 113.8
## Median :3.000 Median :1.000 Median :1.000 Median :30 Median : 228.5
## Mean :2.826 Mean :1.078 Mean :1.248 Mean :31 Mean : 471.0
## 3rd Qu.:3.000 3rd Qu.:1.000 3rd Qu.:1.000 3rd Qu.:30 3rd Qu.: 788.4
## Max. :5.000 Max. :2.000 Max. :2.000 Max. :55 Max. :2165.3
## Churn
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.1571
## 3rd Qu.:0.0000
## Max. :1.0000
From the result above the data as no missing value so we can just go
to the next step. We exclude Churn during the PCA and FA
calculation since it is the target which could result in data leakage
for these unsupervised algorithm.
clean_df <- df |>
select(-Churn)
ggplot(df, aes(x = Churn, fill = as.factor(Churn))) +
geom_bar() +
labs(
title = "Distribution Number of Customer in Churn Category",
x = "Churn",
y = "Number of Customers",
fill = "Category"
) +
scale_fill_manual(
values = c("red", "green"),
labels= c("Churn", "Not Churn")
) +
theme_minimal() +
theme(
legend.position = "bottom",
plot.title = element_text(hjust = 0.5, face = "bold")
)
ggplot(df, aes(x= Churn, y= Customer.Value, fill = as.factor(Churn))) +
geom_boxplot(alpha = 0.7, outlier.colour = "red", outlier.size = 1) +
labs(
title = "Distribution of Customer Value based on the Churn Status",
fill = "Category"
) +
scale_fill_manual(
values = c("red", "green"),
labels= c("Churn", "Not Churn")
) +
theme_minimal() +
theme(
legend.position = "bottom",
plot.title = element_text(hjust = 0.5, face = "bold")
)
corr_data <- clean_df |>
as.matrix() |>
rcorr()
lower_idx <- lower.tri(corr_data$r)
unique_r <- corr_data$r[lower_idx]
unique_p <- corr_data$P[lower_idx]
valid_pairs_count <- sum(abs(unique_r) > 0.30 & unique_p < 0.01, na.rm = TRUE)
total_pairs_count <- length(unique_r)
percentage_valid <- (valid_pairs_count / total_pairs_count) * 100
cat("Percentage of unique pairs with |r| > 0.30 and p < 0.01: ", round(percentage_valid, 2), "%\n")
## Percentage of unique pairs with |r| > 0.30 and p < 0.01: 25.64 %
clean_df |>
tab_corr()
| Call..Failure | Complains | Subscription..Length | Charge..Amount | Seconds.of.Use | Frequency.of.use | Frequency.of.SMS | Distinct.Called.Numbers | Age.Group | Tariff.Plan | Status | Age | Customer.Value | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Call..Failure | 0.153*** | 0.170*** | 0.589*** | 0.502*** | 0.573*** | -0.022 | 0.504*** | 0.050** | 0.192*** | -0.115*** | 0.042* | 0.121*** | |
| Complains | 0.153*** | -0.020 | -0.034 | -0.105*** | -0.091*** | -0.112*** | -0.058** | 0.020 | 0.001 | 0.271*** | 0.003 | -0.133*** | |
| Subscription..Length | 0.170*** | -0.020 | 0.079*** | 0.125*** | 0.107*** | 0.076*** | 0.092*** | 0.021 | -0.160*** | 0.143*** | -0.002 | 0.110*** | |
| Charge..Amount | 0.589*** | -0.034 | 0.079*** | 0.447*** | 0.379*** | 0.092*** | 0.415*** | 0.280*** | 0.324*** | -0.356*** | 0.279*** | 0.169*** | |
| Seconds.of.Use | 0.502*** | -0.105*** | 0.125*** | 0.447*** | 0.946*** | 0.102*** | 0.677*** | 0.020 | 0.134*** | -0.461*** | 0.021 | 0.415*** | |
| Frequency.of.use | 0.573*** | -0.091*** | 0.107*** | 0.379*** | 0.946*** | 0.100*** | 0.736*** | -0.033 | 0.206*** | -0.455*** | -0.028 | 0.402*** | |
| Frequency.of.SMS | -0.022 | -0.112*** | 0.076*** | 0.092*** | 0.102*** | 0.100*** | 0.080*** | -0.054** | 0.196*** | -0.296*** | -0.093*** | 0.925*** | |
| Distinct.Called.Numbers | 0.504*** | -0.058** | 0.092*** | 0.415*** | 0.677*** | 0.736*** | 0.080*** | 0.021 | 0.172*** | -0.413*** | 0.051** | 0.285*** | |
| Age.Group | 0.050** | 0.020 | 0.021 | 0.280*** | 0.020 | -0.033 | -0.054** | 0.021 | -0.151*** | 0.003 | 0.961*** | -0.183*** | |
| Tariff.Plan | 0.192*** | 0.001 | -0.160*** | 0.324*** | 0.134*** | 0.206*** | 0.196*** | 0.172*** | -0.151*** | -0.164*** | -0.119*** | 0.252*** | |
| Status | -0.115*** | 0.271*** | 0.143*** | -0.356*** | -0.461*** | -0.455*** | -0.296*** | -0.413*** | 0.003 | -0.164*** | -0.001 | -0.413*** | |
| Age | 0.042* | 0.003 | -0.002 | 0.279*** | 0.021 | -0.028 | -0.093*** | 0.051** | 0.961*** | -0.119*** | -0.001 | -0.220*** | |
| Customer.Value | 0.121*** | -0.133*** | 0.110*** | 0.169*** | 0.415*** | 0.402*** | 0.925*** | 0.285*** | -0.183*** | 0.252*** | -0.413*** | -0.220*** | |
| Computed correlation used pearson-method with listwise-deletion. | |||||||||||||
fa_data <- corr_data$r
kmo_filter <- function(data) {
repeat {
kmo_result <- KMO(data)
msa_values <- kmo_result$MSAi
min_msa <- min(msa_values)
min_var <- names(which.min(msa_values))
if (min_msa >= 0.5) {
cat("All variables passed KMO threshold (MSA >= 0.5)\n")
cat("Overall KMO:", kmo_result$MSA, "\n")
print(kmo_result$MSAi)
break
}
cat("Removing variable:", min_var, "| MSA =", round(min_msa, 4), "\n")
data <- data[, !colnames(data) %in% min_var]
}
return(data)
}
clean_df <- kmo_filter(clean_df)
## Removing variable: Tariff.Plan | MSA = 0.3601
## Removing variable: Frequency.of.SMS | MSA = 0.4259
## Removing variable: Subscription..Length | MSA = 0.4563
## All variables passed KMO threshold (MSA >= 0.5)
## Overall KMO: 0.6390464
## Call..Failure Complains Charge..Amount
## 0.5811866 0.6125565 0.5671869
## Seconds.of.Use Frequency.of.use Distinct.Called.Numbers
## 0.6575280 0.6175513 0.8801046
## Age.Group Status Age
## 0.5209310 0.7435941 0.5161150
## Customer.Value
## 0.8560696
cat("\nRemaining variables:", ncol(clean_df), "\n")
##
## Remaining variables: 10
cat("Variables kept:", colnames(clean_df), "\n")
## Variables kept: Call..Failure Complains Charge..Amount Seconds.of.Use Frequency.of.use Distinct.Called.Numbers Age.Group Status Age Customer.Value
n = nrow(clean_df)
corr_data$r |>
cortest.bartlett(n=n, diag = TRUE)
## $chisq
## [1] 38219.42
##
## $p.value
## [1] 0
##
## $df
## [1] 78
scale_df <- scale(clean_df)
cor_matrix <- cov(scale_df)
pc <- eigen(cor_matrix)
data.frame(pc$vectors)
## X1 X2 X3 X4 X5 X6
## 1 -0.34201545 -0.089401352 -0.46734405 -0.26943225 0.25394178 -0.22477570
## 2 0.06800960 -0.077254119 -0.67264766 0.65074639 0.01080933 0.32284678
## 3 -0.33144418 -0.246519579 -0.08669780 -0.17404388 0.67943455 0.01469626
## 4 -0.46165528 0.045662558 -0.01407282 0.02337777 -0.31857031 -0.07346387
## 5 -0.46619067 0.079646973 -0.06565265 -0.01696293 -0.37453129 -0.04834995
## 6 -0.41649198 0.006891662 -0.08036415 -0.08560871 -0.28833643 0.21259943
## 7 -0.03500520 -0.646860892 0.16091280 0.17309366 -0.10476672 -0.10934302
## 8 0.31154664 -0.079943554 -0.43840284 -0.18631649 -0.25693448 -0.67825805
## 9 -0.03621553 -0.649975076 0.16474925 0.12662996 -0.13555420 -0.05789990
## 10 -0.25499360 0.263635176 0.25577166 0.62029881 0.23130167 -0.56280939
## X7 X8 X9 X10
## 1 -0.01550589 0.667027942 -0.007062854 -0.154277351
## 2 -0.05261369 -0.077869761 0.013816874 0.010538612
## 3 -0.02495864 -0.554538561 0.023607901 0.144998292
## 4 -0.43895767 -0.289599674 -0.122742786 -0.620921236
## 5 -0.28959240 0.007096198 0.151578799 0.722649954
## 6 0.81244159 -0.120903770 -0.065466681 -0.073473310
## 7 -0.01327856 0.106593562 -0.685223780 0.143140419
## 8 0.13609186 -0.350890447 0.008848405 0.043175525
## 9 0.03739861 0.068908344 0.696872802 -0.139775943
## 10 0.19915705 0.061043485 0.040403494 -0.002801393
propvar <- sapply(pc$values, function(x) x/sum(pc$values))*100
cumvar <- data.frame(cbind(pc$values, propvar)) |>
mutate(cum = cumsum(propvar))
colnames(cumvar)[1] <- "eigen_value"
row.names(cumvar) <- paste0("PC",c(1:ncol(clean_df)))
print(cumvar)
## eigen_value propvar cum
## PC1 3.8172229 38.172229 38.17223
## PC2 2.1587786 21.587786 59.76001
## PC3 1.3056698 13.056698 72.81671
## PC4 0.7536057 7.536057 80.35277
## PC5 0.7185989 7.185989 87.53876
## PC6 0.5452963 5.452963 92.99172
## PC7 0.3561144 3.561144 96.55287
## PC8 0.2731875 2.731875 99.28474
## PC9 0.0369698 0.369698 99.65444
## PC10 0.0345561 0.345561 100.00000
Based on the cumulative variance, the PC1 to PC4 hold 80.35% of data information, thus making them able do be use for representing the data without the sacrifice the essential information.
# Make df for pca with only 4 first components
pca_df <-
scale_df %*% pc$vectors[, 1:4] |>
data.frame()
colnames(pca_df) <- paste0("PC", 1:ncol(pca_df))
head(pca_df)
## PC1 PC2 PC3 PC4
## 1 0.27767731 0.02283714 0.38176054 -0.2633117
## 2 2.80590292 0.81086220 -0.31702271 -0.7837384
## 3 -0.34602642 0.64763010 0.90177075 1.2266190
## 4 -0.07335151 2.57169955 -0.44407340 -0.9785779
## 5 0.61454666 2.57827882 -0.01578183 -0.8297137
## 6 -0.41320037 -0.11980529 0.11169283 -0.4487193
In here we use prcomp since the visualization library
integrated with it. To ensure the result are similar with the manual
calculation, checking being used.
pca_result <- clean_df |>
prcomp(scale = TRUE)
cat("Are the Manual and Auto PCA is the same?", all.equal(
abs(scale_df %*% pc$vectors),
abs(pca_result$x),
check.attributes = FALSE
))
## Are the Manual and Auto PCA is the same? TRUE
fviz_eig(
pca_result,
addlabels = TRUE,
ncp = ncol(data),
barfill = "skyblue",
barcolor = "darkblue",
linecolor = "red"
)
## Warning in geom_bar(stat = "identity", fill = barfill, color = barcolor, :
## Ignoring empty aesthetic: `width`.
fviz_pca_biplot(
pca_result,
label = "var",
repel = TRUE,
invisible = aes("ind")
)
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
plot_data <- data.frame(
PC1 = pca_result$x[, 1],
PC2 = pca_result$x[, 2],
PC3 = pca_result$x[, 3],
PC4 = pca_result$x[, 4],
Label = as.factor(df$Churn)
)
ggplot(plot_data, aes(x = PC1, y = PC2, color= Label)) +
geom_point(alpha = 0.7, size = 2) +
stat_ellipse(level = 0.95, linetype = "dashed") +
labs(
title = "PCA of Churn Dataset (PC1 vs PC2)",
x = "Principal Component 1",
y = "Principal Component 2",
color = "Churn"
) +
scale_color_manual(
values = c("red", "green"),
labels= c("Churn", "Not Churn")
) +
theme_minimal()
long_box_data <- plot_data |>
pivot_longer(
cols = starts_with("PC"),
names_to = "Component",
values_to = "Score"
)
ggplot(long_box_data, aes(x = Label, y = Score, fill = Label)) +
geom_boxplot(alpha = 0.8, outlier.size = 1, outlier.alpha = 0.5) +
facet_wrap(~ Component, scales = "free_y") +
labs(
title = "Distribution of Principal Component Scores by Churn",
x = "Churn",
y = "Principal Component Score",
fill = "Category"
) +
scale_fill_manual(
values = c("red", "green"),
labels = c("Churn", "Not Churn")
) +
theme_minimal() +
theme(legend.position = "bottom")
loading <- pca_result$rotation[, 1:4]
loading
## PC1 PC2 PC3 PC4
## Call..Failure -0.34201545 -0.089401352 0.46734405 -0.26943225
## Complains 0.06800960 -0.077254119 0.67264766 0.65074639
## Charge..Amount -0.33144418 -0.246519579 0.08669780 -0.17404388
## Seconds.of.Use -0.46165528 0.045662558 0.01407282 0.02337777
## Frequency.of.use -0.46619067 0.079646973 0.06565265 -0.01696293
## Distinct.Called.Numbers -0.41649198 0.006891662 0.08036415 -0.08560871
## Age.Group -0.03500520 -0.646860892 -0.16091280 0.17309366
## Status 0.31154664 -0.079943554 0.43840284 -0.18631649
## Age -0.03621553 -0.649975076 -0.16474925 0.12662996
## Customer.Value -0.25499360 0.263635176 -0.25577166 0.62029881
fa.parallel(clean_df, fm = "ml", fa = "fa", main = "Parallel Analysis - FA")
## Parallel analysis suggests that the number of factors = 4 and the number of components = NA
eigen_values <- eigen(cor(clean_df))$values
eigen_table <- data.frame(
Factor = paste0("F", 1:length(eigen_values)),
Eigenvalue = round(eigen_values, 4),
Variance_Pct = round((eigen_values / sum(eigen_values)) * 100, 2),
Cumulative_Pct = round(cumsum((eigen_values / sum(eigen_values)) * 100), 2),
Retain = ifelse(eigen_values > 1, "Yes", "No")
)
print(eigen_table)
## Factor Eigenvalue Variance_Pct Cumulative_Pct Retain
## 1 F1 3.8172 38.17 38.17 Yes
## 2 F2 2.1588 21.59 59.76 Yes
## 3 F3 1.3057 13.06 72.82 Yes
## 4 F4 0.7536 7.54 80.35 No
## 5 F5 0.7186 7.19 87.54 No
## 6 F6 0.5453 5.45 92.99 No
## 7 F7 0.3561 3.56 96.55 No
## 8 F8 0.2732 2.73 99.28 No
## 9 F9 0.0370 0.37 99.65 No
## 10 F10 0.0346 0.35 100.00 No
kaiser_factors <- sum(eigen_values > 1)
cat("Number of factors (Kaiser's criteria):", kaiser_factors, "\n")
## Number of factors (Kaiser's criteria): 3
fa_result <- fa(fa_data, nfactors = 4, rotate = "varimax", fm = "ml")
print(fa_result, cut = 0.3, sort = TRUE)
## Factor Analysis using method = ml
## Call: fa(r = fa_data, nfactors = 4, rotate = "varimax", fm = "ml")
## Standardized loadings (pattern matrix) based upon correlation matrix
## item ML4 ML1 ML3 ML2 h2 u2 com
## Seconds.of.Use 5 0.98 0.985 0.0150 1.1
## Frequency.of.use 6 0.92 0.924 0.0762 1.2
## Distinct.Called.Numbers 8 0.65 0.513 0.4872 1.4
## Status 11 -0.45 0.297 0.7033 1.8
## Frequency.of.SMS 7 1.00 0.994 0.0062 1.0
## Customer.Value 13 0.32 0.93 0.987 0.0127 1.3
## Tariff.Plan 10 0.096 0.9041 3.2
## Age 12 0.98 0.979 0.0212 1.0
## Age.Group 9 0.97 0.945 0.0553 1.0
## Call..Failure 1 0.38 0.92 0.995 0.0050 1.3
## Charge..Amount 4 0.36 0.49 0.452 0.5477 2.6
## Complains 2 0.076 0.9240 2.2
## Subscription..Length 3 0.036 0.9636 2.3
##
## ML4 ML1 ML3 ML2
## SS loadings 2.85 2.06 2.02 1.35
## Proportion Var 0.22 0.16 0.16 0.10
## Cumulative Var 0.22 0.38 0.53 0.64
## Proportion Explained 0.34 0.25 0.24 0.16
## Cumulative Proportion 0.34 0.59 0.84 1.00
##
## Mean item complexity = 1.6
## Test of the hypothesis that 4 factors are sufficient.
##
## df null model = 78 with the objective function = 12.16
## df of the model are 32 and the objective function was 1.01
##
## The root mean square of the residuals (RMSR) is 0.04
## The df corrected root mean square of the residuals is 0.06
##
## Fit based upon off diagonal values = 0.98
## Measures of factor score adequacy
## ML4 ML1 ML3 ML2
## Correlation of (regression) scores with factors 0.99 1.00 0.99 1.00
## Multiple R square of scores with factors 0.99 1.00 0.98 0.99
## Minimum correlation of possible factor scores 0.97 0.99 0.97 0.98
fa_data <- clean_df[, !colnames(clean_df) %in%
c("Tariff.Plan", "Complains", "Subscription..Length")]
fa_result2 <- fa(fa_data, nfactors = 4, rotate = "varimax", fm = "ml")
print(fa_result2, cut = 0.3, sort = TRUE)
## Factor Analysis using method = ml
## Call: fa(r = fa_data, nfactors = 4, rotate = "varimax", fm = "ml")
## Standardized loadings (pattern matrix) based upon correlation matrix
## item ML1 ML2 ML4 ML3 h2 u2 com
## Frequency.of.use 4 0.92 0.39 1.00 0.005 1.3
## Seconds.of.Use 3 0.91 0.92 0.077 1.2
## Distinct.Called.Numbers 5 0.65 0.35 0.57 0.434 1.6
## Status 7 -0.54 0.37 0.628 1.6
## Customer.Value 9 0.45 0.26 0.738 1.6
## Age 8 0.99 1.00 0.005 1.0
## Age.Group 6 0.96 0.93 0.072 1.0
## Call..Failure 1 0.90 0.93 0.074 1.3
## Charge..Amount 2 0.35 0.87 1.00 0.005 1.7
##
## ML1 ML2 ML4 ML3
## SS loadings 2.73 2.00 1.29 0.94
## Proportion Var 0.30 0.22 0.14 0.10
## Cumulative Var 0.30 0.53 0.67 0.77
## Proportion Explained 0.39 0.29 0.19 0.13
## Cumulative Proportion 0.39 0.68 0.87 1.00
##
## Mean item complexity = 1.4
## Test of the hypothesis that 4 factors are sufficient.
##
## df null model = 36 with the objective function = 7.7 with Chi Square = 24230.52
## df of the model are 6 and the objective function was 0.15
##
## The root mean square of the residuals (RMSR) is 0.03
## The df corrected root mean square of the residuals is 0.07
##
## The harmonic n.obs is 3150 with the empirical chi square 83.58 with prob < 6.5e-16
## The total n.obs was 3150 with Likelihood Chi Square = 479.31 with prob < 2.4e-100
##
## Tucker Lewis Index of factoring reliability = 0.883
## RMSEA index = 0.158 and the 90 % confidence intervals are 0.146 0.17
## BIC = 430.98
## Fit based upon off diagonal values = 1
## Measures of factor score adequacy
## ML1 ML2 ML4 ML3
## Correlation of (regression) scores with factors 0.99 1.00 0.94 0.99
## Multiple R square of scores with factors 0.97 0.99 0.89 0.98
## Minimum correlation of possible factor scores 0.95 0.99 0.78 0.97
comm_table <- data.frame(
Variable = colnames(fa_data),
Communality = round(fa_result2$communality, 3),
Uniqueness = round(fa_result2$uniquenesses, 3)
)
print(comm_table)
## Variable Communality Uniqueness
## Call..Failure Call..Failure 0.926 0.074
## Charge..Amount Charge..Amount 0.995 0.005
## Seconds.of.Use Seconds.of.Use 0.923 0.077
## Frequency.of.use Frequency.of.use 0.995 0.005
## Distinct.Called.Numbers Distinct.Called.Numbers 0.566 0.434
## Age.Group Age.Group 0.928 0.072
## Status Status 0.372 0.628
## Age Age 0.995 0.005
## Customer.Value Customer.Value 0.262 0.738
variance_table <- as.data.frame(fa_result2$Vaccounted) |>
rownames_to_column("Metric") |>
mutate(across(where(is.numeric), ~ round(., 4)))
print(variance_table)
## Metric ML1 ML2 ML4 ML3
## 1 SS loadings 2.7334 1.9986 1.2923 0.9365
## 2 Proportion Var 0.3037 0.2221 0.1436 0.1041
## 3 Cumulative Var 0.3037 0.5258 0.6694 0.7734
## 4 Proportion Explained 0.3927 0.2871 0.1857 0.1345
## 5 Cumulative Proportion 0.3927 0.6798 0.8655 1.0000
fa.diagram(fa_result2,
main = "Figure. Factor Structure - Iranian Churn Dataset")
loadings_matrix <- as.data.frame(unclass(fa_result2$loadings))
loadings_matrix |>
rownames_to_column("Variable") |>
pivot_longer(-Variable, names_to = "Factor", values_to = "Loading") |>
ggplot(aes(x = Factor, y = Variable, fill = Loading)) +
geom_tile(color = "white") +
geom_text(aes(label = round(Loading, 2)), size = 3.5) +
scale_fill_gradient2(low = "red", mid = "white", high = "blue",
midpoint = 0, limits = c(-1, 1)) +
labs(title = "Figure. Factor Loadings Heatmap",
x = "Factor", y = "Variable") +
theme_minimal()
factor_scores <- as.data.frame(fa_result2$scores)
factor_scores$Churn <- as.factor(df$Churn)
ggplot(factor_scores, aes(x = ML1, y = ML2, color = Churn)) +
geom_point(alpha = 0.5, size = 2) +
stat_ellipse(level = 0.95, linetype = "dashed") +
labs(
title = "Factor Scores Plot (ML1 vs ML2)",
x = "Factor 1 - Usage Intensity",
y = "Factor 2 - Demographics",
color = "Churn"
) +
scale_color_manual(
values = c("0" = "green", "1" = "red"),
labels = c("0" = "Not Churn", "1" = "Churn")
) +
theme_minimal()
ggplot(factor_scores, aes(x = ML1, y = ML2, color = Churn)) +
geom_jitter(alpha = 0.4, size = 1.5, width = 0.05, height = 0.05) +
stat_ellipse(level = 0.95, linetype = "dashed", linewidth = 0.8) +
geom_hline(yintercept = 0, linetype = "dotted", color = "grey50") +
geom_vline(xintercept = 0, linetype = "dotted", color = "grey50") +
annotate("text", x = 2, y = 2.8,
label = "Older + Heavy User", size = 3, color = "grey40") +
annotate("text", x = -1.5, y = 2.8,
label = "Older + Light User", size = 3, color = "grey40") +
annotate("text", x = 2, y = -1.8,
label = "Younger + Heavy User", size = 3, color = "grey40") +
annotate("text", x = -1.5, y = -1.8,
label = "Younger + Light User", size = 3, color = "grey40") +
labs(
title = "Factor Scores Plot (Usage Intensity vs Demographics)",
subtitle = "Red = Churned | Green = Retained",
x = "Factor 1 — Usage Intensity (low ← → high)",
y = "Factor 2 — Demographics (younger ← → older)",
color = "Churn Status"
) +
scale_color_manual(
values = c("0" = "#2ecc71", "1" = "#e74c3c"),
labels = c("0" = "Not Churn", "1" = "Churn")
) +
theme_minimal(base_size = 12) +
theme(
plot.title = element_text(face = "bold", size = 14),
plot.subtitle = element_text(color = "grey50"),
legend.position = "right"
)