This analysis applies dimension reduction techniques (PCA, t-SNE, UMAP) to the Apple App Store dataset to visualize high-dimensional app characteristics and validate the clustering structure identified in the companion clustering analysis.
Key Results:
| Metric | Value |
|---|---|
| Original Dimensions | 15 variables |
| Optimal PCs (80% variance) | 8 components |
| PC1 + PC2 Variance | 38.9% |
| Best Structure Preservation | PCA (Mantel r = 0.696) |
Principal Component Interpretation:
| Component | Variance | Interpretation |
|---|---|---|
| PC1 | 23.8% | International Reach & Popularity |
| PC2 | 15.1% | User Engagement & Quality |
Method Comparison:
We load the pre-processed dataset from the clustering analysis, which already has anomalies removed and cluster assignments.
# Load pre-cleaned data with clusters from clustering analysis
clean_file <- "results/appstore_with_clusters.csv"
if (file.exists(clean_file)) {
appstore <- read.csv(clean_file, stringsAsFactors = FALSE)
has_clusters <- "cluster" %in% names(appstore)
cat("Loaded pre-cleaned dataset with cluster assignments\n")
} else {
# Fallback to original data if clustering hasn't been run
appstore <- read.csv("DataSets/AppleStore.csv", stringsAsFactors = FALSE)
has_clusters <- FALSE
cat("Note: Run clustering_report.Rmd first for cluster-colored visualizations\n")
}## Loaded pre-cleaned dataset with cluster assignments
data.frame(
Metric = c("Total Apps", "Variables", "Has Clusters"),
Value = c(format(nrow(appstore), big.mark = ","), ncol(appstore), ifelse(has_clusters, "Yes (3 clusters)", "No"))
) %>%
kable(caption = "Dataset Overview") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Metric | Value |
|---|---|
| Total Apps | 6,847 |
| Variables | 34 |
| Has Clusters | Yes (3 clusters) |
## Cluster distribution:
##
## 1 2 3
## 2517 2760 1570
## Dataset dimensions: 6847 apps × 34 variables
# Create numeric encoding for categorical variables
appstore$cont_rating_num <- as.numeric(factor(appstore$cont_rating))
appstore$prime_genre_num <- as.numeric(factor(appstore$prime_genre))
# Create price tiers and engineered features
appstore <- appstore %>%
mutate(
price_tier = cut(price,
breaks = c(-Inf, 0, 2.99, 9.99, 29.99, Inf),
labels = c("Free", "Budget", "Mid", "Premium", "Ultra")),
# Log transformations for skewed variables
log_rating_count = log1p(rating_count_tot),
log_size = log1p(size_bytes),
log_price = log1p(price),
# Engagement and quality metrics
engagement_ratio = ifelse(rating_count_tot > 0,
rating_count_ver / rating_count_tot, 0),
quality_delta = user_rating_ver - user_rating,
# Composite indices
international_index = ifelse(size_bytes > 0,
(lang.num * sup_devices.num) / log1p(size_bytes), 0),
richness_score = ipadSc_urls.num + lang.num
)
# Feature summary table
data.frame(
Feature = c("log_size", "log_price", "log_rating_count", "engagement_ratio",
"quality_delta", "international_index", "richness_score", "price_tier"),
Purpose = c("Normalize size distribution", "Normalize price distribution",
"Normalize popularity distribution", "Version engagement rate",
"Rating trend direction", "Global reach potential",
"App comprehensiveness", "Categorical price segments")
) %>%
kable(caption = "Engineered Features") %>%
kable_styling(bootstrap_options = c("striped", "hover"))| Feature | Purpose |
|---|---|
| log_size | Normalize size distribution |
| log_price | Normalize price distribution |
| log_rating_count | Normalize popularity distribution |
| engagement_ratio | Version engagement rate |
| quality_delta | Rating trend direction |
| international_index | Global reach potential |
| richness_score | App comprehensiveness |
| price_tier | Categorical price segments |
# Select numerical variables for dimension reduction
clustering_vars <- c(
"log_price", "log_size", "log_rating_count",
"user_rating", "user_rating_ver",
"cont_rating_num", "prime_genre_num",
"sup_devices.num", "ipadSc_urls.num",
"lang.num", "vpp_lic",
"engagement_ratio", "quality_delta",
"international_index", "richness_score"
)
cat("Selected", length(clustering_vars), "variables for dimension reduction:\n")## Selected 15 variables for dimension reduction:
## [1] "log_price" "log_size" "log_rating_count"
## [4] "user_rating" "user_rating_ver" "cont_rating_num"
## [7] "prime_genre_num" "sup_devices.num" "ipadSc_urls.num"
## [10] "lang.num" "vpp_lic" "engagement_ratio"
## [13] "quality_delta" "international_index" "richness_score"
# Extract and clean the data
data_for_reduction <- appstore[, clustering_vars]
# Handle missing/infinite values
data_for_reduction[is.na(data_for_reduction)] <- 0
data_for_reduction[is.infinite(as.matrix(data_for_reduction))] <- 0
# Standardize the data (Z-score normalization)
data_scaled <- scale(data_for_reduction)
# Handle any NA/NaN values that may arise from zero-variance columns
data_scaled[is.na(data_scaled)] <- 0
data_scaled[is.nan(data_scaled)] <- 0
data_scaled_df <- as.data.frame(data_scaled)
cat("\nScaled data dimensions:", nrow(data_scaled_df), "×", ncol(data_scaled_df))##
## Scaled data dimensions: 6847 × 15
Before dimension reduction, we remove outliers using Isolation Forest to prevent them from distorting principal components and visualizations.
# Set seed to match clustering analysis (ensures same anomalies removed)
set.seed(123)
# Train Isolation Forest
iso_forest <- isolationForest$new(sample_size = 256, num_trees = 100, max_depth = ceiling(log2(256)))
iso_forest$fit(as.data.frame(data_scaled))
anomaly_scores <- iso_forest$predict(as.data.frame(data_scaled))
# Add scores and identify anomalies (top 5%)
appstore$anomaly_score <- anomaly_scores$anomaly_score
threshold <- quantile(anomaly_scores$anomaly_score, 0.95)
is_anomaly <- anomaly_scores$anomaly_score > threshold
data.frame(
Metric = c("Original dataset", "Anomalies detected", "Clean dataset"),
Value = c(paste(nrow(appstore), "apps"),
paste(sum(is_anomaly), "(", round(mean(is_anomaly) * 100, 1), "%)"),
paste(sum(!is_anomaly), "apps"))
) %>%
kable(caption = "Anomaly Detection Results") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Metric | Value |
|---|---|
| Original dataset | 6847 apps |
| Anomalies detected | 336 ( 4.9 %) |
| Clean dataset | 6511 apps |
par(mfrow = c(1, 2))
# Histogram of anomaly scores
hist(anomaly_scores$anomaly_score, breaks = 50,
main = "Distribution of Anomaly Scores",
xlab = "Anomaly Score", col = "lightblue", border = "white")
abline(v = threshold, col = "red", lwd = 2, lty = 2)
legend("topright", legend = "95th percentile", col = "red", lty = 2, lwd = 2)
# Anomaly score vs log rating count
plot(appstore$log_rating_count, anomaly_scores$anomaly_score,
main = "Anomaly Score vs Popularity",
xlab = "Log(Rating Count + 1)", ylab = "Anomaly Score", pch = 16,
col = ifelse(is_anomaly, "red", "gray70"), cex = 0.5)
abline(h = threshold, col = "red", lwd = 2, lty = 2)Anomaly Score Distribution
# Remove anomalies from all datasets
data_scaled <- data_scaled[!is_anomaly, ]
data_scaled_df <- data_scaled_df[!is_anomaly, ]
appstore <- appstore[!is_anomaly, ]
data_for_reduction <- data_for_reduction[!is_anomaly, ]
cat("Clean dataset:", nrow(appstore), "apps (anomalies removed)")## Clean dataset: 6511 apps (anomalies removed)
Understanding correlations between variables justifies the need for dimension reduction and helps interpret principal components.
# Remove cluster column if present for correlation calculation
cor_data <- data_scaled_df[, !names(data_scaled_df) %in% c("cluster")]
# Remove any columns with zero variance (cause NA in correlation)
col_vars <- apply(cor_data, 2, var, na.rm = TRUE)
valid_cols <- col_vars > 1e-10 & !is.na(col_vars)
cor_data <- cor_data[, valid_cols]
cor_matrix <- cor(cor_data, use = "pairwise.complete.obs")
# Replace any remaining NA with 0 for visualization
cor_matrix[is.na(cor_matrix)] <- 0
corrplot(cor_matrix,
method = "color",
type = "upper",
order = "hclust",
tl.col = "black",
tl.srt = 45,
tl.cex = 0.8,
title = "Correlation Matrix - App Store Variables",
mar = c(0, 0, 2, 0))Correlation Matrix of App Features
# Identify highly correlated pairs
high_cor <- which(abs(cor_matrix) > 0.7 & abs(cor_matrix) < 1, arr.ind = TRUE)
if (nrow(high_cor) > 0) {
high_cor_pairs <- data.frame(
Variable_1 = character(),
Variable_2 = character(),
Correlation = numeric()
)
for (i in 1:nrow(high_cor)) {
if (high_cor[i, 1] < high_cor[i, 2]) {
high_cor_pairs <- rbind(high_cor_pairs, data.frame(
Variable_1 = colnames(cor_matrix)[high_cor[i, 1]],
Variable_2 = colnames(cor_matrix)[high_cor[i, 2]],
Correlation = round(cor_matrix[high_cor[i, 1], high_cor[i, 2]], 3)
))
}
}
high_cor_pairs %>%
arrange(desc(abs(Correlation))) %>%
kable(caption = "Highly Correlated Variable Pairs (|r| > 0.7)") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
}| Variable_1 | Variable_2 | Correlation |
|---|---|---|
| lang.num | international_index | 0.993 |
| lang.num | richness_score | 0.942 |
| international_index | richness_score | 0.930 |
| user_rating | user_rating_ver | 0.788 |
richness_score and lang.num (expected, as
richness includes language count)pca_result <- PCA(data_scaled_df, graph = FALSE, ncp = ncol(data_scaled_df))
# Extract eigenvalues
eigenvalues <- get_eigenvalue(pca_result)
if (is.matrix(eigenvalues)) {
eigenvalues <- as.data.frame(eigenvalues)
}
# Display first 10 components
eigenvalues %>%
head(10) %>%
mutate(
Component = 1:10,
eigenvalue = round(eigenvalue, 3),
variance.percent = round(variance.percent, 2),
cumulative.variance.percent = round(cumulative.variance.percent, 2)
) %>%
select(Component, eigenvalue, variance.percent, cumulative.variance.percent) %>%
kable(
col.names = c("Component", "Eigenvalue", "Variance %", "Cumulative %"),
caption = "Variance Explained by Principal Components"
) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Component | Eigenvalue | Variance % | Cumulative % | |
|---|---|---|---|---|
| Dim.1 | 1 | 3.815 | 27.25 | 27.25 |
| Dim.2 | 2 | 2.047 | 14.62 | 41.87 |
| Dim.3 | 3 | 1.453 | 10.38 | 52.25 |
| Dim.4 | 4 | 1.249 | 8.92 | 61.17 |
| Dim.5 | 5 | 1.094 | 7.82 | 68.99 |
| Dim.6 | 6 | 1.001 | 7.15 | 76.14 |
| Dim.7 | 7 | 0.916 | 6.55 | 82.68 |
| Dim.8 | 8 | 0.785 | 5.60 | 88.29 |
| Dim.9 | 9 | 0.726 | 5.18 | 93.47 |
| Dim.10 | 10 | 0.675 | 4.82 | 98.29 |
cumvar <- eigenvalues$cumulative.variance.percent
optimal_pcs_80 <- which(cumvar >= 80)[1]
optimal_pcs_90 <- which(cumvar >= 90)[1]
cat("Components for 80% variance:", optimal_pcs_80, "\n")## Components for 80% variance: 7
## Components for 90% variance: 9
## Dimension reduction: 53.3 %
fviz_eig(pca_result,
addlabels = TRUE,
ylim = c(0, 30),
main = "Scree Plot - Variance Explained by Components",
barfill = "steelblue",
barcolor = "steelblue",
linecolor = "red")Scree Plot - Variance Explained by Components
plot(1:nrow(eigenvalues), eigenvalues$cumulative.variance.percent,
type = "b", pch = 19, col = "darkblue",
xlab = "Number of Components",
ylab = "Cumulative Variance Explained (%)",
main = "Cumulative Variance Explained by Principal Components",
ylim = c(0, 100))
abline(h = c(80, 90), col = c("red", "orange"), lty = 2)
abline(v = optimal_pcs_80, col = "red", lty = 2)
text(optimal_pcs_80 + 0.5, 50, paste0(optimal_pcs_80, " PCs\n(80% var)"), pos = 4, col = "red")
grid()Cumulative Variance Explained
par(mfrow = c(1, 2))
fviz_contrib(pca_result, choice = "var", axes = 1, top = 10,
title = "Top 10 Variables Contributing to PC1")Top Contributing Variables to PC1 and PC2
fviz_contrib(pca_result, choice = "var", axes = 2, top = 10,
title = "Top 10 Variables Contributing to PC2")Top Contributing Variables to PC1 and PC2
loadings <- get_pca_var(pca_result)$coord
loadings_df <- as.data.frame(loadings)
loadings_df$Variable <- rownames(loadings_df)
# Rename Dim columns to PC for clarity
names(loadings_df) <- gsub("Dim\\.", "PC", names(loadings_df))
# Top PC1 loadings
pc1_top <- loadings_df %>%
arrange(desc(abs(PC1))) %>%
head(5) %>%
select(Variable, PC1) %>%
mutate(PC1 = round(PC1, 3))
# Top PC2 loadings
pc2_top <- loadings_df %>%
arrange(desc(abs(PC2))) %>%
head(5) %>%
select(Variable, PC2) %>%
mutate(PC2 = round(PC2, 3))
cat("PC1 - Top Contributing Variables (International Reach & Popularity):\n")## PC1 - Top Contributing Variables (International Reach & Popularity):
kable(pc1_top, col.names = c("Variable", "PC1 Loading")) %>%
kable_styling(bootstrap_options = c("striped"), full_width = FALSE)| Variable | PC1 Loading | |
|---|---|---|
| richness_score | richness_score | 0.882 |
| lang.num | lang.num | 0.811 |
| international_index | international_index | 0.796 |
| user_rating_ver | user_rating_ver | 0.687 |
| log_rating_count | log_rating_count | 0.652 |
##
## PC2 - Top Contributing Variables (Engagement & Quality):
kable(pc2_top, col.names = c("Variable", "PC2 Loading")) %>%
kable_styling(bootstrap_options = c("striped"), full_width = FALSE)| Variable | PC2 Loading | |
|---|---|---|
| user_rating_ver | user_rating_ver | 0.603 |
| engagement_ratio | engagement_ratio | 0.551 |
| international_index | international_index | -0.548 |
| lang.num | lang.num | -0.534 |
| user_rating | user_rating | 0.481 |
fviz_pca_var(pca_result,
col.var = "contrib",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE,
title = "PCA - Variable Correlation Circle")PCA Variable Correlation Circle
richness_score, lang.num,
international_index
user_rating_ver, user_rating,
engagement_ratio
We perform K-means clustering (k=3) on the scaled data to visualize how clusters appear in reduced dimensional space.
# Perform K-means clustering (k=3) - same as clustering project
set.seed(123)
k_optimal <- 3
km_result <- kmeans(data_scaled, centers = k_optimal, nstart = 50, iter.max = 300)
# Add cluster assignments
appstore$cluster <- km_result$cluster
data_scaled_df$cluster <- km_result$cluster
cat("K-means clustering (k =", k_optimal, ") completed\n")## K-means clustering (k = 3 ) completed
## Cluster distribution:
##
## 1 2 3
## 1509 3545 1457
# Cluster sizes
data.frame(
Cluster = 1:k_optimal,
Size = as.numeric(table(km_result$cluster)),
Percentage = paste0(round(as.numeric(table(km_result$cluster)) / nrow(appstore) * 100, 1), "%")
) %>%
kable(caption = "K-means Cluster Distribution") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Cluster | Size | Percentage |
|---|---|---|
| 1 | 1509 | 23.2% |
| 2 | 3545 | 54.4% |
| 3 | 1457 | 22.4% |
# Calculate cluster profiles (mean values for each cluster)
cluster_profiles <- appstore %>%
group_by(cluster) %>%
summarise(
n_apps = n(),
avg_price = mean(price, na.rm = TRUE),
avg_rating = mean(user_rating, na.rm = TRUE),
avg_rating_count = mean(rating_count_tot, na.rm = TRUE),
avg_size_mb = mean(size_bytes / 1e6, na.rm = TRUE),
avg_languages = mean(lang.num, na.rm = TRUE),
avg_devices = mean(sup_devices.num, na.rm = TRUE),
pct_free = mean(price == 0, na.rm = TRUE) * 100,
.groups = "drop"
)
# Determine cluster characteristics based on profiles
cluster_labels <- sapply(1:k_optimal, function(i) {
profile <- cluster_profiles[cluster_profiles$cluster == i, ]
# Identify dominant characteristics
chars <- c()
if (profile$pct_free > 70) {
chars <- c(chars, "Free/Low-cost")
} else if (profile$avg_price > 5) {
chars <- c(chars, "Premium-priced")
}
if (profile$avg_rating_count > mean(cluster_profiles$avg_rating_count) * 1.5) {
chars <- c(chars, "High popularity")
} else if (profile$avg_rating_count < mean(cluster_profiles$avg_rating_count) * 0.5) {
chars <- c(chars, "Niche/Low visibility")
}
if (profile$avg_languages > mean(cluster_profiles$avg_languages) * 1.3) {
chars <- c(chars, "International reach")
}
if (profile$avg_rating > 4.0) {
chars <- c(chars, "High quality")
}
if (length(chars) == 0) chars <- "Mainstream apps"
paste(chars, collapse = ", ")
})
# Display cluster interpretation
cluster_interpretation <- data.frame(
Cluster = 1:k_optimal,
Apps = cluster_profiles$n_apps,
Label = cluster_labels,
Avg_Price = paste0("$", round(cluster_profiles$avg_price, 2)),
Avg_Rating = round(cluster_profiles$avg_rating, 2),
Avg_Downloads = format(round(cluster_profiles$avg_rating_count), big.mark = ","),
Avg_Languages = round(cluster_profiles$avg_languages, 1),
Pct_Free = paste0(round(cluster_profiles$pct_free, 1), "%")
)
cluster_interpretation %>%
kable(
col.names = c("Cluster", "# Apps", "Interpretation", "Avg Price", "Avg Rating",
"Avg Rating Count", "Avg Languages", "% Free"),
caption = "Cluster Profiles and Interpretation"
) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Cluster | # Apps | Interpretation | Avg Price | Avg Rating | Avg Rating Count | Avg Languages | % Free |
|---|---|---|---|---|---|---|---|
| 1 | 1509 | Free/Low-cost, Niche/Low visibility | $0.8 | 1.47 | 493 | 1.9 | 72% |
| 2 | 3545 | High quality | $1.81 | 4.11 | 8,841 | 1.7 | 50.1% |
| 3 | 1457 | High popularity, International reach, High quality | $1.58 | 4.27 | 24,598 | 12.5 | 59.7% |
for (i in 1:k_optimal) {
profile <- cluster_profiles[cluster_profiles$cluster == i, ]
cat(paste0("**Cluster ", i, " - ", cluster_labels[i], "**\n\n"))
cat(paste0("- **Size**: ", format(profile$n_apps, big.mark = ","), " apps (",
round(profile$n_apps / nrow(appstore) * 100, 1), "%)\n"))
cat(paste0("- **Average Price**: $", round(profile$avg_price, 2),
" (", round(profile$pct_free, 0), "% free)\n"))
cat(paste0("- **Average Rating**: ", round(profile$avg_rating, 2), " stars\n"))
cat(paste0("- **Average Popularity**: ", format(round(profile$avg_rating_count), big.mark = ","),
" ratings\n"))
cat(paste0("- **International Support**: ", round(profile$avg_languages, 1), " languages\n\n"))
}Cluster 1 - Free/Low-cost, Niche/Low visibility
Cluster 2 - High quality
Cluster 3 - High popularity, International reach, High quality
The PCA plot displays apps in a 2D space based on their characteristics:
Understanding the Axes:
Cluster Positions Explained:
| Cluster | Position on Plot | Why? |
|---|---|---|
| Cluster 1 (Green) | Bottom-left quadrant | Low languages (1.9), low ratings (1.47★), low popularity → Low on both PC1 and PC2 |
| Cluster 2 (Orange) | Top-center area | High quality (4.11★) but average international reach → High PC2, medium PC1 |
| Cluster 3 (Blue) | Right side | Many languages (12.5), high popularity (24,598 ratings) → High PC1, varies on PC2 |
Visual Guide:
High Quality (PC2+)
↑
┌─────┴─────┐
│ Cluster 2 │
│ (Orange) │
│ 4.11★ avg │
Low Reach ←─────┼───────────┼─────→ High Reach (PC1+)
(PC1-) │ │ Many Languages
│ Cluster 1 │ High Popularity
│ (Green) │ Cluster 3 (Blue)
│ 1.47★ avg │ 12.5 languages
└─────┬─────┘
↓
Low Quality (PC2-)
fviz_pca_ind(pca_result,
geom.ind = "point",
col.ind = as.factor(appstore$cluster),
palette = "Set2",
addEllipses = TRUE,
ellipse.type = "convex",
legend.title = "Cluster",
title = "PCA - Apps Colored by K-means Clusters (k=3)",
pointsize = 1.5,
alpha.ind = 0.6)PCA - Apps Colored by K-means Clusters
fviz_pca_biplot(pca_result,
geom.ind = "point",
pointsize = 0.5,
alpha.ind = 0.3,
col.var = "red",
repel = TRUE,
title = "PCA Biplot - Apps and Variables")PCA Biplot - Apps and Variables Combined
pca_coords <- get_pca_ind(pca_result)$coord[, 1:3]
plot_data <- data.frame(
PC1 = pca_coords[, 1],
PC2 = pca_coords[, 2],
PC3 = pca_coords[, 3],
price_tier = appstore$price_tier,
track_name = appstore$track_name
)
plot_data$cluster <- as.factor(appstore$cluster)
plot_ly(plot_data, x = ~PC1, y = ~PC2, z = ~PC3,
color = ~cluster, colors = "Set2",
type = "scatter3d", mode = "markers",
marker = list(size = 3, opacity = 0.6),
text = ~track_name, hoverinfo = "text") %>%
layout(
title = "3D PCA - Colored by K-means Clusters (k=3)",
scene = list(
xaxis = list(title = paste0("PC1 (", round(eigenvalues$variance.percent[1], 1), "%)")),
yaxis = list(title = paste0("PC2 (", round(eigenvalues$variance.percent[2], 1), "%)")),
zaxis = list(title = paste0("PC3 (", round(eigenvalues$variance.percent[3], 1), "%)"))
)
)Interactive 3D PCA Visualization
t-SNE (t-Distributed Stochastic Neighbor Embedding) is a non-linear technique that excels at preserving local structure and revealing cluster patterns in high-dimensional data.
# Sample data for t-SNE (computational efficiency)
if (nrow(data_scaled_df) > 5000) {
cat("Sampling 5000 apps for t-SNE analysis...\n")
sample_idx <- sample(1:nrow(data_scaled_df), 5000)
data_tsne <- data_scaled_df[sample_idx, ]
appstore_tsne <- appstore[sample_idx, ]
} else {
sample_idx <- 1:nrow(data_scaled_df)
data_tsne <- data_scaled_df
appstore_tsne <- appstore
}## Sampling 5000 apps for t-SNE analysis...
## t-SNE sample size: 5000 apps
Perplexity controls the balance between local and global structure. We compare three values:
## Running t-SNE with perplexity = 30...
tsne_30 <- Rtsne(data_tsne, dims = 2, perplexity = 30, verbose = FALSE, max_iter = 500)
cat("Running t-SNE with perplexity = 50...\n")## Running t-SNE with perplexity = 50...
tsne_50 <- Rtsne(data_tsne, dims = 2, perplexity = 50, verbose = FALSE, max_iter = 500)
cat("Running t-SNE with perplexity = 100...\n")## Running t-SNE with perplexity = 100...
tsne_30_df <- data.frame(
tSNE1 = tsne_30$Y[, 1], tSNE2 = tsne_30$Y[, 2],
price_tier = appstore_tsne$price_tier
)
tsne_50_df <- data.frame(
tSNE1 = tsne_50$Y[, 1], tSNE2 = tsne_50$Y[, 2],
price_tier = appstore_tsne$price_tier
)
tsne_100_df <- data.frame(
tSNE1 = tsne_100$Y[, 1], tSNE2 = tsne_100$Y[, 2],
price_tier = appstore_tsne$price_tier
)
# Add cluster assignments
tsne_30_df$cluster <- as.factor(appstore_tsne$cluster)
tsne_50_df$cluster <- as.factor(appstore_tsne$cluster)
tsne_100_df$cluster <- as.factor(appstore_tsne$cluster)p1 <- ggplot(tsne_30_df, aes(x = tSNE1, y = tSNE2, color = price_tier)) +
geom_point(alpha = 0.6, size = 1.5) +
scale_color_brewer(palette = "Set2") +
theme_minimal() +
labs(title = "Perplexity = 30", color = "Price Tier") +
theme(legend.position = "bottom")
p2 <- ggplot(tsne_50_df, aes(x = tSNE1, y = tSNE2, color = price_tier)) +
geom_point(alpha = 0.6, size = 1.5) +
scale_color_brewer(palette = "Set2") +
theme_minimal() +
labs(title = "Perplexity = 50", color = "Price Tier") +
theme(legend.position = "bottom")
p3 <- ggplot(tsne_100_df, aes(x = tSNE1, y = tSNE2, color = price_tier)) +
geom_point(alpha = 0.6, size = 1.5) +
scale_color_brewer(palette = "Set2") +
theme_minimal() +
labs(title = "Perplexity = 100", color = "Price Tier") +
theme(legend.position = "bottom")
grid.arrange(p1, p2, p3, ncol = 3, top = "t-SNE Perplexity Comparison")t-SNE with Different Perplexity Values
ggplot(tsne_50_df, aes(x = tSNE1, y = tSNE2, color = cluster)) +
geom_point(alpha = 0.6, size = 2) +
scale_color_brewer(palette = "Set2") +
theme_minimal() +
labs(
title = "t-SNE Projection Colored by K-means Clusters (Perplexity = 50)",
subtitle = "Non-linear dimension reduction reveals cluster structure",
color = "Cluster"
) +
theme(
legend.position = "right",
plot.title = element_text(size = 14, face = "bold"),
plot.subtitle = element_text(size = 10)
)t-SNE Colored by K-means Clusters (Perplexity = 50)
UMAP (Uniform Manifold Approximation and Projection) is a newer technique that often preserves both local and global structure better than t-SNE, while being faster to compute.
## Running UMAP with default parameters...
## Running UMAP with n_neighbors=15, min_dist=0.1...
umap_default_df <- data.frame(
UMAP1 = umap_default$layout[, 1],
UMAP2 = umap_default$layout[, 2],
price_tier = appstore_tsne$price_tier
)
umap_custom_df <- data.frame(
UMAP1 = umap_custom$layout[, 1],
UMAP2 = umap_custom$layout[, 2],
price_tier = appstore_tsne$price_tier
)
# Add cluster assignments
umap_default_df$cluster <- as.factor(appstore_tsne$cluster)
umap_custom_df$cluster <- as.factor(appstore_tsne$cluster)p1 <- ggplot(umap_default_df, aes(x = UMAP1, y = UMAP2, color = price_tier)) +
geom_point(alpha = 0.6, size = 1.5) +
scale_color_brewer(palette = "Set2") +
theme_minimal() +
labs(title = "UMAP (Default Parameters)", color = "Price Tier") +
theme(legend.position = "bottom")
p2 <- ggplot(umap_custom_df, aes(x = UMAP1, y = UMAP2, color = price_tier)) +
geom_point(alpha = 0.6, size = 1.5) +
scale_color_brewer(palette = "Set2") +
theme_minimal() +
labs(title = "UMAP (n_neighbors=15, min_dist=0.1)", color = "Price Tier") +
theme(legend.position = "bottom")
grid.arrange(p1, p2, ncol = 2, top = "UMAP Parameter Comparison")UMAP with Different Parameters
ggplot(umap_custom_df, aes(x = UMAP1, y = UMAP2, color = cluster)) +
geom_point(alpha = 0.6, size = 2) +
scale_color_brewer(palette = "Set2") +
theme_minimal() +
labs(
title = "UMAP Projection Colored by K-means Clusters",
subtitle = "Preserves both local and global structure",
color = "Cluster"
) +
theme(
legend.position = "right",
plot.title = element_text(size = 14, face = "bold"),
plot.subtitle = element_text(size = 10)
)UMAP Colored by K-means Clusters
# PCA coordinates for sampled data
pca_coords_sample <- get_pca_ind(pca_result)$coord[sample_idx, 1:2]
pca_comparison_df <- data.frame(
PC1 = pca_coords_sample[, 1],
PC2 = pca_coords_sample[, 2],
price_tier = appstore_tsne$price_tier,
cluster = as.factor(appstore_tsne$cluster)
)
p1 <- ggplot(pca_comparison_df, aes(x = PC1, y = PC2, color = cluster)) +
geom_point(alpha = 0.5, size = 1.5) +
scale_color_brewer(palette = "Set2") +
theme_minimal() +
labs(title = "PCA", color = "Cluster") +
theme(legend.position = "bottom")
p2 <- ggplot(tsne_50_df, aes(x = tSNE1, y = tSNE2, color = cluster)) +
geom_point(alpha = 0.5, size = 1.5) +
scale_color_brewer(palette = "Set2") +
theme_minimal() +
labs(title = "t-SNE (perplexity=50)", color = "Cluster") +
theme(legend.position = "bottom")
p3 <- ggplot(umap_custom_df, aes(x = UMAP1, y = UMAP2, color = cluster)) +
geom_point(alpha = 0.5, size = 1.5) +
scale_color_brewer(palette = "Set2") +
theme_minimal() +
labs(title = "UMAP", color = "Cluster") +
theme(legend.position = "bottom")
grid.arrange(p1, p2, p3, ncol = 3, top = "Dimension Reduction Methods Comparison")Dimension Reduction Methods Comparison
The Mantel test measures how well each method preserves the distance structure from the original high-dimensional space.
## Computing distance matrices...
# PCA 2D distances
pca_2d <- get_pca_ind(pca_result)$coord[sample_idx, 1:2]
dist_pca <- dist(pca_2d, method = "euclidean")
# t-SNE 2D distances
dist_tsne <- dist(tsne_50$Y, method = "euclidean")
# UMAP 2D distances
dist_umap <- dist(umap_custom$layout, method = "euclidean")
# Original 15D distances
dist_original <- dist(data_scaled_df[sample_idx, ], method = "euclidean")
cat("Running Mantel tests (999 permutations)...\n")## Running Mantel tests (999 permutations)...
mantel_pca <- mantel(dist_original, dist_pca, method = "pearson", permutations = 999)
mantel_tsne <- mantel(dist_original, dist_tsne, method = "pearson", permutations = 999)
mantel_umap <- mantel(dist_original, dist_umap, method = "pearson", permutations = 999)mantel_results <- data.frame(
Method = c("PCA", "t-SNE", "UMAP"),
Mantel_r = round(c(mantel_pca$statistic, mantel_tsne$statistic, mantel_umap$statistic), 4),
p_value = c(mantel_pca$signif, mantel_tsne$signif, mantel_umap$signif),
Significance = c(
ifelse(mantel_pca$signif < 0.001, "***",
ifelse(mantel_pca$signif < 0.01, "**",
ifelse(mantel_pca$signif < 0.05, "*", "ns"))),
ifelse(mantel_tsne$signif < 0.001, "***",
ifelse(mantel_tsne$signif < 0.01, "**",
ifelse(mantel_tsne$signif < 0.05, "*", "ns"))),
ifelse(mantel_umap$signif < 0.001, "***",
ifelse(mantel_umap$signif < 0.01, "**",
ifelse(mantel_umap$signif < 0.05, "*", "ns")))
),
Quality = c(
ifelse(mantel_pca$statistic > 0.7, "Excellent",
ifelse(mantel_pca$statistic > 0.5, "Good",
ifelse(mantel_pca$statistic > 0.3, "Fair", "Poor"))),
ifelse(mantel_tsne$statistic > 0.7, "Excellent",
ifelse(mantel_tsne$statistic > 0.5, "Good",
ifelse(mantel_tsne$statistic > 0.3, "Fair", "Poor"))),
ifelse(mantel_umap$statistic > 0.7, "Excellent",
ifelse(mantel_umap$statistic > 0.5, "Good",
ifelse(mantel_umap$statistic > 0.3, "Fair", "Poor")))
)
)
mantel_results %>%
kable(caption = "Mantel Test Results - Structure Preservation") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Method | Mantel_r | p_value | Significance | Quality |
|---|---|---|---|---|
| PCA | 0.6876 | 0.001 | ** | Good |
| t-SNE | 0.4250 | 0.001 | ** | Fair |
| UMAP | 0.5282 | 0.001 | ** | Good |
barplot(mantel_results$Mantel_r,
names.arg = mantel_results$Method,
col = c("steelblue", "coral", "seagreen"),
main = "Mantel Test: Structure Preservation Comparison",
ylab = "Mantel r (correlation)",
ylim = c(-0.1, 1),
border = NA)
abline(h = c(0.3, 0.5, 0.7), col = c("orange", "gold", "darkgreen"), lty = 2, lwd = 1.5)
text(1:3, mantel_results$Mantel_r + 0.05,
sprintf("r = %.3f", mantel_results$Mantel_r), cex = 0.9)
legend("topright",
legend = c("Excellent (>0.7)", "Good (>0.5)", "Fair (>0.3)"),
col = c("darkgreen", "gold", "orange"),
lty = 2, lwd = 2, cex = 0.8)Mantel Test: Structure Preservation Comparison
data.frame(
Method = c("PCA", "t-SNE", "UMAP"),
Type = c("Linear", "Non-linear", "Non-linear"),
Structure_Preserved = c("Global", "Local", "Local + Global"),
Interpretability = c("High (loadings)", "Low", "Low"),
Speed = c("Fast", "Slow", "Moderate"),
Best_For = c("Feature reduction, interpretability", "Cluster visualization", "Large datasets, balanced view")
) %>%
kable(caption = "Method Comparison Summary") %>%
kable_styling(bootstrap_options = c("striped", "hover"))| Method | Type | Structure_Preserved | Interpretability | Speed | Best_For |
|---|---|---|---|---|---|
| PCA | Linear | Global | High (loadings) | Fast | Feature reduction, interpretability |
| t-SNE | Non-linear | Local | Low | Slow | Cluster visualization |
| UMAP | Non-linear | Local + Global | Low | Moderate | Large datasets, balanced view |
For App Developers:
For Further Analysis:
# Create output directory
dir.create("results/dimension_reduction", showWarnings = FALSE, recursive = TRUE)
# Save PCA scores
pca_scores <- as.data.frame(get_pca_ind(pca_result)$coord[, 1:min(10, ncol(get_pca_ind(pca_result)$coord))])
pca_scores$id <- appstore$id
write.csv(pca_scores, "results/dimension_reduction/pca_scores.csv", row.names = FALSE)
# Save PCA loadings
write.csv(loadings_df, "results/dimension_reduction/pca_loadings.csv", row.names = FALSE)
# Save variance explained
variance_explained <- eigenvalues
variance_explained$component <- 1:nrow(variance_explained)
write.csv(variance_explained, "results/dimension_reduction/pca_variance_explained.csv", row.names = FALSE)
# Save t-SNE coordinates
tsne_output <- data.frame(
id = appstore_tsne$id,
tsne1_perp30 = tsne_30$Y[, 1],
tsne2_perp30 = tsne_30$Y[, 2],
tsne1_perp50 = tsne_50$Y[, 1],
tsne2_perp50 = tsne_50$Y[, 2],
tsne1_perp100 = tsne_100$Y[, 1],
tsne2_perp100 = tsne_100$Y[, 2]
)
write.csv(tsne_output, "results/dimension_reduction/tsne_coordinates.csv", row.names = FALSE)
# Save UMAP coordinates
umap_output <- data.frame(
id = appstore_tsne$id,
umap1_default = umap_default$layout[, 1],
umap2_default = umap_default$layout[, 2],
umap1_custom = umap_custom$layout[, 1],
umap2_custom = umap_custom$layout[, 2]
)
write.csv(umap_output, "results/dimension_reduction/umap_coordinates.csv", row.names = FALSE)
# Save Mantel test results
write.csv(mantel_results, "results/dimension_reduction/mantel_test_results.csv", row.names = FALSE)
# Save summary
summary_report <- data.frame(
Metric = c(
"Original Dimensions",
"Optimal PCs (80% variance)",
"Optimal PCs (90% variance)",
"PC1 Variance Explained (%)",
"PC2 Variance Explained (%)",
"Total Variance (PC1+PC2) (%)",
"Dimension Reduction (%)",
"Sample Size for t-SNE/UMAP"
),
Value = c(
ncol(data_scaled_df),
optimal_pcs_80,
optimal_pcs_90,
round(eigenvalues$variance.percent[1], 2),
round(eigenvalues$variance.percent[2], 2),
round(sum(eigenvalues$variance.percent[1:2]), 2),
round((1 - optimal_pcs_80 / ncol(data_scaled_df)) * 100, 1),
nrow(data_tsne)
)
)
write.csv(summary_report, "results/dimension_reduction/summary.csv", row.names = FALSE)
cat("Results saved to results/dimension_reduction/ directory\n")## Results saved to results/dimension_reduction/ directory
## Files: pca_scores.csv, pca_loadings.csv, pca_variance_explained.csv,
## tsne_coordinates.csv, umap_coordinates.csv, mantel_test_results.csv, summary.csv
## R version 4.5.2 (2025-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS Tahoe 26.0
##
## Matrix products: default
## BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: Europe/Warsaw
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] future_1.69.0 solitude_1.1.3 vegan_2.7-2 permute_0.9-8
## [5] kableExtra_1.4.0 knitr_1.51 reshape2_1.4.5 RColorBrewer_1.1-3
## [9] gridExtra_2.3 plotly_4.12.0 corrplot_0.95 tidyr_1.3.2
## [13] dplyr_1.1.4 umap_0.2.10.0 Rtsne_0.17 factoextra_1.0.7
## [17] ggplot2_4.0.1 FactoMineR_2.13
##
## loaded via a namespace (and not attached):
## [1] rlang_1.1.7 magrittr_2.0.4 otel_0.2.0
## [4] compiler_4.5.2 mgcv_1.9-3 png_0.1-8
## [7] systemfonts_1.3.1 vctrs_0.7.1 stringr_1.6.0
## [10] pkgconfig_2.0.3 fastmap_1.2.0 backports_1.5.0
## [13] labeling_0.4.3 rmarkdown_2.30 purrr_1.2.1
## [16] xfun_0.56 cachem_1.1.0 jsonlite_2.0.0
## [19] flashClust_1.01-2 broom_1.0.12 parallel_4.5.2
## [22] cluster_2.1.8.1 R6_2.6.1 bslib_0.10.0
## [25] stringi_1.8.7 ranger_0.18.0 reticulate_1.44.1
## [28] parallelly_1.46.1 car_3.1-3 jquerylib_0.1.4
## [31] estimability_1.5.1 Rcpp_1.1.1 future.apply_1.20.1
## [34] Matrix_1.7-4 splines_4.5.2 igraph_2.2.1
## [37] tidyselect_1.2.1 rstudioapi_0.18.0 abind_1.4-8
## [40] yaml_2.3.12 codetools_0.2-20 listenv_0.10.0
## [43] lattice_0.22-7 tibble_3.3.1 plyr_1.8.9
## [46] withr_3.0.2 S7_0.2.1 askpass_1.2.1
## [49] evaluate_1.0.5 xml2_1.5.2 pillar_1.11.1
## [52] ggpubr_0.6.2 carData_3.0-6 DT_0.34.0
## [55] generics_0.1.4 scales_1.4.0 globals_0.18.0
## [58] leaps_3.2 glue_1.8.0 emmeans_2.0.1
## [61] scatterplot3d_0.3-44 lazyeval_0.2.2 tools_4.5.2
## [64] data.table_1.18.2.1 RSpectra_0.16-2 ggsignif_0.6.4
## [67] mvtnorm_1.3-3 grid_4.5.2 crosstalk_1.2.2
## [70] nlme_3.1-168 Formula_1.2-5 cli_3.6.5
## [73] textshaping_1.0.4 viridisLite_0.4.2 svglite_2.2.2
## [76] gtable_0.3.6 rstatix_0.7.3 sass_0.4.10
## [79] digest_0.6.39 ggrepel_0.9.6 lgr_0.5.2
## [82] htmlwidgets_1.6.4 farver_2.1.2 htmltools_0.5.9
## [85] lifecycle_1.0.5 httr_1.4.7 multcompView_0.1-10
## [88] openssl_2.3.4 MASS_7.3-65
Report Generated: 2026-02-01 Author: Mariyam Babayeva