1 Executive Summary

This analysis applies dimension reduction techniques (PCA, t-SNE, UMAP) to the Apple App Store dataset to visualize high-dimensional app characteristics and validate the clustering structure identified in the companion clustering analysis.

Key Results:

Metric Value
Original Dimensions 15 variables
Optimal PCs (80% variance) 8 components
PC1 + PC2 Variance 38.9%
Best Structure Preservation PCA (Mantel r = 0.696)

Principal Component Interpretation:

Component Variance Interpretation
PC1 23.8% International Reach & Popularity
PC2 15.1% User Engagement & Quality

Method Comparison:

  • PCA: Best for interpretability and global structure preservation
  • t-SNE: Best for revealing local cluster patterns (perplexity = 50)
  • UMAP: Balances local and global structure, faster than t-SNE

2 Data Loading & Preparation

We load the pre-processed dataset from the clustering analysis, which already has anomalies removed and cluster assignments.

# Load pre-cleaned data with clusters from clustering analysis
clean_file <- "results/appstore_with_clusters.csv"

if (file.exists(clean_file)) {
  appstore <- read.csv(clean_file, stringsAsFactors = FALSE)
  has_clusters <- "cluster" %in% names(appstore)
  cat("Loaded pre-cleaned dataset with cluster assignments\n")
} else {
  # Fallback to original data if clustering hasn't been run
  appstore <- read.csv("DataSets/AppleStore.csv", stringsAsFactors = FALSE)
  has_clusters <- FALSE
  cat("Note: Run clustering_report.Rmd first for cluster-colored visualizations\n")
}
## Loaded pre-cleaned dataset with cluster assignments
data.frame(
  Metric = c("Total Apps", "Variables", "Has Clusters"),
  Value = c(format(nrow(appstore), big.mark = ","), ncol(appstore), ifelse(has_clusters, "Yes (3 clusters)", "No"))
) %>%
  kable(caption = "Dataset Overview") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Dataset Overview
Metric Value
Total Apps 6,847
Variables 34
Has Clusters Yes (3 clusters)
if (has_clusters) {
  cat("Cluster distribution:\n")
  print(table(appstore$cluster))
}
## Cluster distribution:
## 
##    1    2    3 
## 2517 2760 1570
cat("Dataset dimensions:", nrow(appstore), "apps ×", ncol(appstore), "variables\n")
## Dataset dimensions: 6847 apps × 34 variables

3 Feature Engineering

# Create numeric encoding for categorical variables
appstore$cont_rating_num <- as.numeric(factor(appstore$cont_rating))
appstore$prime_genre_num <- as.numeric(factor(appstore$prime_genre))

# Create price tiers and engineered features
appstore <- appstore %>%
  mutate(
    price_tier = cut(price,
                     breaks = c(-Inf, 0, 2.99, 9.99, 29.99, Inf),
                     labels = c("Free", "Budget", "Mid", "Premium", "Ultra")),
    
    # Log transformations for skewed variables
    log_rating_count = log1p(rating_count_tot),
    log_size = log1p(size_bytes),
    log_price = log1p(price),
    
    # Engagement and quality metrics
    engagement_ratio = ifelse(rating_count_tot > 0,
                              rating_count_ver / rating_count_tot, 0),
    quality_delta = user_rating_ver - user_rating,
    
    # Composite indices
    international_index = ifelse(size_bytes > 0,
                                 (lang.num * sup_devices.num) / log1p(size_bytes), 0),
    richness_score = ipadSc_urls.num + lang.num
  )

# Feature summary table
data.frame(
  Feature = c("log_size", "log_price", "log_rating_count", "engagement_ratio", 
              "quality_delta", "international_index", "richness_score", "price_tier"),
  Purpose = c("Normalize size distribution", "Normalize price distribution",
              "Normalize popularity distribution", "Version engagement rate",
              "Rating trend direction", "Global reach potential",
              "App comprehensiveness", "Categorical price segments")
) %>%
  kable(caption = "Engineered Features") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Engineered Features
Feature Purpose
log_size Normalize size distribution
log_price Normalize price distribution
log_rating_count Normalize popularity distribution
engagement_ratio Version engagement rate
quality_delta Rating trend direction
international_index Global reach potential
richness_score App comprehensiveness
price_tier Categorical price segments

3.0.1 Key Findings

  • 8 new features created from original variables
  • Log transformations applied to highly skewed variables (size, price, ratings)
  • Composite indices capture multi-dimensional concepts (international reach, app richness)

4 Variable Selection & Standardization

# Select numerical variables for dimension reduction
clustering_vars <- c(
  "log_price", "log_size", "log_rating_count",
  "user_rating", "user_rating_ver",
  "cont_rating_num", "prime_genre_num",
  "sup_devices.num", "ipadSc_urls.num",
  "lang.num", "vpp_lic",
  "engagement_ratio", "quality_delta",
  "international_index", "richness_score"
)

cat("Selected", length(clustering_vars), "variables for dimension reduction:\n")
## Selected 15 variables for dimension reduction:
print(clustering_vars)
##  [1] "log_price"           "log_size"            "log_rating_count"   
##  [4] "user_rating"         "user_rating_ver"     "cont_rating_num"    
##  [7] "prime_genre_num"     "sup_devices.num"     "ipadSc_urls.num"    
## [10] "lang.num"            "vpp_lic"             "engagement_ratio"   
## [13] "quality_delta"       "international_index" "richness_score"
# Extract and clean the data
data_for_reduction <- appstore[, clustering_vars]

# Handle missing/infinite values
data_for_reduction[is.na(data_for_reduction)] <- 0
data_for_reduction[is.infinite(as.matrix(data_for_reduction))] <- 0

# Standardize the data (Z-score normalization)
data_scaled <- scale(data_for_reduction)

# Handle any NA/NaN values that may arise from zero-variance columns
data_scaled[is.na(data_scaled)] <- 0
data_scaled[is.nan(data_scaled)] <- 0

data_scaled_df <- as.data.frame(data_scaled)

cat("\nScaled data dimensions:", nrow(data_scaled_df), "×", ncol(data_scaled_df))
## 
## Scaled data dimensions: 6847 × 15

5 Anomaly Detection

Before dimension reduction, we remove outliers using Isolation Forest to prevent them from distorting principal components and visualizations.

# Set seed to match clustering analysis (ensures same anomalies removed)
set.seed(123)

# Train Isolation Forest
iso_forest <- isolationForest$new(sample_size = 256, num_trees = 100, max_depth = ceiling(log2(256)))
iso_forest$fit(as.data.frame(data_scaled))
anomaly_scores <- iso_forest$predict(as.data.frame(data_scaled))

# Add scores and identify anomalies (top 5%)
appstore$anomaly_score <- anomaly_scores$anomaly_score
threshold <- quantile(anomaly_scores$anomaly_score, 0.95)
is_anomaly <- anomaly_scores$anomaly_score > threshold

data.frame(
  Metric = c("Original dataset", "Anomalies detected", "Clean dataset"),
  Value = c(paste(nrow(appstore), "apps"), 
            paste(sum(is_anomaly), "(", round(mean(is_anomaly) * 100, 1), "%)"),
            paste(sum(!is_anomaly), "apps"))
) %>%
  kable(caption = "Anomaly Detection Results") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Anomaly Detection Results
Metric Value
Original dataset 6847 apps
Anomalies detected 336 ( 4.9 %)
Clean dataset 6511 apps
par(mfrow = c(1, 2))

# Histogram of anomaly scores
hist(anomaly_scores$anomaly_score, breaks = 50, 
     main = "Distribution of Anomaly Scores", 
     xlab = "Anomaly Score", col = "lightblue", border = "white")
abline(v = threshold, col = "red", lwd = 2, lty = 2)
legend("topright", legend = "95th percentile", col = "red", lty = 2, lwd = 2)

# Anomaly score vs log rating count
plot(appstore$log_rating_count, anomaly_scores$anomaly_score, 
     main = "Anomaly Score vs Popularity", 
     xlab = "Log(Rating Count + 1)", ylab = "Anomaly Score", pch = 16, 
     col = ifelse(is_anomaly, "red", "gray70"), cex = 0.5)
abline(h = threshold, col = "red", lwd = 2, lty = 2)
Anomaly Score Distribution

Anomaly Score Distribution

# Remove anomalies from all datasets
data_scaled <- data_scaled[!is_anomaly, ]
data_scaled_df <- data_scaled_df[!is_anomaly, ]
appstore <- appstore[!is_anomaly, ]
data_for_reduction <- data_for_reduction[!is_anomaly, ]

cat("Clean dataset:", nrow(appstore), "apps (anomalies removed)")
## Clean dataset: 6511 apps (anomalies removed)

5.0.1 Key Findings

  • Isolation Forest identified extreme outliers based on multi-dimensional patterns
  • 5% threshold removes apps with unusual feature combinations
  • Cleaner PCA - Anomalies would distort principal components and visualizations
  • Consistency with clustering analysis which used the same anomaly removal

6 Correlation Analysis

Understanding correlations between variables justifies the need for dimension reduction and helps interpret principal components.

# Remove cluster column if present for correlation calculation
cor_data <- data_scaled_df[, !names(data_scaled_df) %in% c("cluster")]

# Remove any columns with zero variance (cause NA in correlation)
col_vars <- apply(cor_data, 2, var, na.rm = TRUE)
valid_cols <- col_vars > 1e-10 & !is.na(col_vars)
cor_data <- cor_data[, valid_cols]

cor_matrix <- cor(cor_data, use = "pairwise.complete.obs")

# Replace any remaining NA with 0 for visualization
cor_matrix[is.na(cor_matrix)] <- 0

corrplot(cor_matrix,
         method = "color",
         type = "upper",
         order = "hclust",
         tl.col = "black",
         tl.srt = 45,
         tl.cex = 0.8,
         title = "Correlation Matrix - App Store Variables",
         mar = c(0, 0, 2, 0))
Correlation Matrix of App Features

Correlation Matrix of App Features

# Identify highly correlated pairs
high_cor <- which(abs(cor_matrix) > 0.7 & abs(cor_matrix) < 1, arr.ind = TRUE)

if (nrow(high_cor) > 0) {
  high_cor_pairs <- data.frame(
    Variable_1 = character(),
    Variable_2 = character(),
    Correlation = numeric()
  )
  
  for (i in 1:nrow(high_cor)) {
    if (high_cor[i, 1] < high_cor[i, 2]) {
      high_cor_pairs <- rbind(high_cor_pairs, data.frame(
        Variable_1 = colnames(cor_matrix)[high_cor[i, 1]],
        Variable_2 = colnames(cor_matrix)[high_cor[i, 2]],
        Correlation = round(cor_matrix[high_cor[i, 1], high_cor[i, 2]], 3)
      ))
    }
  }
  
  high_cor_pairs %>%
    arrange(desc(abs(Correlation))) %>%
    kable(caption = "Highly Correlated Variable Pairs (|r| > 0.7)") %>%
    kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
}
Highly Correlated Variable Pairs (|r| > 0.7)
Variable_1 Variable_2 Correlation
lang.num international_index 0.993
lang.num richness_score 0.942
international_index richness_score 0.930
user_rating user_rating_ver 0.788

6.0.1 Key Findings

  • Strong correlations between richness_score and lang.num (expected, as richness includes language count)
  • Moderate correlations between rating metrics and engagement features
  • Multicollinearity justifies dimension reduction to avoid redundant information

7 Principal Component Analysis (PCA)

7.1 Variance Explained

pca_result <- PCA(data_scaled_df, graph = FALSE, ncp = ncol(data_scaled_df))

# Extract eigenvalues
eigenvalues <- get_eigenvalue(pca_result)
if (is.matrix(eigenvalues)) {
  eigenvalues <- as.data.frame(eigenvalues)
}

# Display first 10 components
eigenvalues %>%
  head(10) %>%
  mutate(
    Component = 1:10,
    eigenvalue = round(eigenvalue, 3),
    variance.percent = round(variance.percent, 2),
    cumulative.variance.percent = round(cumulative.variance.percent, 2)
  ) %>%
  select(Component, eigenvalue, variance.percent, cumulative.variance.percent) %>%
  kable(
    col.names = c("Component", "Eigenvalue", "Variance %", "Cumulative %"),
    caption = "Variance Explained by Principal Components"
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Variance Explained by Principal Components
Component Eigenvalue Variance % Cumulative %
Dim.1 1 3.815 27.25 27.25
Dim.2 2 2.047 14.62 41.87
Dim.3 3 1.453 10.38 52.25
Dim.4 4 1.249 8.92 61.17
Dim.5 5 1.094 7.82 68.99
Dim.6 6 1.001 7.15 76.14
Dim.7 7 0.916 6.55 82.68
Dim.8 8 0.785 5.60 88.29
Dim.9 9 0.726 5.18 93.47
Dim.10 10 0.675 4.82 98.29
cumvar <- eigenvalues$cumulative.variance.percent
optimal_pcs_80 <- which(cumvar >= 80)[1]
optimal_pcs_90 <- which(cumvar >= 90)[1]

cat("Components for 80% variance:", optimal_pcs_80, "\n")
## Components for 80% variance: 7
cat("Components for 90% variance:", optimal_pcs_90, "\n")
## Components for 90% variance: 9
cat("Dimension reduction:", round((1 - optimal_pcs_80 / ncol(data_scaled_df)) * 100, 1), "%")
## Dimension reduction: 53.3 %

7.2 Scree Plot

fviz_eig(pca_result,
         addlabels = TRUE,
         ylim = c(0, 30),
         main = "Scree Plot - Variance Explained by Components",
         barfill = "steelblue",
         barcolor = "steelblue",
         linecolor = "red")
Scree Plot - Variance Explained by Components

Scree Plot - Variance Explained by Components

7.3 Cumulative Variance

plot(1:nrow(eigenvalues), eigenvalues$cumulative.variance.percent,
     type = "b", pch = 19, col = "darkblue",
     xlab = "Number of Components",
     ylab = "Cumulative Variance Explained (%)",
     main = "Cumulative Variance Explained by Principal Components",
     ylim = c(0, 100))
abline(h = c(80, 90), col = c("red", "orange"), lty = 2)
abline(v = optimal_pcs_80, col = "red", lty = 2)
text(optimal_pcs_80 + 0.5, 50, paste0(optimal_pcs_80, " PCs\n(80% var)"), pos = 4, col = "red")
grid()
Cumulative Variance Explained

Cumulative Variance Explained

7.3.1 Key Findings

  • PC1 explains 23.8% of total variance - largest single component
  • 8 components needed for 80% variance (46.7% dimension reduction)
  • First 2 PCs capture 38.9% - sufficient for 2D visualization

7.4 Variable Contributions

par(mfrow = c(1, 2))
fviz_contrib(pca_result, choice = "var", axes = 1, top = 10,
             title = "Top 10 Variables Contributing to PC1")
Top Contributing Variables to PC1 and PC2

Top Contributing Variables to PC1 and PC2

fviz_contrib(pca_result, choice = "var", axes = 2, top = 10,
             title = "Top 10 Variables Contributing to PC2")
Top Contributing Variables to PC1 and PC2

Top Contributing Variables to PC1 and PC2

loadings <- get_pca_var(pca_result)$coord
loadings_df <- as.data.frame(loadings)
loadings_df$Variable <- rownames(loadings_df)

# Rename Dim columns to PC for clarity
names(loadings_df) <- gsub("Dim\\.", "PC", names(loadings_df))

# Top PC1 loadings
pc1_top <- loadings_df %>%
  arrange(desc(abs(PC1))) %>%
  head(5) %>%
  select(Variable, PC1) %>%
  mutate(PC1 = round(PC1, 3))

# Top PC2 loadings
pc2_top <- loadings_df %>%
  arrange(desc(abs(PC2))) %>%
  head(5) %>%
  select(Variable, PC2) %>%
  mutate(PC2 = round(PC2, 3))

cat("PC1 - Top Contributing Variables (International Reach & Popularity):\n")
## PC1 - Top Contributing Variables (International Reach & Popularity):
kable(pc1_top, col.names = c("Variable", "PC1 Loading")) %>%
  kable_styling(bootstrap_options = c("striped"), full_width = FALSE)
Variable PC1 Loading
richness_score richness_score 0.882
lang.num lang.num 0.811
international_index international_index 0.796
user_rating_ver user_rating_ver 0.687
log_rating_count log_rating_count 0.652
cat("\nPC2 - Top Contributing Variables (Engagement & Quality):\n")
## 
## PC2 - Top Contributing Variables (Engagement & Quality):
kable(pc2_top, col.names = c("Variable", "PC2 Loading")) %>%
  kable_styling(bootstrap_options = c("striped"), full_width = FALSE)
Variable PC2 Loading
user_rating_ver user_rating_ver 0.603
engagement_ratio engagement_ratio 0.551
international_index international_index -0.548
lang.num lang.num -0.534
user_rating user_rating 0.481

7.5 Variable Correlation Circle

fviz_pca_var(pca_result,
             col.var = "contrib",
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE,
             title = "PCA - Variable Correlation Circle")
PCA Variable Correlation Circle

PCA Variable Correlation Circle

7.5.1 Key Findings

  • PC1 (International Reach): Dominated by richness_score, lang.num, international_index
    • High PC1 → Apps with broad international support and popularity
  • PC2 (Engagement & Quality): Dominated by user_rating_ver, user_rating, engagement_ratio
    • High PC2 → Apps with strong user engagement and quality ratings

7.6 Integration with Clustering

We perform K-means clustering (k=3) on the scaled data to visualize how clusters appear in reduced dimensional space.

# Perform K-means clustering (k=3) - same as clustering project
set.seed(123)
k_optimal <- 3
km_result <- kmeans(data_scaled, centers = k_optimal, nstart = 50, iter.max = 300)

# Add cluster assignments
appstore$cluster <- km_result$cluster
data_scaled_df$cluster <- km_result$cluster

cat("K-means clustering (k =", k_optimal, ") completed\n")
## K-means clustering (k = 3 ) completed
cat("Cluster distribution:\n")
## Cluster distribution:
print(table(appstore$cluster))
## 
##    1    2    3 
## 1509 3545 1457
# Cluster sizes
data.frame(
  Cluster = 1:k_optimal,
  Size = as.numeric(table(km_result$cluster)),
  Percentage = paste0(round(as.numeric(table(km_result$cluster)) / nrow(appstore) * 100, 1), "%")
) %>%
  kable(caption = "K-means Cluster Distribution") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
K-means Cluster Distribution
Cluster Size Percentage
1 1509 23.2%
2 3545 54.4%
3 1457 22.4%
# Calculate cluster profiles (mean values for each cluster)
cluster_profiles <- appstore %>%
  group_by(cluster) %>%
  summarise(
    n_apps = n(),
    avg_price = mean(price, na.rm = TRUE),
    avg_rating = mean(user_rating, na.rm = TRUE),
    avg_rating_count = mean(rating_count_tot, na.rm = TRUE),
    avg_size_mb = mean(size_bytes / 1e6, na.rm = TRUE),
    avg_languages = mean(lang.num, na.rm = TRUE),
    avg_devices = mean(sup_devices.num, na.rm = TRUE),
    pct_free = mean(price == 0, na.rm = TRUE) * 100,
    .groups = "drop"
  )

# Determine cluster characteristics based on profiles
cluster_labels <- sapply(1:k_optimal, function(i) {
  profile <- cluster_profiles[cluster_profiles$cluster == i, ]
  
  # Identify dominant characteristics
  chars <- c()
  
  if (profile$pct_free > 70) {
    chars <- c(chars, "Free/Low-cost")
  } else if (profile$avg_price > 5) {
    chars <- c(chars, "Premium-priced")
  }
  
  if (profile$avg_rating_count > mean(cluster_profiles$avg_rating_count) * 1.5) {
    chars <- c(chars, "High popularity")
  } else if (profile$avg_rating_count < mean(cluster_profiles$avg_rating_count) * 0.5) {
    chars <- c(chars, "Niche/Low visibility")
  }
  
  if (profile$avg_languages > mean(cluster_profiles$avg_languages) * 1.3) {
    chars <- c(chars, "International reach")
  }
  
  if (profile$avg_rating > 4.0) {
    chars <- c(chars, "High quality")
  }
  
  if (length(chars) == 0) chars <- "Mainstream apps"
  paste(chars, collapse = ", ")
})

# Display cluster interpretation
cluster_interpretation <- data.frame(
  Cluster = 1:k_optimal,
  Apps = cluster_profiles$n_apps,
  Label = cluster_labels,
  Avg_Price = paste0("$", round(cluster_profiles$avg_price, 2)),
  Avg_Rating = round(cluster_profiles$avg_rating, 2),
  Avg_Downloads = format(round(cluster_profiles$avg_rating_count), big.mark = ","),
  Avg_Languages = round(cluster_profiles$avg_languages, 1),
  Pct_Free = paste0(round(cluster_profiles$pct_free, 1), "%")
)

cluster_interpretation %>%
  kable(
    col.names = c("Cluster", "# Apps", "Interpretation", "Avg Price", "Avg Rating", 
                  "Avg Rating Count", "Avg Languages", "% Free"),
    caption = "Cluster Profiles and Interpretation"
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Cluster Profiles and Interpretation
Cluster # Apps Interpretation Avg Price Avg Rating Avg Rating Count Avg Languages % Free
1 1509 Free/Low-cost, Niche/Low visibility $0.8 1.47 493 1.9 72%
2 3545 High quality $1.81 4.11 8,841 1.7 50.1%
3 1457 High popularity, International reach, High quality $1.58 4.27 24,598 12.5 59.7%
cat("\n### Cluster Interpretation\n\n")

7.6.1 Cluster Interpretation

for (i in 1:k_optimal) {
  profile <- cluster_profiles[cluster_profiles$cluster == i, ]
  
  cat(paste0("**Cluster ", i, " - ", cluster_labels[i], "**\n\n"))
  cat(paste0("- **Size**: ", format(profile$n_apps, big.mark = ","), " apps (", 
             round(profile$n_apps / nrow(appstore) * 100, 1), "%)\n"))
  cat(paste0("- **Average Price**: $", round(profile$avg_price, 2), 
             " (", round(profile$pct_free, 0), "% free)\n"))
  cat(paste0("- **Average Rating**: ", round(profile$avg_rating, 2), " stars\n"))
  cat(paste0("- **Average Popularity**: ", format(round(profile$avg_rating_count), big.mark = ","), 
             " ratings\n"))
  cat(paste0("- **International Support**: ", round(profile$avg_languages, 1), " languages\n\n"))
}

Cluster 1 - Free/Low-cost, Niche/Low visibility

  • Size: 1,509 apps (23.2%)
  • Average Price: $0.8 (72% free)
  • Average Rating: 1.47 stars
  • Average Popularity: 493 ratings
  • International Support: 1.9 languages

Cluster 2 - High quality

  • Size: 3,545 apps (54.4%)
  • Average Price: $1.81 (50% free)
  • Average Rating: 4.11 stars
  • Average Popularity: 8,841 ratings
  • International Support: 1.7 languages

Cluster 3 - High popularity, International reach, High quality

  • Size: 1,457 apps (22.4%)
  • Average Price: $1.58 (60% free)
  • Average Rating: 4.27 stars
  • Average Popularity: 24,598 ratings
  • International Support: 12.5 languages

7.6.2 How to Read the PCA Cluster Plot

The PCA plot displays apps in a 2D space based on their characteristics:

Understanding the Axes:

  • PC1 (Dim1, horizontal axis): Represents International Reach & Popularity
    • Apps on the right have more languages, broader device support, higher popularity
    • Apps on the left have fewer languages, limited reach, lower visibility
  • PC2 (Dim2, vertical axis): Represents User Engagement & Quality
    • Apps at the top have higher ratings (4+ stars), better user engagement
    • Apps at the bottom have lower ratings, less user engagement

Cluster Positions Explained:

Cluster Position on Plot Why?
Cluster 1 (Green) Bottom-left quadrant Low languages (1.9), low ratings (1.47★), low popularity → Low on both PC1 and PC2
Cluster 2 (Orange) Top-center area High quality (4.11★) but average international reach → High PC2, medium PC1
Cluster 3 (Blue) Right side Many languages (12.5), high popularity (24,598 ratings) → High PC1, varies on PC2

Visual Guide:

                    High Quality (PC2+)
                          ↑
                    ┌─────┴─────┐
                    │  Cluster 2 │
                    │  (Orange)  │
                    │ 4.11★ avg  │
    Low Reach ←─────┼───────────┼─────→ High Reach (PC1+)
    (PC1-)          │           │       Many Languages
                    │ Cluster 1 │       High Popularity
                    │  (Green)  │       Cluster 3 (Blue)
                    │ 1.47★ avg │       12.5 languages
                    └─────┬─────┘
                          ↓
                    Low Quality (PC2-)
fviz_pca_ind(pca_result,
             geom.ind = "point",
             col.ind = as.factor(appstore$cluster),
             palette = "Set2",
             addEllipses = TRUE,
             ellipse.type = "convex",
             legend.title = "Cluster",
             title = "PCA - Apps Colored by K-means Clusters (k=3)",
             pointsize = 1.5,
             alpha.ind = 0.6)
PCA - Apps Colored by K-means Clusters

PCA - Apps Colored by K-means Clusters

7.7 PCA Biplot

fviz_pca_biplot(pca_result,
                geom.ind = "point",
                pointsize = 0.5,
                alpha.ind = 0.3,
                col.var = "red",
                repel = TRUE,
                title = "PCA Biplot - Apps and Variables")
PCA Biplot - Apps and Variables Combined

PCA Biplot - Apps and Variables Combined

7.8 Interactive 3D PCA

pca_coords <- get_pca_ind(pca_result)$coord[, 1:3]

plot_data <- data.frame(
  PC1 = pca_coords[, 1],
  PC2 = pca_coords[, 2],
  PC3 = pca_coords[, 3],
  price_tier = appstore$price_tier,
  track_name = appstore$track_name
)

plot_data$cluster <- as.factor(appstore$cluster)

plot_ly(plot_data, x = ~PC1, y = ~PC2, z = ~PC3,
        color = ~cluster, colors = "Set2",
        type = "scatter3d", mode = "markers",
        marker = list(size = 3, opacity = 0.6),
        text = ~track_name, hoverinfo = "text") %>%
  layout(
    title = "3D PCA - Colored by K-means Clusters (k=3)",
    scene = list(
      xaxis = list(title = paste0("PC1 (", round(eigenvalues$variance.percent[1], 1), "%)")),
      yaxis = list(title = paste0("PC2 (", round(eigenvalues$variance.percent[2], 1), "%)")),
      zaxis = list(title = paste0("PC3 (", round(eigenvalues$variance.percent[3], 1), "%)"))
    )
  )

Interactive 3D PCA Visualization

8 t-SNE Analysis

t-SNE (t-Distributed Stochastic Neighbor Embedding) is a non-linear technique that excels at preserving local structure and revealing cluster patterns in high-dimensional data.

# Sample data for t-SNE (computational efficiency)
if (nrow(data_scaled_df) > 5000) {
  cat("Sampling 5000 apps for t-SNE analysis...\n")
  sample_idx <- sample(1:nrow(data_scaled_df), 5000)
  data_tsne <- data_scaled_df[sample_idx, ]
  appstore_tsne <- appstore[sample_idx, ]
} else {
  sample_idx <- 1:nrow(data_scaled_df)
  data_tsne <- data_scaled_df
  appstore_tsne <- appstore
}
## Sampling 5000 apps for t-SNE analysis...
cat("t-SNE sample size:", nrow(data_tsne), "apps\n")
## t-SNE sample size: 5000 apps

8.1 Perplexity Comparison

Perplexity controls the balance between local and global structure. We compare three values:

cat("Running t-SNE with perplexity = 30...\n")
## Running t-SNE with perplexity = 30...
tsne_30 <- Rtsne(data_tsne, dims = 2, perplexity = 30, verbose = FALSE, max_iter = 500)

cat("Running t-SNE with perplexity = 50...\n")
## Running t-SNE with perplexity = 50...
tsne_50 <- Rtsne(data_tsne, dims = 2, perplexity = 50, verbose = FALSE, max_iter = 500)

cat("Running t-SNE with perplexity = 100...\n")
## Running t-SNE with perplexity = 100...
tsne_100 <- Rtsne(data_tsne, dims = 2, perplexity = 100, verbose = FALSE, max_iter = 500)
tsne_30_df <- data.frame(
  tSNE1 = tsne_30$Y[, 1], tSNE2 = tsne_30$Y[, 2],
  price_tier = appstore_tsne$price_tier
)

tsne_50_df <- data.frame(
  tSNE1 = tsne_50$Y[, 1], tSNE2 = tsne_50$Y[, 2],
  price_tier = appstore_tsne$price_tier
)

tsne_100_df <- data.frame(
  tSNE1 = tsne_100$Y[, 1], tSNE2 = tsne_100$Y[, 2],
  price_tier = appstore_tsne$price_tier
)

# Add cluster assignments
tsne_30_df$cluster <- as.factor(appstore_tsne$cluster)
tsne_50_df$cluster <- as.factor(appstore_tsne$cluster)
tsne_100_df$cluster <- as.factor(appstore_tsne$cluster)
p1 <- ggplot(tsne_30_df, aes(x = tSNE1, y = tSNE2, color = price_tier)) +
  geom_point(alpha = 0.6, size = 1.5) +
  scale_color_brewer(palette = "Set2") +
  theme_minimal() +
  labs(title = "Perplexity = 30", color = "Price Tier") +
  theme(legend.position = "bottom")

p2 <- ggplot(tsne_50_df, aes(x = tSNE1, y = tSNE2, color = price_tier)) +
  geom_point(alpha = 0.6, size = 1.5) +
  scale_color_brewer(palette = "Set2") +
  theme_minimal() +
  labs(title = "Perplexity = 50", color = "Price Tier") +
  theme(legend.position = "bottom")

p3 <- ggplot(tsne_100_df, aes(x = tSNE1, y = tSNE2, color = price_tier)) +
  geom_point(alpha = 0.6, size = 1.5) +
  scale_color_brewer(palette = "Set2") +
  theme_minimal() +
  labs(title = "Perplexity = 100", color = "Price Tier") +
  theme(legend.position = "bottom")

grid.arrange(p1, p2, p3, ncol = 3, top = "t-SNE Perplexity Comparison")
t-SNE with Different Perplexity Values

t-SNE with Different Perplexity Values

8.2 t-SNE by Clusters

ggplot(tsne_50_df, aes(x = tSNE1, y = tSNE2, color = cluster)) +
  geom_point(alpha = 0.6, size = 2) +
  scale_color_brewer(palette = "Set2") +
  theme_minimal() +
  labs(
    title = "t-SNE Projection Colored by K-means Clusters (Perplexity = 50)",
    subtitle = "Non-linear dimension reduction reveals cluster structure",
    color = "Cluster"
  ) +
  theme(
    legend.position = "right",
    plot.title = element_text(size = 14, face = "bold"),
    plot.subtitle = element_text(size = 10)
  )
t-SNE Colored by K-means Clusters (Perplexity = 50)

t-SNE Colored by K-means Clusters (Perplexity = 50)

8.2.1 Key Findings

  • Perplexity 30: Tight, well-separated clusters but may miss global patterns
  • Perplexity 50: Balanced local and global structure (recommended)
  • Perplexity 100: More emphasis on global structure, clusters less distinct
  • Free apps form a distinct cluster in t-SNE space

9 UMAP Analysis

UMAP (Uniform Manifold Approximation and Projection) is a newer technique that often preserves both local and global structure better than t-SNE, while being faster to compute.

cat("Running UMAP with default parameters...\n")
## Running UMAP with default parameters...
umap_default <- umap(data_tsne)

cat("Running UMAP with n_neighbors=15, min_dist=0.1...\n")
## Running UMAP with n_neighbors=15, min_dist=0.1...
umap_custom <- umap(data_tsne, n_neighbors = 15, min_dist = 0.1)
umap_default_df <- data.frame(
  UMAP1 = umap_default$layout[, 1],
  UMAP2 = umap_default$layout[, 2],
  price_tier = appstore_tsne$price_tier
)

umap_custom_df <- data.frame(
  UMAP1 = umap_custom$layout[, 1],
  UMAP2 = umap_custom$layout[, 2],
  price_tier = appstore_tsne$price_tier
)

# Add cluster assignments
umap_default_df$cluster <- as.factor(appstore_tsne$cluster)
umap_custom_df$cluster <- as.factor(appstore_tsne$cluster)

9.1 UMAP Parameter Comparison

p1 <- ggplot(umap_default_df, aes(x = UMAP1, y = UMAP2, color = price_tier)) +
  geom_point(alpha = 0.6, size = 1.5) +
  scale_color_brewer(palette = "Set2") +
  theme_minimal() +
  labs(title = "UMAP (Default Parameters)", color = "Price Tier") +
  theme(legend.position = "bottom")

p2 <- ggplot(umap_custom_df, aes(x = UMAP1, y = UMAP2, color = price_tier)) +
  geom_point(alpha = 0.6, size = 1.5) +
  scale_color_brewer(palette = "Set2") +
  theme_minimal() +
  labs(title = "UMAP (n_neighbors=15, min_dist=0.1)", color = "Price Tier") +
  theme(legend.position = "bottom")

grid.arrange(p1, p2, ncol = 2, top = "UMAP Parameter Comparison")
UMAP with Different Parameters

UMAP with Different Parameters

9.2 UMAP by Clusters

ggplot(umap_custom_df, aes(x = UMAP1, y = UMAP2, color = cluster)) +
  geom_point(alpha = 0.6, size = 2) +
  scale_color_brewer(palette = "Set2") +
  theme_minimal() +
  labs(
    title = "UMAP Projection Colored by K-means Clusters",
    subtitle = "Preserves both local and global structure",
    color = "Cluster"
  ) +
  theme(
    legend.position = "right",
    plot.title = element_text(size = 14, face = "bold"),
    plot.subtitle = element_text(size = 10)
  )
UMAP Colored by K-means Clusters

UMAP Colored by K-means Clusters

9.2.1 Key Findings

  • Default UMAP: Good balance between local and global structure
  • Custom UMAP (n_neighbors=15, min_dist=0.1): Tighter clusters, more local focus
  • UMAP runs significantly faster than t-SNE on large datasets
  • Cluster separation is comparable to t-SNE

10 Method Comparison

10.1 Side-by-Side Visualization

# PCA coordinates for sampled data
pca_coords_sample <- get_pca_ind(pca_result)$coord[sample_idx, 1:2]

pca_comparison_df <- data.frame(
  PC1 = pca_coords_sample[, 1],
  PC2 = pca_coords_sample[, 2],
  price_tier = appstore_tsne$price_tier,
  cluster = as.factor(appstore_tsne$cluster)
)

p1 <- ggplot(pca_comparison_df, aes(x = PC1, y = PC2, color = cluster)) +
  geom_point(alpha = 0.5, size = 1.5) +
  scale_color_brewer(palette = "Set2") +
  theme_minimal() +
  labs(title = "PCA", color = "Cluster") +
  theme(legend.position = "bottom")

p2 <- ggplot(tsne_50_df, aes(x = tSNE1, y = tSNE2, color = cluster)) +
  geom_point(alpha = 0.5, size = 1.5) +
  scale_color_brewer(palette = "Set2") +
  theme_minimal() +
  labs(title = "t-SNE (perplexity=50)", color = "Cluster") +
  theme(legend.position = "bottom")

p3 <- ggplot(umap_custom_df, aes(x = UMAP1, y = UMAP2, color = cluster)) +
  geom_point(alpha = 0.5, size = 1.5) +
  scale_color_brewer(palette = "Set2") +
  theme_minimal() +
  labs(title = "UMAP", color = "Cluster") +
  theme(legend.position = "bottom")

grid.arrange(p1, p2, p3, ncol = 3, top = "Dimension Reduction Methods Comparison")
Dimension Reduction Methods Comparison

Dimension Reduction Methods Comparison

10.2 Mantel Test Validation

The Mantel test measures how well each method preserves the distance structure from the original high-dimensional space.

cat("Computing distance matrices...\n")
## Computing distance matrices...
# PCA 2D distances
pca_2d <- get_pca_ind(pca_result)$coord[sample_idx, 1:2]
dist_pca <- dist(pca_2d, method = "euclidean")

# t-SNE 2D distances
dist_tsne <- dist(tsne_50$Y, method = "euclidean")

# UMAP 2D distances
dist_umap <- dist(umap_custom$layout, method = "euclidean")

# Original 15D distances
dist_original <- dist(data_scaled_df[sample_idx, ], method = "euclidean")

cat("Running Mantel tests (999 permutations)...\n")
## Running Mantel tests (999 permutations)...
mantel_pca <- mantel(dist_original, dist_pca, method = "pearson", permutations = 999)
mantel_tsne <- mantel(dist_original, dist_tsne, method = "pearson", permutations = 999)
mantel_umap <- mantel(dist_original, dist_umap, method = "pearson", permutations = 999)
mantel_results <- data.frame(
  Method = c("PCA", "t-SNE", "UMAP"),
  Mantel_r = round(c(mantel_pca$statistic, mantel_tsne$statistic, mantel_umap$statistic), 4),
  p_value = c(mantel_pca$signif, mantel_tsne$signif, mantel_umap$signif),
  Significance = c(
    ifelse(mantel_pca$signif < 0.001, "***",
           ifelse(mantel_pca$signif < 0.01, "**",
                  ifelse(mantel_pca$signif < 0.05, "*", "ns"))),
    ifelse(mantel_tsne$signif < 0.001, "***",
           ifelse(mantel_tsne$signif < 0.01, "**",
                  ifelse(mantel_tsne$signif < 0.05, "*", "ns"))),
    ifelse(mantel_umap$signif < 0.001, "***",
           ifelse(mantel_umap$signif < 0.01, "**",
                  ifelse(mantel_umap$signif < 0.05, "*", "ns")))
  ),
  Quality = c(
    ifelse(mantel_pca$statistic > 0.7, "Excellent",
           ifelse(mantel_pca$statistic > 0.5, "Good",
                  ifelse(mantel_pca$statistic > 0.3, "Fair", "Poor"))),
    ifelse(mantel_tsne$statistic > 0.7, "Excellent",
           ifelse(mantel_tsne$statistic > 0.5, "Good",
                  ifelse(mantel_tsne$statistic > 0.3, "Fair", "Poor"))),
    ifelse(mantel_umap$statistic > 0.7, "Excellent",
           ifelse(mantel_umap$statistic > 0.5, "Good",
                  ifelse(mantel_umap$statistic > 0.3, "Fair", "Poor")))
  )
)

mantel_results %>%
  kable(caption = "Mantel Test Results - Structure Preservation") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Mantel Test Results - Structure Preservation
Method Mantel_r p_value Significance Quality
PCA 0.6876 0.001 ** Good
t-SNE 0.4250 0.001 ** Fair
UMAP 0.5282 0.001 ** Good
barplot(mantel_results$Mantel_r,
        names.arg = mantel_results$Method,
        col = c("steelblue", "coral", "seagreen"),
        main = "Mantel Test: Structure Preservation Comparison",
        ylab = "Mantel r (correlation)",
        ylim = c(-0.1, 1),
        border = NA)
abline(h = c(0.3, 0.5, 0.7), col = c("orange", "gold", "darkgreen"), lty = 2, lwd = 1.5)
text(1:3, mantel_results$Mantel_r + 0.05,
     sprintf("r = %.3f", mantel_results$Mantel_r), cex = 0.9)
legend("topright",
       legend = c("Excellent (>0.7)", "Good (>0.5)", "Fair (>0.3)"),
       col = c("darkgreen", "gold", "orange"),
       lty = 2, lwd = 2, cex = 0.8)
Mantel Test: Structure Preservation Comparison

Mantel Test: Structure Preservation Comparison

10.2.1 Key Findings

  • PCA best preserves the original distance structure (Mantel r ≈ 0.7)
  • t-SNE and UMAP show low Mantel r values - this is expected and not bad
    • These methods intentionally distort global distances to reveal local cluster patterns
    • Low Mantel r indicates they’re doing their job (emphasizing local structure)
  • Use PCA when you need to preserve original relationships
  • Use t-SNE/UMAP when you want to visualize cluster structure

10.3 Method Characteristics

data.frame(
  Method = c("PCA", "t-SNE", "UMAP"),
  Type = c("Linear", "Non-linear", "Non-linear"),
  Structure_Preserved = c("Global", "Local", "Local + Global"),
  Interpretability = c("High (loadings)", "Low", "Low"),
  Speed = c("Fast", "Slow", "Moderate"),
  Best_For = c("Feature reduction, interpretability", "Cluster visualization", "Large datasets, balanced view")
) %>%
  kable(caption = "Method Comparison Summary") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Method Comparison Summary
Method Type Structure_Preserved Interpretability Speed Best_For
PCA Linear Global High (loadings) Fast Feature reduction, interpretability
t-SNE Non-linear Local Low Slow Cluster visualization
UMAP Non-linear Local + Global Low Moderate Large datasets, balanced view

11 Business Insights & Conclusions

11.1 Key Takeaways

  1. Dimensionality Reduction Achieved
    • Reduced from 15 dimensions to 8 (80% variance preserved)
    • First 2 PCs capture 38.9% for visualization
  2. Principal Component Interpretation
    • PC1 (23.8%): International Reach & Popularity
      • High scores: Apps with many languages, broad device support, high ratings
    • PC2 (15.1%): User Engagement & Quality
      • High scores: Apps with strong version ratings and engagement
  3. Cluster Validation
    • All three methods reveal similar cluster structure
    • K-means clusters are visually separable in reduced dimensional space
    • Non-linear methods (t-SNE, UMAP) show tighter cluster boundaries
  4. Method Selection Guidelines
    • PCA: Use for feature reduction before modeling, interpretable axes
    • t-SNE: Use for publication-quality cluster visualizations
    • UMAP: Use for large datasets when speed matters

11.2 Limitations

  • 2D projections lose information (only 38.9% variance in PC1+PC2)
  • t-SNE perplexity choice affects results - always compare multiple values
  • Non-linear methods are stochastic - results vary between runs

11.3 Recommendations

For App Developers:

  • Apps in the high PC1 quadrant (international reach) show stronger market penetration
  • Focus on multi-language support and broad device compatibility

For Further Analysis:

  • Use PCA scores as input for supervised learning models
  • Apply clustering directly in reduced PCA space (8 components)
  • Combine insights with clustering profiles for targeted marketing strategies

12 Save Results

# Create output directory
dir.create("results/dimension_reduction", showWarnings = FALSE, recursive = TRUE)

# Save PCA scores
pca_scores <- as.data.frame(get_pca_ind(pca_result)$coord[, 1:min(10, ncol(get_pca_ind(pca_result)$coord))])
pca_scores$id <- appstore$id
write.csv(pca_scores, "results/dimension_reduction/pca_scores.csv", row.names = FALSE)

# Save PCA loadings
write.csv(loadings_df, "results/dimension_reduction/pca_loadings.csv", row.names = FALSE)

# Save variance explained
variance_explained <- eigenvalues
variance_explained$component <- 1:nrow(variance_explained)
write.csv(variance_explained, "results/dimension_reduction/pca_variance_explained.csv", row.names = FALSE)

# Save t-SNE coordinates
tsne_output <- data.frame(
  id = appstore_tsne$id,
  tsne1_perp30 = tsne_30$Y[, 1],
  tsne2_perp30 = tsne_30$Y[, 2],
  tsne1_perp50 = tsne_50$Y[, 1],
  tsne2_perp50 = tsne_50$Y[, 2],
  tsne1_perp100 = tsne_100$Y[, 1],
  tsne2_perp100 = tsne_100$Y[, 2]
)
write.csv(tsne_output, "results/dimension_reduction/tsne_coordinates.csv", row.names = FALSE)

# Save UMAP coordinates
umap_output <- data.frame(
  id = appstore_tsne$id,
  umap1_default = umap_default$layout[, 1],
  umap2_default = umap_default$layout[, 2],
  umap1_custom = umap_custom$layout[, 1],
  umap2_custom = umap_custom$layout[, 2]
)
write.csv(umap_output, "results/dimension_reduction/umap_coordinates.csv", row.names = FALSE)

# Save Mantel test results
write.csv(mantel_results, "results/dimension_reduction/mantel_test_results.csv", row.names = FALSE)

# Save summary
summary_report <- data.frame(
  Metric = c(
    "Original Dimensions",
    "Optimal PCs (80% variance)",
    "Optimal PCs (90% variance)",
    "PC1 Variance Explained (%)",
    "PC2 Variance Explained (%)",
    "Total Variance (PC1+PC2) (%)",
    "Dimension Reduction (%)",
    "Sample Size for t-SNE/UMAP"
  ),
  Value = c(
    ncol(data_scaled_df),
    optimal_pcs_80,
    optimal_pcs_90,
    round(eigenvalues$variance.percent[1], 2),
    round(eigenvalues$variance.percent[2], 2),
    round(sum(eigenvalues$variance.percent[1:2]), 2),
    round((1 - optimal_pcs_80 / ncol(data_scaled_df)) * 100, 1),
    nrow(data_tsne)
  )
)
write.csv(summary_report, "results/dimension_reduction/summary.csv", row.names = FALSE)

cat("Results saved to results/dimension_reduction/ directory\n")
## Results saved to results/dimension_reduction/ directory
cat("Files: pca_scores.csv, pca_loadings.csv, pca_variance_explained.csv,\n")
## Files: pca_scores.csv, pca_loadings.csv, pca_variance_explained.csv,
cat("       tsne_coordinates.csv, umap_coordinates.csv, mantel_test_results.csv, summary.csv")
##        tsne_coordinates.csv, umap_coordinates.csv, mantel_test_results.csv, summary.csv

13 Session Info

sessionInfo()
## R version 4.5.2 (2025-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS Tahoe 26.0
## 
## Matrix products: default
## BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: Europe/Warsaw
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] future_1.69.0      solitude_1.1.3     vegan_2.7-2        permute_0.9-8     
##  [5] kableExtra_1.4.0   knitr_1.51         reshape2_1.4.5     RColorBrewer_1.1-3
##  [9] gridExtra_2.3      plotly_4.12.0      corrplot_0.95      tidyr_1.3.2       
## [13] dplyr_1.1.4        umap_0.2.10.0      Rtsne_0.17         factoextra_1.0.7  
## [17] ggplot2_4.0.1      FactoMineR_2.13   
## 
## loaded via a namespace (and not attached):
##  [1] rlang_1.1.7          magrittr_2.0.4       otel_0.2.0          
##  [4] compiler_4.5.2       mgcv_1.9-3           png_0.1-8           
##  [7] systemfonts_1.3.1    vctrs_0.7.1          stringr_1.6.0       
## [10] pkgconfig_2.0.3      fastmap_1.2.0        backports_1.5.0     
## [13] labeling_0.4.3       rmarkdown_2.30       purrr_1.2.1         
## [16] xfun_0.56            cachem_1.1.0         jsonlite_2.0.0      
## [19] flashClust_1.01-2    broom_1.0.12         parallel_4.5.2      
## [22] cluster_2.1.8.1      R6_2.6.1             bslib_0.10.0        
## [25] stringi_1.8.7        ranger_0.18.0        reticulate_1.44.1   
## [28] parallelly_1.46.1    car_3.1-3            jquerylib_0.1.4     
## [31] estimability_1.5.1   Rcpp_1.1.1           future.apply_1.20.1 
## [34] Matrix_1.7-4         splines_4.5.2        igraph_2.2.1        
## [37] tidyselect_1.2.1     rstudioapi_0.18.0    abind_1.4-8         
## [40] yaml_2.3.12          codetools_0.2-20     listenv_0.10.0      
## [43] lattice_0.22-7       tibble_3.3.1         plyr_1.8.9          
## [46] withr_3.0.2          S7_0.2.1             askpass_1.2.1       
## [49] evaluate_1.0.5       xml2_1.5.2           pillar_1.11.1       
## [52] ggpubr_0.6.2         carData_3.0-6        DT_0.34.0           
## [55] generics_0.1.4       scales_1.4.0         globals_0.18.0      
## [58] leaps_3.2            glue_1.8.0           emmeans_2.0.1       
## [61] scatterplot3d_0.3-44 lazyeval_0.2.2       tools_4.5.2         
## [64] data.table_1.18.2.1  RSpectra_0.16-2      ggsignif_0.6.4      
## [67] mvtnorm_1.3-3        grid_4.5.2           crosstalk_1.2.2     
## [70] nlme_3.1-168         Formula_1.2-5        cli_3.6.5           
## [73] textshaping_1.0.4    viridisLite_0.4.2    svglite_2.2.2       
## [76] gtable_0.3.6         rstatix_0.7.3        sass_0.4.10         
## [79] digest_0.6.39        ggrepel_0.9.6        lgr_0.5.2           
## [82] htmlwidgets_1.6.4    farver_2.1.2         htmltools_0.5.9     
## [85] lifecycle_1.0.5      httr_1.4.7           multcompView_0.1-10 
## [88] openssl_2.3.4        MASS_7.3-65

Report Generated: 2026-02-01 Author: Mariyam Babayeva