1. Executive Summary

This analysis segments 7,197 Apple App Store applications into distinct market categories. After removing 350 anomalous apps (5%) via Isolation Forest, four clustering methods were compared.

Key Results:

  • 3 distinct segments identified with strong clustering tendency (Hopkins H = 0.999)
  • K-means selected as primary method (Calinski-Harabasz = 1,776.86)
  • Cluster 2 (Free apps) achieves 29% success rate vs 14% for paid apps
Segment Size Profile
Premium Paid 36.8% $3.83 avg, 260MB, established apps
Successful Free 40.3% $0, high engagement (1,527 median ratings)
Low-Traction 22.9% $0.80, 0 median ratings, early-stage apps

2. Data Loading & Exploration

appstore <- read.csv("/Users/mariyam.babayeva/Desktop/2025/AppleStore.csv", stringsAsFactors = FALSE)

data.frame(
  Metric = c("Total Apps", "Variables", "Missing Values"),
  Value = c(format(nrow(appstore), big.mark = ","), ncol(appstore), sum(is.na(appstore)))
) %>% kable() %>% kable_styling(bootstrap_options = "striped", full_width = FALSE)
Metric Value
Total Apps 7,197
Variables 17
Missing Values 0
genre_counts <- sort(table(appstore$prime_genre), decreasing = TRUE)
genre_df <- data.frame(Genre = names(genre_counts)[1:10], 
                       Count = as.numeric(genre_counts)[1:10],
                       Percentage = paste0(round(as.numeric(genre_counts)[1:10] / nrow(appstore) * 100, 1), "%"))
kable(genre_df, caption = "Top 10 Genres") %>% kable_styling(bootstrap_options = "striped", full_width = FALSE)
Top 10 Genres
Genre Count Percentage
Games 3862 53.7%
Entertainment 535 7.4%
Education 453 6.3%
Photo & Video 349 4.8%
Utilities 248 3.4%
Health & Fitness 180 2.5%
Productivity 178 2.5%
Social Networking 167 2.3%
Lifestyle 144 2%
Music 138 1.9%

3. Feature Engineering

appstore$is_free <- as.numeric(appstore$price == 0)
appstore$log_size <- log1p(appstore$size_bytes)
appstore$log_price <- log1p(appstore$price)
appstore$log_rating_count <- log1p(appstore$rating_count_tot)
appstore$size_mb <- appstore$size_bytes / (1024^2)

appstore$engagement_ratio <- ifelse(appstore$rating_count_tot > 0,
                                    appstore$rating_count_ver / appstore$rating_count_tot, 0)
appstore$engagement_ratio[appstore$engagement_ratio > 1] <- 1

appstore$quality_delta <- appstore$user_rating_ver - appstore$user_rating

appstore$international_index <- (appstore$lang.num * appstore$sup_devices.num) / 
                                 (appstore$size_bytes / 1e6)
appstore$international_index[is.infinite(appstore$international_index)] <- 0

appstore$is_multilingual <- as.numeric(appstore$lang.num >= 5)
appstore$richness_score <- appstore$ipadSc_urls.num + appstore$lang.num

data.frame(
  Feature = c("log_size", "log_price", "engagement_ratio", "quality_delta", "international_index", "richness_score"),
  Purpose = c("Normalize size distribution", "Normalize price distribution", "Version engagement rate", 
              "Rating trend direction", "Global reach potential", "App comprehensiveness")
) %>% kable() %>% kable_styling(bootstrap_options = "striped")
Feature Purpose
log_size Normalize size distribution
log_price Normalize price distribution
engagement_ratio Version engagement rate
quality_delta Rating trend direction
international_index Global reach potential
richness_score App comprehensiveness

4. Variable Selection & Standardization

cluster_vars <- c("log_size", "log_price", "log_rating_count", "rating_count_ver",
                  "user_rating", "user_rating_ver", "sup_devices.num", "ipadSc_urls.num",
                  "lang.num", "vpp_lic", "engagement_ratio", "quality_delta",
                  "international_index", "richness_score", "is_free")

data_for_clustering <- appstore[, cluster_vars]
data_for_clustering[is.na(data_for_clustering)] <- 0
data_for_clustering[sapply(data_for_clustering, is.infinite)] <- 0

data_scaled <- scale(data_for_clustering)
cat("Variables:", length(cluster_vars), "| Observations:", nrow(data_scaled))
## Variables: 15 | Observations: 7197
cor_matrix <- cor(data_for_clustering)
corrplot(cor_matrix, method = "color", order = "hclust", tl.cex = 0.7, tl.col = "black",
         title = "Correlation Matrix of App Features", mar = c(0,0,2,0))
Feature Correlation Matrix

Feature Correlation Matrix

5. Anomaly Detection

iso_forest <- isolationForest$new(sample_size = 256, num_trees = 100, max_depth = ceiling(log2(256)))
iso_forest$fit(as.data.frame(data_scaled))
anomaly_scores <- iso_forest$predict(as.data.frame(data_scaled))

appstore$anomaly_score <- anomaly_scores$anomaly_score
threshold <- quantile(anomaly_scores$anomaly_score, 0.95)
is_anomaly <- anomaly_scores$anomaly_score > threshold

data.frame(
  Metric = c("Original dataset", "Anomalies detected", "Clean dataset"),
  Value = c(paste(nrow(appstore), "apps"), paste(sum(is_anomaly), "(5%)"), paste(sum(!is_anomaly), "apps"))
) %>% kable() %>% kable_styling(bootstrap_options = "striped", full_width = FALSE)
Metric Value
Original dataset 7197 apps
Anomalies detected 350 (5%)
Clean dataset 6847 apps
par(mfrow = c(2, 2))
hist(anomaly_scores$anomaly_score, breaks = 50, main = "Distribution of Anomaly Scores", 
     xlab = "Anomaly Score", col = "lightblue", border = "white")
abline(v = threshold, col = "red", lwd = 2, lty = 2)

plot(appstore$price, anomaly_scores$anomaly_score, main = "Anomaly Score vs Price", 
     xlab = "Price ($)", ylab = "Anomaly Score", pch = 16, 
     col = ifelse(is_anomaly, "red", "gray70"), cex = 0.5)
abline(h = threshold, col = "red", lwd = 2, lty = 2)

plot(appstore$log_rating_count, anomaly_scores$anomaly_score, main = "Anomaly Score vs Popularity", 
     xlab = "Log(Rating Count + 1)", ylab = "Anomaly Score", pch = 16, 
     col = ifelse(is_anomaly, "red", "gray70"), cex = 0.5)
abline(h = threshold, col = "red", lwd = 2, lty = 2)

plot(appstore$size_mb, anomaly_scores$anomaly_score, main = "Anomaly Score vs App Size", 
     xlab = "Size (MB)", ylab = "Anomaly Score", pch = 16, 
     col = ifelse(is_anomaly, "red", "gray70"), cex = 0.5,
     xlim = c(0, quantile(appstore$size_mb, 0.95)))
abline(h = threshold, col = "red", lwd = 2, lty = 2)
Anomaly Score Distribution

Anomaly Score Distribution

data_scaled <- data_scaled[!is_anomaly, ]
appstore <- appstore[!is_anomaly, ]
data_for_clustering <- data_for_clustering[!is_anomaly, ]
cat("Clean dataset:", nrow(appstore), "apps")
## Clean dataset: 6847 apps

6. Clustering Tendency

6.1 Hopkins Statistic

set.seed(42)
n_iterations <- 100
hopkins_values <- numeric(n_iterations)

for (i in 1:n_iterations) {
  hopkins_values[i] <- hopkins(data_scaled, m = min(nrow(data_scaled) - 1, 500))
}

data.frame(
  Metric = c("Mean H", "SD", "95% CI"),
  Value = c(round(mean(hopkins_values), 4), round(sd(hopkins_values), 4),
            paste(round(quantile(hopkins_values, 0.025), 4), "-", round(quantile(hopkins_values, 0.975), 4)))
) %>% kable() %>% kable_styling(bootstrap_options = "striped", full_width = FALSE)
Metric Value
Mean H 1
SD 0
95% CI 1 - 1

Interpretation: H = 1 indicates strong clustering tendency.

6.2 VAT Plot

sample_idx <- sample(1:nrow(data_scaled), min(500, nrow(data_scaled)))
get_clust_tendency(data_scaled[sample_idx,], n = 50, graph = TRUE)
## $hopkins_stat
## [1] 0.8009825
## 
## $plot
VAT Plot

VAT Plot

7. Optimal Number of Clusters

7.1 Elbow Method

fviz_nbclust(data_scaled, kmeans, method = "wss", k.max = 15) +
  labs(title = "Elbow Method for Optimal k")
Elbow Method

Elbow Method

7.2 Silhouette Method

fviz_nbclust(data_scaled, kmeans, method = "silhouette", k.max = 15) +
  labs(title = "Silhouette Method for Optimal k")
Silhouette Method

Silhouette Method

7.3 Gap Statistic

set.seed(123)
gap_stat <- clusGap(data_scaled, FUN = kmeans, nstart = 25, K.max = 10, B = 50)
fviz_gap_stat(gap_stat) + labs(title = "Gap Statistic Method")
Gap Statistic

Gap Statistic

7.4 Selection Rationale

Selected: k = 3 based on elbow inflection and business interpretability.

k_optimal <- 3

8. Clustering Methods

8.1 K-means

set.seed(123)
km_final <- kmeans(data_scaled, centers = k_optimal, nstart = 50, iter.max = 300)

data.frame(Cluster = 1:k_optimal, Size = as.numeric(table(km_final$cluster)),
           Pct = paste0(round(as.numeric(table(km_final$cluster)) / nrow(data_scaled) * 100, 1), "%")
) %>% kable() %>% kable_styling(bootstrap_options = "striped", full_width = FALSE)
Cluster Size Pct
1 2517 36.8%
2 2760 40.3%
3 1570 22.9%
cat("Within-cluster SS:", round(km_final$tot.withinss, 2), "\n")
## Within-cluster SS: 50017.22
cat("Between/Total SS ratio:", round(km_final$betweenss / km_final$totss * 100, 2), "%")
## Between/Total SS ratio: 34.18 %
fviz_cluster(km_final, data = data_scaled, geom = "point", ellipse.type = "convex",
             palette = "jco", ggtheme = theme_minimal(), main = paste("K-means (k =", k_optimal, ")"))
K-means Clusters

K-means Clusters

8.2 PAM (Partitioning Around Medoids)

pam_final <- pam(data_scaled, k = k_optimal, metric = "euclidean")
medoid_apps <- appstore[pam_final$id.med, ]

data.frame(Cluster = 1:k_optimal, Medoid_App = medoid_apps$track_name,
           Price = paste0("$", medoid_apps$price), Rating = medoid_apps$user_rating
) %>% kable() %>% kable_styling(bootstrap_options = "striped")
Cluster Medoid_App Price Rating
1 Tsuro - The Game of the Path $2.99 4.5
2 Battleborn® Tap $0 4.5
3 Reversi REAL - Multiplayer Board game $0 0.0
fviz_cluster(pam_final, data = data_scaled, geom = "point", ellipse.type = "convex",
             palette = "jco", ggtheme = theme_minimal(), main = paste("PAM (k =", k_optimal, ")"))
PAM Clusters

PAM Clusters

8.3 CLARA

clara_final <- clara(data_scaled, k = k_optimal, samples = 100, sampsize = 1500)
cat("CLARA Silhouette:", round(clara_final$silinfo$avg.width, 4))
## CLARA Silhouette: 0.2321

8.4 Hierarchical Clustering

d <- dist(data_scaled, method = "euclidean")
hc_ward <- hclust(d, method = "ward.D2")
hc_clusters <- cutree(hc_ward, k = k_optimal)

cat("Hierarchical cluster sizes:", table(hc_clusters))
## Hierarchical cluster sizes: 2769 2700 1378
fviz_dend(hc_ward, k = k_optimal, rect = TRUE, cex = 0.4,
          main = "Hierarchical Clustering (Ward's Method)", palette = "jco")
Ward's Dendrogram

Ward’s Dendrogram

9. Validation & Comparison

9.1 Internal Validation Metrics

methods <- list(kmeans = km_final$cluster, pam = pam_final$clustering,
                clara = clara_final$clustering, hierarchical = hc_clusters)

validation_table <- data.frame(
  Method = names(methods),
  Calinski_Harabasz = round(sapply(methods, function(x) cluster.stats(d, x)$ch), 2),
  Avg_Silhouette = round(sapply(methods, function(x) mean(silhouette(x, d)[,3])), 4),
  Within_SS = round(sapply(methods, function(x) sum(cluster.stats(d, x)$within.cluster.ss)), 2)
)
kable(validation_table, caption = "Validation Metrics") %>% kable_styling(bootstrap_options = "striped")
Validation Metrics
Method Calinski_Harabasz Avg_Silhouette Within_SS
kmeans kmeans 1776.86 0.2309 50017.22
pam pam 1677.89 0.2319 50987.89
clara clara 1677.89 0.2319 50987.89
hierarchical hierarchical 1363.27 0.1793 54340.16

9.2 Cross-Method Agreement (ARI)

ari_matrix <- matrix(NA, 4, 4)
rownames(ari_matrix) <- colnames(ari_matrix) <- names(methods)
for(i in 1:4) for(j in 1:4) ari_matrix[i,j] <- adj.rand.index(methods[[i]], methods[[j]])

corrplot(ari_matrix, method = "color", is.corr = FALSE, addCoef.col = "black", 
         tl.col = "black", title = "Adjusted Rand Index", mar = c(0,0,2,0), cl.lim = c(0, 1))
Cross-Method Agreement (ARI)

Cross-Method Agreement (ARI)

9.3 Bootstrap Stability

km_boot <- clusterboot(data_scaled, B = 100, bootmethod = "boot", clustermethod = kmeansCBI, k = k_optimal, seed = 123)
data.frame(Cluster = 1:k_optimal, Jaccard = round(km_boot$bootmean, 4),
           Stability = ifelse(km_boot$bootmean > 0.85, "Highly stable", 
                              ifelse(km_boot$bootmean > 0.75, "Stable", "Moderate"))
) %>% kable(caption = "Bootstrap Stability") %>% kable_styling(bootstrap_options = "striped", full_width = FALSE)
Bootstrap Stability
Cluster Jaccard Stability
1 0.8348 Stable
2 0.8170 Stable
3 0.8730 Highly stable

9.4 Statistical Significance

appstore$cluster <- km_final$cluster

kw_results <- data.frame(
  Variable = c("Price", "Rating", "Size", "Languages"),
  p_value = c(format.pval(kruskal.test(price ~ cluster, data = appstore)$p.value),
              format.pval(kruskal.test(user_rating ~ cluster, data = appstore)$p.value),
              format.pval(kruskal.test(size_mb ~ cluster, data = appstore)$p.value),
              format.pval(kruskal.test(lang.num ~ cluster, data = appstore)$p.value)),
  Significant = rep("Yes", 4)
)
kable(kw_results, caption = "Kruskal-Wallis Tests") %>% kable_styling(bootstrap_options = "striped", full_width = FALSE)
Kruskal-Wallis Tests
Variable p_value Significant
Price < 2.22e-16 Yes
Rating < 2.22e-16 Yes
Size < 2.22e-16 Yes
Languages < 2.22e-16 Yes

10. Cluster Profiles

10.1 Summary Statistics

cluster_profiles <- appstore %>% group_by(cluster) %>%
  summarise(n_apps = n(), avg_price = round(mean(price), 2), pct_free = round(mean(price == 0) * 100, 1),
            avg_rating = round(mean(user_rating, na.rm = TRUE), 2), median_ratings = median(rating_count_tot),
            avg_size_MB = round(mean(size_mb), 1), avg_languages = round(mean(lang.num), 2))

kable(cluster_profiles, col.names = c("Cluster", "Apps", "Avg Price", "% Free", "Avg Rating", 
                                       "Median Ratings", "Size (MB)", "Languages"),
      caption = "Cluster Profiles") %>% kable_styling(bootstrap_options = "striped")
Cluster Profiles
Cluster Apps Avg Price % Free Avg Rating Median Ratings Size (MB) Languages
1 2517 3.83 0 4.13 313.0 259.9 4.95
2 2760 0.00 100 4.16 1526.5 160.8 6.16
3 1570 0.80 72 1.53 0.0 102.2 2.20

10.2 Segment Interpretation

for(i in 1:k_optimal) {
  cluster_data <- subset(appstore, cluster == i)
  
  cat("\n### Cluster", i, "\n\n")
  cat("**Size:**", nrow(cluster_data), "apps (", round(nrow(cluster_data)/nrow(appstore)*100, 1), "%)\n\n")
  cat("**Pricing:** $", round(mean(cluster_data$price), 2), " average, ",
      round(mean(cluster_data$price == 0) * 100, 1), "% free\n\n", sep="")
  cat("**Engagement:**", round(mean(cluster_data$user_rating, na.rm = TRUE), 2), 
      " rating,", median(cluster_data$rating_count_tot), "median ratings\n\n")
  
  top_genres <- sort(table(cluster_data$prime_genre), decreasing = TRUE)[1:5]
  cat("**Top Genres:**\n\n")
  for(g in 1:length(top_genres)) {
    cat("- ", names(top_genres)[g], ": ", round(top_genres[g]/nrow(cluster_data)*100, 1), "%\n", sep="")
  }
  cat("\n---\n")
}

Cluster 1

Size: 2517 apps ( 36.8 %)

Pricing: $3.83 average, 0% free

Engagement: 4.13 rating, 313 median ratings

Top Genres:

  • Games: 55.5%
  • Education: 8.9%
  • Photo & Video: 6%
  • Entertainment: 5.9%
  • Utilities: 3.8%

Cluster 2

Size: 2760 apps ( 40.3 %)

Pricing: $0 average, 100% free

Engagement: 4.16 rating, 1526.5 median ratings

Top Genres:

  • Games: 62.5%
  • Entertainment: 7.9%
  • Photo & Video: 4.2%
  • Education: 3.1%
  • Social Networking: 2.7%

Cluster 3

Size: 1570 apps ( 22.9 %)

Pricing: $0.8 average, 72% free

Engagement: 1.53 rating, 0 median ratings

Top Genres:

  • Games: 39%
  • Entertainment: 9.6%
  • Education: 7.4%
  • Lifestyle: 4.3%
  • Utilities: 4.2%

10.3 Visualizations

ggplot(appstore, aes(x = factor(cluster), y = log1p(rating_count_tot), fill = factor(cluster))) +
  geom_boxplot() + theme_minimal() + scale_fill_brewer(palette = "Set2") +
  labs(title = "Rating Count Distribution", x = "Cluster", y = "Log(Ratings + 1)", fill = "Cluster")
Rating Count Distribution by Cluster

Rating Count Distribution by Cluster

ggplot(appstore, aes(x = price + 0.1, y = user_rating, color = factor(cluster))) +
  geom_point(alpha = 0.5, size = 1.5) + scale_x_log10() + theme_minimal() +
  scale_color_brewer(palette = "Set2") +
  labs(title = "Price vs Rating by Cluster", x = "Price (log)", y = "Rating", color = "Cluster")
Price vs Rating by Cluster

Price vs Rating by Cluster

ggplot(appstore, aes(x = factor(cluster), y = lang.num, fill = factor(cluster))) +
  geom_boxplot() + theme_minimal() + scale_fill_brewer(palette = "Set2") +
  labs(title = "Language Support by Cluster", x = "Cluster", y = "Languages", fill = "Cluster")
Language Support by Cluster

Language Support by Cluster

ggplot(appstore, aes(x = prime_genre, fill = factor(cluster))) +
  geom_bar(position = "dodge") + theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
  scale_fill_brewer(palette = "Set2") +
  labs(title = "Cluster Distribution by Genre", x = "Genre", y = "Count", fill = "Cluster")
Cluster Distribution by Genre

Cluster Distribution by Genre

10.4 Success Factor Analysis

appstore$successful <- (appstore$user_rating >= 4.5 & 
                        appstore$rating_count_tot >= quantile(appstore$rating_count_tot, 0.75))

appstore %>% group_by(cluster) %>%
  summarise(total = n(), successful = sum(successful), rate = paste0(round(mean(successful) * 100, 1), "%")) %>%
  kable(col.names = c("Cluster", "Total", "Successful", "Success Rate"), caption = "Success by Cluster") %>%
  kable_styling(bootstrap_options = "striped", full_width = FALSE)
Success by Cluster
Cluster Total Successful Success Rate
1 2517 345 34500%
2 2760 802 80200%
3 1570 14 1400%

11. Conclusions

11.1 Key Findings

  1. Hopkins H = 1 confirms strong clustering tendency
  2. K-means selected: highest Calinski-Harabasz (1776.86)
  3. Free apps achieve higher success rates than paid apps
  4. All clusters stable (Jaccard > 0.75)
  5. International reach correlates with engagement

11.2 Limitations

  • Modest silhouette (~0.23): clusters overlap in feature space
  • Games genre overrepresented (54%)
  • Cross-sectional data only

11.3 Recommendations

For Developers:

  • Freemium model with international support correlates with success
  • Paid apps require larger feature sets to compete

For Further Analysis:

  • Apply dimension reduction (PCA, t-SNE) for visualization
  • Genre-stratified clustering to reduce Games dominance

12. Save Results

dir.create("results", showWarnings = FALSE)
write.csv(validation_table, "results/validation_metrics.csv", row.names = FALSE)
write.csv(cluster_profiles, "results/cluster_profiles.csv", row.names = FALSE)

cluster_assignments <- data.frame(
  app_id = appstore$id, track_name = appstore$track_name,
  prime_genre = appstore$prime_genre, cluster = km_final$cluster
)
write.csv(cluster_assignments, "results/cluster_assignments.csv", row.names = FALSE)
write.csv(appstore, "results/appstore_with_clusters.csv", row.names = FALSE)

cat("Results saved to results/ directory")
## Results saved to results/ directory