This analysis segments 7,197 Apple App Store applications into distinct market categories. After removing 350 anomalous apps (5%) via Isolation Forest, four clustering methods were compared.
Key Results:
| Segment | Size | Profile |
|---|---|---|
| Premium Paid | 36.8% | $3.83 avg, 260MB, established apps |
| Successful Free | 40.3% | $0, high engagement (1,527 median ratings) |
| Low-Traction | 22.9% | $0.80, 0 median ratings, early-stage apps |
appstore <- read.csv("/Users/mariyam.babayeva/Desktop/2025/AppleStore.csv", stringsAsFactors = FALSE)
data.frame(
Metric = c("Total Apps", "Variables", "Missing Values"),
Value = c(format(nrow(appstore), big.mark = ","), ncol(appstore), sum(is.na(appstore)))
) %>% kable() %>% kable_styling(bootstrap_options = "striped", full_width = FALSE)| Metric | Value |
|---|---|
| Total Apps | 7,197 |
| Variables | 17 |
| Missing Values | 0 |
genre_counts <- sort(table(appstore$prime_genre), decreasing = TRUE)
genre_df <- data.frame(Genre = names(genre_counts)[1:10],
Count = as.numeric(genre_counts)[1:10],
Percentage = paste0(round(as.numeric(genre_counts)[1:10] / nrow(appstore) * 100, 1), "%"))
kable(genre_df, caption = "Top 10 Genres") %>% kable_styling(bootstrap_options = "striped", full_width = FALSE)| Genre | Count | Percentage |
|---|---|---|
| Games | 3862 | 53.7% |
| Entertainment | 535 | 7.4% |
| Education | 453 | 6.3% |
| Photo & Video | 349 | 4.8% |
| Utilities | 248 | 3.4% |
| Health & Fitness | 180 | 2.5% |
| Productivity | 178 | 2.5% |
| Social Networking | 167 | 2.3% |
| Lifestyle | 144 | 2% |
| Music | 138 | 1.9% |
appstore$is_free <- as.numeric(appstore$price == 0)
appstore$log_size <- log1p(appstore$size_bytes)
appstore$log_price <- log1p(appstore$price)
appstore$log_rating_count <- log1p(appstore$rating_count_tot)
appstore$size_mb <- appstore$size_bytes / (1024^2)
appstore$engagement_ratio <- ifelse(appstore$rating_count_tot > 0,
appstore$rating_count_ver / appstore$rating_count_tot, 0)
appstore$engagement_ratio[appstore$engagement_ratio > 1] <- 1
appstore$quality_delta <- appstore$user_rating_ver - appstore$user_rating
appstore$international_index <- (appstore$lang.num * appstore$sup_devices.num) /
(appstore$size_bytes / 1e6)
appstore$international_index[is.infinite(appstore$international_index)] <- 0
appstore$is_multilingual <- as.numeric(appstore$lang.num >= 5)
appstore$richness_score <- appstore$ipadSc_urls.num + appstore$lang.num
data.frame(
Feature = c("log_size", "log_price", "engagement_ratio", "quality_delta", "international_index", "richness_score"),
Purpose = c("Normalize size distribution", "Normalize price distribution", "Version engagement rate",
"Rating trend direction", "Global reach potential", "App comprehensiveness")
) %>% kable() %>% kable_styling(bootstrap_options = "striped")| Feature | Purpose |
|---|---|
| log_size | Normalize size distribution |
| log_price | Normalize price distribution |
| engagement_ratio | Version engagement rate |
| quality_delta | Rating trend direction |
| international_index | Global reach potential |
| richness_score | App comprehensiveness |
cluster_vars <- c("log_size", "log_price", "log_rating_count", "rating_count_ver",
"user_rating", "user_rating_ver", "sup_devices.num", "ipadSc_urls.num",
"lang.num", "vpp_lic", "engagement_ratio", "quality_delta",
"international_index", "richness_score", "is_free")
data_for_clustering <- appstore[, cluster_vars]
data_for_clustering[is.na(data_for_clustering)] <- 0
data_for_clustering[sapply(data_for_clustering, is.infinite)] <- 0
data_scaled <- scale(data_for_clustering)
cat("Variables:", length(cluster_vars), "| Observations:", nrow(data_scaled))## Variables: 15 | Observations: 7197
cor_matrix <- cor(data_for_clustering)
corrplot(cor_matrix, method = "color", order = "hclust", tl.cex = 0.7, tl.col = "black",
title = "Correlation Matrix of App Features", mar = c(0,0,2,0))Feature Correlation Matrix
iso_forest <- isolationForest$new(sample_size = 256, num_trees = 100, max_depth = ceiling(log2(256)))
iso_forest$fit(as.data.frame(data_scaled))
anomaly_scores <- iso_forest$predict(as.data.frame(data_scaled))
appstore$anomaly_score <- anomaly_scores$anomaly_score
threshold <- quantile(anomaly_scores$anomaly_score, 0.95)
is_anomaly <- anomaly_scores$anomaly_score > threshold
data.frame(
Metric = c("Original dataset", "Anomalies detected", "Clean dataset"),
Value = c(paste(nrow(appstore), "apps"), paste(sum(is_anomaly), "(5%)"), paste(sum(!is_anomaly), "apps"))
) %>% kable() %>% kable_styling(bootstrap_options = "striped", full_width = FALSE)| Metric | Value |
|---|---|
| Original dataset | 7197 apps |
| Anomalies detected | 350 (5%) |
| Clean dataset | 6847 apps |
par(mfrow = c(2, 2))
hist(anomaly_scores$anomaly_score, breaks = 50, main = "Distribution of Anomaly Scores",
xlab = "Anomaly Score", col = "lightblue", border = "white")
abline(v = threshold, col = "red", lwd = 2, lty = 2)
plot(appstore$price, anomaly_scores$anomaly_score, main = "Anomaly Score vs Price",
xlab = "Price ($)", ylab = "Anomaly Score", pch = 16,
col = ifelse(is_anomaly, "red", "gray70"), cex = 0.5)
abline(h = threshold, col = "red", lwd = 2, lty = 2)
plot(appstore$log_rating_count, anomaly_scores$anomaly_score, main = "Anomaly Score vs Popularity",
xlab = "Log(Rating Count + 1)", ylab = "Anomaly Score", pch = 16,
col = ifelse(is_anomaly, "red", "gray70"), cex = 0.5)
abline(h = threshold, col = "red", lwd = 2, lty = 2)
plot(appstore$size_mb, anomaly_scores$anomaly_score, main = "Anomaly Score vs App Size",
xlab = "Size (MB)", ylab = "Anomaly Score", pch = 16,
col = ifelse(is_anomaly, "red", "gray70"), cex = 0.5,
xlim = c(0, quantile(appstore$size_mb, 0.95)))
abline(h = threshold, col = "red", lwd = 2, lty = 2)Anomaly Score Distribution
data_scaled <- data_scaled[!is_anomaly, ]
appstore <- appstore[!is_anomaly, ]
data_for_clustering <- data_for_clustering[!is_anomaly, ]
cat("Clean dataset:", nrow(appstore), "apps")## Clean dataset: 6847 apps
set.seed(42)
n_iterations <- 100
hopkins_values <- numeric(n_iterations)
for (i in 1:n_iterations) {
hopkins_values[i] <- hopkins(data_scaled, m = min(nrow(data_scaled) - 1, 500))
}
data.frame(
Metric = c("Mean H", "SD", "95% CI"),
Value = c(round(mean(hopkins_values), 4), round(sd(hopkins_values), 4),
paste(round(quantile(hopkins_values, 0.025), 4), "-", round(quantile(hopkins_values, 0.975), 4)))
) %>% kable() %>% kable_styling(bootstrap_options = "striped", full_width = FALSE)| Metric | Value |
|---|---|
| Mean H | 1 |
| SD | 0 |
| 95% CI | 1 - 1 |
Interpretation: H = 1 indicates strong clustering tendency.
fviz_nbclust(data_scaled, kmeans, method = "wss", k.max = 15) +
labs(title = "Elbow Method for Optimal k")Elbow Method
fviz_nbclust(data_scaled, kmeans, method = "silhouette", k.max = 15) +
labs(title = "Silhouette Method for Optimal k")Silhouette Method
set.seed(123)
km_final <- kmeans(data_scaled, centers = k_optimal, nstart = 50, iter.max = 300)
data.frame(Cluster = 1:k_optimal, Size = as.numeric(table(km_final$cluster)),
Pct = paste0(round(as.numeric(table(km_final$cluster)) / nrow(data_scaled) * 100, 1), "%")
) %>% kable() %>% kable_styling(bootstrap_options = "striped", full_width = FALSE)| Cluster | Size | Pct |
|---|---|---|
| 1 | 2517 | 36.8% |
| 2 | 2760 | 40.3% |
| 3 | 1570 | 22.9% |
## Within-cluster SS: 50017.22
## Between/Total SS ratio: 34.18 %
fviz_cluster(km_final, data = data_scaled, geom = "point", ellipse.type = "convex",
palette = "jco", ggtheme = theme_minimal(), main = paste("K-means (k =", k_optimal, ")"))K-means Clusters
pam_final <- pam(data_scaled, k = k_optimal, metric = "euclidean")
medoid_apps <- appstore[pam_final$id.med, ]
data.frame(Cluster = 1:k_optimal, Medoid_App = medoid_apps$track_name,
Price = paste0("$", medoid_apps$price), Rating = medoid_apps$user_rating
) %>% kable() %>% kable_styling(bootstrap_options = "striped")| Cluster | Medoid_App | Price | Rating |
|---|---|---|---|
| 1 | Tsuro - The Game of the Path | $2.99 | 4.5 |
| 2 | Battleborn® Tap | $0 | 4.5 |
| 3 | Reversi REAL - Multiplayer Board game | $0 | 0.0 |
fviz_cluster(pam_final, data = data_scaled, geom = "point", ellipse.type = "convex",
palette = "jco", ggtheme = theme_minimal(), main = paste("PAM (k =", k_optimal, ")"))PAM Clusters
clara_final <- clara(data_scaled, k = k_optimal, samples = 100, sampsize = 1500)
cat("CLARA Silhouette:", round(clara_final$silinfo$avg.width, 4))## CLARA Silhouette: 0.2321
d <- dist(data_scaled, method = "euclidean")
hc_ward <- hclust(d, method = "ward.D2")
hc_clusters <- cutree(hc_ward, k = k_optimal)
cat("Hierarchical cluster sizes:", table(hc_clusters))## Hierarchical cluster sizes: 2769 2700 1378
fviz_dend(hc_ward, k = k_optimal, rect = TRUE, cex = 0.4,
main = "Hierarchical Clustering (Ward's Method)", palette = "jco")Ward’s Dendrogram
methods <- list(kmeans = km_final$cluster, pam = pam_final$clustering,
clara = clara_final$clustering, hierarchical = hc_clusters)
validation_table <- data.frame(
Method = names(methods),
Calinski_Harabasz = round(sapply(methods, function(x) cluster.stats(d, x)$ch), 2),
Avg_Silhouette = round(sapply(methods, function(x) mean(silhouette(x, d)[,3])), 4),
Within_SS = round(sapply(methods, function(x) sum(cluster.stats(d, x)$within.cluster.ss)), 2)
)
kable(validation_table, caption = "Validation Metrics") %>% kable_styling(bootstrap_options = "striped")| Method | Calinski_Harabasz | Avg_Silhouette | Within_SS | |
|---|---|---|---|---|
| kmeans | kmeans | 1776.86 | 0.2309 | 50017.22 |
| pam | pam | 1677.89 | 0.2319 | 50987.89 |
| clara | clara | 1677.89 | 0.2319 | 50987.89 |
| hierarchical | hierarchical | 1363.27 | 0.1793 | 54340.16 |
ari_matrix <- matrix(NA, 4, 4)
rownames(ari_matrix) <- colnames(ari_matrix) <- names(methods)
for(i in 1:4) for(j in 1:4) ari_matrix[i,j] <- adj.rand.index(methods[[i]], methods[[j]])
corrplot(ari_matrix, method = "color", is.corr = FALSE, addCoef.col = "black",
tl.col = "black", title = "Adjusted Rand Index", mar = c(0,0,2,0), cl.lim = c(0, 1))Cross-Method Agreement (ARI)
km_boot <- clusterboot(data_scaled, B = 100, bootmethod = "boot", clustermethod = kmeansCBI, k = k_optimal, seed = 123)data.frame(Cluster = 1:k_optimal, Jaccard = round(km_boot$bootmean, 4),
Stability = ifelse(km_boot$bootmean > 0.85, "Highly stable",
ifelse(km_boot$bootmean > 0.75, "Stable", "Moderate"))
) %>% kable(caption = "Bootstrap Stability") %>% kable_styling(bootstrap_options = "striped", full_width = FALSE)| Cluster | Jaccard | Stability |
|---|---|---|
| 1 | 0.8348 | Stable |
| 2 | 0.8170 | Stable |
| 3 | 0.8730 | Highly stable |
appstore$cluster <- km_final$cluster
kw_results <- data.frame(
Variable = c("Price", "Rating", "Size", "Languages"),
p_value = c(format.pval(kruskal.test(price ~ cluster, data = appstore)$p.value),
format.pval(kruskal.test(user_rating ~ cluster, data = appstore)$p.value),
format.pval(kruskal.test(size_mb ~ cluster, data = appstore)$p.value),
format.pval(kruskal.test(lang.num ~ cluster, data = appstore)$p.value)),
Significant = rep("Yes", 4)
)
kable(kw_results, caption = "Kruskal-Wallis Tests") %>% kable_styling(bootstrap_options = "striped", full_width = FALSE)| Variable | p_value | Significant |
|---|---|---|
| Price | < 2.22e-16 | Yes |
| Rating | < 2.22e-16 | Yes |
| Size | < 2.22e-16 | Yes |
| Languages | < 2.22e-16 | Yes |
cluster_profiles <- appstore %>% group_by(cluster) %>%
summarise(n_apps = n(), avg_price = round(mean(price), 2), pct_free = round(mean(price == 0) * 100, 1),
avg_rating = round(mean(user_rating, na.rm = TRUE), 2), median_ratings = median(rating_count_tot),
avg_size_MB = round(mean(size_mb), 1), avg_languages = round(mean(lang.num), 2))
kable(cluster_profiles, col.names = c("Cluster", "Apps", "Avg Price", "% Free", "Avg Rating",
"Median Ratings", "Size (MB)", "Languages"),
caption = "Cluster Profiles") %>% kable_styling(bootstrap_options = "striped")| Cluster | Apps | Avg Price | % Free | Avg Rating | Median Ratings | Size (MB) | Languages |
|---|---|---|---|---|---|---|---|
| 1 | 2517 | 3.83 | 0 | 4.13 | 313.0 | 259.9 | 4.95 |
| 2 | 2760 | 0.00 | 100 | 4.16 | 1526.5 | 160.8 | 6.16 |
| 3 | 1570 | 0.80 | 72 | 1.53 | 0.0 | 102.2 | 2.20 |
for(i in 1:k_optimal) {
cluster_data <- subset(appstore, cluster == i)
cat("\n### Cluster", i, "\n\n")
cat("**Size:**", nrow(cluster_data), "apps (", round(nrow(cluster_data)/nrow(appstore)*100, 1), "%)\n\n")
cat("**Pricing:** $", round(mean(cluster_data$price), 2), " average, ",
round(mean(cluster_data$price == 0) * 100, 1), "% free\n\n", sep="")
cat("**Engagement:**", round(mean(cluster_data$user_rating, na.rm = TRUE), 2),
" rating,", median(cluster_data$rating_count_tot), "median ratings\n\n")
top_genres <- sort(table(cluster_data$prime_genre), decreasing = TRUE)[1:5]
cat("**Top Genres:**\n\n")
for(g in 1:length(top_genres)) {
cat("- ", names(top_genres)[g], ": ", round(top_genres[g]/nrow(cluster_data)*100, 1), "%\n", sep="")
}
cat("\n---\n")
}Size: 2517 apps ( 36.8 %)
Pricing: $3.83 average, 0% free
Engagement: 4.13 rating, 313 median ratings
Top Genres:
Size: 2760 apps ( 40.3 %)
Pricing: $0 average, 100% free
Engagement: 4.16 rating, 1526.5 median ratings
Top Genres:
Size: 1570 apps ( 22.9 %)
Pricing: $0.8 average, 72% free
Engagement: 1.53 rating, 0 median ratings
Top Genres:
ggplot(appstore, aes(x = factor(cluster), y = log1p(rating_count_tot), fill = factor(cluster))) +
geom_boxplot() + theme_minimal() + scale_fill_brewer(palette = "Set2") +
labs(title = "Rating Count Distribution", x = "Cluster", y = "Log(Ratings + 1)", fill = "Cluster")Rating Count Distribution by Cluster
ggplot(appstore, aes(x = price + 0.1, y = user_rating, color = factor(cluster))) +
geom_point(alpha = 0.5, size = 1.5) + scale_x_log10() + theme_minimal() +
scale_color_brewer(palette = "Set2") +
labs(title = "Price vs Rating by Cluster", x = "Price (log)", y = "Rating", color = "Cluster")Price vs Rating by Cluster
ggplot(appstore, aes(x = factor(cluster), y = lang.num, fill = factor(cluster))) +
geom_boxplot() + theme_minimal() + scale_fill_brewer(palette = "Set2") +
labs(title = "Language Support by Cluster", x = "Cluster", y = "Languages", fill = "Cluster")Language Support by Cluster
ggplot(appstore, aes(x = prime_genre, fill = factor(cluster))) +
geom_bar(position = "dodge") + theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
scale_fill_brewer(palette = "Set2") +
labs(title = "Cluster Distribution by Genre", x = "Genre", y = "Count", fill = "Cluster")Cluster Distribution by Genre
appstore$successful <- (appstore$user_rating >= 4.5 &
appstore$rating_count_tot >= quantile(appstore$rating_count_tot, 0.75))
appstore %>% group_by(cluster) %>%
summarise(total = n(), successful = sum(successful), rate = paste0(round(mean(successful) * 100, 1), "%")) %>%
kable(col.names = c("Cluster", "Total", "Successful", "Success Rate"), caption = "Success by Cluster") %>%
kable_styling(bootstrap_options = "striped", full_width = FALSE)| Cluster | Total | Successful | Success Rate |
|---|---|---|---|
| 1 | 2517 | 345 | 34500% |
| 2 | 2760 | 802 | 80200% |
| 3 | 1570 | 14 | 1400% |
For Developers:
For Further Analysis:
dir.create("results", showWarnings = FALSE)
write.csv(validation_table, "results/validation_metrics.csv", row.names = FALSE)
write.csv(cluster_profiles, "results/cluster_profiles.csv", row.names = FALSE)
cluster_assignments <- data.frame(
app_id = appstore$id, track_name = appstore$track_name,
prime_genre = appstore$prime_genre, cluster = km_final$cluster
)
write.csv(cluster_assignments, "results/cluster_assignments.csv", row.names = FALSE)
write.csv(appstore, "results/appstore_with_clusters.csv", row.names = FALSE)
cat("Results saved to results/ directory")## Results saved to results/ directory