1. Executive Summary

This analysis segments 7,197 Apple App Store applications into distinct market categories. After removing 350 anomalous apps (5%) via Isolation Forest, four clustering methods were compared.

Key Results:

3 distinct segments identified with strong clustering tendency (Hopkins H = 0.999)
K-means selected as primary method (Calinski-Harabasz = 1,776.86)
Cluster 2 (Free apps) achieves 29% success rate vs 14% for paid apps

Segment	Size	Profile
Premium Paid	36.8%	$3.83 avg, 260MB, established apps
Successful Free	40.3%	$0, high engagement (1,527 median ratings)
Low-Traction	22.9%	$0.80, 0 median ratings, early-stage apps

2. Data Loading & Exploration

appstore <- read.csv("/Users/mariyam.babayeva/Desktop/2025/AppleStore.csv", stringsAsFactors = FALSE)

data.frame(
  Metric = c("Total Apps", "Variables", "Missing Values"),
  Value = c(format(nrow(appstore), big.mark = ","), ncol(appstore), sum(is.na(appstore)))
) %>% kable() %>% kable_styling(bootstrap_options = "striped", full_width = FALSE)

Metric	Value
Total Apps	7,197
Variables	17
Missing Values	0

genre_counts <- sort(table(appstore$prime_genre), decreasing = TRUE)
genre_df <- data.frame(Genre = names(genre_counts)[1:10], 
                       Count = as.numeric(genre_counts)[1:10],
                       Percentage = paste0(round(as.numeric(genre_counts)[1:10] / nrow(appstore) * 100, 1), "%"))
kable(genre_df, caption = "Top 10 Genres") %>% kable_styling(bootstrap_options = "striped", full_width = FALSE)

Top 10 Genres
Genre	Count	Percentage
Games	3862	53.7%
Entertainment	535	7.4%
Education	453	6.3%
Photo & Video	349	4.8%
Utilities	248	3.4%
Health & Fitness	180	2.5%
Productivity	178	2.5%
Social Networking	167	2.3%
Lifestyle	144	2%
Music	138	1.9%

3. Feature Engineering

appstore$is_free <- as.numeric(appstore$price == 0)
appstore$log_size <- log1p(appstore$size_bytes)
appstore$log_price <- log1p(appstore$price)
appstore$log_rating_count <- log1p(appstore$rating_count_tot)
appstore$size_mb <- appstore$size_bytes / (1024^2)

appstore$engagement_ratio <- ifelse(appstore$rating_count_tot > 0,
                                    appstore$rating_count_ver / appstore$rating_count_tot, 0)
appstore$engagement_ratio[appstore$engagement_ratio > 1] <- 1

appstore$quality_delta <- appstore$user_rating_ver - appstore$user_rating

appstore$international_index <- (appstore$lang.num * appstore$sup_devices.num) / 
                                 (appstore$size_bytes / 1e6)
appstore$international_index[is.infinite(appstore$international_index)] <- 0

appstore$is_multilingual <- as.numeric(appstore$lang.num >= 5)
appstore$richness_score <- appstore$ipadSc_urls.num + appstore$lang.num

data.frame(
  Feature = c("log_size", "log_price", "engagement_ratio", "quality_delta", "international_index", "richness_score"),
  Purpose = c("Normalize size distribution", "Normalize price distribution", "Version engagement rate", 
              "Rating trend direction", "Global reach potential", "App comprehensiveness")
) %>% kable() %>% kable_styling(bootstrap_options = "striped")

Feature	Purpose
log_size	Normalize size distribution
log_price	Normalize price distribution
engagement_ratio	Version engagement rate
quality_delta	Rating trend direction
international_index	Global reach potential
richness_score	App comprehensiveness

4. Variable Selection & Standardization

cluster_vars <- c("log_size", "log_price", "log_rating_count", "rating_count_ver",
                  "user_rating", "user_rating_ver", "sup_devices.num", "ipadSc_urls.num",
                  "lang.num", "vpp_lic", "engagement_ratio", "quality_delta",
                  "international_index", "richness_score", "is_free")

data_for_clustering <- appstore[, cluster_vars]
data_for_clustering[is.na(data_for_clustering)] <- 0
data_for_clustering[sapply(data_for_clustering, is.infinite)] <- 0

data_scaled <- scale(data_for_clustering)
cat("Variables:", length(cluster_vars), "| Observations:", nrow(data_scaled))

## Variables: 15 | Observations: 7197

cor_matrix <- cor(data_for_clustering)
corrplot(cor_matrix, method = "color", order = "hclust", tl.cex = 0.7, tl.col = "black",
         title = "Correlation Matrix of App Features", mar = c(0,0,2,0))

Feature Correlation Matrix

5. Anomaly Detection

iso_forest <- isolationForest$new(sample_size = 256, num_trees = 100, max_depth = ceiling(log2(256)))
iso_forest$fit(as.data.frame(data_scaled))
anomaly_scores <- iso_forest$predict(as.data.frame(data_scaled))

appstore$anomaly_score <- anomaly_scores$anomaly_score
threshold <- quantile(anomaly_scores$anomaly_score, 0.95)
is_anomaly <- anomaly_scores$anomaly_score > threshold

data.frame(
  Metric = c("Original dataset", "Anomalies detected", "Clean dataset"),
  Value = c(paste(nrow(appstore), "apps"), paste(sum(is_anomaly), "(5%)"), paste(sum(!is_anomaly), "apps"))
) %>% kable() %>% kable_styling(bootstrap_options = "striped", full_width = FALSE)

Metric	Value
Original dataset	7197 apps
Anomalies detected	350 (5%)
Clean dataset	6847 apps

par(mfrow = c(2, 2))
hist(anomaly_scores$anomaly_score, breaks = 50, main = "Distribution of Anomaly Scores", 
     xlab = "Anomaly Score", col = "lightblue", border = "white")
abline(v = threshold, col = "red", lwd = 2, lty = 2)

plot(appstore$price, anomaly_scores$anomaly_score, main = "Anomaly Score vs Price", 
     xlab = "Price ($)", ylab = "Anomaly Score", pch = 16, 
     col = ifelse(is_anomaly, "red", "gray70"), cex = 0.5)
abline(h = threshold, col = "red", lwd = 2, lty = 2)

plot(appstore$log_rating_count, anomaly_scores$anomaly_score, main = "Anomaly Score vs Popularity", 
     xlab = "Log(Rating Count + 1)", ylab = "Anomaly Score", pch = 16, 
     col = ifelse(is_anomaly, "red", "gray70"), cex = 0.5)
abline(h = threshold, col = "red", lwd = 2, lty = 2)

plot(appstore$size_mb, anomaly_scores$anomaly_score, main = "Anomaly Score vs App Size", 
     xlab = "Size (MB)", ylab = "Anomaly Score", pch = 16, 
     col = ifelse(is_anomaly, "red", "gray70"), cex = 0.5,
     xlim = c(0, quantile(appstore$size_mb, 0.95)))
abline(h = threshold, col = "red", lwd = 2, lty = 2)

Anomaly Score Distribution

data_scaled <- data_scaled[!is_anomaly, ]
appstore <- appstore[!is_anomaly, ]
data_for_clustering <- data_for_clustering[!is_anomaly, ]
cat("Clean dataset:", nrow(appstore), "apps")

## Clean dataset: 6847 apps

6. Clustering Tendency

6.1 Hopkins Statistic

set.seed(42)
n_iterations <- 100
hopkins_values <- numeric(n_iterations)

for (i in 1:n_iterations) {
  hopkins_values[i] <- hopkins(data_scaled, m = min(nrow(data_scaled) - 1, 500))
}

data.frame(
  Metric = c("Mean H", "SD", "95% CI"),
  Value = c(round(mean(hopkins_values), 4), round(sd(hopkins_values), 4),
            paste(round(quantile(hopkins_values, 0.025), 4), "-", round(quantile(hopkins_values, 0.975), 4)))
) %>% kable() %>% kable_styling(bootstrap_options = "striped", full_width = FALSE)

Metric	Value
Mean H	1
SD	0
95% CI	1 - 1

Interpretation: H = 1 indicates strong clustering tendency.

6.2 VAT Plot

sample_idx <- sample(1:nrow(data_scaled), min(500, nrow(data_scaled)))
get_clust_tendency(data_scaled[sample_idx,], n = 50, graph = TRUE)

## $hopkins_stat
## [1] 0.8009825
## 
## $plot

VAT Plot

7. Optimal Number of Clusters

7.1 Elbow Method

fviz_nbclust(data_scaled, kmeans, method = "wss", k.max = 15) +
  labs(title = "Elbow Method for Optimal k")

Elbow Method

7.2 Silhouette Method

fviz_nbclust(data_scaled, kmeans, method = "silhouette", k.max = 15) +
  labs(title = "Silhouette Method for Optimal k")

Silhouette Method

7.3 Gap Statistic

set.seed(123)
gap_stat <- clusGap(data_scaled, FUN = kmeans, nstart = 25, K.max = 10, B = 50)
fviz_gap_stat(gap_stat) + labs(title = "Gap Statistic Method")

Gap Statistic

7.4 Selection Rationale

Selected: k = 3 based on elbow inflection and business interpretability.

k_optimal <- 3

8. Clustering Methods

8.1 K-means

set.seed(123)
km_final <- kmeans(data_scaled, centers = k_optimal, nstart = 50, iter.max = 300)

data.frame(Cluster = 1:k_optimal, Size = as.numeric(table(km_final$cluster)),
           Pct = paste0(round(as.numeric(table(km_final$cluster)) / nrow(data_scaled) * 100, 1), "%")
) %>% kable() %>% kable_styling(bootstrap_options = "striped", full_width = FALSE)

Cluster	Size	Pct
1	2517	36.8%
2	2760	40.3%
3	1570	22.9%

cat("Within-cluster SS:", round(km_final$tot.withinss, 2), "\n")

## Within-cluster SS: 50017.22

cat("Between/Total SS ratio:", round(km_final$betweenss / km_final$totss * 100, 2), "%")

## Between/Total SS ratio: 34.18 %

fviz_cluster(km_final, data = data_scaled, geom = "point", ellipse.type = "convex",
             palette = "jco", ggtheme = theme_minimal(), main = paste("K-means (k =", k_optimal, ")"))

K-means Clusters

8.2 PAM (Partitioning Around Medoids)

pam_final <- pam(data_scaled, k = k_optimal, metric = "euclidean")
medoid_apps <- appstore[pam_final$id.med, ]

data.frame(Cluster = 1:k_optimal, Medoid_App = medoid_apps$track_name,
           Price = paste0("$", medoid_apps$price), Rating = medoid_apps$user_rating
) %>% kable() %>% kable_styling(bootstrap_options = "striped")

Cluster	Medoid_App	Price	Rating
1	Tsuro - The Game of the Path	$2.99	4.5
2	Battleborn® Tap	$0	4.5
3	Reversi REAL - Multiplayer Board game	$0	0.0

fviz_cluster(pam_final, data = data_scaled, geom = "point", ellipse.type = "convex",
             palette = "jco", ggtheme = theme_minimal(), main = paste("PAM (k =", k_optimal, ")"))

PAM Clusters

8.3 CLARA

clara_final <- clara(data_scaled, k = k_optimal, samples = 100, sampsize = 1500)
cat("CLARA Silhouette:", round(clara_final$silinfo$avg.width, 4))

## CLARA Silhouette: 0.2321

8.4 Hierarchical Clustering

d <- dist(data_scaled, method = "euclidean")
hc_ward <- hclust(d, method = "ward.D2")
hc_clusters <- cutree(hc_ward, k = k_optimal)

cat("Hierarchical cluster sizes:", table(hc_clusters))

## Hierarchical cluster sizes: 2769 2700 1378

fviz_dend(hc_ward, k = k_optimal, rect = TRUE, cex = 0.4,
          main = "Hierarchical Clustering (Ward's Method)", palette = "jco")

Ward’s Dendrogram

9. Validation & Comparison

9.1 Internal Validation Metrics

methods <- list(kmeans = km_final$cluster, pam = pam_final$clustering,
                clara = clara_final$clustering, hierarchical = hc_clusters)

validation_table <- data.frame(
  Method = names(methods),
  Calinski_Harabasz = round(sapply(methods, function(x) cluster.stats(d, x)$ch), 2),
  Avg_Silhouette = round(sapply(methods, function(x) mean(silhouette(x, d)[,3])), 4),
  Within_SS = round(sapply(methods, function(x) sum(cluster.stats(d, x)$within.cluster.ss)), 2)
)
kable(validation_table, caption = "Validation Metrics") %>% kable_styling(bootstrap_options = "striped")

Validation Metrics
	Method	Calinski_Harabasz	Avg_Silhouette	Within_SS
kmeans	kmeans	1776.86	0.2309	50017.22
pam	pam	1677.89	0.2319	50987.89
clara	clara	1677.89	0.2319	50987.89
hierarchical	hierarchical	1363.27	0.1793	54340.16

9.2 Cross-Method Agreement (ARI)

ari_matrix <- matrix(NA, 4, 4)
rownames(ari_matrix) <- colnames(ari_matrix) <- names(methods)
for(i in 1:4) for(j in 1:4) ari_matrix[i,j] <- adj.rand.index(methods[[i]], methods[[j]])

corrplot(ari_matrix, method = "color", is.corr = FALSE, addCoef.col = "black", 
         tl.col = "black", title = "Adjusted Rand Index", mar = c(0,0,2,0), cl.lim = c(0, 1))

Cross-Method Agreement (ARI)

9.3 Bootstrap Stability

km_boot <- clusterboot(data_scaled, B = 100, bootmethod = "boot", clustermethod = kmeansCBI, k = k_optimal, seed = 123)

data.frame(Cluster = 1:k_optimal, Jaccard = round(km_boot$bootmean, 4),
           Stability = ifelse(km_boot$bootmean > 0.85, "Highly stable", 
                              ifelse(km_boot$bootmean > 0.75, "Stable", "Moderate"))
) %>% kable(caption = "Bootstrap Stability") %>% kable_styling(bootstrap_options = "striped", full_width = FALSE)

Bootstrap Stability
Cluster	Jaccard	Stability
1	0.8348	Stable
2	0.8170	Stable
3	0.8730	Highly stable

9.4 Statistical Significance

appstore$cluster <- km_final$cluster

kw_results <- data.frame(
  Variable = c("Price", "Rating", "Size", "Languages"),
  p_value = c(format.pval(kruskal.test(price ~ cluster, data = appstore)$p.value),
              format.pval(kruskal.test(user_rating ~ cluster, data = appstore)$p.value),
              format.pval(kruskal.test(size_mb ~ cluster, data = appstore)$p.value),
              format.pval(kruskal.test(lang.num ~ cluster, data = appstore)$p.value)),
  Significant = rep("Yes", 4)
)
kable(kw_results, caption = "Kruskal-Wallis Tests") %>% kable_styling(bootstrap_options = "striped", full_width = FALSE)

Kruskal-Wallis Tests
Variable	p_value	Significant
Price	< 2.22e-16	Yes
Rating	< 2.22e-16	Yes
Size	< 2.22e-16	Yes
Languages	< 2.22e-16	Yes

10. Cluster Profiles

10.1 Summary Statistics

cluster_profiles <- appstore %>% group_by(cluster) %>%
  summarise(n_apps = n(), avg_price = round(mean(price), 2), pct_free = round(mean(price == 0) * 100, 1),
            avg_rating = round(mean(user_rating, na.rm = TRUE), 2), median_ratings = median(rating_count_tot),
            avg_size_MB = round(mean(size_mb), 1), avg_languages = round(mean(lang.num), 2))

kable(cluster_profiles, col.names = c("Cluster", "Apps", "Avg Price", "% Free", "Avg Rating", 
                                       "Median Ratings", "Size (MB)", "Languages"),
      caption = "Cluster Profiles") %>% kable_styling(bootstrap_options = "striped")

Cluster Profiles
Cluster	Apps	Avg Price	% Free	Avg Rating	Median Ratings	Size (MB)	Languages
1	2517	3.83	0	4.13	313.0	259.9	4.95
2	2760	0.00	100	4.16	1526.5	160.8	6.16
3	1570	0.80	72	1.53	0.0	102.2	2.20

10.2 Segment Interpretation

for(i in 1:k_optimal) {
  cluster_data <- subset(appstore, cluster == i)
  
  cat("\n### Cluster", i, "\n\n")
  cat("**Size:**", nrow(cluster_data), "apps (", round(nrow(cluster_data)/nrow(appstore)*100, 1), "%)\n\n")
  cat("**Pricing:** $", round(mean(cluster_data$price), 2), " average, ",
      round(mean(cluster_data$price == 0) * 100, 1), "% free\n\n", sep="")
  cat("**Engagement:**", round(mean(cluster_data$user_rating, na.rm = TRUE), 2), 
      " rating,", median(cluster_data$rating_count_tot), "median ratings\n\n")
  
  top_genres <- sort(table(cluster_data$prime_genre), decreasing = TRUE)[1:5]
  cat("**Top Genres:**\n\n")
  for(g in 1:length(top_genres)) {
    cat("- ", names(top_genres)[g], ": ", round(top_genres[g]/nrow(cluster_data)*100, 1), "%\n", sep="")
  }
  cat("\n---\n")
}

Cluster 1

Size: 2517 apps ( 36.8 %)

Pricing: $3.83 average, 0% free

Engagement: 4.13 rating, 313 median ratings

Top Genres:

Games: 55.5%
Education: 8.9%
Photo & Video: 6%
Entertainment: 5.9%
Utilities: 3.8%

Cluster 2

Size: 2760 apps ( 40.3 %)

Pricing: $0 average, 100% free

Engagement: 4.16 rating, 1526.5 median ratings

Top Genres:

Games: 62.5%
Entertainment: 7.9%
Photo & Video: 4.2%
Education: 3.1%
Social Networking: 2.7%

Cluster 3

Size: 1570 apps ( 22.9 %)

Pricing: $0.8 average, 72% free

Engagement: 1.53 rating, 0 median ratings

Top Genres:

Games: 39%
Entertainment: 9.6%
Education: 7.4%
Lifestyle: 4.3%
Utilities: 4.2%

10.3 Visualizations

ggplot(appstore, aes(x = factor(cluster), y = log1p(rating_count_tot), fill = factor(cluster))) +
  geom_boxplot() + theme_minimal() + scale_fill_brewer(palette = "Set2") +
  labs(title = "Rating Count Distribution", x = "Cluster", y = "Log(Ratings + 1)", fill = "Cluster")

Rating Count Distribution by Cluster

ggplot(appstore, aes(x = price + 0.1, y = user_rating, color = factor(cluster))) +
  geom_point(alpha = 0.5, size = 1.5) + scale_x_log10() + theme_minimal() +
  scale_color_brewer(palette = "Set2") +
  labs(title = "Price vs Rating by Cluster", x = "Price (log)", y = "Rating", color = "Cluster")

Price vs Rating by Cluster

ggplot(appstore, aes(x = factor(cluster), y = lang.num, fill = factor(cluster))) +
  geom_boxplot() + theme_minimal() + scale_fill_brewer(palette = "Set2") +
  labs(title = "Language Support by Cluster", x = "Cluster", y = "Languages", fill = "Cluster")

Language Support by Cluster

ggplot(appstore, aes(x = prime_genre, fill = factor(cluster))) +
  geom_bar(position = "dodge") + theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
  scale_fill_brewer(palette = "Set2") +
  labs(title = "Cluster Distribution by Genre", x = "Genre", y = "Count", fill = "Cluster")

Cluster Distribution by Genre

10.4 Success Factor Analysis

appstore$successful <- (appstore$user_rating >= 4.5 & 
                        appstore$rating_count_tot >= quantile(appstore$rating_count_tot, 0.75))

appstore %>% group_by(cluster) %>%
  summarise(total = n(), successful = sum(successful), rate = paste0(round(mean(successful) * 100, 1), "%")) %>%
  kable(col.names = c("Cluster", "Total", "Successful", "Success Rate"), caption = "Success by Cluster") %>%
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Success by Cluster
Cluster	Total	Successful	Success Rate
1	2517	345	34500%
2	2760	802	80200%
3	1570	14	1400%

11. Conclusions

11.1 Key Findings

Hopkins H = 1 confirms strong clustering tendency
K-means selected: highest Calinski-Harabasz (1776.86)
Free apps achieve higher success rates than paid apps
All clusters stable (Jaccard > 0.75)
International reach correlates with engagement

11.2 Limitations

Modest silhouette (~0.23): clusters overlap in feature space
Games genre overrepresented (54%)
Cross-sectional data only

11.3 Recommendations

For Developers:

Freemium model with international support correlates with success
Paid apps require larger feature sets to compete

For Further Analysis:

Apply dimension reduction (PCA, t-SNE) for visualization
Genre-stratified clustering to reduce Games dominance

12. Save Results

dir.create("results", showWarnings = FALSE)
write.csv(validation_table, "results/validation_metrics.csv", row.names = FALSE)
write.csv(cluster_profiles, "results/cluster_profiles.csv", row.names = FALSE)

cluster_assignments <- data.frame(
  app_id = appstore$id, track_name = appstore$track_name,
  prime_genre = appstore$prime_genre, cluster = km_final$cluster
)
write.csv(cluster_assignments, "results/cluster_assignments.csv", row.names = FALSE)
write.csv(appstore, "results/appstore_with_clusters.csv", row.names = FALSE)

cat("Results saved to results/ directory")

## Results saved to results/ directory

Apple App Store Market Segmentation: Clustering Analysis

Mariyam Babayeva

31.01.2026

1. Executive Summary

2. Data Loading & Exploration

3. Feature Engineering

4. Variable Selection & Standardization

5. Anomaly Detection

6. Clustering Tendency

6.1 Hopkins Statistic

6.2 VAT Plot

7. Optimal Number of Clusters

7.1 Elbow Method

7.2 Silhouette Method

7.3 Gap Statistic

7.4 Selection Rationale

8. Clustering Methods

8.1 K-means

8.2 PAM (Partitioning Around Medoids)

8.3 CLARA

8.4 Hierarchical Clustering

9. Validation & Comparison

9.1 Internal Validation Metrics

9.2 Cross-Method Agreement (ARI)

9.3 Bootstrap Stability

9.4 Statistical Significance

10. Cluster Profiles

10.1 Summary Statistics

10.2 Segment Interpretation

Cluster 1

Cluster 2

Cluster 3

10.3 Visualizations

10.4 Success Factor Analysis

11. Conclusions

11.1 Key Findings

11.2 Limitations

11.3 Recommendations

12. Save Results