Apple Store App Data

Exploratory Analysis of App Ratings, Pricing, and Category Trends

Victor Shamanovsky

Sunday, August 31, 2025 at 06:27 PM EDT

🌙 Dark Mode

# Load all required libraries
library(readr)
library(dplyr)
library(ggplot2)
library(scales)
library(psych)
library(treemapify)
library(reshape2)

# Read the CSV
apple_data <- read_csv("apple_store_data.csv", show_col_types = FALSE)

# Show data columns
names(apple_data)

## [1] "...1"              "id"                "app_name"         
## [4] "size_bytes"        "price"             "number_of_ratings"
## [7] "user_rating"       "genre"

1 Summary statistics

# Summary across all columns
summary(apple_data)

##       ...1             id              app_name           size_bytes       
##  Min.   :    1   Min.   :2.817e+08   Length:7197        Min.   :5.898e+05  
##  1st Qu.: 2090   1st Qu.:6.001e+08   Class :character   1st Qu.:4.692e+07  
##  Median : 4380   Median :9.781e+08   Mode  :character   Median :9.715e+07  
##  Mean   : 4759   Mean   :8.631e+08                      Mean   :1.991e+08  
##  3rd Qu.: 7223   3rd Qu.:1.082e+09                      3rd Qu.:1.819e+08  
##  Max.   :11097   Max.   :1.188e+09                      Max.   :4.026e+09  
##      price         number_of_ratings  user_rating       genre          
##  Min.   :  0.000   Min.   :      0   Min.   :0.000   Length:7197       
##  1st Qu.:  0.000   1st Qu.:     28   1st Qu.:3.500   Class :character  
##  Median :  0.000   Median :    300   Median :4.000   Mode  :character  
##  Mean   :  1.726   Mean   :  12893   Mean   :3.527                     
##  3rd Qu.:  1.990   3rd Qu.:   2793   3rd Qu.:4.500                     
##  Max.   :299.990   Max.   :2974676   Max.   :5.000

2 Basic Visualizations

2.1 Boxplot (User Rating)

library(ggplot2)

# Boxplot of user ratings (continuous)
ggplot(apple_data, aes(x = cut(user_rating, breaks = 5), y = user_rating, fill = cut(user_rating, breaks = 5))) +
  geom_boxplot() +
  scale_fill_manual(values = rainbow(5)) +
  labs(
    title = "User Ratings by Rating Tier",
    x = "Rating Tier",
    y = "User Rating"
  ) +
  theme_minimal()

2.2 Scatterplot (Price vs. Number of Ratings)

library(ggplot2)
library(scales)  # for label_comma()

ggplot(apple_data, aes(x = price, y = number_of_ratings, color = user_rating)) +
  geom_point(alpha = 0.6, size = 2) +
  scale_color_gradientn(colors = rainbow(7)) +
  scale_y_continuous(labels = label_comma()) +
  labs(
    title = "Price vs. Number of Ratings (Colored by User Rating)",
    x = "Price (USD)",
    y = "Number of Ratings",
    color = "User Rating"
  ) +
  theme_minimal()

2.3 Histogram (App Size in MB)

library(dplyr)
library(ggplot2)

# Convert size from bytes to megabytes for readability
apple_data <- apple_data %>%
  mutate(size_mb = size_bytes / (1024^2))

ggplot(apple_data, aes(x = size_mb, fill = after_stat(count))) +
  geom_histogram(bins = 30, color = "black") +
  scale_fill_gradientn(colors = rainbow(7)) +
  labs(
    title = "Distribution of App Size",
    x = "Size (MB)",
    y = "Count",
    fill = "Frequency"
  ) +
  theme_minimal()

2.4 Lollipop chart (top items by a numeric column)

library(dplyr)
library(ggplot2)

# Prepare top 10 apps by number of ratings
top_apps <- apple_data %>%
  arrange(desc(number_of_ratings)) %>%
  slice_head(n = 10) %>%
  mutate(app_name = factor(app_name, levels = rev(app_name)))  # reverse for vertical order

# Lollipop chart
ggplot(top_apps, aes(x = app_name, y = number_of_ratings)) +
  geom_segment(aes(x = app_name, xend = app_name, y = 0, yend = number_of_ratings),
               color = "gray70", linewidth = 1.2) +
  geom_point(aes(color = app_name), size = 5) +
  scale_color_manual(values = rainbow(10)) +
  scale_y_continuous(labels = scales::label_comma()) +
  coord_flip() +
  labs(
    title = "Top 10 Apps by Number of Ratings",
    x = "App Name",
    y = "Number of Ratings",
    color = "App"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

2.5 Treemap of App Genres

library(dplyr)
library(ggplot2)
library(treemapify)

# Count apps by genre
genre_counts <- apple_data %>%
  count(genre, name = "count") %>%
  arrange(desc(count))

# Treemap
ggplot(genre_counts, aes(area = count, fill = genre, label = genre)) +
  geom_treemap() +
  geom_treemap_text(
    colour = "white",
    place = "centre",
    grow = TRUE,
    reflow = TRUE
  ) +
  scale_fill_manual(values = rainbow(nrow(genre_counts))) +
  labs(
    title = "App Distribution by Genre",
    fill = "Genre"
  ) +
  theme_minimal()

2.6 Bar Plot of Top App Categories

library(dplyr)
library(ggplot2)

# Count apps by genre and select top 10
top_genres <- apple_data %>%
  count(genre, name = "count") %>%
  arrange(desc(count)) %>%
  slice_head(n = 10) %>%
  mutate(genre = factor(genre, levels = rev(genre)))  # reverse for vertical order

# Bar plot
ggplot(top_genres, aes(x = genre, y = count, fill = genre)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = rainbow(10)) +
  coord_flip() +
  labs(
    title = "Top 10 App Categories by Count",
    x = "Category",
    y = "Number of Apps",
    fill = "Genre"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

2.7 Correlation Heatmap

library(dplyr)
library(ggplot2)
library(reshape2)
library(RColorBrewer)

# Select numeric columns for correlation
numeric_data <- apple_data %>%
  select(where(is.numeric))

# Compute correlation matrix
cor_matrix <- round(cor(numeric_data, use = "complete.obs"), 2)

# Melt the matrix for ggplot
cor_melted <- melt(cor_matrix)

# Heatmap
ggplot(cor_melted, aes(x = Var1, y = Var2, fill = value)) +
  geom_tile(color = "white") +
  scale_fill_gradientn(colors = rainbow(7), limits = c(-1, 1)) +
  geom_text(aes(label = value), size = 4, color = "black") +
  labs(
    title = "Correlation Heatmap of Numeric Variables",
    x = "",
    y = "",
    fill = "Correlation"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

3 Category-Level Insights

3.1 Stacked Bar Chart (Genre by Rating Tier)

library(dplyr)
library(ggplot2)

# Create rating tiers
apple_data <- apple_data %>%
  mutate(rating_tier = cut(user_rating,
                           breaks = c(0, 2, 3.5, 4.5, 5),
                           labels = c("Low", "Medium", "High", "Excellent"),
                           include.lowest = TRUE))

# Count apps by genre and rating tier
genre_rating_counts <- apple_data %>%
  count(genre, rating_tier)

# Stacked bar chart
ggplot(genre_rating_counts, aes(x = genre, y = n, fill = rating_tier)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = rainbow(length(unique(genre_rating_counts$rating_tier)))) +
  labs(
    title = "App Count by Genre and Rating Tier",
    x = "Genre",
    y = "Number of Apps",
    fill = "Rating Tier"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

3.2 Grouped Bar Chart (Average Rating by Genre and Tier)

library(dplyr)
library(ggplot2)

# Create rating tiers
apple_data <- apple_data %>%
  mutate(rating_tier = cut(user_rating,
                           breaks = c(0, 2, 3.5, 4.5, 5),
                           labels = c("Low", "Medium", "High", "Excellent"),
                           include.lowest = TRUE))

# Calculate average rating per genre and tier
avg_rating_by_genre <- apple_data %>%
  group_by(genre, rating_tier) %>%
  summarise(avg_rating = mean(user_rating, na.rm = TRUE), .groups = "drop") %>%
  filter(!is.na(rating_tier)) %>%
  mutate(genre = factor(genre, levels = rev(unique(genre))))

# Grouped bar chart
ggplot(avg_rating_by_genre, aes(x = genre, y = avg_rating, fill = rating_tier)) +
  geom_bar(stat = "identity", position = "dodge") +
  scale_fill_manual(values = rainbow(length(unique(avg_rating_by_genre$rating_tier)))) +
  labs(
    title = "Average User Rating by Genre and Rating Tier",
    x = "Genre",
    y = "Average Rating",
    fill = "Rating Tier"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

4 Trend & Distribution Visuals

4.1 Density Plot of User Ratings

ggplot(apple_data, aes(x = user_rating, fill = genre)) +
  geom_density(alpha = 0.6) +
  scale_fill_manual(values = rainbow(length(unique(apple_data$genre)))) +
  labs(
    title = "Density of User Ratings by Genre",
    x = "User Rating",
    y = "Density",
    fill = "Genre"
  ) +
  theme_minimal()

4.2 Violin Plot of User Ratings by Genre

library(ggplot2)

ggplot(apple_data, aes(x = genre, y = user_rating, fill = genre)) +
  geom_violin(trim = FALSE, scale = "width") +
  scale_fill_manual(values = rainbow(length(unique(apple_data$genre)))) +
  labs(
    title = "Distribution of User Ratings by Genre",
    x = "Genre",
    y = "User Rating",
    fill = "Genre"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

5 Relationship & Ranking Visuals

5.1 Bubble Chart (Price vs. Rating, Sized by Number of Ratings)

library(ggplot2)
library(scales)

ggplot(apple_data, aes(x = price, y = user_rating, size = number_of_ratings, color = genre)) +
  geom_point(alpha = 0.6) +
  scale_size_continuous(range = c(2, 12), labels = label_comma()) +
  scale_color_manual(values = rainbow(length(unique(apple_data$genre)))) +
  labs(
    title = "Bubble Chart: Price vs. User Rating (Bubble Size = Number of Ratings)",
    x = "Price (USD)",
    y = "User Rating",
    size = "Number of Ratings",
    color = "Genre"
  ) +
  theme_minimal()

5.2 Pareto Chart of App Genres

library(dplyr)
library(ggplot2)
library(scales)

# Create rating tiers
apple_data <- apple_data %>%
  mutate(rating_tier = cut(user_rating,
                           breaks = c(0, 2, 3.5, 4.5, 5),
                           labels = c("Low", "Medium", "High", "Excellent"),
                           include.lowest = TRUE))

# Count apps by rating tier and calculate cumulative percentage
pareto_ratings <- apple_data %>%
  count(rating_tier, name = "count") %>%
  arrange(desc(count)) %>%
  mutate(
    cum_pct = cumsum(count) / sum(count),
    rating_tier = factor(rating_tier, levels = rating_tier)  # preserve order
  )

# Pareto chart
ggplot(pareto_ratings, aes(x = rating_tier)) +
  geom_bar(aes(y = count, fill = rating_tier), stat = "identity") +
  geom_line(aes(y = cum_pct * max(count)), group = 1, color = "darkred", linewidth = 1.2) +
  geom_point(aes(y = cum_pct * max(count)), color = "darkred", size = 2) +
  scale_fill_manual(values = rainbow(length(unique(pareto_ratings$rating_tier)))) +
  scale_y_continuous(
    name = "App Count",
    sec.axis = sec_axis(~ . / max(pareto_ratings$count), name = "Cumulative %", labels = percent)
  ) +
  labs(
    title = "Pareto Chart of App Ratings",
    x = "Rating Tier",
    fill = "Rating Tier"
  ) +
  theme_minimal()

6 Advanced & Interactive Ideas

6.1 Faceted Scatterplot by Genre

library(ggplot2)

ggplot(apple_data, aes(x = price, y = number_of_ratings, color = user_rating)) +
  geom_point(alpha = 0.6, size = 1.8) +
  scale_color_gradientn(colors = rainbow(7)) +
  facet_wrap(~ genre) +
  labs(
    title = "Price vs. Number of Ratings by Genre",
    x = "Price (USD)",
    y = "Number of Ratings",
    color = "User Rating"
  ) +
  theme_minimal()

6.2 Faceted Histogram of App Size by Rating Tier

library(dplyr)
library(ggplot2)

# Create rating tiers
apple_data <- apple_data %>%
  mutate(rating_tier = cut(user_rating,
                           breaks = c(0, 2, 3.5, 4.5, 5),
                           labels = c("Low", "Medium", "High", "Excellent"),
                           include.lowest = TRUE))

# Convert size to MB
apple_data <- apple_data %>%
  mutate(size_mb = size_bytes / (1024^2))

ggplot(apple_data, aes(x = size_mb, fill = rating_tier)) +
  geom_histogram(bins = 30, color = "black") +
  scale_fill_manual(values = rainbow(4)) +
  facet_wrap(~ rating_tier) +
  labs(
    title = "App Size Distribution by Rating Tier",
    x = "Size (MB)",
    y = "Count",
    fill = "Rating Tier"
  ) +
  theme_minimal()

7 Executive KPIs

# Load libraries
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(gridExtra))
suppressPackageStartupMessages(library(plotly))

# Compute KPIs
total_apps <- nrow(apple_data)
avg_rating <- round(mean(apple_data$user_rating, na.rm = TRUE), 2)
avg_price <- round(mean(apple_data$price, na.rm = TRUE), 2)
top_category <- apple_data %>%
  count(genre) %>%
  arrange(desc(n)) %>%
  slice(1) %>%
  pull(genre)

top_category_share <- apple_data %>%
  count(genre) %>%
  mutate(share = n / sum(n)) %>%
  filter(genre == top_category) %>%
  pull(share)

# Conditional insights
rating_comment <- if (avg_rating >= 4.5) {
  "User satisfaction is excellent."
} else if (avg_rating >= 3.5) {
  "User satisfaction is moderate."
} else {
  "User satisfaction is low—consider investigating app quality."
}

price_comment <- if (avg_price > 5) {
  "Average price is high—review pricing strategy."
} else {
  "Pricing is within expected range."
}

category_comment <- paste0("Top category '", top_category, "' accounts for ", round(top_category_share * 100, 1), "% of all apps.")

# Create value box
kpi_box <- function(title, value, fill_color) {
  ggplot() +
    geom_rect(aes(xmin = 0, xmax = 4, ymin = 0, ymax = 2), fill = fill_color, color = "white") +
    annotate("text", x = 2, y = 1.4, label = title, size = 5.5, fontface = "bold", color = "white") +
    annotate("text", x = 2, y = 0.6, label = value, size = 6.5, fontface = "bold", color = "white") +
    theme_void()
}

# Generate KPI boxes
box1 <- kpi_box("Total Apps", format(total_apps, big.mark = ","), "#1E88E5")
box2 <- kpi_box("Avg Rating", avg_rating, "#43A047")
box3 <- kpi_box("Top Category", top_category, "#FB8C00")
box4 <- kpi_box("Avg Price", paste0("$", avg_price), "#8E24AA")

# Arrange boxes
grid.arrange(box1, box2, box3, box4, ncol = 4)

# Interactive KPI panel with visible and hoverable summary
tooltip_text <- paste0(
  "<b>📊 KPI Summary</b><br>",
  "Total Apps: <b>", format(total_apps, big.mark = ","), "</b><br>",
  "Avg Rating: <b>", avg_rating, "</b><br>",
  "Top Category: <b>", top_category, "</b><br>",
  "Avg Price: <b>$", avg_price, "</b><br>",
  "<i>", rating_comment, "</i><br>",
  "<i>", price_comment, "</i><br>",
  "<i>", category_comment, "</i>"
)

plotly_kpi <- plot_ly(
  type = "scatter",
  mode = "text",
  x = c(1),
  y = c(1),
  text = tooltip_text,         # Show full summary directly
  hoverinfo = "text",
  hovertext = tooltip_text,    # Also show it on hover
  textfont = list(size = 16, color = "white"),
  hoverlabel = list(bgcolor = "#263238", font = list(color = "white", size = 14))
) %>%
  layout(
    paper_bgcolor = "#1E88E5",
    plot_bgcolor = "#1E88E5",
    xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
    yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
    margin = list(t = 40, b = 40)
  )

# Display interactive KPI
plotly_kpi