Victor Shamanovsky
Sunday, August 31, 2025 at 06:27 PM EDT
# Load all required libraries
library(readr)
library(dplyr)
library(ggplot2)
library(scales)
library(psych)
library(treemapify)
library(reshape2)
# Read the CSV
apple_data <- read_csv("apple_store_data.csv", show_col_types = FALSE)
# Show data columns
names(apple_data)## [1] "...1" "id" "app_name"
## [4] "size_bytes" "price" "number_of_ratings"
## [7] "user_rating" "genre"
## ...1 id app_name size_bytes
## Min. : 1 Min. :2.817e+08 Length:7197 Min. :5.898e+05
## 1st Qu.: 2090 1st Qu.:6.001e+08 Class :character 1st Qu.:4.692e+07
## Median : 4380 Median :9.781e+08 Mode :character Median :9.715e+07
## Mean : 4759 Mean :8.631e+08 Mean :1.991e+08
## 3rd Qu.: 7223 3rd Qu.:1.082e+09 3rd Qu.:1.819e+08
## Max. :11097 Max. :1.188e+09 Max. :4.026e+09
## price number_of_ratings user_rating genre
## Min. : 0.000 Min. : 0 Min. :0.000 Length:7197
## 1st Qu.: 0.000 1st Qu.: 28 1st Qu.:3.500 Class :character
## Median : 0.000 Median : 300 Median :4.000 Mode :character
## Mean : 1.726 Mean : 12893 Mean :3.527
## 3rd Qu.: 1.990 3rd Qu.: 2793 3rd Qu.:4.500
## Max. :299.990 Max. :2974676 Max. :5.000
library(ggplot2)
# Boxplot of user ratings (continuous)
ggplot(apple_data, aes(x = cut(user_rating, breaks = 5), y = user_rating, fill = cut(user_rating, breaks = 5))) +
geom_boxplot() +
scale_fill_manual(values = rainbow(5)) +
labs(
title = "User Ratings by Rating Tier",
x = "Rating Tier",
y = "User Rating"
) +
theme_minimal()library(ggplot2)
library(scales) # for label_comma()
ggplot(apple_data, aes(x = price, y = number_of_ratings, color = user_rating)) +
geom_point(alpha = 0.6, size = 2) +
scale_color_gradientn(colors = rainbow(7)) +
scale_y_continuous(labels = label_comma()) +
labs(
title = "Price vs. Number of Ratings (Colored by User Rating)",
x = "Price (USD)",
y = "Number of Ratings",
color = "User Rating"
) +
theme_minimal()library(dplyr)
library(ggplot2)
# Convert size from bytes to megabytes for readability
apple_data <- apple_data %>%
mutate(size_mb = size_bytes / (1024^2))
ggplot(apple_data, aes(x = size_mb, fill = after_stat(count))) +
geom_histogram(bins = 30, color = "black") +
scale_fill_gradientn(colors = rainbow(7)) +
labs(
title = "Distribution of App Size",
x = "Size (MB)",
y = "Count",
fill = "Frequency"
) +
theme_minimal()library(dplyr)
library(ggplot2)
# Prepare top 10 apps by number of ratings
top_apps <- apple_data %>%
arrange(desc(number_of_ratings)) %>%
slice_head(n = 10) %>%
mutate(app_name = factor(app_name, levels = rev(app_name))) # reverse for vertical order
# Lollipop chart
ggplot(top_apps, aes(x = app_name, y = number_of_ratings)) +
geom_segment(aes(x = app_name, xend = app_name, y = 0, yend = number_of_ratings),
color = "gray70", linewidth = 1.2) +
geom_point(aes(color = app_name), size = 5) +
scale_color_manual(values = rainbow(10)) +
scale_y_continuous(labels = scales::label_comma()) +
coord_flip() +
labs(
title = "Top 10 Apps by Number of Ratings",
x = "App Name",
y = "Number of Ratings",
color = "App"
) +
theme_minimal() +
theme(legend.position = "none")library(dplyr)
library(ggplot2)
library(treemapify)
# Count apps by genre
genre_counts <- apple_data %>%
count(genre, name = "count") %>%
arrange(desc(count))
# Treemap
ggplot(genre_counts, aes(area = count, fill = genre, label = genre)) +
geom_treemap() +
geom_treemap_text(
colour = "white",
place = "centre",
grow = TRUE,
reflow = TRUE
) +
scale_fill_manual(values = rainbow(nrow(genre_counts))) +
labs(
title = "App Distribution by Genre",
fill = "Genre"
) +
theme_minimal()library(dplyr)
library(ggplot2)
# Count apps by genre and select top 10
top_genres <- apple_data %>%
count(genre, name = "count") %>%
arrange(desc(count)) %>%
slice_head(n = 10) %>%
mutate(genre = factor(genre, levels = rev(genre))) # reverse for vertical order
# Bar plot
ggplot(top_genres, aes(x = genre, y = count, fill = genre)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = rainbow(10)) +
coord_flip() +
labs(
title = "Top 10 App Categories by Count",
x = "Category",
y = "Number of Apps",
fill = "Genre"
) +
theme_minimal() +
theme(legend.position = "none")library(dplyr)
library(ggplot2)
library(reshape2)
library(RColorBrewer)
# Select numeric columns for correlation
numeric_data <- apple_data %>%
select(where(is.numeric))
# Compute correlation matrix
cor_matrix <- round(cor(numeric_data, use = "complete.obs"), 2)
# Melt the matrix for ggplot
cor_melted <- melt(cor_matrix)
# Heatmap
ggplot(cor_melted, aes(x = Var1, y = Var2, fill = value)) +
geom_tile(color = "white") +
scale_fill_gradientn(colors = rainbow(7), limits = c(-1, 1)) +
geom_text(aes(label = value), size = 4, color = "black") +
labs(
title = "Correlation Heatmap of Numeric Variables",
x = "",
y = "",
fill = "Correlation"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))library(dplyr)
library(ggplot2)
# Create rating tiers
apple_data <- apple_data %>%
mutate(rating_tier = cut(user_rating,
breaks = c(0, 2, 3.5, 4.5, 5),
labels = c("Low", "Medium", "High", "Excellent"),
include.lowest = TRUE))
# Count apps by genre and rating tier
genre_rating_counts <- apple_data %>%
count(genre, rating_tier)
# Stacked bar chart
ggplot(genre_rating_counts, aes(x = genre, y = n, fill = rating_tier)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = rainbow(length(unique(genre_rating_counts$rating_tier)))) +
labs(
title = "App Count by Genre and Rating Tier",
x = "Genre",
y = "Number of Apps",
fill = "Rating Tier"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))library(dplyr)
library(ggplot2)
# Create rating tiers
apple_data <- apple_data %>%
mutate(rating_tier = cut(user_rating,
breaks = c(0, 2, 3.5, 4.5, 5),
labels = c("Low", "Medium", "High", "Excellent"),
include.lowest = TRUE))
# Calculate average rating per genre and tier
avg_rating_by_genre <- apple_data %>%
group_by(genre, rating_tier) %>%
summarise(avg_rating = mean(user_rating, na.rm = TRUE), .groups = "drop") %>%
filter(!is.na(rating_tier)) %>%
mutate(genre = factor(genre, levels = rev(unique(genre))))
# Grouped bar chart
ggplot(avg_rating_by_genre, aes(x = genre, y = avg_rating, fill = rating_tier)) +
geom_bar(stat = "identity", position = "dodge") +
scale_fill_manual(values = rainbow(length(unique(avg_rating_by_genre$rating_tier)))) +
labs(
title = "Average User Rating by Genre and Rating Tier",
x = "Genre",
y = "Average Rating",
fill = "Rating Tier"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))ggplot(apple_data, aes(x = user_rating, fill = genre)) +
geom_density(alpha = 0.6) +
scale_fill_manual(values = rainbow(length(unique(apple_data$genre)))) +
labs(
title = "Density of User Ratings by Genre",
x = "User Rating",
y = "Density",
fill = "Genre"
) +
theme_minimal()library(ggplot2)
ggplot(apple_data, aes(x = genre, y = user_rating, fill = genre)) +
geom_violin(trim = FALSE, scale = "width") +
scale_fill_manual(values = rainbow(length(unique(apple_data$genre)))) +
labs(
title = "Distribution of User Ratings by Genre",
x = "Genre",
y = "User Rating",
fill = "Genre"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))library(ggplot2)
library(scales)
ggplot(apple_data, aes(x = price, y = user_rating, size = number_of_ratings, color = genre)) +
geom_point(alpha = 0.6) +
scale_size_continuous(range = c(2, 12), labels = label_comma()) +
scale_color_manual(values = rainbow(length(unique(apple_data$genre)))) +
labs(
title = "Bubble Chart: Price vs. User Rating (Bubble Size = Number of Ratings)",
x = "Price (USD)",
y = "User Rating",
size = "Number of Ratings",
color = "Genre"
) +
theme_minimal()library(dplyr)
library(ggplot2)
library(scales)
# Create rating tiers
apple_data <- apple_data %>%
mutate(rating_tier = cut(user_rating,
breaks = c(0, 2, 3.5, 4.5, 5),
labels = c("Low", "Medium", "High", "Excellent"),
include.lowest = TRUE))
# Count apps by rating tier and calculate cumulative percentage
pareto_ratings <- apple_data %>%
count(rating_tier, name = "count") %>%
arrange(desc(count)) %>%
mutate(
cum_pct = cumsum(count) / sum(count),
rating_tier = factor(rating_tier, levels = rating_tier) # preserve order
)
# Pareto chart
ggplot(pareto_ratings, aes(x = rating_tier)) +
geom_bar(aes(y = count, fill = rating_tier), stat = "identity") +
geom_line(aes(y = cum_pct * max(count)), group = 1, color = "darkred", linewidth = 1.2) +
geom_point(aes(y = cum_pct * max(count)), color = "darkred", size = 2) +
scale_fill_manual(values = rainbow(length(unique(pareto_ratings$rating_tier)))) +
scale_y_continuous(
name = "App Count",
sec.axis = sec_axis(~ . / max(pareto_ratings$count), name = "Cumulative %", labels = percent)
) +
labs(
title = "Pareto Chart of App Ratings",
x = "Rating Tier",
fill = "Rating Tier"
) +
theme_minimal()library(ggplot2)
ggplot(apple_data, aes(x = price, y = number_of_ratings, color = user_rating)) +
geom_point(alpha = 0.6, size = 1.8) +
scale_color_gradientn(colors = rainbow(7)) +
facet_wrap(~ genre) +
labs(
title = "Price vs. Number of Ratings by Genre",
x = "Price (USD)",
y = "Number of Ratings",
color = "User Rating"
) +
theme_minimal()library(dplyr)
library(ggplot2)
# Create rating tiers
apple_data <- apple_data %>%
mutate(rating_tier = cut(user_rating,
breaks = c(0, 2, 3.5, 4.5, 5),
labels = c("Low", "Medium", "High", "Excellent"),
include.lowest = TRUE))
# Convert size to MB
apple_data <- apple_data %>%
mutate(size_mb = size_bytes / (1024^2))
ggplot(apple_data, aes(x = size_mb, fill = rating_tier)) +
geom_histogram(bins = 30, color = "black") +
scale_fill_manual(values = rainbow(4)) +
facet_wrap(~ rating_tier) +
labs(
title = "App Size Distribution by Rating Tier",
x = "Size (MB)",
y = "Count",
fill = "Rating Tier"
) +
theme_minimal()# Load libraries
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(gridExtra))
suppressPackageStartupMessages(library(plotly))
# Compute KPIs
total_apps <- nrow(apple_data)
avg_rating <- round(mean(apple_data$user_rating, na.rm = TRUE), 2)
avg_price <- round(mean(apple_data$price, na.rm = TRUE), 2)
top_category <- apple_data %>%
count(genre) %>%
arrange(desc(n)) %>%
slice(1) %>%
pull(genre)
top_category_share <- apple_data %>%
count(genre) %>%
mutate(share = n / sum(n)) %>%
filter(genre == top_category) %>%
pull(share)
# Conditional insights
rating_comment <- if (avg_rating >= 4.5) {
"User satisfaction is excellent."
} else if (avg_rating >= 3.5) {
"User satisfaction is moderate."
} else {
"User satisfaction is low—consider investigating app quality."
}
price_comment <- if (avg_price > 5) {
"Average price is high—review pricing strategy."
} else {
"Pricing is within expected range."
}
category_comment <- paste0("Top category '", top_category, "' accounts for ", round(top_category_share * 100, 1), "% of all apps.")
# Create value box
kpi_box <- function(title, value, fill_color) {
ggplot() +
geom_rect(aes(xmin = 0, xmax = 4, ymin = 0, ymax = 2), fill = fill_color, color = "white") +
annotate("text", x = 2, y = 1.4, label = title, size = 5.5, fontface = "bold", color = "white") +
annotate("text", x = 2, y = 0.6, label = value, size = 6.5, fontface = "bold", color = "white") +
theme_void()
}
# Generate KPI boxes
box1 <- kpi_box("Total Apps", format(total_apps, big.mark = ","), "#1E88E5")
box2 <- kpi_box("Avg Rating", avg_rating, "#43A047")
box3 <- kpi_box("Top Category", top_category, "#FB8C00")
box4 <- kpi_box("Avg Price", paste0("$", avg_price), "#8E24AA")
# Arrange boxes
grid.arrange(box1, box2, box3, box4, ncol = 4)# Interactive KPI panel with visible and hoverable summary
tooltip_text <- paste0(
"<b>📊 KPI Summary</b><br>",
"Total Apps: <b>", format(total_apps, big.mark = ","), "</b><br>",
"Avg Rating: <b>", avg_rating, "</b><br>",
"Top Category: <b>", top_category, "</b><br>",
"Avg Price: <b>$", avg_price, "</b><br>",
"<i>", rating_comment, "</i><br>",
"<i>", price_comment, "</i><br>",
"<i>", category_comment, "</i>"
)
plotly_kpi <- plot_ly(
type = "scatter",
mode = "text",
x = c(1),
y = c(1),
text = tooltip_text, # Show full summary directly
hoverinfo = "text",
hovertext = tooltip_text, # Also show it on hover
textfont = list(size = 16, color = "white"),
hoverlabel = list(bgcolor = "#263238", font = list(color = "white", size = 14))
) %>%
layout(
paper_bgcolor = "#1E88E5",
plot_bgcolor = "#1E88E5",
xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
margin = list(t = 40, b = 40)
)
# Display interactive KPI
plotly_kpiReport generated by Victor Shamanovsky
Apple
Store App Data Analysis
© 2025 All
rights reserved.