With millions of apps available today, the following data set has become key to understanding app performance in different categories within the Apple iOS App Store. This data set contains nearly 7200 mobile app details, which was collected in July 2017 and extracted from the iTunes Search API at the Apple Inc website.
Other than to gain understanding about app performance, this analysis aims to investigate the relationship between app details and user rating.
# Import libraries
library(ggplot2)
library(tidyr)
library(scales)
library(RColorBrewer)# Read data
apps <- read.csv("data_input/AppleStore.csv")
appd <- read.csv("data_input/appleStore_description.csv")There are two csv files: Apple Store and Apple Store Description, in which this section will be divided into.
This data frame contains each applicationโs ID, size (in Bytes), currency, price, rating counts (all and current versions), user rating value (all and current versions), latest version code, content rating, app genre or category, number of supporting devices, number of screenshots showed for display, number of supported languages, and whether VPP licensed was enabled
dim(apps)#> [1] 7197 17
names(apps)#> [1] "X" "id" "track_name" "size_bytes"
#> [5] "currency" "price" "rating_count_tot" "rating_count_ver"
#> [9] "user_rating" "user_rating_ver" "ver" "cont_rating"
#> [13] "prime_genre" "sup_devices.num" "ipadSc_urls.num" "lang.num"
#> [17] "vpp_lic"
head(apps)tail(apps)# Check missing values
anyNA(apps)#> [1] FALSE
This data frame contains the ID, memory size (in Bytes) and description of each application.
dim(appd)#> [1] 7197 4
names(appd)#> [1] "id" "track_name" "size_bytes" "app_desc"
head(appd)tail(appd)# Check missing values
anyNA(appd)#> [1] FALSE
For the purpose of this analysis, only Apple Store data will be used as the other contains no additional information aside from the descriptions of all 7197 apps listed on the App Store.
# Check data structure
str(apps)#> 'data.frame': 7197 obs. of 17 variables:
#> $ X : int 1 2 3 4 5 6 7 8 9 10 ...
#> $ id : int 281656475 281796108 281940292 282614216 282935706 283619399 283646709 284035177 284666222 284736660 ...
#> $ track_name : chr "PAC-MAN Premium" "Evernote - stay organized" "WeatherBug - Local Weather, Radar, Maps, Alerts" "eBay: Best App to Buy, Sell, Save! Online Shopping" ...
#> $ size_bytes : num 1.01e+08 1.59e+08 1.01e+08 1.29e+08 9.28e+07 ...
#> $ currency : chr "USD" "USD" "USD" "USD" ...
#> $ price : num 3.99 0 0 0 0 0.99 0 0 9.99 3.99 ...
#> $ rating_count_tot: int 21292 161065 188583 262241 985920 8253 119487 1126879 1117 7885 ...
#> $ rating_count_ver: int 26 26 2822 649 5320 5516 879 3594 4 40 ...
#> $ user_rating : num 4 4 3.5 4 4.5 4 4 4 4.5 4 ...
#> $ user_rating_ver : num 4.5 3.5 4.5 4.5 5 4 4.5 4.5 5 4 ...
#> $ ver : chr "6.3.5" "8.2.2" "5.0.0" "5.10.0" ...
#> $ cont_rating : chr "4+" "4+" "4+" "12+" ...
#> $ prime_genre : chr "Games" "Productivity" "Weather" "Shopping" ...
#> $ sup_devices.num : int 38 37 37 37 37 47 37 37 37 38 ...
#> $ ipadSc_urls.num : int 5 5 5 5 5 5 0 4 5 0 ...
#> $ lang.num : int 10 23 3 9 45 1 19 1 1 10 ...
#> $ vpp_lic : int 1 1 1 1 1 1 1 1 1 1 ...
# Change data type
apps[,c("X", "id")] <- lapply(apps[,c("X", "id")], as.character)
apps[,c("currency", "user_rating", "user_rating_ver", "cont_rating", "prime_genre", "vpp_lic")] <- lapply(apps[,c("currency", "user_rating", "user_rating_ver", "cont_rating", "prime_genre", "vpp_lic")], as.factor)
str(apps)#> 'data.frame': 7197 obs. of 17 variables:
#> $ X : chr "1" "2" "3" "4" ...
#> $ id : chr "281656475" "281796108" "281940292" "282614216" ...
#> $ track_name : chr "PAC-MAN Premium" "Evernote - stay organized" "WeatherBug - Local Weather, Radar, Maps, Alerts" "eBay: Best App to Buy, Sell, Save! Online Shopping" ...
#> $ size_bytes : num 1.01e+08 1.59e+08 1.01e+08 1.29e+08 9.28e+07 ...
#> $ currency : Factor w/ 1 level "USD": 1 1 1 1 1 1 1 1 1 1 ...
#> $ price : num 3.99 0 0 0 0 0.99 0 0 9.99 3.99 ...
#> $ rating_count_tot: int 21292 161065 188583 262241 985920 8253 119487 1126879 1117 7885 ...
#> $ rating_count_ver: int 26 26 2822 649 5320 5516 879 3594 4 40 ...
#> $ user_rating : Factor w/ 10 levels "0","1","1.5",..: 8 8 7 8 9 8 8 8 9 8 ...
#> $ user_rating_ver : Factor w/ 10 levels "0","1","1.5",..: 9 7 9 9 10 8 9 9 10 8 ...
#> $ ver : chr "6.3.5" "8.2.2" "5.0.0" "5.10.0" ...
#> $ cont_rating : Factor w/ 4 levels "12+","17+","4+",..: 3 3 3 1 3 3 3 1 3 3 ...
#> $ prime_genre : Factor w/ 23 levels "Book","Business",..: 8 16 23 18 17 8 6 12 22 8 ...
#> $ sup_devices.num : int 38 37 37 37 37 47 37 37 37 38 ...
#> $ ipadSc_urls.num : int 5 5 5 5 5 5 0 4 5 0 ...
#> $ lang.num : int 10 23 3 9 45 1 19 1 1 10 ...
#> $ vpp_lic : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
Analysing app size in Bytes might be confusing and so, converting it to MB may be more convenient.
# Convert app size from Bytes to MB
apps$size_mb <- apps$size_bytes / 1000000
str(apps)#> 'data.frame': 7197 obs. of 18 variables:
#> $ X : chr "1" "2" "3" "4" ...
#> $ id : chr "281656475" "281796108" "281940292" "282614216" ...
#> $ track_name : chr "PAC-MAN Premium" "Evernote - stay organized" "WeatherBug - Local Weather, Radar, Maps, Alerts" "eBay: Best App to Buy, Sell, Save! Online Shopping" ...
#> $ size_bytes : num 1.01e+08 1.59e+08 1.01e+08 1.29e+08 9.28e+07 ...
#> $ currency : Factor w/ 1 level "USD": 1 1 1 1 1 1 1 1 1 1 ...
#> $ price : num 3.99 0 0 0 0 0.99 0 0 9.99 3.99 ...
#> $ rating_count_tot: int 21292 161065 188583 262241 985920 8253 119487 1126879 1117 7885 ...
#> $ rating_count_ver: int 26 26 2822 649 5320 5516 879 3594 4 40 ...
#> $ user_rating : Factor w/ 10 levels "0","1","1.5",..: 8 8 7 8 9 8 8 8 9 8 ...
#> $ user_rating_ver : Factor w/ 10 levels "0","1","1.5",..: 9 7 9 9 10 8 9 9 10 8 ...
#> $ ver : chr "6.3.5" "8.2.2" "5.0.0" "5.10.0" ...
#> $ cont_rating : Factor w/ 4 levels "12+","17+","4+",..: 3 3 3 1 3 3 3 1 3 3 ...
#> $ prime_genre : Factor w/ 23 levels "Book","Business",..: 8 16 23 18 17 8 6 12 22 8 ...
#> $ sup_devices.num : int 38 37 37 37 37 47 37 37 37 38 ...
#> $ ipadSc_urls.num : int 5 5 5 5 5 5 0 4 5 0 ...
#> $ lang.num : int 10 23 3 9 45 1 19 1 1 10 ...
#> $ vpp_lic : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
#> $ size_mb : num 100.8 158.6 100.5 128.5 92.8 ...
It may also be more convenient to be able to quickly see whether the app was free or paid.
apps$price_type <- apps$price == 0
apps$price_type <- ifelse(apps$price_type == TRUE, "FREE", "PAID")
apps$price_type <- as.factor(apps$price_type)
head(apps)summary(apps)#> X id track_name size_bytes
#> Length:7197 Length:7197 Length:7197 Min. :5.898e+05
#> Class :character Class :character Class :character 1st Qu.:4.692e+07
#> Mode :character Mode :character Mode :character Median :9.715e+07
#> Mean :1.991e+08
#> 3rd Qu.:1.819e+08
#> Max. :4.026e+09
#>
#> currency price rating_count_tot rating_count_ver
#> USD:7197 Min. : 0.000 Min. : 0 Min. : 0.0
#> 1st Qu.: 0.000 1st Qu.: 28 1st Qu.: 1.0
#> Median : 0.000 Median : 300 Median : 23.0
#> Mean : 1.726 Mean : 12893 Mean : 460.4
#> 3rd Qu.: 1.990 3rd Qu.: 2793 3rd Qu.: 140.0
#> Max. :299.990 Max. :2974676 Max. :177050.0
#>
#> user_rating user_rating_ver ver cont_rating
#> 4.5 :2663 4.5 :2205 Length:7197 12+:1155
#> 4 :1626 0 :1443 Class :character 17+: 622
#> 0 : 929 4 :1237 Mode :character 4+ :4433
#> 3.5 : 702 5 : 964 9+ : 987
#> 5 : 492 3.5 : 533
#> 3 : 383 3 : 304
#> (Other): 402 (Other): 511
#> prime_genre sup_devices.num ipadSc_urls.num lang.num
#> Games :3862 Min. : 9.00 Min. :0.000 Min. : 0.000
#> Entertainment : 535 1st Qu.:37.00 1st Qu.:3.000 1st Qu.: 1.000
#> Education : 453 Median :37.00 Median :5.000 Median : 1.000
#> Photo & Video : 349 Mean :37.36 Mean :3.707 Mean : 5.435
#> Utilities : 248 3rd Qu.:38.00 3rd Qu.:5.000 3rd Qu.: 8.000
#> Health & Fitness: 180 Max. :47.00 Max. :5.000 Max. :75.000
#> (Other) :1570
#> vpp_lic size_mb price_type
#> 0: 50 Min. : 0.59 FREE:4056
#> 1:7147 1st Qu.: 46.92 PAID:3141
#> Median : 97.15
#> Mean : 199.13
#> 3rd Qu.: 181.93
#> Max. :4025.97
#>
# Genres
unique(apps$prime_genre)#> [1] Games Productivity Weather Shopping
#> [5] Reference Finance Music Utilities
#> [9] Travel Social Networking Sports Business
#> [13] Health & Fitness Entertainment Photo & Video Navigation
#> [17] Education Lifestyle Food & Drink News
#> [21] Book Medical Catalogs
#> 23 Levels: Book Business Catalogs Education Entertainment ... Weather
# The number of free and paid apps per genre
table(apps$prime_genre, apps$price_type)#>
#> FREE PAID
#> Book 66 46
#> Business 20 37
#> Catalogs 9 1
#> Education 132 321
#> Entertainment 334 201
#> Finance 84 20
#> Food & Drink 43 20
#> Games 2257 1605
#> Health & Fitness 76 104
#> Lifestyle 94 50
#> Medical 8 15
#> Music 67 71
#> Navigation 20 26
#> News 58 17
#> Photo & Video 167 182
#> Productivity 62 116
#> Reference 20 44
#> Shopping 121 1
#> Social Networking 143 24
#> Sports 79 35
#> Travel 56 25
#> Utilities 109 139
#> Weather 31 41
Some useful insights based on this summary:
The total number of apps, both free and paid, per App Store category
top_gen <-as.data.frame(table(apps$prime_genre))
top_gen <- top_gen[order(top_gen$Freq, decreasing = T),]
head(top_gen)# Plotting (All)
ggplot(data = top_gen, mapping = aes(x=Var1, y=Freq)) +
geom_col() +
geom_col(data = top_gen[1,], fill = "#0484EB") +
guides(x = guide_axis(angle = 45)) +
labs(
title = "App Store Categories",
subtitle = "in 2017",
y = "Count",
x = "Categories"
) +
theme_minimal()The total number of Games (blue) was significantly higher than any other categories.
Top 10 general app categories is shown below:
# Plotting (General)
ggplot(data = top_gen[2:11,], mapping = aes(x=Freq, y=reorder(Var1, Freq))) +
geom_col(aes(fill = Freq), show.legend = F) +
geom_label(aes(label=Freq)) +
scale_fill_gradient(low="#E3F4FE",high="#0484EB") +
labs(
title = "Top 10 App Store General Categories",
subtitle = "excluding Games",
y = NULL,
x = "Count",
fill = "Categories",
) +
theme_minimal()# Based on Content Rating
prop_cont <- as.data.frame(table(apps$prime_genre, apps$cont_rating))
ggplot(data = prop_cont, aes(x = Freq, y = reorder (Var1, Freq))) +
geom_col(aes(fill = Var2), position = "fill") +
scale_fill_brewer(palette = "Paired") +
labs(
title = "Proportion of App Store Categories",
subtitle = "Category vs Content Rating",
x = "Content Rating",
y = NULL,
fill = NULL
) +
theme(legend.position = "top",
plot.title.position = "plot")
๐ Insight:
# Average user rating vs number of languages supported
ggplot(data = apps, aes(x=user_rating,y=lang.num, group=1)) +
geom_line(stat="summary", fun = mean,
aes(col="#CC6666"), show.legend = F) +
labs (
title="The relation between number of languages supported and app quality",
subtitle = "Average user rating vs number of languages supported",
x="Average user rating",
y="Number of language supported",
) +
theme(plot.title.position = "plot") +
theme_minimal()summary(apps$lang.num)#> Min. 1st Qu. Median Mean 3rd Qu. Max.
#> 0.000 1.000 1.000 5.435 8.000 75.000
๐ Insight: