Introduction

With millions of apps available today, the following data set has become key to understanding app performance in different categories within the Apple iOS App Store. This data set contains nearly 7200 mobile app details, which was collected in July 2017 and extracted from the iTunes Search API at the Apple Inc website.

Other than to gain understanding about app performance, this analysis aims to investigate the relationship between app details and user rating.

Data Pre-processing

# Import libraries
library(ggplot2)
library(tidyr)
library(scales)
library(RColorBrewer)

Data Inspection

# Read data
apps <- read.csv("data_input/AppleStore.csv")
appd <- read.csv("data_input/appleStore_description.csv")

There are two csv files: Apple Store and Apple Store Description, in which this section will be divided into.

Apple Store

This data frame contains each application’s ID, size (in Bytes), currency, price, rating counts (all and current versions), user rating value (all and current versions), latest version code, content rating, app genre or category, number of supporting devices, number of screenshots showed for display, number of supported languages, and whether VPP licensed was enabled

dim(apps)

#> [1] 7197   17

names(apps)

#>  [1] "X"                "id"               "track_name"       "size_bytes"      
#>  [5] "currency"         "price"            "rating_count_tot" "rating_count_ver"
#>  [9] "user_rating"      "user_rating_ver"  "ver"              "cont_rating"     
#> [13] "prime_genre"      "sup_devices.num"  "ipadSc_urls.num"  "lang.num"        
#> [17] "vpp_lic"

head(apps)

tail(apps)

# Check missing values
anyNA(apps)

#> [1] FALSE

Apple Store Description

This data frame contains the ID, memory size (in Bytes) and description of each application.

dim(appd)

#> [1] 7197    4

names(appd)

#> [1] "id"         "track_name" "size_bytes" "app_desc"

head(appd)

tail(appd)

# Check missing values
anyNA(appd)

#> [1] FALSE

Data Wrangling

For the purpose of this analysis, only Apple Store data will be used as the other contains no additional information aside from the descriptions of all 7197 apps listed on the App Store.

# Check data structure
str(apps)

#> 'data.frame':    7197 obs. of  17 variables:
#>  $ X               : int  1 2 3 4 5 6 7 8 9 10 ...
#>  $ id              : int  281656475 281796108 281940292 282614216 282935706 283619399 283646709 284035177 284666222 284736660 ...
#>  $ track_name      : chr  "PAC-MAN Premium" "Evernote - stay organized" "WeatherBug - Local Weather, Radar, Maps, Alerts" "eBay: Best App to Buy, Sell, Save! Online Shopping" ...
#>  $ size_bytes      : num  1.01e+08 1.59e+08 1.01e+08 1.29e+08 9.28e+07 ...
#>  $ currency        : chr  "USD" "USD" "USD" "USD" ...
#>  $ price           : num  3.99 0 0 0 0 0.99 0 0 9.99 3.99 ...
#>  $ rating_count_tot: int  21292 161065 188583 262241 985920 8253 119487 1126879 1117 7885 ...
#>  $ rating_count_ver: int  26 26 2822 649 5320 5516 879 3594 4 40 ...
#>  $ user_rating     : num  4 4 3.5 4 4.5 4 4 4 4.5 4 ...
#>  $ user_rating_ver : num  4.5 3.5 4.5 4.5 5 4 4.5 4.5 5 4 ...
#>  $ ver             : chr  "6.3.5" "8.2.2" "5.0.0" "5.10.0" ...
#>  $ cont_rating     : chr  "4+" "4+" "4+" "12+" ...
#>  $ prime_genre     : chr  "Games" "Productivity" "Weather" "Shopping" ...
#>  $ sup_devices.num : int  38 37 37 37 37 47 37 37 37 38 ...
#>  $ ipadSc_urls.num : int  5 5 5 5 5 5 0 4 5 0 ...
#>  $ lang.num        : int  10 23 3 9 45 1 19 1 1 10 ...
#>  $ vpp_lic         : int  1 1 1 1 1 1 1 1 1 1 ...

# Change data type
apps[,c("X", "id")] <- lapply(apps[,c("X", "id")], as.character)

apps[,c("currency", "user_rating", "user_rating_ver", "cont_rating", "prime_genre", "vpp_lic")] <- lapply(apps[,c("currency", "user_rating", "user_rating_ver", "cont_rating", "prime_genre", "vpp_lic")], as.factor)

str(apps)

#> 'data.frame':    7197 obs. of  17 variables:
#>  $ X               : chr  "1" "2" "3" "4" ...
#>  $ id              : chr  "281656475" "281796108" "281940292" "282614216" ...
#>  $ track_name      : chr  "PAC-MAN Premium" "Evernote - stay organized" "WeatherBug - Local Weather, Radar, Maps, Alerts" "eBay: Best App to Buy, Sell, Save! Online Shopping" ...
#>  $ size_bytes      : num  1.01e+08 1.59e+08 1.01e+08 1.29e+08 9.28e+07 ...
#>  $ currency        : Factor w/ 1 level "USD": 1 1 1 1 1 1 1 1 1 1 ...
#>  $ price           : num  3.99 0 0 0 0 0.99 0 0 9.99 3.99 ...
#>  $ rating_count_tot: int  21292 161065 188583 262241 985920 8253 119487 1126879 1117 7885 ...
#>  $ rating_count_ver: int  26 26 2822 649 5320 5516 879 3594 4 40 ...
#>  $ user_rating     : Factor w/ 10 levels "0","1","1.5",..: 8 8 7 8 9 8 8 8 9 8 ...
#>  $ user_rating_ver : Factor w/ 10 levels "0","1","1.5",..: 9 7 9 9 10 8 9 9 10 8 ...
#>  $ ver             : chr  "6.3.5" "8.2.2" "5.0.0" "5.10.0" ...
#>  $ cont_rating     : Factor w/ 4 levels "12+","17+","4+",..: 3 3 3 1 3 3 3 1 3 3 ...
#>  $ prime_genre     : Factor w/ 23 levels "Book","Business",..: 8 16 23 18 17 8 6 12 22 8 ...
#>  $ sup_devices.num : int  38 37 37 37 37 47 37 37 37 38 ...
#>  $ ipadSc_urls.num : int  5 5 5 5 5 5 0 4 5 0 ...
#>  $ lang.num        : int  10 23 3 9 45 1 19 1 1 10 ...
#>  $ vpp_lic         : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...

Analysing app size in Bytes might be confusing and so, converting it to MB may be more convenient.

# Convert app size from Bytes to MB
apps$size_mb <- apps$size_bytes / 1000000

str(apps)

#> 'data.frame':    7197 obs. of  18 variables:
#>  $ X               : chr  "1" "2" "3" "4" ...
#>  $ id              : chr  "281656475" "281796108" "281940292" "282614216" ...
#>  $ track_name      : chr  "PAC-MAN Premium" "Evernote - stay organized" "WeatherBug - Local Weather, Radar, Maps, Alerts" "eBay: Best App to Buy, Sell, Save! Online Shopping" ...
#>  $ size_bytes      : num  1.01e+08 1.59e+08 1.01e+08 1.29e+08 9.28e+07 ...
#>  $ currency        : Factor w/ 1 level "USD": 1 1 1 1 1 1 1 1 1 1 ...
#>  $ price           : num  3.99 0 0 0 0 0.99 0 0 9.99 3.99 ...
#>  $ rating_count_tot: int  21292 161065 188583 262241 985920 8253 119487 1126879 1117 7885 ...
#>  $ rating_count_ver: int  26 26 2822 649 5320 5516 879 3594 4 40 ...
#>  $ user_rating     : Factor w/ 10 levels "0","1","1.5",..: 8 8 7 8 9 8 8 8 9 8 ...
#>  $ user_rating_ver : Factor w/ 10 levels "0","1","1.5",..: 9 7 9 9 10 8 9 9 10 8 ...
#>  $ ver             : chr  "6.3.5" "8.2.2" "5.0.0" "5.10.0" ...
#>  $ cont_rating     : Factor w/ 4 levels "12+","17+","4+",..: 3 3 3 1 3 3 3 1 3 3 ...
#>  $ prime_genre     : Factor w/ 23 levels "Book","Business",..: 8 16 23 18 17 8 6 12 22 8 ...
#>  $ sup_devices.num : int  38 37 37 37 37 47 37 37 37 38 ...
#>  $ ipadSc_urls.num : int  5 5 5 5 5 5 0 4 5 0 ...
#>  $ lang.num        : int  10 23 3 9 45 1 19 1 1 10 ...
#>  $ vpp_lic         : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
#>  $ size_mb         : num  100.8 158.6 100.5 128.5 92.8 ...

It may also be more convenient to be able to quickly see whether the app was free or paid.

apps$price_type <- apps$price == 0
apps$price_type <- ifelse(apps$price_type == TRUE, "FREE", "PAID")

apps$price_type <- as.factor(apps$price_type)

head(apps)

Data Summary

summary(apps)

#>       X                  id             track_name          size_bytes       
#>  Length:7197        Length:7197        Length:7197        Min.   :5.898e+05  
#>  Class :character   Class :character   Class :character   1st Qu.:4.692e+07  
#>  Mode  :character   Mode  :character   Mode  :character   Median :9.715e+07  
#>                                                           Mean   :1.991e+08  
#>                                                           3rd Qu.:1.819e+08  
#>                                                           Max.   :4.026e+09  
#>                                                                              
#>  currency       price         rating_count_tot  rating_count_ver  
#>  USD:7197   Min.   :  0.000   Min.   :      0   Min.   :     0.0  
#>             1st Qu.:  0.000   1st Qu.:     28   1st Qu.:     1.0  
#>             Median :  0.000   Median :    300   Median :    23.0  
#>             Mean   :  1.726   Mean   :  12893   Mean   :   460.4  
#>             3rd Qu.:  1.990   3rd Qu.:   2793   3rd Qu.:   140.0  
#>             Max.   :299.990   Max.   :2974676   Max.   :177050.0  
#>                                                                   
#>   user_rating   user_rating_ver     ver            cont_rating
#>  4.5    :2663   4.5    :2205    Length:7197        12+:1155   
#>  4      :1626   0      :1443    Class :character   17+: 622   
#>  0      : 929   4      :1237    Mode  :character   4+ :4433   
#>  3.5    : 702   5      : 964                       9+ : 987   
#>  5      : 492   3.5    : 533                                  
#>  3      : 383   3      : 304                                  
#>  (Other): 402   (Other): 511                                  
#>            prime_genre   sup_devices.num ipadSc_urls.num    lang.num     
#>  Games           :3862   Min.   : 9.00   Min.   :0.000   Min.   : 0.000  
#>  Entertainment   : 535   1st Qu.:37.00   1st Qu.:3.000   1st Qu.: 1.000  
#>  Education       : 453   Median :37.00   Median :5.000   Median : 1.000  
#>  Photo & Video   : 349   Mean   :37.36   Mean   :3.707   Mean   : 5.435  
#>  Utilities       : 248   3rd Qu.:38.00   3rd Qu.:5.000   3rd Qu.: 8.000  
#>  Health & Fitness: 180   Max.   :47.00   Max.   :5.000   Max.   :75.000  
#>  (Other)         :1570                                                   
#>  vpp_lic     size_mb        price_type 
#>  0:  50   Min.   :   0.59   FREE:4056  
#>  1:7147   1st Qu.:  46.92   PAID:3141  
#>           Median :  97.15              
#>           Mean   : 199.13              
#>           3rd Qu.: 181.93              
#>           Max.   :4025.97              
#>

# Genres
unique(apps$prime_genre)

#>  [1] Games             Productivity      Weather           Shopping         
#>  [5] Reference         Finance           Music             Utilities        
#>  [9] Travel            Social Networking Sports            Business         
#> [13] Health & Fitness  Entertainment     Photo & Video     Navigation       
#> [17] Education         Lifestyle         Food & Drink      News             
#> [21] Book              Medical           Catalogs         
#> 23 Levels: Book Business Catalogs Education Entertainment ... Weather

# The number of free and paid apps per genre
table(apps$prime_genre, apps$price_type)

#>                    
#>                     FREE PAID
#>   Book                66   46
#>   Business            20   37
#>   Catalogs             9    1
#>   Education          132  321
#>   Entertainment      334  201
#>   Finance             84   20
#>   Food & Drink        43   20
#>   Games             2257 1605
#>   Health & Fitness    76  104
#>   Lifestyle           94   50
#>   Medical              8   15
#>   Music               67   71
#>   Navigation          20   26
#>   News                58   17
#>   Photo & Video      167  182
#>   Productivity        62  116
#>   Reference           20   44
#>   Shopping           121    1
#>   Social Networking  143   24
#>   Sports              79   35
#>   Travel              56   25
#>   Utilities          109  139
#>   Weather             31   41

Some useful insights based on this summary:

7197 apps, 23 different genres or categories, in which Games was the most popular followed by Entertainment, Education, and Photo & Video apps.
There were more free apps than paid apps. Average price was at 1.7 USD and the most expensive app was priced at 299.99 USD.
The majority of apps (4,500+) were rated 4+, indicating that they were generally suitable for all users. But almost 1200 apps required users to be at least 12 years old before they can download.
On the scale of 0 t0 5 stars, most apps were rated 4.5 stars and more than 900 apps were unrated.
The average file size was 199MB. Apps typically supported 37 devices up to a maximum of 47 Apple devices.

Data Processing and Plotting

App Store categories

The total number of apps, both free and paid, per App Store category

top_gen <-as.data.frame(table(apps$prime_genre))

top_gen <- top_gen[order(top_gen$Freq, decreasing = T),]

head(top_gen)

# Plotting (All)
ggplot(data = top_gen, mapping = aes(x=Var1, y=Freq)) +
  geom_col() +
  geom_col(data = top_gen[1,], fill = "#0484EB") +
  guides(x = guide_axis(angle = 45)) +
  labs(
    title = "App Store Categories",
    subtitle = "in 2017",
    y = "Count",
    x = "Categories"
  ) +
  theme_minimal()

The total number of Games (blue) was significantly higher than any other categories.

Top 10 general app categories is shown below:

# Plotting (General)
ggplot(data = top_gen[2:11,], mapping = aes(x=Freq, y=reorder(Var1, Freq))) +
  geom_col(aes(fill = Freq), show.legend = F) +
  geom_label(aes(label=Freq)) +
  scale_fill_gradient(low="#E3F4FE",high="#0484EB") +
  labs(
    title = "Top 10 App Store General Categories",
    subtitle = "excluding Games",
    y = NULL,
    x = "Count",
    fill = "Categories",
  ) +
  theme_minimal()

Proportion of App Categories

# Based on Content Rating
prop_cont <- as.data.frame(table(apps$prime_genre, apps$cont_rating))

ggplot(data = prop_cont, aes(x = Freq, y = reorder (Var1, Freq))) +
  geom_col(aes(fill = Var2), position = "fill") +
  scale_fill_brewer(palette = "Paired") +
    labs(
      title = "Proportion of App Store Categories",
      subtitle = "Category vs Content Rating",
      x = "Content Rating",
      y = NULL,
      fill = NULL
    ) +
  theme(legend.position = "top",
        plot.title.position = "plot")

🔎 Insight:

A large proportion of categories were rated 4+, which suggests that they were generally suitable for all users.
The highest proportion of apps rated 4+ was Education and rated 17+ was Social Networking.

The relation between app details

# Average user rating vs number of languages supported
ggplot(data = apps, aes(x=user_rating,y=lang.num, group=1)) +
  geom_line(stat="summary", fun = mean, 
            aes(col="#CC6666"), show.legend = F) +
  labs (
    title="The relation between number of languages supported and app quality",
    subtitle = "Average user rating vs number of languages supported",
    x="Average user rating", 
    y="Number of language supported",
  ) +
  theme(plot.title.position = "plot") +
  theme_minimal()

summary(apps$lang.num)

#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#>   0.000   1.000   1.000   5.435   8.000  75.000