# Clear environment of variables and functions
rm(list = ls(all = TRUE)) 

# Clear environmet of packages
if(is.null(sessionInfo()$otherPkgs) == FALSE)lapply(paste("package:", names(sessionInfo()$otherPkgs), sep=""), detach, character.only = TRUE, unload = TRUE)

1 Load Package and Data

1.1 Load Package

#load package 
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──

## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.4     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.0

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

library(GGally)

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

library(janitor) # for tyble

## 
## Attaching package: 'janitor'

## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

library(lmPerm)  # for ANOVA
library(formattable)# For table formatting and table formatting functions
library(htmltools)
library(Hmisc)

## Loading required package: lattice

## Loading required package: survival

## Loading required package: Formula

## 
## Attaching package: 'Hmisc'

## The following objects are masked from 'package:dplyr':
## 
##     src, summarize

## The following objects are masked from 'package:base':
## 
##     format.pval, units

library(MultinomialCI) # To calculate multinomial confidence intervals for factor variables 


library(flexdashboard)
library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:Hmisc':
## 
##     subplot

## The following object is masked from 'package:formattable':
## 
##     style

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(dygraphs)
library(xts)  # to convert date data to xts data, xts is time series class

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## 
## Attaching package: 'xts'

## The following objects are masked from 'package:dplyr':
## 
##     first, last

library(gganimate)

## No renderer backend detected. gganimate will default to writing frames to separate files
## Consider installing:
## - the `gifski` package for gif output
## - the `av` package for video output
## and restarting the R session

library(inspectdf)# Load auto EDA packages

1.2 Data Loading and Cleaning

#Apple iOS app store data
#Data Loading 
my_data <- read.csv("AppleStore.csv")

#add new data cloumn for paid, app size and revenue
my_data <- my_data %>% mutate(paid = as.factor(ifelse(price %in% 0 , "Free","Paid")))
my_data <- my_data %>% mutate(size_bytes_MB = size_bytes/(1024*1024))     
my_data <- my_data %>% mutate(revenue = rating_count_tot * price)     
head(my_data)

##          id                                         track_name size_bytes
## 1 281656475                                    PAC-MAN Premium  100788224
## 2 281796108                          Evernote - stay organized  158578688
## 3 281940292    WeatherBug - Local Weather, Radar, Maps, Alerts  100524032
## 4 282614216 eBay: Best App to Buy, Sell, Save! Online Shopping  128512000
## 5 282935706                                              Bible   92774400
## 6 283619399                                   Shanghai Mahjong   10485713
##   currency price rating_count_tot rating_count_ver user_rating user_rating_ver
## 1      USD  3.99            21292               26         4.0             4.5
## 2      USD  0.00           161065               26         4.0             3.5
## 3      USD  0.00           188583             2822         3.5             4.5
## 4      USD  0.00           262241              649         4.0             4.5
## 5      USD  0.00           985920             5320         4.5             5.0
## 6      USD  0.99             8253             5516         4.0             4.0
##      ver cont_rating  prime_genre sup_devices.num ipadSc_urls.num lang.num
## 1  6.3.5          4+        Games              38               5       10
## 2  8.2.2          4+ Productivity              37               5       23
## 3  5.0.0          4+      Weather              37               5        3
## 4 5.10.0         12+     Shopping              37               5        9
## 5  7.5.1          4+    Reference              37               5       45
## 6    1.8          4+        Games              47               5        1
##   vpp_lic paid size_bytes_MB  revenue
## 1       1 Paid     96.119141 84955.08
## 2       1 Free    151.232422     0.00
## 3       1 Free     95.867188     0.00
## 4       1 Free    122.558594     0.00
## 5       1 Free     88.476562     0.00
## 6       1 Paid      9.999955  8170.47

Column name introdution

“id” : App ID

“track_name”: App Name

“size_bytes”: Size (in Bytes)

“currency”: Currency Type

“price”: Price amount

“rating_count_tot”: User Rating counts (for all version)

“rating_count_ver”: User Rating counts (for current version)

“user_rating” : Average User Rating value (for all version)

“user_rating_ver”: Average User Rating value (for current version)

“ver” : Latest version code

“cont_rating”: Content Rating

“prime_genre”: Primary Genre

“sup_devices.num”: Number of supporting devices

“ipadSc_urls.num”: Number of screenshots showed for display

“lang.num”: Number of supported languages

“vpp_lic”: Vpp Device Based Licensing Enabled

2 Base EDA

2.1 Uni-variable Non-Graphic

summary(my_data)

##        id             track_name          size_bytes          currency        
##  Min.   :2.817e+08   Length:7197        Min.   :5.898e+05   Length:7197       
##  1st Qu.:6.001e+08   Class :character   1st Qu.:4.692e+07   Class :character  
##  Median :9.781e+08   Mode  :character   Median :9.715e+07   Mode  :character  
##  Mean   :8.631e+08                      Mean   :1.991e+08                     
##  3rd Qu.:1.082e+09                      3rd Qu.:1.819e+08                     
##  Max.   :1.188e+09                      Max.   :4.026e+09                     
##      price         rating_count_tot  rating_count_ver    user_rating   
##  Min.   :  0.000   Min.   :      0   Min.   :     0.0   Min.   :0.000  
##  1st Qu.:  0.000   1st Qu.:     28   1st Qu.:     1.0   1st Qu.:3.500  
##  Median :  0.000   Median :    300   Median :    23.0   Median :4.000  
##  Mean   :  1.726   Mean   :  12893   Mean   :   460.4   Mean   :3.527  
##  3rd Qu.:  1.990   3rd Qu.:   2793   3rd Qu.:   140.0   3rd Qu.:4.500  
##  Max.   :299.990   Max.   :2974676   Max.   :177050.0   Max.   :5.000  
##  user_rating_ver     ver            cont_rating        prime_genre       
##  Min.   :0.000   Length:7197        Length:7197        Length:7197       
##  1st Qu.:2.500   Class :character   Class :character   Class :character  
##  Median :4.000   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :3.254                                                           
##  3rd Qu.:4.500                                                           
##  Max.   :5.000                                                           
##  sup_devices.num ipadSc_urls.num    lang.num         vpp_lic         paid     
##  Min.   : 9.00   Min.   :0.000   Min.   : 0.000   Min.   :0.0000   Free:4056  
##  1st Qu.:37.00   1st Qu.:3.000   1st Qu.: 1.000   1st Qu.:1.0000   Paid:3141  
##  Median :37.00   Median :5.000   Median : 1.000   Median :1.0000              
##  Mean   :37.36   Mean   :3.707   Mean   : 5.435   Mean   :0.9931              
##  3rd Qu.:38.00   3rd Qu.:5.000   3rd Qu.: 8.000   3rd Qu.:1.0000              
##  Max.   :47.00   Max.   :5.000   Max.   :75.000   Max.   :1.0000              
##  size_bytes_MB         revenue       
##  Min.   :   0.562   Min.   :      0  
##  1st Qu.:  44.749   1st Qu.:      0  
##  Median :  92.652   Median :      0  
##  Mean   : 189.909   Mean   :   5009  
##  3rd Qu.: 173.497   3rd Qu.:    340  
##  Max.   :3839.464   Max.   :3648864

str(my_data)

## 'data.frame':    7197 obs. of  19 variables:
##  $ id              : int  281656475 281796108 281940292 282614216 282935706 283619399 283646709 284035177 284666222 284736660 ...
##  $ track_name      : chr  "PAC-MAN Premium" "Evernote - stay organized" "WeatherBug - Local Weather, Radar, Maps, Alerts" "eBay: Best App to Buy, Sell, Save! Online Shopping" ...
##  $ size_bytes      : num  1.01e+08 1.59e+08 1.01e+08 1.29e+08 9.28e+07 ...
##  $ currency        : chr  "USD" "USD" "USD" "USD" ...
##  $ price           : num  3.99 0 0 0 0 0.99 0 0 9.99 3.99 ...
##  $ rating_count_tot: int  21292 161065 188583 262241 985920 8253 119487 1126879 1117 7885 ...
##  $ rating_count_ver: int  26 26 2822 649 5320 5516 879 3594 4 40 ...
##  $ user_rating     : num  4 4 3.5 4 4.5 4 4 4 4.5 4 ...
##  $ user_rating_ver : num  4.5 3.5 4.5 4.5 5 4 4.5 4.5 5 4 ...
##  $ ver             : chr  "6.3.5" "8.2.2" "5.0.0" "5.10.0" ...
##  $ cont_rating     : chr  "4+" "4+" "4+" "12+" ...
##  $ prime_genre     : chr  "Games" "Productivity" "Weather" "Shopping" ...
##  $ sup_devices.num : int  38 37 37 37 37 47 37 37 37 38 ...
##  $ ipadSc_urls.num : int  5 5 5 5 5 5 0 4 5 0 ...
##  $ lang.num        : int  10 23 3 9 45 1 19 1 1 10 ...
##  $ vpp_lic         : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ paid            : Factor w/ 2 levels "Free","Paid": 2 1 1 1 1 2 1 1 2 2 ...
##  $ size_bytes_MB   : num  96.1 151.2 95.9 122.6 88.5 ...
##  $ revenue         : num  84955 0 0 0 0 ...

table(my_data$price)

## 
##      0   0.99   1.99   2.99   3.99   4.99   5.99   6.99   7.99   8.99   9.99 
##   4056    728    621    683    277    394     52    166     33      9     81 
##  11.99  12.99  13.99  14.99  15.99  16.99  17.99  18.99  19.99  20.99  21.99 
##      6      5      6     21      4      2      3      1     13      2      1 
##  22.99  23.99  24.99  27.99  29.99  34.99  39.99  47.99  49.99  59.99  74.99 
##      2      2      8      2      6      1      2      1      2      3      1 
##  99.99 249.99 299.99 
##      1      1      1

table(my_data$prime_genre)

## 
##              Book          Business          Catalogs         Education 
##               112                57                10               453 
##     Entertainment           Finance      Food & Drink             Games 
##               535               104                63              3862 
##  Health & Fitness         Lifestyle           Medical             Music 
##               180               144                23               138 
##        Navigation              News     Photo & Video      Productivity 
##                46                75               349               178 
##         Reference          Shopping Social Networking            Sports 
##                64               122               167               114 
##            Travel         Utilities           Weather 
##                81               248                72

table(my_data$sup_devices.num)

## 
##    9   11   12   13   15   16   23   24   25   26   33   35   36   37   38   39 
##    1    3    1    7    2    8    1  270   67   42    2   24    7 3263 1912   40 
##   40   43   45   47 
## 1142  371    8   26

table(my_data$lang.num)

## 
##    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
##   41 3767  675  217  154  207  143  133  145  138  168  266  179  130   89   86 
##   16   17   18   19   20   21   22   23   24   25   26   27   28   29   30   31 
##  114   46   71   30   21   35   32   24   16   14   22    7    8   20   28   53 
##   32   33   34   35   36   37   39   40   41   42   43   45   46   47   50   54 
##   17   30   13    2    4    2    2    1    2    3    2    9    4    1    1    2 
##   55   56   58   59   63   68   69   74   75 
##    2    1   12    1    1    1    3    1    1

print(paste("Number of track_name: ",nrow(my_data %>% group_by(track_name)%>%summarise(n()))))

## `summarise()` ungrouping output (override with `.groups` argument)

## [1] "Number of track_name:  7195"

print(paste("Number of id: ",nrow(my_data %>% group_by(id)%>%summarise(n()))))

## `summarise()` ungrouping output (override with `.groups` argument)

## [1] "Number of id:  7197"

print(paste("Number of currency: ",nrow(my_data %>% group_by(currency)%>%summarise(n()))))

## `summarise()` ungrouping output (override with `.groups` argument)

## [1] "Number of currency:  1"

print(paste("Number of prime_genre: " ,nrow(my_data %>% group_by(prime_genre)%>%summarise(n()))))

## `summarise()` ungrouping output (override with `.groups` argument)

## [1] "Number of prime_genre:  23"

Comments:

track_name: number of Apps (7195 kinds)

id: number of App ids (7197 kinds) (2 Apps have duplicate name)

currency: only US dollar

prime_genre: type of App (23 types)

2.2 Uni-variable Graphic

# Variable types in a data set
my_data %>% 
  inspect_types() %>% 
  show_plot()

Comments:

Attributes type Overview:

Interger: 7

Factor: 6

Numeric: 6

2.2.1 Overview of categorical variable

library(inspectdf)# Load auto EDA packages
test_data <- my_data
  test_data$user_rating <- as.factor(test_data$user_rating) 
  test_data$user_rating_ver <- as.factor(test_data$user_rating_ver) 
  test_data$ipadSc_urls.num <- as.factor(test_data$ipadSc_urls.num) 
  
  test_data %>%
  inspect_cat() %>% 
  show_plot()

# Load auto EDA packages
library(DataExplorer)

# Distribution of categorical variable 
my_data %>% 
  plot_bar()

## 2 columns ignored with more than 50 categories.
## track_name: 7195 categories
## ver: 1546 categories

Attributes Analysis

Content rating has 4+ the most.
Free Apps are more than Paid Apps.

2.2.2 Overview of Numeric variable

# Distribution of numeric variable
my_data %>% 
  plot_histogram()

Finding:

Most of variables are skew, it’s better to use median for each variable in the following analysis.

2.2.3 Detail of Variable Distribution

paid_data <- 
  my_data %>%
  filter(price != 0 & price != 299.99 & price != 249.99) 
price_plot <- 
  paid_data %>%
  ggplot(aes(x = price)) + 
  geom_bar() + 
  labs(title="Price Distribution for paid App")

user_rating_plot <- 
  ggplot(my_data,aes(x = user_rating)) + 
  geom_bar() + 
  geom_text(stat = "count", aes(label = ..count..),vjust = -0.5,size=2)
  labs(title="User Rating Distribution")

## $title
## [1] "User Rating Distribution"
## 
## attr(,"class")
## [1] "labels"

sup_devices_plot <- 
  ggplot(my_data,aes(x = sup_devices.num)) + 
  geom_bar() + 
  geom_text(stat = "count", aes(label = ..count..),vjust = -0.5,size=2) +
  labs(title="Number of Devices Support Distribution")

lang_num_plot <- 
  ggplot(my_data,aes(x = as.factor(lang.num))) + 
  geom_bar() + 
  geom_text(stat = "count", aes(label = ..count..),vjust = -0.5,size=2) +
  theme(axis.text.x = element_text(size=7,angle=45))+
  labs(title="Number of Language Support Distribution", x="The Number of Language Support")

price_plot

user_rating_plot

sup_devices_plot

lang_num_plot

Findings:

There are an outlier 249.99 and 299.99 in price.
Price distribution is right-skewed, so it’s better to use median for analysis in the following.
Almost 60% of Apps are rated between 4 and 4.5.
Most of Apps support 37 to 38 devices.
Most of Apps only support one language.

Questions:

There are 41 Apps don’t support any language??

grid.arrange(
  ggplot(my_data,aes(x = prime_genre))+ 
  geom_bar() + 
  geom_text(stat = "count", aes(label = ..count..),vjust = -0.2,size=3) +
  theme(axis.text.x = element_text(size=7,angle=45, hjust = 1.0))+
  labs(title="Number of Apps Genre", x="Apps Genre"),

  ggplot(my_data,aes(x = cont_rating))+ 
  geom_bar() + 
  geom_text(stat = "count", aes(label = ..count..),vjust = 1.5,size=4,color = "white") +
  theme(axis.text.x = element_text(size=7,angle=45, hjust = 1.0))+
  labs(title="Number of Content Rating", x="Content Rating")
  
  ,ncol=1)

Findings:

Over half of Apps are from Game genre in the App Store market.
Top 5 number of Apps genre is Games, Entertaiment, Education, Photo & Video and Utilities.

2.3 Multi-variable Non-Graphic

library(Hmisc)
my_data %>% 
  select_if(is.numeric) %>% 
  as.matrix() %>% 
  rcorr()

##                     id size_bytes price rating_count_tot rating_count_ver
## id                1.00       0.08 -0.08            -0.20            -0.07
## size_bytes        0.08       1.00  0.18             0.00             0.01
## price            -0.08       0.18  1.00            -0.04            -0.02
## rating_count_tot -0.20       0.00 -0.04             1.00             0.16
## rating_count_ver -0.07       0.01 -0.02             0.16             1.00
## user_rating      -0.19       0.07  0.05             0.08             0.07
## user_rating_ver  -0.11       0.09  0.03             0.09             0.08
## sup_devices.num   0.03      -0.12 -0.12             0.01             0.04
## ipadSc_urls.num   0.05       0.15  0.07             0.02             0.02
## lang.num         -0.13       0.00 -0.01             0.14             0.01
## vpp_lic           0.02      -0.15 -0.03             0.00             0.01
## size_bytes_MB     0.08       1.00  0.18             0.00             0.01
## revenue          -0.12       0.02  0.08             0.16             0.09
##                  user_rating user_rating_ver sup_devices.num ipadSc_urls.num
## id                     -0.19           -0.11            0.03            0.05
## size_bytes              0.07            0.09           -0.12            0.15
## price                   0.05            0.03           -0.12            0.07
## rating_count_tot        0.08            0.09            0.01            0.02
## rating_count_ver        0.07            0.08            0.04            0.02
## user_rating             1.00            0.77           -0.04            0.27
## user_rating_ver         0.77            1.00           -0.02            0.28
## sup_devices.num        -0.04           -0.02            1.00           -0.04
## ipadSc_urls.num         0.27            0.28           -0.04            1.00
## lang.num                0.17            0.18           -0.04            0.09
## vpp_lic                 0.07            0.05           -0.04            0.07
## size_bytes_MB           0.07            0.09           -0.12            0.15
## revenue                 0.05            0.05            0.00           -0.01
##                  lang.num vpp_lic size_bytes_MB revenue
## id                  -0.13    0.02          0.08   -0.12
## size_bytes           0.00   -0.15          1.00    0.02
## price               -0.01   -0.03          0.18    0.08
## rating_count_tot     0.14    0.00          0.00    0.16
## rating_count_ver     0.01    0.01          0.01    0.09
## user_rating          0.17    0.07          0.07    0.05
## user_rating_ver      0.18    0.05          0.09    0.05
## sup_devices.num     -0.04   -0.04         -0.12    0.00
## ipadSc_urls.num      0.09    0.07          0.15   -0.01
## lang.num             1.00    0.03          0.00    0.02
## vpp_lic              0.03    1.00         -0.15    0.01
## size_bytes_MB        0.00   -0.15          1.00    0.02
## revenue              0.02    0.01          0.02    1.00
## 
## n= 7197 
## 
## 
## P
##                  id     size_bytes price  rating_count_tot rating_count_ver
## id                      0.0000     0.0000 0.0000           0.0000          
## size_bytes       0.0000            0.0000 0.7035           0.5909          
## price            0.0000 0.0000            0.0009           0.1265          
## rating_count_tot 0.0000 0.7035     0.0009                  0.0000          
## rating_count_ver 0.0000 0.5909     0.1265 0.0000                           
## user_rating      0.0000 0.0000     0.0000 0.0000           0.0000          
## user_rating_ver  0.0000 0.0000     0.0327 0.0000           0.0000          
## sup_devices.num  0.0044 0.0000     0.0000 0.4538           0.0013          
## ipadSc_urls.num  0.0000 0.0000     0.0000 0.1820           0.0390          
## lang.num         0.0000 0.6955     0.5691 0.0000           0.2597          
## vpp_lic          0.1323 0.0000     0.0111 0.9336           0.5837          
## size_bytes_MB    0.0000 0.0000     0.0000 0.7035           0.5909          
## revenue          0.0000 0.0408     0.0000 0.0000           0.0000          
##                  user_rating user_rating_ver sup_devices.num ipadSc_urls.num
## id               0.0000      0.0000          0.0044          0.0000         
## size_bytes       0.0000      0.0000          0.0000          0.0000         
## price            0.0000      0.0327          0.0000          0.0000         
## rating_count_tot 0.0000      0.0000          0.4538          0.1820         
## rating_count_ver 0.0000      0.0000          0.0013          0.0390         
## user_rating                  0.0000          0.0003          0.0000         
## user_rating_ver  0.0000                      0.1089          0.0000         
## sup_devices.num  0.0003      0.1089                          0.0014         
## ipadSc_urls.num  0.0000      0.0000          0.0014                         
## lang.num         0.0000      0.0000          0.0004          0.0000         
## vpp_lic          0.0000      0.0000          0.0016          0.0000         
## size_bytes_MB    0.0000      0.0000          0.0000          0.0000         
## revenue          0.0000      0.0000          0.6972          0.6316         
##                  lang.num vpp_lic size_bytes_MB revenue
## id               0.0000   0.1323  0.0000        0.0000 
## size_bytes       0.6955   0.0000  0.0000        0.0408 
## price            0.5691   0.0111  0.0000        0.0000 
## rating_count_tot 0.0000   0.9336  0.7035        0.0000 
## rating_count_ver 0.2597   0.5837  0.5909        0.0000 
## user_rating      0.0000   0.0000  0.0000        0.0000 
## user_rating_ver  0.0000   0.0000  0.0000        0.0000 
## sup_devices.num  0.0004   0.0016  0.0000        0.6972 
## ipadSc_urls.num  0.0000   0.0000  0.0000        0.6316 
## lang.num                  0.0059  0.6955        0.0739 
## vpp_lic          0.0059           0.0000        0.6655 
## size_bytes_MB    0.6955   0.0000                0.0408 
## revenue          0.0739   0.6655  0.0408

Findings:

Price has high correlation with size_bytes, user_rating, sup_devices.num and the number of screenshots showed for display (ipadSc_urls.num).
user_rating has high correlation with user_rating_ver (Average User Rating value (for current version)), the number of screenshots showed for display (ipadSc_urls.num), and the number of languages support.

my_data %>% tabyl(price,prime_genre)

##   price Book Business Catalogs Education Entertainment Finance Food & Drink
##    0.00   66       20        9       132           334      84           43
##    0.99    4        3        0        27            63      11            4
##    1.99    7        4        0        37            65       2            3
##    2.99    9        4        0       149            41       2            2
##    3.99   13        3        0        43            12       3            3
##    4.99    1       10        0        34            12       1            4
##    5.99    9        3        0         5             5       1            1
##    6.99    1        0        0         4             2       0            1
##    7.99    0        3        1         4             0       0            0
##    8.99    0        0        0         1             0       0            1
##    9.99    1        4        0         6             1       0            0
##   11.99    0        0        0         2             0       0            0
##   12.99    0        0        0         1             0       0            0
##   13.99    0        0        0         0             0       0            0
##   14.99    0        1        0         1             0       0            0
##   15.99    0        0        0         2             0       0            0
##   16.99    0        0        0         0             0       0            0
##   17.99    0        0        0         0             0       0            0
##   18.99    0        0        0         0             0       0            0
##   19.99    0        0        0         0             0       0            0
##   20.99    0        0        0         0             0       0            0
##   21.99    0        0        0         0             0       0            0
##   22.99    0        0        0         0             0       0            0
##   23.99    0        0        0         0             0       0            0
##   24.99    0        0        0         1             0       0            0
##   27.99    1        0        0         0             0       0            1
##   29.99    0        0        0         0             0       0            0
##   34.99    0        0        0         0             0       0            0
##   39.99    0        0        0         0             0       0            0
##   47.99    0        0        0         0             0       0            0
##   49.99    0        1        0         0             0       0            0
##   59.99    0        1        0         2             0       0            0
##   74.99    0        0        0         0             0       0            0
##   99.99    0        0        0         0             0       0            0
##  249.99    0        0        0         1             0       0            0
##  299.99    0        0        0         1             0       0            0
##  Games Health & Fitness Lifestyle Medical Music Navigation News Photo & Video
##   2257               76        94       8    67         20   58           167
##    435               11        14       1     4          6    7            48
##    274               20        16       0     9          4    2            62
##    317               37         6       2    13          4    4            29
##    120               12         6       2     8          2    4            12
##    226               17         8       1     8          6    0            19
##     18                1         0       0     2          0    0             3
##    135                5         0       0     3          0    0             3
##     14                0         0       1     0          0    0             1
##      3                0         0       0     1          0    0             1
##     34                1         0       1     5          1    0             2
##      2                0         0       0     1          0    0             0
##      2                0         0       0     1          0    0             0
##      2                0         0       1     1          0    0             0
##     13                0         0       0     2          0    0             1
##      2                0         0       0     0          0    0             0
##      1                0         0       0     0          0    0             0
##      2                0         0       0     0          0    0             0
##      0                0         0       0     1          0    0             0
##      2                0         0       2     4          1    0             0
##      1                0         0       0     0          1    0             0
##      0                0         0       0     0          0    0             0
##      0                0         0       0     0          0    0             1
##      0                0         0       0     0          0    0             0
##      1                0         0       3     1          0    0             0
##      0                0         0       0     0          0    0             0
##      1                0         0       0     4          0    0             0
##      0                0         0       1     0          0    0             0
##      0                0         0       0     2          0    0             0
##      0                0         0       0     0          0    0             0
##      0                0         0       0     1          0    0             0
##      0                0         0       0     0          0    0             0
##      0                0         0       0     0          1    0             0
##      0                0         0       0     0          0    0             0
##      0                0         0       0     0          0    0             0
##      0                0         0       0     0          0    0             0
##  Productivity Reference Shopping Social Networking Sports Travel Utilities
##            62        20      121               143     79     56       109
##            15         4        0                13      9      8        35
##            21        10        1                 1     15      2        54
##            12        11        0                 6      3      2        16
##             7         2        0                 1      3      4        11
##            18         6        0                 2      1      5        13
##             2         0        0                 0      0      0         2
##             6         1        0                 0      2      2         1
##             7         1        0                 0      0      1         0
##             1         0        0                 0      0      0         1
##            17         1        0                 1      1      1         3
##             1         0        0                 0      0      0         0
##             0         0        0                 0      0      0         1
##             0         1        0                 0      0      0         1
##             2         1        0                 0      0      0         0
##             0         0        0                 0      0      0         0
##             1         0        0                 0      0      0         0
##             1         0        0                 0      0      0         0
##             0         0        0                 0      0      0         0
##             2         1        0                 0      1      0         0
##             0         0        0                 0      0      0         0
##             0         1        0                 0      0      0         0
##             0         1        0                 0      0      0         0
##             0         2        0                 0      0      0         0
##             1         0        0                 0      0      0         1
##             0         0        0                 0      0      0         0
##             1         0        0                 0      0      0         0
##             0         0        0                 0      0      0         0
##             0         0        0                 0      0      0         0
##             0         1        0                 0      0      0         0
##             0         0        0                 0      0      0         0
##             0         0        0                 0      0      0         0
##             0         0        0                 0      0      0         0
##             1         0        0                 0      0      0         0
##             0         0        0                 0      0      0         0
##             0         0        0                 0      0      0         0
##  Weather
##       31
##        6
##       12
##       14
##        6
##        2
##        0
##        0
##        0
##        0
##        1
##        0
##        0
##        0
##        0
##        0
##        0
##        0
##        0
##        0
##        0
##        0
##        0
##        0
##        0
##        0
##        0
##        0
##        0
##        0
##        0
##        0
##        0
##        0
##        0
##        0

2.4 Multi-variable Graphic

2.4.1 Overview of App Store market correlation

my_data %>% 
  inspect_num() %>% 
  show_plot()

# Distribution of numeric variable 
my_data %>% 
  select(size_bytes,price,rating_count_tot,user_rating,ipadSc_urls.num, lang.num) %>% 
  inspect_cor() %>% 
  show_plot()

Finding:

ipadSc_urls.num and user_rating has strong correlation.

my_data %>% select(size_bytes,price,rating_count_tot,user_rating,ipadSc_urls.num, lang.num,cont_rating) %>%  
  ggpairs()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Finding:

User rating have strong correlation with the number of screen shot.

2.4.2 Apps vs Total rating count

my_data %>%
  arrange(desc(rating_count_tot)) %>%
  slice(1:10) %>%
  ggplot(aes(x = reorder(track_name,-rating_count_tot), y = rating_count_tot, fill = prime_genre)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Top 10 High Total Rating Count Apps by Genre" , x= "App Name") +
  scale_fill_discrete(name = "Genre") +
  theme(axis.text.x = element_text(size=7,angle=25,hjust = 1.0))

Findings:

In top 10 rating count, 4 of 10 are Game genre, 2 of 10 are Social Networking, 2 of 10 are music.
Top 1 Facebook has a gap with other Apps.

2.4.3 Apps vs Total rating count (current ver.)

my_data %>%
  arrange(desc(rating_count_ver)) %>%
  slice(1:10) %>%
  ggplot(aes(x = reorder(track_name,-rating_count_ver), y = rating_count_ver, fill = prime_genre)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Top 10 high total rating count Apps (Current Version) by Genre" , x= "App Name") +
  scale_fill_discrete(name = "Genre") +
  theme(axis.text.x = element_text(size=7,angle=25,hjust = 1.0))

2.4.4 Apps vs Download Size

my_data %>%
  arrange(desc(size_bytes_MB)) %>%
  slice(1:10) %>%
  ggplot(aes(x = reorder(track_name,-size_bytes_MB), y = size_bytes_MB, fill = prime_genre)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Top 10 high Apps Size (MByte) by Genre" , x= "App Name") +
  scale_fill_discrete(name = "Genre") +
  theme(axis.text.x = element_text(size=7,angle=25,hjust = 1.0))

Findings:

All top 10 Apps are from Games genre.

2.4.5 Apps vs Price

my_data %>%
  arrange(desc(price)) %>%
  slice(1:10) %>%
  ggplot(aes(x = reorder(track_name,-price), y = price, fill = prime_genre)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Top 10 high price of Apps by Genre" , x= "App Name") +
  scale_fill_discrete(name = "Genre") +
  theme(axis.text.x = element_text(size=7,angle=25,hjust = 1.0))

Findings:

In top 10 rating count, 4 of 10 are Education genre, 2 of 10 are Business.
Top 2 Educations App has huge gap with other Apps.

2.4.6 App vs Tentitive revenue

my_data %>%
  #revenue = price * rating_count_tot
  arrange(desc(revenue)) %>%
  slice(1:10) %>%
  ggplot(aes(x = reorder(track_name,-revenue), y = revenue, fill = prime_genre)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Top 10 Revenue Ranking by Genre" , x= "App Name") +
  scale_fill_discrete(name = "Genre") +
  theme(axis.text.x = element_text(size=7,angle=25,hjust = 1.0))

Comments:

We assumed that all the rating counts are from user who already download the Apps. The real revenue will must be higher than this data, because not everyone download the App will also rating it.

2.4.7 Genre Anaylsis

grid.arrange(

my_data %>%
  group_by(prime_genre) %>%
  summarise(med_rating_count_tot = median(rating_count_tot)) %>%
  ggplot(aes(x = reorder(prime_genre,-med_rating_count_tot), y = med_rating_count_tot)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Median of Total rating count across Genre" , x= "App Genre") +
  theme(axis.text.x = element_text(size=7,angle=25,hjust = 1.0)),

  
my_data %>%
  group_by(prime_genre,size_bytes_MB) %>%
  summarise(avg_app_size = mean(size_bytes_MB)) %>%
  ggplot(aes(x = reorder(prime_genre,-avg_app_size), y = avg_app_size)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Average App Size of Download across Genre" , x= "App Genre") +
  theme(axis.text.x = element_text(size=7,angle=25,hjust = 1.0)),

ncol =1
)

## `summarise()` ungrouping output (override with `.groups` argument)

## `summarise()` regrouping output by 'prime_genre' (override with `.groups` argument)

grid.arrange(
my_data %>%
  arrange(desc(user_rating)) %>%
  group_by(prime_genre) %>%
  summarise(avg_rate = mean(user_rating)) %>%
  ggplot(aes(x = reorder(prime_genre,-avg_rate), y = avg_rate)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Average User Rating by Genre" , x= "App Genre") +
  theme(axis.text.x = element_text(size=7,angle=25,hjust = 1.0)),

my_data %>%
  group_by(prime_genre) %>%
  summarise(med_lang.num = median(lang.num))%>%
  ggplot(aes(x = reorder(prime_genre,-med_lang.num), y = med_lang.num)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "The median number of language supported by Genre" , x= "App Genre") +
  theme(axis.text.x = element_text(size=7,angle=25,hjust = 1.0)),

ncol = 1)

## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)

2.4.8 User rating analysis

grid.arrange(
my_data%>%
ggplot(my_data, mapping = aes(x=user_rating , y= lang.num))+geom_point(alpha = 0.5,size=1)+stat_smooth(method="lm",se=FALSE)+
labs(title = "The Correlation of number of language \nsupport and Total Rating Count" , x= "User Rating"),


my_data%>%
ggplot(my_data, mapping = aes(x=user_rating , y=ipadSc_urls.num ))+geom_point(alpha = 0.5,size=1)+stat_smooth(method="lm",se=FALSE)+
labs(title = "The Correlation of the number of \nscreenshots showed for display and \nTotal Rating Count" , x= "User Rating"),

my_data%>%
ggplot(my_data, mapping = aes(x=user_rating , y=rating_count_tot ))+geom_point(alpha = 0.5,size=1)+stat_smooth(method="lm",se=FALSE)+
labs(title = "The Correlation of the User Rating \nand Total Rating Count" , x= "User Rating" ),

my_data%>%
filter(price != 0) %>%
ggplot(my_data, mapping = aes(x= user_rating, y=price ))+geom_point(alpha = 0.5,size=1)+stat_smooth(method="lm",se=FALSE) +
labs(title = "The Correlation of the Price and \nTotal Rating Count" , x= "User Rating")

, ncol = 2)

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

Findings:

The number of screenshots showed in the display has strong impact on the User rating.
The number of language supported, total rating count and price only slighly change the trend for user rating.

2.4.9 Total rating count analysis

grid.arrange(
my_data%>%
ggplot(my_data, mapping = aes(x=rating_count_tot , y= lang.num))+geom_point(alpha = 0.5,size=1)+stat_smooth(method="lm",se=FALSE)+
labs(title = "The Correlation of number of language \nsupport and Total Rating Count" , x= "Total rating count"),


my_data%>%
ggplot(my_data, mapping = aes(x=rating_count_tot , y=ipadSc_urls.num ))+geom_point(alpha = 0.5,size=1)+stat_smooth(method="lm",se=FALSE)+
labs(title = "The Correlation of the number of \nscreenshots showed for display and \nTotal Rating Count" , x= "Total rating count"),

my_data%>%
ggplot(my_data, mapping = aes(x=rating_count_tot , y=user_rating ))+geom_point(alpha = 0.5,size=1)+stat_smooth(method="lm",se=FALSE)+
labs(title = "The Correlation of the User Rating and \nTotal Rating Count" , x= "Total rating count" ),

my_data%>%
filter(price != 0) %>%
ggplot(my_data, mapping = aes(x=rating_count_tot , y=price ))+geom_point(alpha = 0.5,size=1)+stat_smooth(method="lm",se=FALSE)+
labs(title = "The Correlation of the Price and Total \nRating Count" , x= "Total rating count"),

ncol = 2)

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

Findings:

The number of language support, screenshot showed, and user rating have positive correlation with Total Rating Count. But Price slightly drop dowm when the Total Rating Count getting more.

2.4.10 Device suport vs. App size

my_data%>%
ggplot(my_data, mapping = aes(x=size_bytes_MB , y=sup_devices.num ))+geom_point(alpha = 0.5,size=1)+stat_smooth(method="lm",se=FALSE)+
labs(title = "The Correlation of the number of device support and App Download Size" , x= "App Download Size")

## `geom_smooth()` using formula 'y ~ x'

Finding:

Ther more download size, the more number of devices supports

2.4.11 Paid Apps Analysis

my_data %>%
  filter(price != 0) %>%
  arrange(desc(price)) %>%
  group_by(prime_genre) %>%
  summarise(median_price = median(price)) %>%
  ggplot(aes(x = reorder(prime_genre,-median_price), y = median_price)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Paid App Median Price by Genre" , x= "App Genre") +
  theme(axis.text.x = element_text(size=7,angle=25,hjust = 1.0))

## `summarise()` ungrouping output (override with `.groups` argument)

Findings:

Medical Apps have the highest median price among all the genre.

grid.arrange(
my_data%>%
  filter(price !=0 )%>% 
ggplot(my_data, mapping = aes(x=price , y= lang.num))+geom_point(alpha = 0.5,size=1)+stat_smooth(method="lm",se=FALSE)+
labs(title = "The Correlation of number of language \nsupport and Price" , x= "Price"),


my_data%>%
  filter(price !=0 )%>% 
ggplot(my_data, mapping = aes(x=price , y=ipadSc_urls.num ))+geom_point(alpha = 0.5,size=1)+stat_smooth(method="lm",se=FALSE)+
labs(title = "The Correlation of the number of \nscreenshots showed for display and \nPrice" , x= "Price"),

my_data%>%
  filter(price !=0 )%>% 
ggplot(my_data, mapping = aes(x=price , y=sup_devices.num ))+geom_point(alpha = 0.5,size=1)+stat_smooth(method="lm",se=FALSE)+
labs(title = "The Correlation of the number of device support and Price" , x= "Price"),

ncol = 2)

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

Finding:

Price with the number of language support and screeshot show have positive correlation, but with the number of devices support have negative correlation.

3 Detail EDA

3.1 Paid Vs. Non-Paid App

paid_title <- c('Free', 'Paid')
paid_value <- c(4065,3141)
paid_table <- data.frame(paid_title,paid_value)

paid_table

##   paid_title paid_value
## 1       Free       4065
## 2       Paid       3141

grid.arrange(
my_data %>%
 ggplot(my_data,mapping = aes(x = paid, fill = paid)) + 
  geom_bar() + 
  geom_text(stat = "count", aes(label = ..count..), vjust = 2.0,size=4) +
  labs(title="Number of Paid & Free Apps"),


my_data %>%
ggplot(my_data, mapping = aes(x=paid,fill = paid)) + 
    geom_bar() +
    geom_text(aes(label=round((..count..)/sum(..count..)*100,2),  vjust = 2.0),
     stat='count',nudge_y=0.125)+
    labs(y = "Percent") +
    scale_y_continuous(labels = scales::percent) +
    theme(axis.title.y = element_blank()) +
    theme_classic()+
    theme(legend.position = "None") 

,ncol = 2)

ggplot(my_data, aes(x = user_rating))+
geom_density(aes(fill = paid), alpha = 0.4) +
  scale_color_manual(values = c("#868686FF", "#EFC000FF"))+
  scale_fill_manual(values = c("#868686FF", "#EFC000FF"))

Findings:

The higher rate, the larger gap shown between free and paid Apps.
At 0 rating, free Apps are more than paid Apps.

my_data %>%
  group_by(paid) %>%
  summarise(avg_rate = mean(user_rating)) %>%
  arrange(desc(avg_rate)) %>%
  ggplot(aes(x = paid, y = avg_rate, fill = "paid")) +
  geom_bar(stat = "identity", position = "dodge",fill=c("#C4961A","#FFDB6D")) +
  geom_text(stat = "identity", aes(label = round(avg_rate,2)),position = position_dodge(width = 1), vjust = 2.0,color = "black") +
  labs(title = "Average User Rating by Free Apps and Paid Apps" , x= "",y="Average User Rating") +
  theme(axis.text.x = element_text(size=10)) +
  theme_classic() +
  scale_fill_manual(values = c("#C4961A", "#FFDB6D")) +
  theme(plot.title = element_text(size=16,face = "bold")) +
  theme(axis.text.x = element_text(size = 13 ,face = "bold")) +
  theme(axis.ticks.y = element_blank(),axis.ticks.x = element_blank()) +
  theme(legend.title = element_blank())

## `summarise()` ungrouping output (override with `.groups` argument)

genre_list <- c("Games","Entertainment","Education","Photo & Video")

group_data <- my_data %>% mutate(prime_genre = ifelse(prime_genre != genre_list, "Others", ifelse(prime_genre == "Games", "Games", ifelse(prime_genre == "Entertainment", "Entertainment", ifelse(prime_genre == "Education", "Education",  "Photo & Video")))))

## Warning: Problem with `mutate()` input `prime_genre`.
## ℹ longer object length is not a multiple of shorter object length
## ℹ Input `prime_genre` is `ifelse(...)`.

## Warning in prime_genre != genre_list: longer object length is not a multiple of
## shorter object length

head(group_data)

##          id                                         track_name size_bytes
## 1 281656475                                    PAC-MAN Premium  100788224
## 2 281796108                          Evernote - stay organized  158578688
## 3 281940292    WeatherBug - Local Weather, Radar, Maps, Alerts  100524032
## 4 282614216 eBay: Best App to Buy, Sell, Save! Online Shopping  128512000
## 5 282935706                                              Bible   92774400
## 6 283619399                                   Shanghai Mahjong   10485713
##   currency price rating_count_tot rating_count_ver user_rating user_rating_ver
## 1      USD  3.99            21292               26         4.0             4.5
## 2      USD  0.00           161065               26         4.0             3.5
## 3      USD  0.00           188583             2822         3.5             4.5
## 4      USD  0.00           262241              649         4.0             4.5
## 5      USD  0.00           985920             5320         4.5             5.0
## 6      USD  0.99             8253             5516         4.0             4.0
##      ver cont_rating prime_genre sup_devices.num ipadSc_urls.num lang.num
## 1  6.3.5          4+       Games              38               5       10
## 2  8.2.2          4+      Others              37               5       23
## 3  5.0.0          4+      Others              37               5        3
## 4 5.10.0         12+      Others              37               5        9
## 5  7.5.1          4+      Others              37               5       45
## 6    1.8          4+      Others              47               5        1
##   vpp_lic paid size_bytes_MB  revenue
## 1       1 Paid     96.119141 84955.08
## 2       1 Free    151.232422     0.00
## 3       1 Free     95.867188     0.00
## 4       1 Free    122.558594     0.00
## 5       1 Free     88.476562     0.00
## 6       1 Paid      9.999955  8170.47

 my_data %>%
  group_by(prime_genre,paid) %>%
  ggplot(aes(prime_genre, fill = paid)) +
  geom_bar(position = "fill")+
  coord_flip() +
  labs(title = "Paid and Free App Comparision Accross Different Genre ") +
  geom_hline(yintercept=0.5, color= "black") +
  theme_classic()

Findings:

Apps from Shopping genre are almost free.
The highest percentage of paid apps is from Education genre.

group_data %>%
  group_by(prime_genre,paid) %>%
  summarise(avg_rate = mean(user_rating)) %>%
  arrange(desc(avg_rate)) %>%
  ggplot(aes(x = reorder(prime_genre,-avg_rate), y = avg_rate, fill = paid)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Average User Rating by Genre" , x= "App Genre") +
  theme(axis.text.x = element_text(size=7,angle=25,hjust = 1.0))

## `summarise()` regrouping output by 'prime_genre' (override with `.groups` argument)

Finding:

We only focus on the top 4 common Apps in the market, the Education Genre is the one that paid App average rating lower than free App.

my_data %>%
  group_by(paid) %>%
  summarise(median_rate_tot = median(rating_count_tot)) %>%
  arrange(desc(median_rate_tot)) %>%
  ggplot(mapping = aes(x = paid, y = median_rate_tot)) +
  geom_bar(stat = "identity", position = "dodge",fill=c("#C4961A","#FFDB6D")) +
  geom_text(stat = "identity", aes(label = round(median_rate_tot,2)),position = position_dodge(width = 1), vjust = 2.0, color = "black") +
  labs(title = "Median of Total Review Count by Free Apps and Paid Apps" , x= "", y="Median of Total Review Count") +
  theme(axis.text.x = element_text(size=10)) +
  theme(axis.text.x = element_text(size=7,angle=25,hjust = 1.0)) +
  theme_classic() +
  theme(plot.title = element_text(size=16,face = "bold")) +
  theme(axis.text.x = element_text(size = 13 ,face = "bold")) +
  theme(axis.ticks.y = element_blank(),axis.ticks.x = element_blank()) +
  theme(legend.title = element_blank())

## `summarise()` ungrouping output (override with `.groups` argument)

group_data %>%
  group_by(prime_genre,paid) %>%
  summarise(avg_rat_tot = median(rating_count_tot)) %>%
  arrange(desc(avg_rat_tot)) %>%
  ggplot(aes(x = reorder(prime_genre,-avg_rat_tot), y = avg_rat_tot, fill = paid)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Median of Total Rating Count by Genre" , x= "App Genre",y= "Median of Total Rating Count") +
  theme(axis.text.x = element_text(size=7,angle=25,hjust = 1.0))

## `summarise()` regrouping output by 'prime_genre' (override with `.groups` argument)

group_data %>% 
  group_by(paid, prime_genre) %>% 
  ggplot(mapping = aes(x = user_rating, y = rating_count_tot)) +
  geom_point() +
  facet_grid(prime_genre ~ paid)

3.2 Popularity Analysis

#prepare popularity data
pop_data <- my_data %>% mutate(popularity = rating_count_tot * user_rating)
head(pop_data)

##          id                                         track_name size_bytes
## 1 281656475                                    PAC-MAN Premium  100788224
## 2 281796108                          Evernote - stay organized  158578688
## 3 281940292    WeatherBug - Local Weather, Radar, Maps, Alerts  100524032
## 4 282614216 eBay: Best App to Buy, Sell, Save! Online Shopping  128512000
## 5 282935706                                              Bible   92774400
## 6 283619399                                   Shanghai Mahjong   10485713
##   currency price rating_count_tot rating_count_ver user_rating user_rating_ver
## 1      USD  3.99            21292               26         4.0             4.5
## 2      USD  0.00           161065               26         4.0             3.5
## 3      USD  0.00           188583             2822         3.5             4.5
## 4      USD  0.00           262241              649         4.0             4.5
## 5      USD  0.00           985920             5320         4.5             5.0
## 6      USD  0.99             8253             5516         4.0             4.0
##      ver cont_rating  prime_genre sup_devices.num ipadSc_urls.num lang.num
## 1  6.3.5          4+        Games              38               5       10
## 2  8.2.2          4+ Productivity              37               5       23
## 3  5.0.0          4+      Weather              37               5        3
## 4 5.10.0         12+     Shopping              37               5        9
## 5  7.5.1          4+    Reference              37               5       45
## 6    1.8          4+        Games              47               5        1
##   vpp_lic paid size_bytes_MB  revenue popularity
## 1       1 Paid     96.119141 84955.08    85168.0
## 2       1 Free    151.232422     0.00   644260.0
## 3       1 Free     95.867188     0.00   660040.5
## 4       1 Free    122.558594     0.00  1048964.0
## 5       1 Free     88.476562     0.00  4436640.0
## 6       1 Paid      9.999955  8170.47    33012.0

3.2.1 App Popularity

pop_data %>%
  arrange(desc(popularity)) %>%
  slice(1:10) %>%
  ggplot(aes(x = reorder(track_name,popularity), y = popularity)) +
  geom_bar(stat = "identity", position = "dodge",fill = "#C3D7A4") +
  labs(title = "Top 10 High Popularity of Apps in the Market" ,x= "",y="Popularity", caption = "Popularity = Total Review Count * User Rating") +
  scale_fill_discrete(name = "Genre") +
  coord_flip() +
  theme(axis.text.x = element_text(size=7,angle=25,hjust = 1.0)) +
  theme_classic() +
  theme(plot.title = element_text(size=16,face = "bold")) +
  theme(axis.text.y = element_text(size = 13 ,face = "bold")) +
  theme(axis.ticks.y = element_blank(),axis.ticks.x = element_blank()) +
  theme(legend.title = element_blank())

Findings:

The order of Popularity across Apps is similar as the the order of Total Rating Count, but Pinterest is behind Pandora - Music & Radio in the Total Rating Count ranking, now is ahead Pandora - Music & Radio.

3.2.2 Genre Popularity

pop_data %>%
  group_by(prime_genre) %>%
  summarise(med_pop = median(popularity))%>%
  ggplot(aes(x = reorder(prime_genre,med_pop), y = med_pop)) +
  geom_bar(stat = "identity", position = "dodge",fill = "#C3D7A4") +
  labs(title = "The Median of Popularity Across Genre" , x= "",y="Median of Popularity", caption = "Popularity = Total Review Count * User Rating" ) +
  theme(axis.text.x = element_text(size=7,angle=25,hjust = 1.0)) +
  coord_flip() +
  theme_classic() +
  theme(plot.title = element_text(size=16,face = "bold")) +
  theme(axis.text.y = element_text(size = 11 ,face = "bold")) +
  theme(axis.ticks.y = element_blank(),axis.ticks.x = element_blank()) +
  theme(legend.title = element_blank())

## `summarise()` ungrouping output (override with `.groups` argument)

Finding:

The order of the mean Popularity is completly diffenrent from

3.3 Growth Rate

#set up the growth rate by formula: (current - previous)/previous)
grow_rate_data <- my_data %>% mutate(rating_count_previous = rating_count_tot- rating_count_ver)
grow_rate_data <- grow_rate_data %>% mutate(rating_count_growth = ifelse(rating_count_previous == 0 | rating_count_ver == 0, 0, ((rating_count_ver - rating_count_previous)/rating_count_previous)))
head(grow_rate_data)

##          id                                         track_name size_bytes
## 1 281656475                                    PAC-MAN Premium  100788224
## 2 281796108                          Evernote - stay organized  158578688
## 3 281940292    WeatherBug - Local Weather, Radar, Maps, Alerts  100524032
## 4 282614216 eBay: Best App to Buy, Sell, Save! Online Shopping  128512000
## 5 282935706                                              Bible   92774400
## 6 283619399                                   Shanghai Mahjong   10485713
##   currency price rating_count_tot rating_count_ver user_rating user_rating_ver
## 1      USD  3.99            21292               26         4.0             4.5
## 2      USD  0.00           161065               26         4.0             3.5
## 3      USD  0.00           188583             2822         3.5             4.5
## 4      USD  0.00           262241              649         4.0             4.5
## 5      USD  0.00           985920             5320         4.5             5.0
## 6      USD  0.99             8253             5516         4.0             4.0
##      ver cont_rating  prime_genre sup_devices.num ipadSc_urls.num lang.num
## 1  6.3.5          4+        Games              38               5       10
## 2  8.2.2          4+ Productivity              37               5       23
## 3  5.0.0          4+      Weather              37               5        3
## 4 5.10.0         12+     Shopping              37               5        9
## 5  7.5.1          4+    Reference              37               5       45
## 6    1.8          4+        Games              47               5        1
##   vpp_lic paid size_bytes_MB  revenue rating_count_previous rating_count_growth
## 1       1 Paid     96.119141 84955.08                 21266          -0.9987774
## 2       1 Free    151.232422     0.00                161039          -0.9998385
## 3       1 Free     95.867188     0.00                185761          -0.9848084
## 4       1 Free    122.558594     0.00                261592          -0.9975190
## 5       1 Free     88.476562     0.00                980600          -0.9945748
## 6       1 Paid      9.999955  8170.47                  2737           1.0153453

#plot the growth rate by genre
grow_rate_data %>%
  group_by(prime_genre) %>%
  summarise(avg_growth = mean(rating_count_growth)) %>%
  #arrange(desc(avg_growth)) %>%
  ggplot(aes(x = reorder(prime_genre,-avg_growth), y = avg_growth)) +  
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Average of Total Rating Count Growth Rate by Genre", x= "App Genre") +
  theme(axis.text.x = element_text(size=7,angle=25,hjust = 1.0))+
  scale_y_continuous(limits=c(-1, 7))

## `summarise()` ungrouping output (override with `.groups` argument)

Finding:

Utilites, Games, Reference genre have positive growth rate base on the current version of the apps.

3.4 Statistics Test

3.4.1 Regression test for Total Rating Count

app_lm <- my_data %>% mutate_if(is.integer, as.factor)

mod <- glm(rating_count_tot ~ price + paid + ipadSc_urls.num + size_bytes_MB + lang.num , 
           family = binomial(link='logit'), 
           data = app_lm)

summary(mod)

## 
## Call:
## glm(formula = rating_count_tot ~ price + paid + ipadSc_urls.num + 
##     size_bytes_MB + lang.num, family = binomial(link = "logit"), 
##     data = app_lm)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.3550   0.1176   0.3713   0.5212   1.7317  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -1.258e+00  3.317e-01  -3.792 0.000150 ***
## price            -3.653e-03  6.421e-03  -0.569 0.569450    
## paidPaid          7.065e-01  8.676e-02   8.144 3.84e-16 ***
## ipadSc_urls.num1  1.062e+00  2.809e-01   3.780 0.000157 ***
## ipadSc_urls.num2  4.846e-01  2.270e-01   2.135 0.032793 *  
## ipadSc_urls.num3  5.582e-01  1.808e-01   3.087 0.002021 ** 
## ipadSc_urls.num4  7.790e-01  1.308e-01   5.953 2.63e-09 ***
## ipadSc_urls.num5  1.228e+00  8.758e-02  14.022  < 2e-16 ***
## size_bytes_MB    -2.041e-04  1.243e-04  -1.642 0.100672    
## lang.num1         1.998e+00  3.282e-01   6.088 1.15e-09 ***
## lang.num2         9.897e-01  3.357e-01   2.948 0.003196 ** 
## lang.num3         2.223e+00  3.876e-01   5.736 9.68e-09 ***
## lang.num4         2.798e+00  4.651e-01   6.015 1.80e-09 ***
## lang.num5         2.623e+00  4.248e-01   6.173 6.69e-10 ***
## lang.num6         3.405e+00  5.617e-01   6.062 1.35e-09 ***
## lang.num7         3.826e+00  6.701e-01   5.710 1.13e-08 ***
## lang.num8         4.483e+00  7.850e-01   5.711 1.12e-08 ***
## lang.num9         2.985e+00  4.917e-01   6.069 1.28e-09 ***
## lang.num10        5.161e+00  1.055e+00   4.891 1.00e-06 ***
## lang.num11        5.657e+00  1.053e+00   5.370 7.88e-08 ***
## lang.num12        3.631e+00  5.588e-01   6.497 8.17e-11 ***
## lang.num13        3.204e+00  5.617e-01   5.704 1.17e-08 ***
## lang.num14        4.595e+00  1.059e+00   4.341 1.42e-05 ***
## lang.num15        1.774e+01  4.149e+02   0.043 0.965906    
## lang.num16        2.665e+00  4.927e-01   5.409 6.32e-08 ***
## lang.num17        1.773e+01  5.663e+02   0.031 0.975016    
## lang.num18        4.319e+00  1.062e+00   4.066 4.77e-05 ***
## lang.num19        3.579e+00  1.075e+00   3.331 0.000866 ***
## lang.num20        1.759e+01  8.528e+02   0.021 0.983542    
## lang.num21        1.777e+01  6.475e+02   0.027 0.978102    
## lang.num22        1.783e+01  6.759e+02   0.026 0.978956    
## lang.num23        1.768e+01  7.963e+02   0.022 0.982282    
## lang.num24        2.830e+00  1.091e+00   2.593 0.009501 ** 
## lang.num25        1.806e+01  1.024e+03   0.018 0.985931    
## lang.num26        1.760e+01  8.249e+02   0.021 0.982974    
## lang.num27        1.810e+01  1.443e+03   0.013 0.989994    
## lang.num28        2.077e+00  1.152e+00   1.804 0.071296 .  
## lang.num29        1.802e+01  8.507e+02   0.021 0.983103    
## lang.num30        1.780e+01  7.212e+02   0.025 0.980307    
## lang.num31        2.031e+00  5.071e-01   4.006 6.18e-05 ***
## lang.num32        1.798e+01  9.368e+02   0.019 0.984685    
## lang.num33        3.725e+00  1.074e+00   3.468 0.000525 ***
## lang.num34        2.780e+00  1.095e+00   2.539 0.011131 *  
## lang.num35        1.850e+01  2.752e+03   0.007 0.994638    
## lang.num36        1.803e+01  1.923e+03   0.009 0.992519    
## lang.num37        1.762e+01  2.797e+03   0.006 0.994974    
## lang.num39        1.761e+01  2.797e+03   0.006 0.994976    
## lang.num40        1.762e+01  3.956e+03   0.004 0.996447    
## lang.num41        1.807e+01  2.797e+03   0.006 0.994845    
## lang.num42        3.588e-02  1.287e+00   0.028 0.977751    
## lang.num43        4.734e-01  1.470e+00   0.322 0.747509    
## lang.num45        1.775e+01  1.313e+03   0.014 0.989216    
## lang.num46        1.762e+01  1.978e+03   0.009 0.992892    
## lang.num47        1.763e+01  3.956e+03   0.004 0.996444    
## lang.num50        1.761e+01  3.956e+03   0.004 0.996448    
## lang.num54        1.691e+01  2.797e+03   0.006 0.995177    
## lang.num55        1.762e+01  2.797e+03   0.006 0.994975    
## lang.num56        1.764e+01  3.956e+03   0.004 0.996442    
## lang.num58        1.710e+01  1.135e+03   0.015 0.987979    
## lang.num59        1.761e+01  3.956e+03   0.004 0.996449    
## lang.num63        1.809e+01  3.956e+03   0.005 0.996352    
## lang.num68        1.886e+01  3.956e+03   0.005 0.996197    
## lang.num69        1.804e+01  2.270e+03   0.008 0.993658    
## lang.num74        1.883e+01  3.956e+03   0.005 0.996202    
## lang.num75        1.763e+01  3.956e+03   0.004 0.996445    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5536.5  on 7196  degrees of freedom
## Residual deviance: 4586.1  on 7132  degrees of freedom
## AIC: 4716.1
## 
## Number of Fisher Scoring iterations: 16

# plot residuals to check for patterns
par(mfrow = c(1, 1))
plot(group_data$price, mod$residuals)

par(mfrow = c(1, 1))
plot(group_data$size_bytes_MB, mod$residuals)

par(mfrow = c(1, 1))
plot(group_data$sup_devices.num, mod$residuals)

4 Finding 1 - Does paid App has better performance in iOS App Store Market?

pop_data %>%
  group_by(prime_genre,paid) %>%
  summarise(med_popular = median(popularity)) %>%
  arrange(desc(med_popular)) %>%
  ggplot(aes(x = reorder(prime_genre,-med_popular), y = med_popular, fill = paid)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Median Popularity by Genre", x= "App Genre") +
  theme(axis.text.x = element_text(size=7,angle=25,hjust = 1.0))

## `summarise()` regrouping output by 'prime_genre' (override with `.groups` argument)

pop_data_grow <- pop_data %>% mutate(prime_genre = ifelse(prime_genre != genre_list, "Others", ifelse(prime_genre == "Games", "Games", ifelse(prime_genre == "Entertainment", "Entertainment", ifelse(prime_genre == "Education", "Education",  "Photo & Video")))))

## Warning: Problem with `mutate()` input `prime_genre`.
## ℹ longer object length is not a multiple of shorter object length
## ℹ Input `prime_genre` is `ifelse(...)`.

## Warning in prime_genre != genre_list: longer object length is not a multiple of
## shorter object length

pop_genre_pic <- pop_data_grow %>%
  group_by(prime_genre,paid) %>%
  summarise(med_popular = median(popularity)) %>%
  #arrange(desc(med_popular)) %>%
  ggplot(aes(x = reorder(prime_genre,desc(prime_genre)), y = med_popular, fill = paid)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Popularity Comparison Across Different Genres by Free \n& Paid Apps", subtitle = "Most free apps have higher median of popularity than paid apps  ", x= "", y="Median of Popularity", caption = "Popularity = Total Review Count * User Rating") +
  scale_fill_manual(values = c("#C4961A", "#FFDB6D")) +
  theme_classic() +
  theme(plot.title = element_text(size=16,face = "bold")) +
  theme(axis.text.x = element_text(size = 13 ,face = "bold")) +
  theme(legend.title = element_blank())

## `summarise()` regrouping output by 'prime_genre' (override with `.groups` argument)

pop_genre_pic

Finding:

When I only focus on top 4 market share of Apps, the results show Free App is more popular than Paid App. However, does paid app really perform worse than free App? It will be dig out more in the following.

grow_group_rate <- grow_rate_data %>% mutate(prime_genre = ifelse(prime_genre != genre_list, "Others", ifelse(prime_genre == "Games", "Games", ifelse(prime_genre == "Entertainment", "Entertainment", ifelse(prime_genre == "Education", "Education",  "Photo & Video")))))

## Warning: Problem with `mutate()` input `prime_genre`.
## ℹ longer object length is not a multiple of shorter object length
## ℹ Input `prime_genre` is `ifelse(...)`.

## Warning in prime_genre != genre_list: longer object length is not a multiple of
## shorter object length

grow_rate_pic <- grow_group_rate %>%
  group_by(prime_genre,paid) %>%
  summarise(avg_rat_tot_grow = mean(rating_count_growth)) %>%
  ggplot(aes(x = reorder(prime_genre,desc(prime_genre)), y = avg_rat_tot_grow, fill = paid)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Average Growth Rate of Total Reviews Count Across Different \nGenres by Free & Paid Apps" , x="",y="Average Growth Rate of Total Reviews", subtitle = "Growth rate of free apps are better than paid apps in Games genre", caption = "Growth Rate = (Total reviews count of current version - past version) / past version ") + 
  theme(axis.text.x = element_text(size=7,angle=25,hjust = 1.0)) +
  scale_fill_manual(values = c("#C4961A", "#FFDB6D")) +
  theme_classic() +
  theme(plot.title = element_text(size=16,face = "bold")) +
  theme(axis.text.x = element_text(size = 13 ,face = "bold")) +
  theme(legend.title = element_blank())

## `summarise()` regrouping output by 'prime_genre' (override with `.groups` argument)

grow_rate_pic

Findings:

Overall, the rating_count_tot growth rate of paid App are better than free App. However, in the Games Genre, free Apps actully have better growth rate than paid Apps.

#save to png for memo finding 1
ggsave(filename = "pop_grow_pic.png", width = 9, height = 6, plot = pop_genre_pic)
ggsave(filename = "grow_rate_pic.png", width = 9, height = 6, plot = grow_rate_pic)

5 Finding 2 - What’s the popularity of keywords that App’s title are using?

#load data for 
library(ggwordcloud)
library(dplyr)
library(tm)

## Loading required package: NLP

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:ggplot2':
## 
##     annotate

library(stringi)
library(ggwordcloud)

library (wordcloud)

## Loading required package: RColorBrewer

library (RColorBrewer)
library (SnowballC)

5.1 Keyword for whole Apps market

myCorpus <- Corpus(VectorSource(my_data$track_name))
# Convert the text to lower case
myCorpus <- tm_map(myCorpus, content_transformer(tolower))

## Warning in tm_map.SimpleCorpus(myCorpus, content_transformer(tolower)):
## transformation drops documents

myCorpus <- tm_map(myCorpus, removeNumbers)

## Warning in tm_map.SimpleCorpus(myCorpus, removeNumbers): transformation drops
## documents

# Remove english common stopwords
myCorpus <- tm_map(myCorpus, removeWords, stopwords('english'))

## Warning in tm_map.SimpleCorpus(myCorpus, removeWords, stopwords("english")):
## transformation drops documents

myCorpus <- tm_map(myCorpus, removePunctuation)

## Warning in tm_map.SimpleCorpus(myCorpus, removePunctuation): transformation
## drops documents

toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
myCorpus <- tm_map(myCorpus, toSpace,"\",")

## Warning in tm_map.SimpleCorpus(myCorpus, toSpace, "\","): transformation drops
## documents

myCorpus <- tm_map(myCorpus, function(x) iconv(enc2utf8(x), sub = "byte"))

## Warning in tm_map.SimpleCorpus(myCorpus, function(x) iconv(enc2utf8(x), :
## transformation drops documents

myCorpus <- tm_map(myCorpus, tolower)

## Warning in tm_map.SimpleCorpus(myCorpus, tolower): transformation drops
## documents

myCorpus <- stri_trans_general(myCorpus, "latin-ascii")

## Warning in stri_trans_general(myCorpus, "latin-ascii"): argument is not an
## atomic vector; coercing

myCorpus <- Corpus(VectorSource(myCorpus))

tdm <- TermDocumentMatrix(myCorpus)

tdmatrix <- as.matrix(tdm)

wordFreq <- sort(rowSums(tdmatrix), decreasing = TRUE)

wordFrame <- as.data.frame(as.table(wordFreq))

wordFrame$Var1=str_replace(wordFrame$Var1, "\",","")
wordFrame$Var1=str_replace(wordFrame$Var1, "\"","")
head(wordFrame,30)

##         Var1 Freq
## 1       game  179
## 2       game  149
## 3       full  123
## 4     hidden  103
## 5        pro  102
## 6     object   93
## 7      video   87
## 8       free   81
## 9         hd   78
## 10     photo   76
## 11   mystery   72
## 12 simulator   66
## 13     games   65
## 14     music   60
## 15       app   59
## 16      live   59
## 17 minecraft   58
## 18      ipad   55
## 19       pro   55
## 20 simulator   54
## 21       app   53
## 22     world   53
## 23     games   49
## 24   edition   48
## 25       car   45
## 26    puzzle   44
## 27 adventure   41
## 28      free   41
## 29      best   39
## 30    pocket   38

word_freq <- wordcloud(words = wordFrame$Var1, freq = wordFrame$Freq, min.freq = 1,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))

word_freq

## NULL

5.2 Keyword for Games Genre

word_data <- my_data %>% filter(prime_genre == "Games")
                                                                   
myCorpus <- Corpus(VectorSource(word_data$track_name))
myCorpus <- tm_map(myCorpus, content_transformer(tolower))

## Warning in tm_map.SimpleCorpus(myCorpus, content_transformer(tolower)):
## transformation drops documents

myCorpus <- tm_map(myCorpus, removeNumbers)

## Warning in tm_map.SimpleCorpus(myCorpus, removeNumbers): transformation drops
## documents

myCorpus <- tm_map(myCorpus, removeWords, stopwords('english'))

## Warning in tm_map.SimpleCorpus(myCorpus, removeWords, stopwords("english")):
## transformation drops documents

myCorpus <- tm_map(myCorpus, removePunctuation)

## Warning in tm_map.SimpleCorpus(myCorpus, removePunctuation): transformation
## drops documents

toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
myCorpus <- tm_map(myCorpus, toSpace,"\",")

## Warning in tm_map.SimpleCorpus(myCorpus, toSpace, "\","): transformation drops
## documents

myCorpus <- tm_map(myCorpus, function(x) iconv(enc2utf8(x), sub = "byte"))

## Warning in tm_map.SimpleCorpus(myCorpus, function(x) iconv(enc2utf8(x), :
## transformation drops documents

myCorpus <- tm_map(myCorpus, tolower)

## Warning in tm_map.SimpleCorpus(myCorpus, tolower): transformation drops
## documents

myCorpus <- stri_trans_general(myCorpus, "latin-ascii")

## Warning in stri_trans_general(myCorpus, "latin-ascii"): argument is not an
## atomic vector; coercing

myCorpus <- Corpus(VectorSource(myCorpus))

tdm <- TermDocumentMatrix(myCorpus)

tdmatrix <- as.matrix(tdm)

wordFreq <- sort(rowSums(tdmatrix), decreasing = TRUE)

wordFrame <- as.data.frame(as.table(wordFreq))

wordFrame$Var1=str_replace(wordFrame$Var1, "\",","")
head(wordFrame,20)

##         Var1 Freq
## 1       game  172
## 2       game  133
## 3       full  123
## 4     hidden  103
## 5     object   93
## 6    mystery   71
## 7  simulator   61
## 8      games   56
## 9  simulator   54
## 10        hd   50
## 11    puzzle   44
## 12     world   41
## 13 adventure   39
## 14       car   39
## 15      free   39
## 16    racing   34
## 17   endless   32
## 18 adventure   30
## 19   "escape   29
## 20     games   29

wordFrame %>%
  filter(Freq > 20) %>%
  ggplot(mapping = aes(label = Var1 , size = Freq)) +
  #geom_text_wordcloud(area_corr = TRUE) +
  geom_text_wordcloud(na.rm = TRUE, eccentricity = 0.5) +
  #geom_text_wordcloud_area(rm_outside = TRUE)
  #scale_size_area(max_size = 10) +
  scale_radius(range = c(0, 15), limits = c(0, NA)) +
  theme_minimal()

Games <- c("game","full", "simulator", "hidden","object")

5.3 Keyword for Entertainment Genre

word_data <- my_data %>% filter(prime_genre == "Entertainment")
                                                                   
myCorpus <- Corpus(VectorSource(word_data$track_name))
myCorpus <- tm_map(myCorpus, content_transformer(tolower))

## Warning in tm_map.SimpleCorpus(myCorpus, content_transformer(tolower)):
## transformation drops documents

myCorpus <- tm_map(myCorpus, removeNumbers)

## Warning in tm_map.SimpleCorpus(myCorpus, removeNumbers): transformation drops
## documents

myCorpus <- tm_map(myCorpus, removeWords, stopwords('english'))

## Warning in tm_map.SimpleCorpus(myCorpus, removeWords, stopwords("english")):
## transformation drops documents

myCorpus <- tm_map(myCorpus, removePunctuation)

## Warning in tm_map.SimpleCorpus(myCorpus, removePunctuation): transformation
## drops documents

myCorpus <- tm_map(myCorpus, function(x) iconv(enc2utf8(x), sub = "byte"))

## Warning in tm_map.SimpleCorpus(myCorpus, function(x) iconv(enc2utf8(x), :
## transformation drops documents

myCorpus <- tm_map(myCorpus, tolower)

## Warning in tm_map.SimpleCorpus(myCorpus, tolower): transformation drops
## documents

myCorpus <- stri_trans_general(myCorpus, "latin-ascii")

## Warning in stri_trans_general(myCorpus, "latin-ascii"): argument is not an
## atomic vector; coercing

myCorpus <- Corpus(VectorSource(myCorpus))

tdm <- TermDocumentMatrix(myCorpus)

tdmatrix <- as.matrix(tdm)

wordFreq <- sort(rowSums(tdmatrix), decreasing = TRUE)

wordFrame <- as.data.frame(as.table(wordFreq))
wordFrame$Var1=str_replace(wordFrame$Var1, "\",","")
wordFrame$Var1=str_replace(wordFrame$Var1, "\"","")
head(wordFrame,25)

##          Var1 Freq
## 1   minecraft   25
## 2        live   15
## 3      pocket   14
## 4       watch   14
## 5         app   13
## 6      movies   13
## 7     edition   12
## 8          tv   12
## 9       santa   11
## 10     disney   10
## 11    talking   10
## 12       best   10
## 13      emoji   10
## 14       full   10
## 15      color    9
## 16       ipad    9
## 17       maps    9
## 18     stream    9
## 19        app    8
## 20       town    8
## 21      video    8
## 22   coloring    7
## 23   episodes    7
## 24   official    7
## 25 wallpapers    7

wordFrame %>%
  filter(Freq > 5) %>%
  ggplot(mapping = aes(label = Var1 , size = Freq)) +
  #geom_text_wordcloud(area_corr = TRUE) +
  geom_text_wordcloud(na.rm = TRUE, eccentricity = 0.5) +
  #geom_text_wordcloud_area(rm_outside = TRUE)
  #scale_size_area(max_size = 10) +
  scale_radius(range = c(0, 15), limits = c(0, NA)) +
  theme_minimal()

Entertainment <- c("minecraft","app","live","pocket", "watch")

5.4 Keyword for Education Genre

word_data <- my_data %>% filter(prime_genre == "Education")
                                                                   
myCorpus <- Corpus(VectorSource(word_data$track_name))
myCorpus <- tm_map(myCorpus, content_transformer(tolower))

## Warning in tm_map.SimpleCorpus(myCorpus, content_transformer(tolower)):
## transformation drops documents

myCorpus <- tm_map(myCorpus, removeNumbers)

## Warning in tm_map.SimpleCorpus(myCorpus, removeNumbers): transformation drops
## documents

myCorpus <- tm_map(myCorpus, removeWords, stopwords('english'))

## Warning in tm_map.SimpleCorpus(myCorpus, removeWords, stopwords("english")):
## transformation drops documents

myCorpus <- tm_map(myCorpus, removePunctuation)

## Warning in tm_map.SimpleCorpus(myCorpus, removePunctuation): transformation
## drops documents

myCorpus <- tm_map(myCorpus, function(x) iconv(enc2utf8(x), sub = "byte"))

## Warning in tm_map.SimpleCorpus(myCorpus, function(x) iconv(enc2utf8(x), :
## transformation drops documents

myCorpus <- tm_map(myCorpus, tolower)

## Warning in tm_map.SimpleCorpus(myCorpus, tolower): transformation drops
## documents

myCorpus <- stri_trans_general(myCorpus, "latin-ascii")

## Warning in stri_trans_general(myCorpus, "latin-ascii"): argument is not an
## atomic vector; coercing

myCorpus <- Corpus(VectorSource(myCorpus))

tdm <- TermDocumentMatrix(myCorpus)

tdmatrix <- as.matrix(tdm)
#tdmatrix <- tm_map(replace, "\",","")
#tdmatrix = str_replace_all(tdmatrix$"1", "game\",", "game")
#head(tdmatrix,20)
wordFreq <- sort(rowSums(tdmatrix), decreasing = TRUE)

wordFrame <- as.data.frame(as.table(wordFreq))

wordFrame$Var1=str_replace(wordFrame$Var1, "\"","")
wordFrame$Var1=str_replace(wordFrame$Var1, "\",","")
wordFrame$Var1=str_replace(wordFrame$Var1, ",","")
head(wordFrame,30)

##        Var1 Freq
## 1      toca   28
## 2        dr   24
## 3     learn   24
## 4     panda   20
## 5      mini   17
## 6      sago   16
## 7      kids   15
## 8      kids   15
## 9      test   13
## 10     math   11
## 11  tinybop   11
## 12 learning   10
## 13       hd    9
## 14    brain    8
## 15     game    8
## 16  spanish    8
## 17      paw    7
## 18     life    7
## 19   pandas    7
## 20   patrol    7
## 21     play    7
## 22   school    7
## 23     math    6
## 24     star    6
## 25      app    6
## 26      fun    6
## 27    games    6
## 28    games    6
## 29  monster    6
## 30      sky    6

wordFrame %>%
  filter(Freq > 5) %>%
  ggplot(mapping = aes(label = Var1 , size = Freq)) +
  #geom_text_wordcloud(area_corr = TRUE) +
  geom_text_wordcloud(na.rm = TRUE, eccentricity = 0.5) +
  #geom_text_wordcloud_area(rm_outside = TRUE)
  #scale_size_area(max_size = 10) +
  scale_radius(range = c(0, 15), limits = c(0, NA)) +
  theme_minimal()

Education <- c("kids","toca","learn","math","pandas")

5.5 Keyword for Photo & Video Genre

word_data <- my_data %>% filter(prime_genre == "Photo & Video")
                                                                   
myCorpus <- Corpus(VectorSource(word_data$track_name))
myCorpus <- tm_map(myCorpus, content_transformer(tolower))

## Warning in tm_map.SimpleCorpus(myCorpus, content_transformer(tolower)):
## transformation drops documents

myCorpus <- tm_map(myCorpus, removeNumbers)

## Warning in tm_map.SimpleCorpus(myCorpus, removeNumbers): transformation drops
## documents

myCorpus <- tm_map(myCorpus, removeWords, stopwords('english'))

## Warning in tm_map.SimpleCorpus(myCorpus, removeWords, stopwords("english")):
## transformation drops documents

myCorpus <- tm_map(myCorpus, removePunctuation)

## Warning in tm_map.SimpleCorpus(myCorpus, removePunctuation): transformation
## drops documents

myCorpus <- tm_map(myCorpus, function(x) iconv(enc2utf8(x), sub = "byte"))

## Warning in tm_map.SimpleCorpus(myCorpus, function(x) iconv(enc2utf8(x), :
## transformation drops documents

myCorpus <- tm_map(myCorpus, tolower)

## Warning in tm_map.SimpleCorpus(myCorpus, tolower): transformation drops
## documents

myCorpus <- stri_trans_general(myCorpus, "latin-ascii")

## Warning in stri_trans_general(myCorpus, "latin-ascii"): argument is not an
## atomic vector; coercing

myCorpus <- Corpus(VectorSource(myCorpus))

tdm <- TermDocumentMatrix(myCorpus)

tdmatrix <- as.matrix(tdm)

wordFreq <- sort(rowSums(tdmatrix), decreasing = TRUE)

wordFrame <- as.data.frame(as.table(wordFreq))

wordFrame$Var1=str_replace(wordFrame$Var1, "\"","")

head(wordFrame,30)

##        Var1 Freq
## 1     photo   68
## 2     video   42
## 3    editor   33
## 4   collage   23
## 5   editor,   23
## 6    photos   22
## 7    camera   20
## 8   camera,   20
## 9     maker   15
## 10   maker,   15
## 11      pro   15
## 12  effects   13
## 13  photos,   12
## 14     face   10
## 15   selfie   10
## 16    photo    9
## 17      add    8
## 18  filters    8
## 19     live    8
## 20   videos    8
## 21 effects,    7
## 22    movie    7
## 23      pic    7
## 24  picture    7
## 25   video,    7
## 26    video    6
## 27 collage,    6
## 28   design    6
## 29  editing    6
## 30 filters,    6

wordFrame %>%
  filter(Freq > 3) %>%
  ggplot(mapping = aes(label = Var1 , size = Freq)) +
  #geom_text_wordcloud(area_corr = TRUE) +
  geom_text_wordcloud(na.rm = TRUE, eccentricity = 0.5) +
  #geom_text_wordcloud_area(rm_outside = TRUE)
  #scale_size_area(max_size = 10) +
  scale_radius(range = c(0, 15), limits = c(0, NA)) +
  theme_minimal()

Photo_Video <- c("Photo","video","editor","camera","maker")

5.6 Keyword for Utilities Genre

word_data <- my_data %>% filter(prime_genre == "Utilities")
                                                                   
myCorpus <- Corpus(VectorSource(word_data$track_name))
myCorpus <- tm_map(myCorpus, content_transformer(tolower))

## Warning in tm_map.SimpleCorpus(myCorpus, content_transformer(tolower)):
## transformation drops documents

myCorpus <- tm_map(myCorpus, removeNumbers)

## Warning in tm_map.SimpleCorpus(myCorpus, removeNumbers): transformation drops
## documents

myCorpus <- tm_map(myCorpus, removeWords, stopwords('english'))

## Warning in tm_map.SimpleCorpus(myCorpus, removeWords, stopwords("english")):
## transformation drops documents

myCorpus <- tm_map(myCorpus, removePunctuation)

## Warning in tm_map.SimpleCorpus(myCorpus, removePunctuation): transformation
## drops documents

myCorpus <- tm_map(myCorpus, function(x) iconv(enc2utf8(x), sub = "byte"))

## Warning in tm_map.SimpleCorpus(myCorpus, function(x) iconv(enc2utf8(x), :
## transformation drops documents

myCorpus <- tm_map(myCorpus, tolower)

## Warning in tm_map.SimpleCorpus(myCorpus, tolower): transformation drops
## documents

myCorpus <- stri_trans_general(myCorpus, "latin-ascii")

## Warning in stri_trans_general(myCorpus, "latin-ascii"): argument is not an
## atomic vector; coercing

myCorpus <- Corpus(VectorSource(myCorpus))

tdm <- TermDocumentMatrix(myCorpus)

tdmatrix <- as.matrix(tdm)

wordFreq <- sort(rowSums(tdmatrix), decreasing = TRUE)

wordFrame <- as.data.frame(as.table(wordFreq))

wordFrame$Var1=str_replace(wordFrame$Var1, "\"","")
wordFrame$Var1=str_replace(wordFrame$Var1, "\",","")
wordFrame$Var1=str_replace(wordFrame$Var1, ",","")
head(wordFrame,30)

##          Var1 Freq
## 1    keyboard   16
## 2         pro   12
## 3   minecraft   11
## 4    keyboard   10
## 5         web   10
## 6       emoji    9
## 7     browser    8
## 8       emoji    7
## 9  calculator    6
## 10    browser    6
## 11         go    6
## 12 calculator    5
## 13 calculator    5
## 14      clock    5
## 15    creator    5
## 16     custom    5
## 17    edition    5
## 18       file    5
## 19      flash    5
## 20       free    5
## 21     iphone    5
## 22         pe    5
## 23     pocket    5
## 24    pokemon    5
## 25        pro    5
## 26     remote    5
## 27      smart    5
## 28     themes    5
## 29       code    4
## 30     emojis    4

wordFrame %>%
  filter(Freq > 3) %>%
  ggplot(mapping = aes(label = Var1 , size = Freq)) +
  #geom_text_wordcloud(area_corr = TRUE) +
  geom_text_wordcloud(na.rm = TRUE, eccentricity = 0.5) +
  #geom_text_wordcloud_area(rm_outside = TRUE)
  #scale_size_area(max_size = 10) +
  scale_radius(range = c(0, 15), limits = c(0, NA)) +
  theme_minimal()

Utilities <- c("keyboard","emoji","pro","calculator","browser")

# build up the table for high frequent keywords
Ranking <- c("1","2","3","4","5")
freq_word_list <- data.frame(Ranking,Games, Entertainment, Education,Photo_Video,Utilities)
formattable(freq_word_list)

Ranking	Games	Entertainment	Education	Photo_Video	Utilities
1	game	minecraft	kids	Photo	keyboard
2	full	app	toca	video	emoji
3	simulator	live	learn	editor	pro
4	hidden	pocket	math	camera	calculator
5	object	watch	pandas	maker	browser

library(knitr)
library(kableExtra)

## 
## Attaching package: 'kableExtra'

## The following object is masked from 'package:dplyr':
## 
##     group_rows

kable(freq_word_list) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
  add_header_above(c(" ", "High-Frequency Keywords List by Genre" = 5))

	High-Frequency Keywords List by Genre
Ranking	Games	Entertainment	Education	Photo_Video	Utilities
1	game	minecraft	kids	Photo	keyboard
2	full	app	toca	video	emoji
3	simulator	live	learn	editor	pro
4	hidden	pocket	math	camera	calculator
5	object	watch	pandas	maker	browser

6 Conclusion

After analyzing, I summarized several findings as follow:

Top 5 app based on total reviews count are Facebook, Instagram, Clash of Clans, Temple Run, and Pandora - Music & Radio.
Almost 60% of Apps are rated between 4 and 4.5.
More than 50% of Apps market share are in Games genre and follow by Entertainment, Education, and Photo & Video.
56% of Apps are free in the market, but overall the paid apps have better user ratings than free apps.
Free apps can bring more reviews and popularity than free apps do.
The top 3 expensive genre are from Medical, catalogs, and business.
The top 3 popular genre are from Shopping, Business, and Photo & Video.
The top frequent words that used as app name are “Game”, “pro”, “free”, full“, and”simulator".

Final Project-TA iOS Mobile App Store Market Analysis

Gloria Chen

12/02/2019

1 Load Package and Data

1.1 Load Package

1.2 Data Loading and Cleaning

2 Base EDA

2.1 Uni-variable Non-Graphic

2.2 Uni-variable Graphic

2.2.1 Overview of categorical variable

2.2.2 Overview of Numeric variable

2.2.3 Detail of Variable Distribution

2.3 Multi-variable Non-Graphic

2.4 Multi-variable Graphic

2.4.1 Overview of App Store market correlation

2.4.2 Apps vs Total rating count

2.4.3 Apps vs Total rating count (current ver.)

2.4.4 Apps vs Download Size

2.4.5 Apps vs Price

2.4.6 App vs Tentitive revenue

2.4.7 Genre Anaylsis

2.4.8 User rating analysis

2.4.9 Total rating count analysis

2.4.10 Device suport vs. App size

2.4.11 Paid Apps Analysis

3 Detail EDA

3.1 Paid Vs. Non-Paid App

3.2 Popularity Analysis

3.2.1 App Popularity

3.2.2 Genre Popularity

3.3 Growth Rate

3.4 Statistics Test

3.4.1 Regression test for Total Rating Count

4 Finding 1 - Does paid App has better performance in iOS App Store Market?

5 Finding 2 - What’s the popularity of keywords that App’s title are using?

5.1 Keyword for whole Apps market

5.2 Keyword for Games Genre

5.3 Keyword for Entertainment Genre

5.4 Keyword for Education Genre

5.5 Keyword for Photo & Video Genre

5.6 Keyword for Utilities Genre

6 Conclusion