# Clear environment of variables and functions
rm(list = ls(all = TRUE))
# Clear environmet of packages
if(is.null(sessionInfo()$otherPkgs) == FALSE)lapply(paste("package:", names(sessionInfo()$otherPkgs), sep=""), detach, character.only = TRUE, unload = TRUE)
#load package
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.4 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(janitor) # for tyble
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(lmPerm) # for ANOVA
library(formattable)# For table formatting and table formatting functions
library(htmltools)
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
library(MultinomialCI) # To calculate multinomial confidence intervals for factor variables
library(flexdashboard)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:Hmisc':
##
## subplot
## The following object is masked from 'package:formattable':
##
## style
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(dygraphs)
library(xts) # to convert date data to xts data, xts is time series class
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
library(gganimate)
## No renderer backend detected. gganimate will default to writing frames to separate files
## Consider installing:
## - the `gifski` package for gif output
## - the `av` package for video output
## and restarting the R session
library(inspectdf)# Load auto EDA packages
#Apple iOS app store data
#Data Loading
my_data <- read.csv("AppleStore.csv")
#add new data cloumn for paid, app size and revenue
my_data <- my_data %>% mutate(paid = as.factor(ifelse(price %in% 0 , "Free","Paid")))
my_data <- my_data %>% mutate(size_bytes_MB = size_bytes/(1024*1024))
my_data <- my_data %>% mutate(revenue = rating_count_tot * price)
head(my_data)
## id track_name size_bytes
## 1 281656475 PAC-MAN Premium 100788224
## 2 281796108 Evernote - stay organized 158578688
## 3 281940292 WeatherBug - Local Weather, Radar, Maps, Alerts 100524032
## 4 282614216 eBay: Best App to Buy, Sell, Save! Online Shopping 128512000
## 5 282935706 Bible 92774400
## 6 283619399 Shanghai Mahjong 10485713
## currency price rating_count_tot rating_count_ver user_rating user_rating_ver
## 1 USD 3.99 21292 26 4.0 4.5
## 2 USD 0.00 161065 26 4.0 3.5
## 3 USD 0.00 188583 2822 3.5 4.5
## 4 USD 0.00 262241 649 4.0 4.5
## 5 USD 0.00 985920 5320 4.5 5.0
## 6 USD 0.99 8253 5516 4.0 4.0
## ver cont_rating prime_genre sup_devices.num ipadSc_urls.num lang.num
## 1 6.3.5 4+ Games 38 5 10
## 2 8.2.2 4+ Productivity 37 5 23
## 3 5.0.0 4+ Weather 37 5 3
## 4 5.10.0 12+ Shopping 37 5 9
## 5 7.5.1 4+ Reference 37 5 45
## 6 1.8 4+ Games 47 5 1
## vpp_lic paid size_bytes_MB revenue
## 1 1 Paid 96.119141 84955.08
## 2 1 Free 151.232422 0.00
## 3 1 Free 95.867188 0.00
## 4 1 Free 122.558594 0.00
## 5 1 Free 88.476562 0.00
## 6 1 Paid 9.999955 8170.47
Column name introdution
“id” : App ID
“track_name”: App Name
“size_bytes”: Size (in Bytes)
“currency”: Currency Type
“price”: Price amount
“rating_count_tot”: User Rating counts (for all version)
“rating_count_ver”: User Rating counts (for current version)
“user_rating” : Average User Rating value (for all version)
“user_rating_ver”: Average User Rating value (for current version)
“ver” : Latest version code
“cont_rating”: Content Rating
“prime_genre”: Primary Genre
“sup_devices.num”: Number of supporting devices
“ipadSc_urls.num”: Number of screenshots showed for display
“lang.num”: Number of supported languages
“vpp_lic”: Vpp Device Based Licensing Enabled
summary(my_data)
## id track_name size_bytes currency
## Min. :2.817e+08 Length:7197 Min. :5.898e+05 Length:7197
## 1st Qu.:6.001e+08 Class :character 1st Qu.:4.692e+07 Class :character
## Median :9.781e+08 Mode :character Median :9.715e+07 Mode :character
## Mean :8.631e+08 Mean :1.991e+08
## 3rd Qu.:1.082e+09 3rd Qu.:1.819e+08
## Max. :1.188e+09 Max. :4.026e+09
## price rating_count_tot rating_count_ver user_rating
## Min. : 0.000 Min. : 0 Min. : 0.0 Min. :0.000
## 1st Qu.: 0.000 1st Qu.: 28 1st Qu.: 1.0 1st Qu.:3.500
## Median : 0.000 Median : 300 Median : 23.0 Median :4.000
## Mean : 1.726 Mean : 12893 Mean : 460.4 Mean :3.527
## 3rd Qu.: 1.990 3rd Qu.: 2793 3rd Qu.: 140.0 3rd Qu.:4.500
## Max. :299.990 Max. :2974676 Max. :177050.0 Max. :5.000
## user_rating_ver ver cont_rating prime_genre
## Min. :0.000 Length:7197 Length:7197 Length:7197
## 1st Qu.:2.500 Class :character Class :character Class :character
## Median :4.000 Mode :character Mode :character Mode :character
## Mean :3.254
## 3rd Qu.:4.500
## Max. :5.000
## sup_devices.num ipadSc_urls.num lang.num vpp_lic paid
## Min. : 9.00 Min. :0.000 Min. : 0.000 Min. :0.0000 Free:4056
## 1st Qu.:37.00 1st Qu.:3.000 1st Qu.: 1.000 1st Qu.:1.0000 Paid:3141
## Median :37.00 Median :5.000 Median : 1.000 Median :1.0000
## Mean :37.36 Mean :3.707 Mean : 5.435 Mean :0.9931
## 3rd Qu.:38.00 3rd Qu.:5.000 3rd Qu.: 8.000 3rd Qu.:1.0000
## Max. :47.00 Max. :5.000 Max. :75.000 Max. :1.0000
## size_bytes_MB revenue
## Min. : 0.562 Min. : 0
## 1st Qu.: 44.749 1st Qu.: 0
## Median : 92.652 Median : 0
## Mean : 189.909 Mean : 5009
## 3rd Qu.: 173.497 3rd Qu.: 340
## Max. :3839.464 Max. :3648864
str(my_data)
## 'data.frame': 7197 obs. of 19 variables:
## $ id : int 281656475 281796108 281940292 282614216 282935706 283619399 283646709 284035177 284666222 284736660 ...
## $ track_name : chr "PAC-MAN Premium" "Evernote - stay organized" "WeatherBug - Local Weather, Radar, Maps, Alerts" "eBay: Best App to Buy, Sell, Save! Online Shopping" ...
## $ size_bytes : num 1.01e+08 1.59e+08 1.01e+08 1.29e+08 9.28e+07 ...
## $ currency : chr "USD" "USD" "USD" "USD" ...
## $ price : num 3.99 0 0 0 0 0.99 0 0 9.99 3.99 ...
## $ rating_count_tot: int 21292 161065 188583 262241 985920 8253 119487 1126879 1117 7885 ...
## $ rating_count_ver: int 26 26 2822 649 5320 5516 879 3594 4 40 ...
## $ user_rating : num 4 4 3.5 4 4.5 4 4 4 4.5 4 ...
## $ user_rating_ver : num 4.5 3.5 4.5 4.5 5 4 4.5 4.5 5 4 ...
## $ ver : chr "6.3.5" "8.2.2" "5.0.0" "5.10.0" ...
## $ cont_rating : chr "4+" "4+" "4+" "12+" ...
## $ prime_genre : chr "Games" "Productivity" "Weather" "Shopping" ...
## $ sup_devices.num : int 38 37 37 37 37 47 37 37 37 38 ...
## $ ipadSc_urls.num : int 5 5 5 5 5 5 0 4 5 0 ...
## $ lang.num : int 10 23 3 9 45 1 19 1 1 10 ...
## $ vpp_lic : int 1 1 1 1 1 1 1 1 1 1 ...
## $ paid : Factor w/ 2 levels "Free","Paid": 2 1 1 1 1 2 1 1 2 2 ...
## $ size_bytes_MB : num 96.1 151.2 95.9 122.6 88.5 ...
## $ revenue : num 84955 0 0 0 0 ...
table(my_data$price)
##
## 0 0.99 1.99 2.99 3.99 4.99 5.99 6.99 7.99 8.99 9.99
## 4056 728 621 683 277 394 52 166 33 9 81
## 11.99 12.99 13.99 14.99 15.99 16.99 17.99 18.99 19.99 20.99 21.99
## 6 5 6 21 4 2 3 1 13 2 1
## 22.99 23.99 24.99 27.99 29.99 34.99 39.99 47.99 49.99 59.99 74.99
## 2 2 8 2 6 1 2 1 2 3 1
## 99.99 249.99 299.99
## 1 1 1
table(my_data$prime_genre)
##
## Book Business Catalogs Education
## 112 57 10 453
## Entertainment Finance Food & Drink Games
## 535 104 63 3862
## Health & Fitness Lifestyle Medical Music
## 180 144 23 138
## Navigation News Photo & Video Productivity
## 46 75 349 178
## Reference Shopping Social Networking Sports
## 64 122 167 114
## Travel Utilities Weather
## 81 248 72
table(my_data$sup_devices.num)
##
## 9 11 12 13 15 16 23 24 25 26 33 35 36 37 38 39
## 1 3 1 7 2 8 1 270 67 42 2 24 7 3263 1912 40
## 40 43 45 47
## 1142 371 8 26
table(my_data$lang.num)
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 41 3767 675 217 154 207 143 133 145 138 168 266 179 130 89 86
## 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
## 114 46 71 30 21 35 32 24 16 14 22 7 8 20 28 53
## 32 33 34 35 36 37 39 40 41 42 43 45 46 47 50 54
## 17 30 13 2 4 2 2 1 2 3 2 9 4 1 1 2
## 55 56 58 59 63 68 69 74 75
## 2 1 12 1 1 1 3 1 1
print(paste("Number of track_name: ",nrow(my_data %>% group_by(track_name)%>%summarise(n()))))
## `summarise()` ungrouping output (override with `.groups` argument)
## [1] "Number of track_name: 7195"
print(paste("Number of id: ",nrow(my_data %>% group_by(id)%>%summarise(n()))))
## `summarise()` ungrouping output (override with `.groups` argument)
## [1] "Number of id: 7197"
print(paste("Number of currency: ",nrow(my_data %>% group_by(currency)%>%summarise(n()))))
## `summarise()` ungrouping output (override with `.groups` argument)
## [1] "Number of currency: 1"
print(paste("Number of prime_genre: " ,nrow(my_data %>% group_by(prime_genre)%>%summarise(n()))))
## `summarise()` ungrouping output (override with `.groups` argument)
## [1] "Number of prime_genre: 23"
Comments:
track_name: number of Apps (7195 kinds)
id: number of App ids (7197 kinds) (2 Apps have duplicate name)
currency: only US dollar
prime_genre: type of App (23 types)
# Variable types in a data set
my_data %>%
inspect_types() %>%
show_plot()
Comments:
Attributes type Overview:
Interger: 7
Factor: 6
Numeric: 6
library(inspectdf)# Load auto EDA packages
test_data <- my_data
test_data$user_rating <- as.factor(test_data$user_rating)
test_data$user_rating_ver <- as.factor(test_data$user_rating_ver)
test_data$ipadSc_urls.num <- as.factor(test_data$ipadSc_urls.num)
test_data %>%
inspect_cat() %>%
show_plot()
# Load auto EDA packages
library(DataExplorer)
# Distribution of categorical variable
my_data %>%
plot_bar()
## 2 columns ignored with more than 50 categories.
## track_name: 7195 categories
## ver: 1546 categories
Attributes Analysis
Content rating has 4+ the most.
Free Apps are more than Paid Apps.
# Distribution of numeric variable
my_data %>%
plot_histogram()
Finding:
paid_data <-
my_data %>%
filter(price != 0 & price != 299.99 & price != 249.99)
price_plot <-
paid_data %>%
ggplot(aes(x = price)) +
geom_bar() +
labs(title="Price Distribution for paid App")
user_rating_plot <-
ggplot(my_data,aes(x = user_rating)) +
geom_bar() +
geom_text(stat = "count", aes(label = ..count..),vjust = -0.5,size=2)
labs(title="User Rating Distribution")
## $title
## [1] "User Rating Distribution"
##
## attr(,"class")
## [1] "labels"
sup_devices_plot <-
ggplot(my_data,aes(x = sup_devices.num)) +
geom_bar() +
geom_text(stat = "count", aes(label = ..count..),vjust = -0.5,size=2) +
labs(title="Number of Devices Support Distribution")
lang_num_plot <-
ggplot(my_data,aes(x = as.factor(lang.num))) +
geom_bar() +
geom_text(stat = "count", aes(label = ..count..),vjust = -0.5,size=2) +
theme(axis.text.x = element_text(size=7,angle=45))+
labs(title="Number of Language Support Distribution", x="The Number of Language Support")
price_plot
user_rating_plot
sup_devices_plot
lang_num_plot
Findings:
There are an outlier 249.99 and 299.99 in price.
Price distribution is right-skewed, so it’s better to use median for analysis in the following.
Almost 60% of Apps are rated between 4 and 4.5.
Most of Apps support 37 to 38 devices.
Most of Apps only support one language.
Questions:
grid.arrange(
ggplot(my_data,aes(x = prime_genre))+
geom_bar() +
geom_text(stat = "count", aes(label = ..count..),vjust = -0.2,size=3) +
theme(axis.text.x = element_text(size=7,angle=45, hjust = 1.0))+
labs(title="Number of Apps Genre", x="Apps Genre"),
ggplot(my_data,aes(x = cont_rating))+
geom_bar() +
geom_text(stat = "count", aes(label = ..count..),vjust = 1.5,size=4,color = "white") +
theme(axis.text.x = element_text(size=7,angle=45, hjust = 1.0))+
labs(title="Number of Content Rating", x="Content Rating")
,ncol=1)
Findings:
Over half of Apps are from Game genre in the App Store market.
Top 5 number of Apps genre is Games, Entertaiment, Education, Photo & Video and Utilities.
library(Hmisc)
my_data %>%
select_if(is.numeric) %>%
as.matrix() %>%
rcorr()
## id size_bytes price rating_count_tot rating_count_ver
## id 1.00 0.08 -0.08 -0.20 -0.07
## size_bytes 0.08 1.00 0.18 0.00 0.01
## price -0.08 0.18 1.00 -0.04 -0.02
## rating_count_tot -0.20 0.00 -0.04 1.00 0.16
## rating_count_ver -0.07 0.01 -0.02 0.16 1.00
## user_rating -0.19 0.07 0.05 0.08 0.07
## user_rating_ver -0.11 0.09 0.03 0.09 0.08
## sup_devices.num 0.03 -0.12 -0.12 0.01 0.04
## ipadSc_urls.num 0.05 0.15 0.07 0.02 0.02
## lang.num -0.13 0.00 -0.01 0.14 0.01
## vpp_lic 0.02 -0.15 -0.03 0.00 0.01
## size_bytes_MB 0.08 1.00 0.18 0.00 0.01
## revenue -0.12 0.02 0.08 0.16 0.09
## user_rating user_rating_ver sup_devices.num ipadSc_urls.num
## id -0.19 -0.11 0.03 0.05
## size_bytes 0.07 0.09 -0.12 0.15
## price 0.05 0.03 -0.12 0.07
## rating_count_tot 0.08 0.09 0.01 0.02
## rating_count_ver 0.07 0.08 0.04 0.02
## user_rating 1.00 0.77 -0.04 0.27
## user_rating_ver 0.77 1.00 -0.02 0.28
## sup_devices.num -0.04 -0.02 1.00 -0.04
## ipadSc_urls.num 0.27 0.28 -0.04 1.00
## lang.num 0.17 0.18 -0.04 0.09
## vpp_lic 0.07 0.05 -0.04 0.07
## size_bytes_MB 0.07 0.09 -0.12 0.15
## revenue 0.05 0.05 0.00 -0.01
## lang.num vpp_lic size_bytes_MB revenue
## id -0.13 0.02 0.08 -0.12
## size_bytes 0.00 -0.15 1.00 0.02
## price -0.01 -0.03 0.18 0.08
## rating_count_tot 0.14 0.00 0.00 0.16
## rating_count_ver 0.01 0.01 0.01 0.09
## user_rating 0.17 0.07 0.07 0.05
## user_rating_ver 0.18 0.05 0.09 0.05
## sup_devices.num -0.04 -0.04 -0.12 0.00
## ipadSc_urls.num 0.09 0.07 0.15 -0.01
## lang.num 1.00 0.03 0.00 0.02
## vpp_lic 0.03 1.00 -0.15 0.01
## size_bytes_MB 0.00 -0.15 1.00 0.02
## revenue 0.02 0.01 0.02 1.00
##
## n= 7197
##
##
## P
## id size_bytes price rating_count_tot rating_count_ver
## id 0.0000 0.0000 0.0000 0.0000
## size_bytes 0.0000 0.0000 0.7035 0.5909
## price 0.0000 0.0000 0.0009 0.1265
## rating_count_tot 0.0000 0.7035 0.0009 0.0000
## rating_count_ver 0.0000 0.5909 0.1265 0.0000
## user_rating 0.0000 0.0000 0.0000 0.0000 0.0000
## user_rating_ver 0.0000 0.0000 0.0327 0.0000 0.0000
## sup_devices.num 0.0044 0.0000 0.0000 0.4538 0.0013
## ipadSc_urls.num 0.0000 0.0000 0.0000 0.1820 0.0390
## lang.num 0.0000 0.6955 0.5691 0.0000 0.2597
## vpp_lic 0.1323 0.0000 0.0111 0.9336 0.5837
## size_bytes_MB 0.0000 0.0000 0.0000 0.7035 0.5909
## revenue 0.0000 0.0408 0.0000 0.0000 0.0000
## user_rating user_rating_ver sup_devices.num ipadSc_urls.num
## id 0.0000 0.0000 0.0044 0.0000
## size_bytes 0.0000 0.0000 0.0000 0.0000
## price 0.0000 0.0327 0.0000 0.0000
## rating_count_tot 0.0000 0.0000 0.4538 0.1820
## rating_count_ver 0.0000 0.0000 0.0013 0.0390
## user_rating 0.0000 0.0003 0.0000
## user_rating_ver 0.0000 0.1089 0.0000
## sup_devices.num 0.0003 0.1089 0.0014
## ipadSc_urls.num 0.0000 0.0000 0.0014
## lang.num 0.0000 0.0000 0.0004 0.0000
## vpp_lic 0.0000 0.0000 0.0016 0.0000
## size_bytes_MB 0.0000 0.0000 0.0000 0.0000
## revenue 0.0000 0.0000 0.6972 0.6316
## lang.num vpp_lic size_bytes_MB revenue
## id 0.0000 0.1323 0.0000 0.0000
## size_bytes 0.6955 0.0000 0.0000 0.0408
## price 0.5691 0.0111 0.0000 0.0000
## rating_count_tot 0.0000 0.9336 0.7035 0.0000
## rating_count_ver 0.2597 0.5837 0.5909 0.0000
## user_rating 0.0000 0.0000 0.0000 0.0000
## user_rating_ver 0.0000 0.0000 0.0000 0.0000
## sup_devices.num 0.0004 0.0016 0.0000 0.6972
## ipadSc_urls.num 0.0000 0.0000 0.0000 0.6316
## lang.num 0.0059 0.6955 0.0739
## vpp_lic 0.0059 0.0000 0.6655
## size_bytes_MB 0.6955 0.0000 0.0408
## revenue 0.0739 0.6655 0.0408
Findings:
Price has high correlation with size_bytes, user_rating, sup_devices.num and the number of screenshots showed for display (ipadSc_urls.num).
user_rating has high correlation with user_rating_ver (Average User Rating value (for current version)), the number of screenshots showed for display (ipadSc_urls.num), and the number of languages support.
my_data %>% tabyl(price,prime_genre)
## price Book Business Catalogs Education Entertainment Finance Food & Drink
## 0.00 66 20 9 132 334 84 43
## 0.99 4 3 0 27 63 11 4
## 1.99 7 4 0 37 65 2 3
## 2.99 9 4 0 149 41 2 2
## 3.99 13 3 0 43 12 3 3
## 4.99 1 10 0 34 12 1 4
## 5.99 9 3 0 5 5 1 1
## 6.99 1 0 0 4 2 0 1
## 7.99 0 3 1 4 0 0 0
## 8.99 0 0 0 1 0 0 1
## 9.99 1 4 0 6 1 0 0
## 11.99 0 0 0 2 0 0 0
## 12.99 0 0 0 1 0 0 0
## 13.99 0 0 0 0 0 0 0
## 14.99 0 1 0 1 0 0 0
## 15.99 0 0 0 2 0 0 0
## 16.99 0 0 0 0 0 0 0
## 17.99 0 0 0 0 0 0 0
## 18.99 0 0 0 0 0 0 0
## 19.99 0 0 0 0 0 0 0
## 20.99 0 0 0 0 0 0 0
## 21.99 0 0 0 0 0 0 0
## 22.99 0 0 0 0 0 0 0
## 23.99 0 0 0 0 0 0 0
## 24.99 0 0 0 1 0 0 0
## 27.99 1 0 0 0 0 0 1
## 29.99 0 0 0 0 0 0 0
## 34.99 0 0 0 0 0 0 0
## 39.99 0 0 0 0 0 0 0
## 47.99 0 0 0 0 0 0 0
## 49.99 0 1 0 0 0 0 0
## 59.99 0 1 0 2 0 0 0
## 74.99 0 0 0 0 0 0 0
## 99.99 0 0 0 0 0 0 0
## 249.99 0 0 0 1 0 0 0
## 299.99 0 0 0 1 0 0 0
## Games Health & Fitness Lifestyle Medical Music Navigation News Photo & Video
## 2257 76 94 8 67 20 58 167
## 435 11 14 1 4 6 7 48
## 274 20 16 0 9 4 2 62
## 317 37 6 2 13 4 4 29
## 120 12 6 2 8 2 4 12
## 226 17 8 1 8 6 0 19
## 18 1 0 0 2 0 0 3
## 135 5 0 0 3 0 0 3
## 14 0 0 1 0 0 0 1
## 3 0 0 0 1 0 0 1
## 34 1 0 1 5 1 0 2
## 2 0 0 0 1 0 0 0
## 2 0 0 0 1 0 0 0
## 2 0 0 1 1 0 0 0
## 13 0 0 0 2 0 0 1
## 2 0 0 0 0 0 0 0
## 1 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0
## 0 0 0 0 1 0 0 0
## 2 0 0 2 4 1 0 0
## 1 0 0 0 0 1 0 0
## 0 0 0 0 0 0 0 0
## 0 0 0 0 0 0 0 1
## 0 0 0 0 0 0 0 0
## 1 0 0 3 1 0 0 0
## 0 0 0 0 0 0 0 0
## 1 0 0 0 4 0 0 0
## 0 0 0 1 0 0 0 0
## 0 0 0 0 2 0 0 0
## 0 0 0 0 0 0 0 0
## 0 0 0 0 1 0 0 0
## 0 0 0 0 0 0 0 0
## 0 0 0 0 0 1 0 0
## 0 0 0 0 0 0 0 0
## 0 0 0 0 0 0 0 0
## 0 0 0 0 0 0 0 0
## Productivity Reference Shopping Social Networking Sports Travel Utilities
## 62 20 121 143 79 56 109
## 15 4 0 13 9 8 35
## 21 10 1 1 15 2 54
## 12 11 0 6 3 2 16
## 7 2 0 1 3 4 11
## 18 6 0 2 1 5 13
## 2 0 0 0 0 0 2
## 6 1 0 0 2 2 1
## 7 1 0 0 0 1 0
## 1 0 0 0 0 0 1
## 17 1 0 1 1 1 3
## 1 0 0 0 0 0 0
## 0 0 0 0 0 0 1
## 0 1 0 0 0 0 1
## 2 1 0 0 0 0 0
## 0 0 0 0 0 0 0
## 1 0 0 0 0 0 0
## 1 0 0 0 0 0 0
## 0 0 0 0 0 0 0
## 2 1 0 0 1 0 0
## 0 0 0 0 0 0 0
## 0 1 0 0 0 0 0
## 0 1 0 0 0 0 0
## 0 2 0 0 0 0 0
## 1 0 0 0 0 0 1
## 0 0 0 0 0 0 0
## 1 0 0 0 0 0 0
## 0 0 0 0 0 0 0
## 0 0 0 0 0 0 0
## 0 1 0 0 0 0 0
## 0 0 0 0 0 0 0
## 0 0 0 0 0 0 0
## 0 0 0 0 0 0 0
## 1 0 0 0 0 0 0
## 0 0 0 0 0 0 0
## 0 0 0 0 0 0 0
## Weather
## 31
## 6
## 12
## 14
## 6
## 2
## 0
## 0
## 0
## 0
## 1
## 0
## 0
## 0
## 0
## 0
## 0
## 0
## 0
## 0
## 0
## 0
## 0
## 0
## 0
## 0
## 0
## 0
## 0
## 0
## 0
## 0
## 0
## 0
## 0
## 0
my_data %>%
inspect_num() %>%
show_plot()
# Distribution of numeric variable
my_data %>%
select(size_bytes,price,rating_count_tot,user_rating,ipadSc_urls.num, lang.num) %>%
inspect_cor() %>%
show_plot()
Finding:
my_data %>% select(size_bytes,price,rating_count_tot,user_rating,ipadSc_urls.num, lang.num,cont_rating) %>%
ggpairs()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Finding:
User rating have strong correlation with the number of screen shot.
my_data %>%
arrange(desc(rating_count_tot)) %>%
slice(1:10) %>%
ggplot(aes(x = reorder(track_name,-rating_count_tot), y = rating_count_tot, fill = prime_genre)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Top 10 High Total Rating Count Apps by Genre" , x= "App Name") +
scale_fill_discrete(name = "Genre") +
theme(axis.text.x = element_text(size=7,angle=25,hjust = 1.0))
Findings:
In top 10 rating count, 4 of 10 are Game genre, 2 of 10 are Social Networking, 2 of 10 are music.
Top 1 Facebook has a gap with other Apps.
my_data %>%
arrange(desc(rating_count_ver)) %>%
slice(1:10) %>%
ggplot(aes(x = reorder(track_name,-rating_count_ver), y = rating_count_ver, fill = prime_genre)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Top 10 high total rating count Apps (Current Version) by Genre" , x= "App Name") +
scale_fill_discrete(name = "Genre") +
theme(axis.text.x = element_text(size=7,angle=25,hjust = 1.0))
my_data %>%
arrange(desc(size_bytes_MB)) %>%
slice(1:10) %>%
ggplot(aes(x = reorder(track_name,-size_bytes_MB), y = size_bytes_MB, fill = prime_genre)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Top 10 high Apps Size (MByte) by Genre" , x= "App Name") +
scale_fill_discrete(name = "Genre") +
theme(axis.text.x = element_text(size=7,angle=25,hjust = 1.0))
Findings:
my_data %>%
arrange(desc(price)) %>%
slice(1:10) %>%
ggplot(aes(x = reorder(track_name,-price), y = price, fill = prime_genre)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Top 10 high price of Apps by Genre" , x= "App Name") +
scale_fill_discrete(name = "Genre") +
theme(axis.text.x = element_text(size=7,angle=25,hjust = 1.0))
Findings:
In top 10 rating count, 4 of 10 are Education genre, 2 of 10 are Business.
Top 2 Educations App has huge gap with other Apps.
my_data %>%
#revenue = price * rating_count_tot
arrange(desc(revenue)) %>%
slice(1:10) %>%
ggplot(aes(x = reorder(track_name,-revenue), y = revenue, fill = prime_genre)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Top 10 Revenue Ranking by Genre" , x= "App Name") +
scale_fill_discrete(name = "Genre") +
theme(axis.text.x = element_text(size=7,angle=25,hjust = 1.0))
Comments:
We assumed that all the rating counts are from user who already download the Apps. The real revenue will must be higher than this data, because not everyone download the App will also rating it.
grid.arrange(
my_data %>%
group_by(prime_genre) %>%
summarise(med_rating_count_tot = median(rating_count_tot)) %>%
ggplot(aes(x = reorder(prime_genre,-med_rating_count_tot), y = med_rating_count_tot)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Median of Total rating count across Genre" , x= "App Genre") +
theme(axis.text.x = element_text(size=7,angle=25,hjust = 1.0)),
my_data %>%
group_by(prime_genre,size_bytes_MB) %>%
summarise(avg_app_size = mean(size_bytes_MB)) %>%
ggplot(aes(x = reorder(prime_genre,-avg_app_size), y = avg_app_size)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Average App Size of Download across Genre" , x= "App Genre") +
theme(axis.text.x = element_text(size=7,angle=25,hjust = 1.0)),
ncol =1
)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` regrouping output by 'prime_genre' (override with `.groups` argument)
grid.arrange(
my_data %>%
arrange(desc(user_rating)) %>%
group_by(prime_genre) %>%
summarise(avg_rate = mean(user_rating)) %>%
ggplot(aes(x = reorder(prime_genre,-avg_rate), y = avg_rate)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Average User Rating by Genre" , x= "App Genre") +
theme(axis.text.x = element_text(size=7,angle=25,hjust = 1.0)),
my_data %>%
group_by(prime_genre) %>%
summarise(med_lang.num = median(lang.num))%>%
ggplot(aes(x = reorder(prime_genre,-med_lang.num), y = med_lang.num)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "The median number of language supported by Genre" , x= "App Genre") +
theme(axis.text.x = element_text(size=7,angle=25,hjust = 1.0)),
ncol = 1)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
grid.arrange(
my_data%>%
ggplot(my_data, mapping = aes(x=user_rating , y= lang.num))+geom_point(alpha = 0.5,size=1)+stat_smooth(method="lm",se=FALSE)+
labs(title = "The Correlation of number of language \nsupport and Total Rating Count" , x= "User Rating"),
my_data%>%
ggplot(my_data, mapping = aes(x=user_rating , y=ipadSc_urls.num ))+geom_point(alpha = 0.5,size=1)+stat_smooth(method="lm",se=FALSE)+
labs(title = "The Correlation of the number of \nscreenshots showed for display and \nTotal Rating Count" , x= "User Rating"),
my_data%>%
ggplot(my_data, mapping = aes(x=user_rating , y=rating_count_tot ))+geom_point(alpha = 0.5,size=1)+stat_smooth(method="lm",se=FALSE)+
labs(title = "The Correlation of the User Rating \nand Total Rating Count" , x= "User Rating" ),
my_data%>%
filter(price != 0) %>%
ggplot(my_data, mapping = aes(x= user_rating, y=price ))+geom_point(alpha = 0.5,size=1)+stat_smooth(method="lm",se=FALSE) +
labs(title = "The Correlation of the Price and \nTotal Rating Count" , x= "User Rating")
, ncol = 2)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
Findings:
The number of screenshots showed in the display has strong impact on the User rating.
The number of language supported, total rating count and price only slighly change the trend for user rating.
grid.arrange(
my_data%>%
ggplot(my_data, mapping = aes(x=rating_count_tot , y= lang.num))+geom_point(alpha = 0.5,size=1)+stat_smooth(method="lm",se=FALSE)+
labs(title = "The Correlation of number of language \nsupport and Total Rating Count" , x= "Total rating count"),
my_data%>%
ggplot(my_data, mapping = aes(x=rating_count_tot , y=ipadSc_urls.num ))+geom_point(alpha = 0.5,size=1)+stat_smooth(method="lm",se=FALSE)+
labs(title = "The Correlation of the number of \nscreenshots showed for display and \nTotal Rating Count" , x= "Total rating count"),
my_data%>%
ggplot(my_data, mapping = aes(x=rating_count_tot , y=user_rating ))+geom_point(alpha = 0.5,size=1)+stat_smooth(method="lm",se=FALSE)+
labs(title = "The Correlation of the User Rating and \nTotal Rating Count" , x= "Total rating count" ),
my_data%>%
filter(price != 0) %>%
ggplot(my_data, mapping = aes(x=rating_count_tot , y=price ))+geom_point(alpha = 0.5,size=1)+stat_smooth(method="lm",se=FALSE)+
labs(title = "The Correlation of the Price and Total \nRating Count" , x= "Total rating count"),
ncol = 2)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
Findings:
my_data%>%
ggplot(my_data, mapping = aes(x=size_bytes_MB , y=sup_devices.num ))+geom_point(alpha = 0.5,size=1)+stat_smooth(method="lm",se=FALSE)+
labs(title = "The Correlation of the number of device support and App Download Size" , x= "App Download Size")
## `geom_smooth()` using formula 'y ~ x'
Finding:
Ther more download size, the more number of devices supports
my_data %>%
filter(price != 0) %>%
arrange(desc(price)) %>%
group_by(prime_genre) %>%
summarise(median_price = median(price)) %>%
ggplot(aes(x = reorder(prime_genre,-median_price), y = median_price)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Paid App Median Price by Genre" , x= "App Genre") +
theme(axis.text.x = element_text(size=7,angle=25,hjust = 1.0))
## `summarise()` ungrouping output (override with `.groups` argument)
Findings:
grid.arrange(
my_data%>%
filter(price !=0 )%>%
ggplot(my_data, mapping = aes(x=price , y= lang.num))+geom_point(alpha = 0.5,size=1)+stat_smooth(method="lm",se=FALSE)+
labs(title = "The Correlation of number of language \nsupport and Price" , x= "Price"),
my_data%>%
filter(price !=0 )%>%
ggplot(my_data, mapping = aes(x=price , y=ipadSc_urls.num ))+geom_point(alpha = 0.5,size=1)+stat_smooth(method="lm",se=FALSE)+
labs(title = "The Correlation of the number of \nscreenshots showed for display and \nPrice" , x= "Price"),
my_data%>%
filter(price !=0 )%>%
ggplot(my_data, mapping = aes(x=price , y=sup_devices.num ))+geom_point(alpha = 0.5,size=1)+stat_smooth(method="lm",se=FALSE)+
labs(title = "The Correlation of the number of device support and Price" , x= "Price"),
ncol = 2)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
Finding:
Price with the number of language support and screeshot show have positive correlation, but with the number of devices support have negative correlation.
paid_title <- c('Free', 'Paid')
paid_value <- c(4065,3141)
paid_table <- data.frame(paid_title,paid_value)
paid_table
## paid_title paid_value
## 1 Free 4065
## 2 Paid 3141
grid.arrange(
my_data %>%
ggplot(my_data,mapping = aes(x = paid, fill = paid)) +
geom_bar() +
geom_text(stat = "count", aes(label = ..count..), vjust = 2.0,size=4) +
labs(title="Number of Paid & Free Apps"),
my_data %>%
ggplot(my_data, mapping = aes(x=paid,fill = paid)) +
geom_bar() +
geom_text(aes(label=round((..count..)/sum(..count..)*100,2), vjust = 2.0),
stat='count',nudge_y=0.125)+
labs(y = "Percent") +
scale_y_continuous(labels = scales::percent) +
theme(axis.title.y = element_blank()) +
theme_classic()+
theme(legend.position = "None")
,ncol = 2)
ggplot(my_data, aes(x = user_rating))+
geom_density(aes(fill = paid), alpha = 0.4) +
scale_color_manual(values = c("#868686FF", "#EFC000FF"))+
scale_fill_manual(values = c("#868686FF", "#EFC000FF"))
Findings:
The higher rate, the larger gap shown between free and paid Apps.
At 0 rating, free Apps are more than paid Apps.
my_data %>%
group_by(paid) %>%
summarise(avg_rate = mean(user_rating)) %>%
arrange(desc(avg_rate)) %>%
ggplot(aes(x = paid, y = avg_rate, fill = "paid")) +
geom_bar(stat = "identity", position = "dodge",fill=c("#C4961A","#FFDB6D")) +
geom_text(stat = "identity", aes(label = round(avg_rate,2)),position = position_dodge(width = 1), vjust = 2.0,color = "black") +
labs(title = "Average User Rating by Free Apps and Paid Apps" , x= "",y="Average User Rating") +
theme(axis.text.x = element_text(size=10)) +
theme_classic() +
scale_fill_manual(values = c("#C4961A", "#FFDB6D")) +
theme(plot.title = element_text(size=16,face = "bold")) +
theme(axis.text.x = element_text(size = 13 ,face = "bold")) +
theme(axis.ticks.y = element_blank(),axis.ticks.x = element_blank()) +
theme(legend.title = element_blank())
## `summarise()` ungrouping output (override with `.groups` argument)
genre_list <- c("Games","Entertainment","Education","Photo & Video")
group_data <- my_data %>% mutate(prime_genre = ifelse(prime_genre != genre_list, "Others", ifelse(prime_genre == "Games", "Games", ifelse(prime_genre == "Entertainment", "Entertainment", ifelse(prime_genre == "Education", "Education", "Photo & Video")))))
## Warning: Problem with `mutate()` input `prime_genre`.
## ℹ longer object length is not a multiple of shorter object length
## ℹ Input `prime_genre` is `ifelse(...)`.
## Warning in prime_genre != genre_list: longer object length is not a multiple of
## shorter object length
head(group_data)
## id track_name size_bytes
## 1 281656475 PAC-MAN Premium 100788224
## 2 281796108 Evernote - stay organized 158578688
## 3 281940292 WeatherBug - Local Weather, Radar, Maps, Alerts 100524032
## 4 282614216 eBay: Best App to Buy, Sell, Save! Online Shopping 128512000
## 5 282935706 Bible 92774400
## 6 283619399 Shanghai Mahjong 10485713
## currency price rating_count_tot rating_count_ver user_rating user_rating_ver
## 1 USD 3.99 21292 26 4.0 4.5
## 2 USD 0.00 161065 26 4.0 3.5
## 3 USD 0.00 188583 2822 3.5 4.5
## 4 USD 0.00 262241 649 4.0 4.5
## 5 USD 0.00 985920 5320 4.5 5.0
## 6 USD 0.99 8253 5516 4.0 4.0
## ver cont_rating prime_genre sup_devices.num ipadSc_urls.num lang.num
## 1 6.3.5 4+ Games 38 5 10
## 2 8.2.2 4+ Others 37 5 23
## 3 5.0.0 4+ Others 37 5 3
## 4 5.10.0 12+ Others 37 5 9
## 5 7.5.1 4+ Others 37 5 45
## 6 1.8 4+ Others 47 5 1
## vpp_lic paid size_bytes_MB revenue
## 1 1 Paid 96.119141 84955.08
## 2 1 Free 151.232422 0.00
## 3 1 Free 95.867188 0.00
## 4 1 Free 122.558594 0.00
## 5 1 Free 88.476562 0.00
## 6 1 Paid 9.999955 8170.47
my_data %>%
group_by(prime_genre,paid) %>%
ggplot(aes(prime_genre, fill = paid)) +
geom_bar(position = "fill")+
coord_flip() +
labs(title = "Paid and Free App Comparision Accross Different Genre ") +
geom_hline(yintercept=0.5, color= "black") +
theme_classic()
Findings:
Apps from Shopping genre are almost free.
The highest percentage of paid apps is from Education genre.
group_data %>%
group_by(prime_genre,paid) %>%
summarise(avg_rate = mean(user_rating)) %>%
arrange(desc(avg_rate)) %>%
ggplot(aes(x = reorder(prime_genre,-avg_rate), y = avg_rate, fill = paid)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Average User Rating by Genre" , x= "App Genre") +
theme(axis.text.x = element_text(size=7,angle=25,hjust = 1.0))
## `summarise()` regrouping output by 'prime_genre' (override with `.groups` argument)
Finding:
my_data %>%
group_by(paid) %>%
summarise(median_rate_tot = median(rating_count_tot)) %>%
arrange(desc(median_rate_tot)) %>%
ggplot(mapping = aes(x = paid, y = median_rate_tot)) +
geom_bar(stat = "identity", position = "dodge",fill=c("#C4961A","#FFDB6D")) +
geom_text(stat = "identity", aes(label = round(median_rate_tot,2)),position = position_dodge(width = 1), vjust = 2.0, color = "black") +
labs(title = "Median of Total Review Count by Free Apps and Paid Apps" , x= "", y="Median of Total Review Count") +
theme(axis.text.x = element_text(size=10)) +
theme(axis.text.x = element_text(size=7,angle=25,hjust = 1.0)) +
theme_classic() +
theme(plot.title = element_text(size=16,face = "bold")) +
theme(axis.text.x = element_text(size = 13 ,face = "bold")) +
theme(axis.ticks.y = element_blank(),axis.ticks.x = element_blank()) +
theme(legend.title = element_blank())
## `summarise()` ungrouping output (override with `.groups` argument)
group_data %>%
group_by(prime_genre,paid) %>%
summarise(avg_rat_tot = median(rating_count_tot)) %>%
arrange(desc(avg_rat_tot)) %>%
ggplot(aes(x = reorder(prime_genre,-avg_rat_tot), y = avg_rat_tot, fill = paid)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Median of Total Rating Count by Genre" , x= "App Genre",y= "Median of Total Rating Count") +
theme(axis.text.x = element_text(size=7,angle=25,hjust = 1.0))
## `summarise()` regrouping output by 'prime_genre' (override with `.groups` argument)
group_data %>%
group_by(paid, prime_genre) %>%
ggplot(mapping = aes(x = user_rating, y = rating_count_tot)) +
geom_point() +
facet_grid(prime_genre ~ paid)
#prepare popularity data
pop_data <- my_data %>% mutate(popularity = rating_count_tot * user_rating)
head(pop_data)
## id track_name size_bytes
## 1 281656475 PAC-MAN Premium 100788224
## 2 281796108 Evernote - stay organized 158578688
## 3 281940292 WeatherBug - Local Weather, Radar, Maps, Alerts 100524032
## 4 282614216 eBay: Best App to Buy, Sell, Save! Online Shopping 128512000
## 5 282935706 Bible 92774400
## 6 283619399 Shanghai Mahjong 10485713
## currency price rating_count_tot rating_count_ver user_rating user_rating_ver
## 1 USD 3.99 21292 26 4.0 4.5
## 2 USD 0.00 161065 26 4.0 3.5
## 3 USD 0.00 188583 2822 3.5 4.5
## 4 USD 0.00 262241 649 4.0 4.5
## 5 USD 0.00 985920 5320 4.5 5.0
## 6 USD 0.99 8253 5516 4.0 4.0
## ver cont_rating prime_genre sup_devices.num ipadSc_urls.num lang.num
## 1 6.3.5 4+ Games 38 5 10
## 2 8.2.2 4+ Productivity 37 5 23
## 3 5.0.0 4+ Weather 37 5 3
## 4 5.10.0 12+ Shopping 37 5 9
## 5 7.5.1 4+ Reference 37 5 45
## 6 1.8 4+ Games 47 5 1
## vpp_lic paid size_bytes_MB revenue popularity
## 1 1 Paid 96.119141 84955.08 85168.0
## 2 1 Free 151.232422 0.00 644260.0
## 3 1 Free 95.867188 0.00 660040.5
## 4 1 Free 122.558594 0.00 1048964.0
## 5 1 Free 88.476562 0.00 4436640.0
## 6 1 Paid 9.999955 8170.47 33012.0
pop_data %>%
arrange(desc(popularity)) %>%
slice(1:10) %>%
ggplot(aes(x = reorder(track_name,popularity), y = popularity)) +
geom_bar(stat = "identity", position = "dodge",fill = "#C3D7A4") +
labs(title = "Top 10 High Popularity of Apps in the Market" ,x= "",y="Popularity", caption = "Popularity = Total Review Count * User Rating") +
scale_fill_discrete(name = "Genre") +
coord_flip() +
theme(axis.text.x = element_text(size=7,angle=25,hjust = 1.0)) +
theme_classic() +
theme(plot.title = element_text(size=16,face = "bold")) +
theme(axis.text.y = element_text(size = 13 ,face = "bold")) +
theme(axis.ticks.y = element_blank(),axis.ticks.x = element_blank()) +
theme(legend.title = element_blank())
Findings:
The order of Popularity across Apps is similar as the the order of Total Rating Count, but Pinterest is behind Pandora - Music & Radio in the Total Rating Count ranking, now is ahead Pandora - Music & Radio.
pop_data %>%
group_by(prime_genre) %>%
summarise(med_pop = median(popularity))%>%
ggplot(aes(x = reorder(prime_genre,med_pop), y = med_pop)) +
geom_bar(stat = "identity", position = "dodge",fill = "#C3D7A4") +
labs(title = "The Median of Popularity Across Genre" , x= "",y="Median of Popularity", caption = "Popularity = Total Review Count * User Rating" ) +
theme(axis.text.x = element_text(size=7,angle=25,hjust = 1.0)) +
coord_flip() +
theme_classic() +
theme(plot.title = element_text(size=16,face = "bold")) +
theme(axis.text.y = element_text(size = 11 ,face = "bold")) +
theme(axis.ticks.y = element_blank(),axis.ticks.x = element_blank()) +
theme(legend.title = element_blank())
## `summarise()` ungrouping output (override with `.groups` argument)
Finding:
#set up the growth rate by formula: (current - previous)/previous)
grow_rate_data <- my_data %>% mutate(rating_count_previous = rating_count_tot- rating_count_ver)
grow_rate_data <- grow_rate_data %>% mutate(rating_count_growth = ifelse(rating_count_previous == 0 | rating_count_ver == 0, 0, ((rating_count_ver - rating_count_previous)/rating_count_previous)))
head(grow_rate_data)
## id track_name size_bytes
## 1 281656475 PAC-MAN Premium 100788224
## 2 281796108 Evernote - stay organized 158578688
## 3 281940292 WeatherBug - Local Weather, Radar, Maps, Alerts 100524032
## 4 282614216 eBay: Best App to Buy, Sell, Save! Online Shopping 128512000
## 5 282935706 Bible 92774400
## 6 283619399 Shanghai Mahjong 10485713
## currency price rating_count_tot rating_count_ver user_rating user_rating_ver
## 1 USD 3.99 21292 26 4.0 4.5
## 2 USD 0.00 161065 26 4.0 3.5
## 3 USD 0.00 188583 2822 3.5 4.5
## 4 USD 0.00 262241 649 4.0 4.5
## 5 USD 0.00 985920 5320 4.5 5.0
## 6 USD 0.99 8253 5516 4.0 4.0
## ver cont_rating prime_genre sup_devices.num ipadSc_urls.num lang.num
## 1 6.3.5 4+ Games 38 5 10
## 2 8.2.2 4+ Productivity 37 5 23
## 3 5.0.0 4+ Weather 37 5 3
## 4 5.10.0 12+ Shopping 37 5 9
## 5 7.5.1 4+ Reference 37 5 45
## 6 1.8 4+ Games 47 5 1
## vpp_lic paid size_bytes_MB revenue rating_count_previous rating_count_growth
## 1 1 Paid 96.119141 84955.08 21266 -0.9987774
## 2 1 Free 151.232422 0.00 161039 -0.9998385
## 3 1 Free 95.867188 0.00 185761 -0.9848084
## 4 1 Free 122.558594 0.00 261592 -0.9975190
## 5 1 Free 88.476562 0.00 980600 -0.9945748
## 6 1 Paid 9.999955 8170.47 2737 1.0153453
#plot the growth rate by genre
grow_rate_data %>%
group_by(prime_genre) %>%
summarise(avg_growth = mean(rating_count_growth)) %>%
#arrange(desc(avg_growth)) %>%
ggplot(aes(x = reorder(prime_genre,-avg_growth), y = avg_growth)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Average of Total Rating Count Growth Rate by Genre", x= "App Genre") +
theme(axis.text.x = element_text(size=7,angle=25,hjust = 1.0))+
scale_y_continuous(limits=c(-1, 7))
## `summarise()` ungrouping output (override with `.groups` argument)
Finding:
app_lm <- my_data %>% mutate_if(is.integer, as.factor)
mod <- glm(rating_count_tot ~ price + paid + ipadSc_urls.num + size_bytes_MB + lang.num ,
family = binomial(link='logit'),
data = app_lm)
summary(mod)
##
## Call:
## glm(formula = rating_count_tot ~ price + paid + ipadSc_urls.num +
## size_bytes_MB + lang.num, family = binomial(link = "logit"),
## data = app_lm)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.3550 0.1176 0.3713 0.5212 1.7317
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.258e+00 3.317e-01 -3.792 0.000150 ***
## price -3.653e-03 6.421e-03 -0.569 0.569450
## paidPaid 7.065e-01 8.676e-02 8.144 3.84e-16 ***
## ipadSc_urls.num1 1.062e+00 2.809e-01 3.780 0.000157 ***
## ipadSc_urls.num2 4.846e-01 2.270e-01 2.135 0.032793 *
## ipadSc_urls.num3 5.582e-01 1.808e-01 3.087 0.002021 **
## ipadSc_urls.num4 7.790e-01 1.308e-01 5.953 2.63e-09 ***
## ipadSc_urls.num5 1.228e+00 8.758e-02 14.022 < 2e-16 ***
## size_bytes_MB -2.041e-04 1.243e-04 -1.642 0.100672
## lang.num1 1.998e+00 3.282e-01 6.088 1.15e-09 ***
## lang.num2 9.897e-01 3.357e-01 2.948 0.003196 **
## lang.num3 2.223e+00 3.876e-01 5.736 9.68e-09 ***
## lang.num4 2.798e+00 4.651e-01 6.015 1.80e-09 ***
## lang.num5 2.623e+00 4.248e-01 6.173 6.69e-10 ***
## lang.num6 3.405e+00 5.617e-01 6.062 1.35e-09 ***
## lang.num7 3.826e+00 6.701e-01 5.710 1.13e-08 ***
## lang.num8 4.483e+00 7.850e-01 5.711 1.12e-08 ***
## lang.num9 2.985e+00 4.917e-01 6.069 1.28e-09 ***
## lang.num10 5.161e+00 1.055e+00 4.891 1.00e-06 ***
## lang.num11 5.657e+00 1.053e+00 5.370 7.88e-08 ***
## lang.num12 3.631e+00 5.588e-01 6.497 8.17e-11 ***
## lang.num13 3.204e+00 5.617e-01 5.704 1.17e-08 ***
## lang.num14 4.595e+00 1.059e+00 4.341 1.42e-05 ***
## lang.num15 1.774e+01 4.149e+02 0.043 0.965906
## lang.num16 2.665e+00 4.927e-01 5.409 6.32e-08 ***
## lang.num17 1.773e+01 5.663e+02 0.031 0.975016
## lang.num18 4.319e+00 1.062e+00 4.066 4.77e-05 ***
## lang.num19 3.579e+00 1.075e+00 3.331 0.000866 ***
## lang.num20 1.759e+01 8.528e+02 0.021 0.983542
## lang.num21 1.777e+01 6.475e+02 0.027 0.978102
## lang.num22 1.783e+01 6.759e+02 0.026 0.978956
## lang.num23 1.768e+01 7.963e+02 0.022 0.982282
## lang.num24 2.830e+00 1.091e+00 2.593 0.009501 **
## lang.num25 1.806e+01 1.024e+03 0.018 0.985931
## lang.num26 1.760e+01 8.249e+02 0.021 0.982974
## lang.num27 1.810e+01 1.443e+03 0.013 0.989994
## lang.num28 2.077e+00 1.152e+00 1.804 0.071296 .
## lang.num29 1.802e+01 8.507e+02 0.021 0.983103
## lang.num30 1.780e+01 7.212e+02 0.025 0.980307
## lang.num31 2.031e+00 5.071e-01 4.006 6.18e-05 ***
## lang.num32 1.798e+01 9.368e+02 0.019 0.984685
## lang.num33 3.725e+00 1.074e+00 3.468 0.000525 ***
## lang.num34 2.780e+00 1.095e+00 2.539 0.011131 *
## lang.num35 1.850e+01 2.752e+03 0.007 0.994638
## lang.num36 1.803e+01 1.923e+03 0.009 0.992519
## lang.num37 1.762e+01 2.797e+03 0.006 0.994974
## lang.num39 1.761e+01 2.797e+03 0.006 0.994976
## lang.num40 1.762e+01 3.956e+03 0.004 0.996447
## lang.num41 1.807e+01 2.797e+03 0.006 0.994845
## lang.num42 3.588e-02 1.287e+00 0.028 0.977751
## lang.num43 4.734e-01 1.470e+00 0.322 0.747509
## lang.num45 1.775e+01 1.313e+03 0.014 0.989216
## lang.num46 1.762e+01 1.978e+03 0.009 0.992892
## lang.num47 1.763e+01 3.956e+03 0.004 0.996444
## lang.num50 1.761e+01 3.956e+03 0.004 0.996448
## lang.num54 1.691e+01 2.797e+03 0.006 0.995177
## lang.num55 1.762e+01 2.797e+03 0.006 0.994975
## lang.num56 1.764e+01 3.956e+03 0.004 0.996442
## lang.num58 1.710e+01 1.135e+03 0.015 0.987979
## lang.num59 1.761e+01 3.956e+03 0.004 0.996449
## lang.num63 1.809e+01 3.956e+03 0.005 0.996352
## lang.num68 1.886e+01 3.956e+03 0.005 0.996197
## lang.num69 1.804e+01 2.270e+03 0.008 0.993658
## lang.num74 1.883e+01 3.956e+03 0.005 0.996202
## lang.num75 1.763e+01 3.956e+03 0.004 0.996445
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5536.5 on 7196 degrees of freedom
## Residual deviance: 4586.1 on 7132 degrees of freedom
## AIC: 4716.1
##
## Number of Fisher Scoring iterations: 16
# plot residuals to check for patterns
par(mfrow = c(1, 1))
plot(group_data$price, mod$residuals)
par(mfrow = c(1, 1))
plot(group_data$size_bytes_MB, mod$residuals)
par(mfrow = c(1, 1))
plot(group_data$sup_devices.num, mod$residuals)
pop_data %>%
group_by(prime_genre,paid) %>%
summarise(med_popular = median(popularity)) %>%
arrange(desc(med_popular)) %>%
ggplot(aes(x = reorder(prime_genre,-med_popular), y = med_popular, fill = paid)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Median Popularity by Genre", x= "App Genre") +
theme(axis.text.x = element_text(size=7,angle=25,hjust = 1.0))
## `summarise()` regrouping output by 'prime_genre' (override with `.groups` argument)
pop_data_grow <- pop_data %>% mutate(prime_genre = ifelse(prime_genre != genre_list, "Others", ifelse(prime_genre == "Games", "Games", ifelse(prime_genre == "Entertainment", "Entertainment", ifelse(prime_genre == "Education", "Education", "Photo & Video")))))
## Warning: Problem with `mutate()` input `prime_genre`.
## ℹ longer object length is not a multiple of shorter object length
## ℹ Input `prime_genre` is `ifelse(...)`.
## Warning in prime_genre != genre_list: longer object length is not a multiple of
## shorter object length
pop_genre_pic <- pop_data_grow %>%
group_by(prime_genre,paid) %>%
summarise(med_popular = median(popularity)) %>%
#arrange(desc(med_popular)) %>%
ggplot(aes(x = reorder(prime_genre,desc(prime_genre)), y = med_popular, fill = paid)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Popularity Comparison Across Different Genres by Free \n& Paid Apps", subtitle = "Most free apps have higher median of popularity than paid apps ", x= "", y="Median of Popularity", caption = "Popularity = Total Review Count * User Rating") +
scale_fill_manual(values = c("#C4961A", "#FFDB6D")) +
theme_classic() +
theme(plot.title = element_text(size=16,face = "bold")) +
theme(axis.text.x = element_text(size = 13 ,face = "bold")) +
theme(legend.title = element_blank())
## `summarise()` regrouping output by 'prime_genre' (override with `.groups` argument)
pop_genre_pic
Finding:
When I only focus on top 4 market share of Apps, the results show Free App is more popular than Paid App. However, does paid app really perform worse than free App? It will be dig out more in the following.
grow_group_rate <- grow_rate_data %>% mutate(prime_genre = ifelse(prime_genre != genre_list, "Others", ifelse(prime_genre == "Games", "Games", ifelse(prime_genre == "Entertainment", "Entertainment", ifelse(prime_genre == "Education", "Education", "Photo & Video")))))
## Warning: Problem with `mutate()` input `prime_genre`.
## ℹ longer object length is not a multiple of shorter object length
## ℹ Input `prime_genre` is `ifelse(...)`.
## Warning in prime_genre != genre_list: longer object length is not a multiple of
## shorter object length
grow_rate_pic <- grow_group_rate %>%
group_by(prime_genre,paid) %>%
summarise(avg_rat_tot_grow = mean(rating_count_growth)) %>%
ggplot(aes(x = reorder(prime_genre,desc(prime_genre)), y = avg_rat_tot_grow, fill = paid)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Average Growth Rate of Total Reviews Count Across Different \nGenres by Free & Paid Apps" , x="",y="Average Growth Rate of Total Reviews", subtitle = "Growth rate of free apps are better than paid apps in Games genre", caption = "Growth Rate = (Total reviews count of current version - past version) / past version ") +
theme(axis.text.x = element_text(size=7,angle=25,hjust = 1.0)) +
scale_fill_manual(values = c("#C4961A", "#FFDB6D")) +
theme_classic() +
theme(plot.title = element_text(size=16,face = "bold")) +
theme(axis.text.x = element_text(size = 13 ,face = "bold")) +
theme(legend.title = element_blank())
## `summarise()` regrouping output by 'prime_genre' (override with `.groups` argument)
grow_rate_pic
Findings:
#save to png for memo finding 1
ggsave(filename = "pop_grow_pic.png", width = 9, height = 6, plot = pop_genre_pic)
ggsave(filename = "grow_rate_pic.png", width = 9, height = 6, plot = grow_rate_pic)
#load data for
library(ggwordcloud)
library(dplyr)
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(stringi)
library(ggwordcloud)
library (wordcloud)
## Loading required package: RColorBrewer
library (RColorBrewer)
library (SnowballC)
myCorpus <- Corpus(VectorSource(my_data$track_name))
# Convert the text to lower case
myCorpus <- tm_map(myCorpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(myCorpus, content_transformer(tolower)):
## transformation drops documents
myCorpus <- tm_map(myCorpus, removeNumbers)
## Warning in tm_map.SimpleCorpus(myCorpus, removeNumbers): transformation drops
## documents
# Remove english common stopwords
myCorpus <- tm_map(myCorpus, removeWords, stopwords('english'))
## Warning in tm_map.SimpleCorpus(myCorpus, removeWords, stopwords("english")):
## transformation drops documents
myCorpus <- tm_map(myCorpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(myCorpus, removePunctuation): transformation
## drops documents
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
myCorpus <- tm_map(myCorpus, toSpace,"\",")
## Warning in tm_map.SimpleCorpus(myCorpus, toSpace, "\","): transformation drops
## documents
myCorpus <- tm_map(myCorpus, function(x) iconv(enc2utf8(x), sub = "byte"))
## Warning in tm_map.SimpleCorpus(myCorpus, function(x) iconv(enc2utf8(x), :
## transformation drops documents
myCorpus <- tm_map(myCorpus, tolower)
## Warning in tm_map.SimpleCorpus(myCorpus, tolower): transformation drops
## documents
myCorpus <- stri_trans_general(myCorpus, "latin-ascii")
## Warning in stri_trans_general(myCorpus, "latin-ascii"): argument is not an
## atomic vector; coercing
myCorpus <- Corpus(VectorSource(myCorpus))
tdm <- TermDocumentMatrix(myCorpus)
tdmatrix <- as.matrix(tdm)
wordFreq <- sort(rowSums(tdmatrix), decreasing = TRUE)
wordFrame <- as.data.frame(as.table(wordFreq))
wordFrame$Var1=str_replace(wordFrame$Var1, "\",","")
wordFrame$Var1=str_replace(wordFrame$Var1, "\"","")
head(wordFrame,30)
## Var1 Freq
## 1 game 179
## 2 game 149
## 3 full 123
## 4 hidden 103
## 5 pro 102
## 6 object 93
## 7 video 87
## 8 free 81
## 9 hd 78
## 10 photo 76
## 11 mystery 72
## 12 simulator 66
## 13 games 65
## 14 music 60
## 15 app 59
## 16 live 59
## 17 minecraft 58
## 18 ipad 55
## 19 pro 55
## 20 simulator 54
## 21 app 53
## 22 world 53
## 23 games 49
## 24 edition 48
## 25 car 45
## 26 puzzle 44
## 27 adventure 41
## 28 free 41
## 29 best 39
## 30 pocket 38
word_freq <- wordcloud(words = wordFrame$Var1, freq = wordFrame$Freq, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
word_freq
## NULL
word_data <- my_data %>% filter(prime_genre == "Games")
myCorpus <- Corpus(VectorSource(word_data$track_name))
myCorpus <- tm_map(myCorpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(myCorpus, content_transformer(tolower)):
## transformation drops documents
myCorpus <- tm_map(myCorpus, removeNumbers)
## Warning in tm_map.SimpleCorpus(myCorpus, removeNumbers): transformation drops
## documents
myCorpus <- tm_map(myCorpus, removeWords, stopwords('english'))
## Warning in tm_map.SimpleCorpus(myCorpus, removeWords, stopwords("english")):
## transformation drops documents
myCorpus <- tm_map(myCorpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(myCorpus, removePunctuation): transformation
## drops documents
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
myCorpus <- tm_map(myCorpus, toSpace,"\",")
## Warning in tm_map.SimpleCorpus(myCorpus, toSpace, "\","): transformation drops
## documents
myCorpus <- tm_map(myCorpus, function(x) iconv(enc2utf8(x), sub = "byte"))
## Warning in tm_map.SimpleCorpus(myCorpus, function(x) iconv(enc2utf8(x), :
## transformation drops documents
myCorpus <- tm_map(myCorpus, tolower)
## Warning in tm_map.SimpleCorpus(myCorpus, tolower): transformation drops
## documents
myCorpus <- stri_trans_general(myCorpus, "latin-ascii")
## Warning in stri_trans_general(myCorpus, "latin-ascii"): argument is not an
## atomic vector; coercing
myCorpus <- Corpus(VectorSource(myCorpus))
tdm <- TermDocumentMatrix(myCorpus)
tdmatrix <- as.matrix(tdm)
wordFreq <- sort(rowSums(tdmatrix), decreasing = TRUE)
wordFrame <- as.data.frame(as.table(wordFreq))
wordFrame$Var1=str_replace(wordFrame$Var1, "\",","")
head(wordFrame,20)
## Var1 Freq
## 1 game 172
## 2 game 133
## 3 full 123
## 4 hidden 103
## 5 object 93
## 6 mystery 71
## 7 simulator 61
## 8 games 56
## 9 simulator 54
## 10 hd 50
## 11 puzzle 44
## 12 world 41
## 13 adventure 39
## 14 car 39
## 15 free 39
## 16 racing 34
## 17 endless 32
## 18 adventure 30
## 19 "escape 29
## 20 games 29
wordFrame %>%
filter(Freq > 20) %>%
ggplot(mapping = aes(label = Var1 , size = Freq)) +
#geom_text_wordcloud(area_corr = TRUE) +
geom_text_wordcloud(na.rm = TRUE, eccentricity = 0.5) +
#geom_text_wordcloud_area(rm_outside = TRUE)
#scale_size_area(max_size = 10) +
scale_radius(range = c(0, 15), limits = c(0, NA)) +
theme_minimal()
Games <- c("game","full", "simulator", "hidden","object")
word_data <- my_data %>% filter(prime_genre == "Entertainment")
myCorpus <- Corpus(VectorSource(word_data$track_name))
myCorpus <- tm_map(myCorpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(myCorpus, content_transformer(tolower)):
## transformation drops documents
myCorpus <- tm_map(myCorpus, removeNumbers)
## Warning in tm_map.SimpleCorpus(myCorpus, removeNumbers): transformation drops
## documents
myCorpus <- tm_map(myCorpus, removeWords, stopwords('english'))
## Warning in tm_map.SimpleCorpus(myCorpus, removeWords, stopwords("english")):
## transformation drops documents
myCorpus <- tm_map(myCorpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(myCorpus, removePunctuation): transformation
## drops documents
myCorpus <- tm_map(myCorpus, function(x) iconv(enc2utf8(x), sub = "byte"))
## Warning in tm_map.SimpleCorpus(myCorpus, function(x) iconv(enc2utf8(x), :
## transformation drops documents
myCorpus <- tm_map(myCorpus, tolower)
## Warning in tm_map.SimpleCorpus(myCorpus, tolower): transformation drops
## documents
myCorpus <- stri_trans_general(myCorpus, "latin-ascii")
## Warning in stri_trans_general(myCorpus, "latin-ascii"): argument is not an
## atomic vector; coercing
myCorpus <- Corpus(VectorSource(myCorpus))
tdm <- TermDocumentMatrix(myCorpus)
tdmatrix <- as.matrix(tdm)
wordFreq <- sort(rowSums(tdmatrix), decreasing = TRUE)
wordFrame <- as.data.frame(as.table(wordFreq))
wordFrame$Var1=str_replace(wordFrame$Var1, "\",","")
wordFrame$Var1=str_replace(wordFrame$Var1, "\"","")
head(wordFrame,25)
## Var1 Freq
## 1 minecraft 25
## 2 live 15
## 3 pocket 14
## 4 watch 14
## 5 app 13
## 6 movies 13
## 7 edition 12
## 8 tv 12
## 9 santa 11
## 10 disney 10
## 11 talking 10
## 12 best 10
## 13 emoji 10
## 14 full 10
## 15 color 9
## 16 ipad 9
## 17 maps 9
## 18 stream 9
## 19 app 8
## 20 town 8
## 21 video 8
## 22 coloring 7
## 23 episodes 7
## 24 official 7
## 25 wallpapers 7
wordFrame %>%
filter(Freq > 5) %>%
ggplot(mapping = aes(label = Var1 , size = Freq)) +
#geom_text_wordcloud(area_corr = TRUE) +
geom_text_wordcloud(na.rm = TRUE, eccentricity = 0.5) +
#geom_text_wordcloud_area(rm_outside = TRUE)
#scale_size_area(max_size = 10) +
scale_radius(range = c(0, 15), limits = c(0, NA)) +
theme_minimal()
Entertainment <- c("minecraft","app","live","pocket", "watch")
word_data <- my_data %>% filter(prime_genre == "Education")
myCorpus <- Corpus(VectorSource(word_data$track_name))
myCorpus <- tm_map(myCorpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(myCorpus, content_transformer(tolower)):
## transformation drops documents
myCorpus <- tm_map(myCorpus, removeNumbers)
## Warning in tm_map.SimpleCorpus(myCorpus, removeNumbers): transformation drops
## documents
myCorpus <- tm_map(myCorpus, removeWords, stopwords('english'))
## Warning in tm_map.SimpleCorpus(myCorpus, removeWords, stopwords("english")):
## transformation drops documents
myCorpus <- tm_map(myCorpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(myCorpus, removePunctuation): transformation
## drops documents
myCorpus <- tm_map(myCorpus, function(x) iconv(enc2utf8(x), sub = "byte"))
## Warning in tm_map.SimpleCorpus(myCorpus, function(x) iconv(enc2utf8(x), :
## transformation drops documents
myCorpus <- tm_map(myCorpus, tolower)
## Warning in tm_map.SimpleCorpus(myCorpus, tolower): transformation drops
## documents
myCorpus <- stri_trans_general(myCorpus, "latin-ascii")
## Warning in stri_trans_general(myCorpus, "latin-ascii"): argument is not an
## atomic vector; coercing
myCorpus <- Corpus(VectorSource(myCorpus))
tdm <- TermDocumentMatrix(myCorpus)
tdmatrix <- as.matrix(tdm)
#tdmatrix <- tm_map(replace, "\",","")
#tdmatrix = str_replace_all(tdmatrix$"1", "game\",", "game")
#head(tdmatrix,20)
wordFreq <- sort(rowSums(tdmatrix), decreasing = TRUE)
wordFrame <- as.data.frame(as.table(wordFreq))
wordFrame$Var1=str_replace(wordFrame$Var1, "\"","")
wordFrame$Var1=str_replace(wordFrame$Var1, "\",","")
wordFrame$Var1=str_replace(wordFrame$Var1, ",","")
head(wordFrame,30)
## Var1 Freq
## 1 toca 28
## 2 dr 24
## 3 learn 24
## 4 panda 20
## 5 mini 17
## 6 sago 16
## 7 kids 15
## 8 kids 15
## 9 test 13
## 10 math 11
## 11 tinybop 11
## 12 learning 10
## 13 hd 9
## 14 brain 8
## 15 game 8
## 16 spanish 8
## 17 paw 7
## 18 life 7
## 19 pandas 7
## 20 patrol 7
## 21 play 7
## 22 school 7
## 23 math 6
## 24 star 6
## 25 app 6
## 26 fun 6
## 27 games 6
## 28 games 6
## 29 monster 6
## 30 sky 6
wordFrame %>%
filter(Freq > 5) %>%
ggplot(mapping = aes(label = Var1 , size = Freq)) +
#geom_text_wordcloud(area_corr = TRUE) +
geom_text_wordcloud(na.rm = TRUE, eccentricity = 0.5) +
#geom_text_wordcloud_area(rm_outside = TRUE)
#scale_size_area(max_size = 10) +
scale_radius(range = c(0, 15), limits = c(0, NA)) +
theme_minimal()
Education <- c("kids","toca","learn","math","pandas")
word_data <- my_data %>% filter(prime_genre == "Photo & Video")
myCorpus <- Corpus(VectorSource(word_data$track_name))
myCorpus <- tm_map(myCorpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(myCorpus, content_transformer(tolower)):
## transformation drops documents
myCorpus <- tm_map(myCorpus, removeNumbers)
## Warning in tm_map.SimpleCorpus(myCorpus, removeNumbers): transformation drops
## documents
myCorpus <- tm_map(myCorpus, removeWords, stopwords('english'))
## Warning in tm_map.SimpleCorpus(myCorpus, removeWords, stopwords("english")):
## transformation drops documents
myCorpus <- tm_map(myCorpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(myCorpus, removePunctuation): transformation
## drops documents
myCorpus <- tm_map(myCorpus, function(x) iconv(enc2utf8(x), sub = "byte"))
## Warning in tm_map.SimpleCorpus(myCorpus, function(x) iconv(enc2utf8(x), :
## transformation drops documents
myCorpus <- tm_map(myCorpus, tolower)
## Warning in tm_map.SimpleCorpus(myCorpus, tolower): transformation drops
## documents
myCorpus <- stri_trans_general(myCorpus, "latin-ascii")
## Warning in stri_trans_general(myCorpus, "latin-ascii"): argument is not an
## atomic vector; coercing
myCorpus <- Corpus(VectorSource(myCorpus))
tdm <- TermDocumentMatrix(myCorpus)
tdmatrix <- as.matrix(tdm)
wordFreq <- sort(rowSums(tdmatrix), decreasing = TRUE)
wordFrame <- as.data.frame(as.table(wordFreq))
wordFrame$Var1=str_replace(wordFrame$Var1, "\"","")
head(wordFrame,30)
## Var1 Freq
## 1 photo 68
## 2 video 42
## 3 editor 33
## 4 collage 23
## 5 editor, 23
## 6 photos 22
## 7 camera 20
## 8 camera, 20
## 9 maker 15
## 10 maker, 15
## 11 pro 15
## 12 effects 13
## 13 photos, 12
## 14 face 10
## 15 selfie 10
## 16 photo 9
## 17 add 8
## 18 filters 8
## 19 live 8
## 20 videos 8
## 21 effects, 7
## 22 movie 7
## 23 pic 7
## 24 picture 7
## 25 video, 7
## 26 video 6
## 27 collage, 6
## 28 design 6
## 29 editing 6
## 30 filters, 6
wordFrame %>%
filter(Freq > 3) %>%
ggplot(mapping = aes(label = Var1 , size = Freq)) +
#geom_text_wordcloud(area_corr = TRUE) +
geom_text_wordcloud(na.rm = TRUE, eccentricity = 0.5) +
#geom_text_wordcloud_area(rm_outside = TRUE)
#scale_size_area(max_size = 10) +
scale_radius(range = c(0, 15), limits = c(0, NA)) +
theme_minimal()
Photo_Video <- c("Photo","video","editor","camera","maker")
word_data <- my_data %>% filter(prime_genre == "Utilities")
myCorpus <- Corpus(VectorSource(word_data$track_name))
myCorpus <- tm_map(myCorpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(myCorpus, content_transformer(tolower)):
## transformation drops documents
myCorpus <- tm_map(myCorpus, removeNumbers)
## Warning in tm_map.SimpleCorpus(myCorpus, removeNumbers): transformation drops
## documents
myCorpus <- tm_map(myCorpus, removeWords, stopwords('english'))
## Warning in tm_map.SimpleCorpus(myCorpus, removeWords, stopwords("english")):
## transformation drops documents
myCorpus <- tm_map(myCorpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(myCorpus, removePunctuation): transformation
## drops documents
myCorpus <- tm_map(myCorpus, function(x) iconv(enc2utf8(x), sub = "byte"))
## Warning in tm_map.SimpleCorpus(myCorpus, function(x) iconv(enc2utf8(x), :
## transformation drops documents
myCorpus <- tm_map(myCorpus, tolower)
## Warning in tm_map.SimpleCorpus(myCorpus, tolower): transformation drops
## documents
myCorpus <- stri_trans_general(myCorpus, "latin-ascii")
## Warning in stri_trans_general(myCorpus, "latin-ascii"): argument is not an
## atomic vector; coercing
myCorpus <- Corpus(VectorSource(myCorpus))
tdm <- TermDocumentMatrix(myCorpus)
tdmatrix <- as.matrix(tdm)
wordFreq <- sort(rowSums(tdmatrix), decreasing = TRUE)
wordFrame <- as.data.frame(as.table(wordFreq))
wordFrame$Var1=str_replace(wordFrame$Var1, "\"","")
wordFrame$Var1=str_replace(wordFrame$Var1, "\",","")
wordFrame$Var1=str_replace(wordFrame$Var1, ",","")
head(wordFrame,30)
## Var1 Freq
## 1 keyboard 16
## 2 pro 12
## 3 minecraft 11
## 4 keyboard 10
## 5 web 10
## 6 emoji 9
## 7 browser 8
## 8 emoji 7
## 9 calculator 6
## 10 browser 6
## 11 go 6
## 12 calculator 5
## 13 calculator 5
## 14 clock 5
## 15 creator 5
## 16 custom 5
## 17 edition 5
## 18 file 5
## 19 flash 5
## 20 free 5
## 21 iphone 5
## 22 pe 5
## 23 pocket 5
## 24 pokemon 5
## 25 pro 5
## 26 remote 5
## 27 smart 5
## 28 themes 5
## 29 code 4
## 30 emojis 4
wordFrame %>%
filter(Freq > 3) %>%
ggplot(mapping = aes(label = Var1 , size = Freq)) +
#geom_text_wordcloud(area_corr = TRUE) +
geom_text_wordcloud(na.rm = TRUE, eccentricity = 0.5) +
#geom_text_wordcloud_area(rm_outside = TRUE)
#scale_size_area(max_size = 10) +
scale_radius(range = c(0, 15), limits = c(0, NA)) +
theme_minimal()
Utilities <- c("keyboard","emoji","pro","calculator","browser")
# build up the table for high frequent keywords
Ranking <- c("1","2","3","4","5")
freq_word_list <- data.frame(Ranking,Games, Entertainment, Education,Photo_Video,Utilities)
formattable(freq_word_list)
| Ranking | Games | Entertainment | Education | Photo_Video | Utilities |
|---|---|---|---|---|---|
| 1 | game | minecraft | kids | Photo | keyboard |
| 2 | full | app | toca | video | emoji |
| 3 | simulator | live | learn | editor | pro |
| 4 | hidden | math | camera | calculator | |
| 5 | object | watch | pandas | maker | browser |
library(knitr)
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
kable(freq_word_list) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
add_header_above(c(" ", "High-Frequency Keywords List by Genre" = 5))
| Ranking | Games | Entertainment | Education | Photo_Video | Utilities |
|---|---|---|---|---|---|
| 1 | game | minecraft | kids | Photo | keyboard |
| 2 | full | app | toca | video | emoji |
| 3 | simulator | live | learn | editor | pro |
| 4 | hidden | math | camera | calculator | |
| 5 | object | watch | pandas | maker | browser |
After analyzing, I summarized several findings as follow:
Top 5 app based on total reviews count are Facebook, Instagram, Clash of Clans, Temple Run, and Pandora - Music & Radio.
Almost 60% of Apps are rated between 4 and 4.5.
More than 50% of Apps market share are in Games genre and follow by Entertainment, Education, and Photo & Video.
56% of Apps are free in the market, but overall the paid apps have better user ratings than free apps.
Free apps can bring more reviews and popularity than free apps do.
The top 3 expensive genre are from Medical, catalogs, and business.
The top 3 popular genre are from Shopping, Business, and Photo & Video.
The top frequent words that used as app name are “Game”, “pro”, “free”, full“, and”simulator".