Notebook description: This exercise explores both apps and games on the Google App Store, looking into their rating scores, installs, pricing models, ad-support, categories and content ratings.
Data Cleaning
#clean names and parse dates
df= playstore
df = df %>% clean_names()
df$released = mdy(df$released)
df$last_updated = mdy(df$last_updated)
df$last_updated_year = year(df$last_updated)
df$release_year = year(df$released)
#remove alphabet after number in size
df$size = sub("(\\d)[^0-9]+$", "\\1", df$size)
#remove 'and above' in minimum andriod
df$minimum_android = sub("(\\d)[^0-9]+$", "\\1", df$minimum_android)
summary(df$released)
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
"2010-01-28" "2017-02-19" "2018-11-18" "2018-04-22" "2020-01-23" "2020-12-03" "7730"
summary(df$last_updated)
Min. 1st Qu. Median Mean 3rd Qu. Max.
"2008-10-22" "2019-02-04" "2020-03-06" "2019-08-18" "2020-09-11" "2020-12-03"
The dataset contains:
- apps that were released from 2010 to 2020
- earliest: 2010-01-28, latest: 2020-12-03
- 7730 obs without release year
- apps that were last updated in 2008 to 2020
- earliest: 2008-10-22, latest: 2020-12-03
- we can assume that games that did not have an update in the year 2020
- while the earliest release year is 2010, the earliest update year is 2008
length(unique(df$app_id))
[1] 1118136
length(unique(df$app_name))
[1] 1046870
length(unique(df$app_id)) - length(unique(df$app_name))
[1] 71266
length(unique(df$developer_id))
[1] 165594
- 1118136 unique IDs, there are no duplicates in the dataset
- 1046870 unique app names, there are some apps with the same names in the dataset
- 165594 unique developers
df$free = as.integer(as.logical(df$free))
df$in_app_purchases = as.integer(as.logical(df$in_app_purchases))
df$ad_supported = as.integer(as.logical(df$ad_supported))
Hmisc::describe(as.factor(df$free))
as.factor(df$free)
n missing distinct
1118136 0 2
Value 0 1
Frequency 46803 1071333
Proportion 0.042 0.958
Hmisc::describe(as.factor(df$in_app_purchases))
as.factor(df$in_app_purchases)
n missing distinct
1118136 0 2
Value 0 1
Frequency 998025 120111
Proportion 0.893 0.107
Hmisc::describe(as.factor(df$ad_supported))
as.factor(df$ad_supported)
n missing distinct
1118136 0 2
Value 0 1
Frequency 392320 725816
Proportion 0.351 0.649
Attribute Construction
print(unique(df$category))
[1] "Communication" "Strategy" "Tools" "Business"
[5] "Music & Audio" "Maps & Navigation" "Lifestyle" "Educational"
[9] "Productivity" "News & Magazines" "Food & Drink" "Photography"
[13] "Books & Reference" "Shopping" "Board" "Sports"
[17] "Medical" "Health & Fitness" "House & Home" "Education"
[21] "Auto & Vehicles" "Finance" "Parenting" "Travel & Local"
[25] "Personalization" "Entertainment" "Video Players & Editors" "Puzzle"
[29] "Casual" "Events" "Arcade" "Music"
[33] "Trivia" "Action" "Simulation" "Racing"
[37] "Weather" "Role Playing" "Adventure" "Social"
[41] "Word" "Comics" "Art & Design" "Card"
[45] "Libraries & Demo" "Casino" "Beauty" "Dating"
[49] ""
Category There are 49 levels in the category feature, including a NA. The levels are grouped into (1) Apps or (2) Games as described by Google Play Console Help in the following section.
#17 game genres
gamegenres= c("Action","Adventure","Arcade","Board","Casino","Casual","Educational","Music","Puzzle","Racing","Role Playing","Simulation","Sports","Strategy","Trivia","Word")
df = df %>% mutate(is_game = if_else(category %in% gamegenres,"1","0"))
Hmisc::describe(df$is_game)
df$is_game
n missing distinct
1118136 0 2
Value 0 1
Frequency 947111 171025
Proportion 0.847 0.153
- 15.3% of the obs in the dataset are games and 84.7% are apps.
#price model
df = df %>% mutate(price_model = case_when(
(free==1 & in_app_purchases == 1) ~ "freemium",
(free==1 & in_app_purchases == 0) ~ "free",
(free==0 & in_app_purchases == 1) ~ "paymium",
(free==0 & in_app_purchases == 0) ~ "paid",
TRUE ~"other"
))
Hmisc::describe(df$price_model)
df$price_model
n missing distinct
1118136 0 4
Value free freemium paid paymium
Frequency 954156 117177 43869 2934
Proportion 0.853 0.105 0.039 0.003
- Price model attribute combines free and in_app_purchases features to understand the developers pricing strategy
- Majority of the observations in the dataset are free applications and least are paymium.
str(df)
'data.frame': 1118136 obs. of 27 variables:
$ app_name : chr "HTTrack Website Copier" "World War 2: Offline Strategy" "WPSApp" "OfficeSuite - Office, PDF, Word, Excel, PowerPoint" ...
$ app_id : chr "com.httrack.android" "com.skizze.wwii" "com.themausoft.wpsapp" "com.mobisystems.office" ...
$ category : chr "Communication" "Strategy" "Tools" "Business" ...
$ rating : num 3.6 4.3 4.2 4.2 4.2 3.5 4.4 4 4.1 3.3 ...
$ rating_count : int 2848 17297 488639 1224420 665 377 3346 44700 10990 1141 ...
$ installs : chr "100,000+" "1,000,000+" "50,000,000+" "100,000,000+" ...
$ minimum_installs : num 1e+05 1e+06 5e+07 1e+08 5e+04 1e+04 1e+05 1e+07 1e+07 1e+05 ...
$ maximum_installs : num 3.52e+05 2.16e+06 7.93e+07 1.64e+08 7.35e+04 ...
$ free : int 1 1 1 1 1 1 1 1 1 1 ...
$ price : num 0 0 0 0 0 0 0 0 0 0 ...
$ currency : chr "USD" "USD" "USD" "USD" ...
$ size : chr "2.7" "86" "5.8" "59" ...
$ minimum_android : chr "2.3" "5.1" "4.1" "4.4" ...
$ developer_id : chr "Xavier Roche" "Skizze Games" "TheMauSoft" "MobiSystems" ...
$ developer_website: chr "http://www.httrack.com/" "http://stereo7.com/" "http://www.themausoft.com" "http://www.mobisystems.com" ...
$ developer_email : chr "roche+android@httrack.com" "Skizze.Games@gmail.com" "wpsapp.app@gmail.com" "support-officesuite-android@mobisystems.com" ...
$ released : Date, format: "2013-08-12" "2018-07-19" "2016-03-07" "2011-12-22" ...
$ last_updated : Date, format: "2017-05-20" "2020-11-26" "2020-10-21" "2020-11-23" ...
$ content_rating : chr "Everyone" "Everyone 10+" "Everyone" "Everyone" ...
$ privacy_policy : chr "http://android.httrack.com/privacy-policy.html" "https://www.iubenda.com/privacy-policy/8032781" "https://sites.google.com/view/wpsapppolicy/main" "http://www.mobisystems.com/mobile/privacy-policy.html" ...
$ ad_supported : int 0 1 1 1 0 0 0 1 1 1 ...
$ in_app_purchases : int 0 1 0 1 0 0 1 1 1 0 ...
$ editors_choice : chr "False" "False" "False" "False" ...
$ last_updated_year: num 2017 2020 2020 2020 2020 ...
$ release_year : num 2013 2018 2016 2011 2016 ...
$ is_game : chr "0" "1" "0" "0" ...
$ price_model : chr "free" "freemium" "free" "freemium" ...
#age of app (days between collection_date and released_date)
collection_date = '2020-12-03'
df$interval = interval(ymd(df$released),ymd(collection_date))
df$age<- as.period(df$interval)
df$age <- df$age %/% days(1) #convert to days
head(df$age)
[1] 2669 866 1730 3267 1530 668
summary(df$age)
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0 314 745 954 1383 3961 7730
- median of 745 days between data collection date and app release date
#age of update (days between collection date and last_update)
df$interval2 = interval(ymd(df$last_updated),ymd(collection_date))
df$age_update <- as.period(df$interval2)
df$age_update <- df$age_update %/% days(1) #convert to days
head(df$age_update)
[1] 1291 7 42 10 11 27
summary(df$age_update)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0 82.0 270.0 471.1 668.0 4424.0
- median of 270 days between data collection date and app last update date
Data Exploration
Entire dataset
#apps and games released over the years
df %>% group_by(is_game,release_year) %>% tally() %>% ggplot(aes(x=release_year, y= n, fill=is_game)) + geom_area(alpha=0.9) + scale_fill_jama()

#games release year
df %>% group_by(release_year) %>% tally() %>% mutate(prop = round(n/sum(n),3))
#games update year
df %>% group_by(last_updated_year) %>% tally() %>% mutate(prop = round(n/sum(n),3))
- Release year:
- more than 60% of the apps in the dataset are released from 2018, 2019 and 2020.
- 7730 apps in the dataset have no release year indicated
- Last updated year:
- 55.8 % of the apps in the dataset had at least one update in 2020, we will consider these as active applications in this notebook.
#ad supported over years (last_updated_year 2016 to 2020)
df %>% filter(last_updated_year>2015) %>% group_by(is_game,last_updated_year, ad_supported) %>% tally() %>% mutate(prop=n/sum(n)) %>% ggplot(aes(x=last_updated_year, y=prop, fill=factor(ad_supported))) + geom_col(width=0.6,alpha=0.9) + facet_wrap(~is_game,labeller=label_both) + scale_fill_uchicago() + theme(legend.position="bottom") + labs(y="proportion",fill="ad_supported")

Last updated dates: 2010-01-28 to 2020-12-03 * The ad support feature could help in understanding the developers’ pricing strategy * The above chart used last updated year as a comparision as ad support could change with app updates * Apps: increased use of ads from 2016 to 2019, and a slight decrease in 2020 (Jan to Nov) * Games: increasing porportion of ad support games from 2016 to 2020
#installs levels
length(unique(df$installs))
[1] 23
unique(df$installs)
[1] "100,000+" "1,000,000+" "50,000,000+" "100,000,000+" "50,000+" "10,000+"
[7] "10,000,000+" "5,000,000+" "500,000,000+" "500+" "500,000+" "1,000+"
[13] "100+" "5,000,000,000+" "5,000+" "10+" "50+" "1+"
[19] "" "5+" "1,000,000,000+" "0+" "10,000,000,000+"
#minimum_installs distribution
df %>% filter(!is.na(minimum_installs)) %>% group_by(is_game, minimum_installs) %>% tally() %>% mutate(prop=n/sum(n)) %>% ggplot(aes(x=factor(minimum_installs), y=prop, fill= is_game)) + geom_col() + facet_wrap(~is_game,labeller=label_both) + theme(legend.position="none") + theme(axis.text.x=element_text(angle=90)) + scale_fill_jama() + labs(y="proportion",x="minimum_installs")

- There are 23 levels of installs categories including one NA
- Similar distribution shape of apps and games installs(minimum installs)
#ratio of app to dev
df %>% group_by(release_year) %>% summarise(devcount=n_distinct(developer_id), appcount=n_distinct(app_id)) %>% mutate(ratio= (appcount/devcount))
#plot release_year, ratio, is_game
df %>% group_by(is_game, release_year) %>% summarise(devcount=n_distinct(developer_id), appcount=n_distinct(app_id)) %>% mutate(ratio= (appcount/devcount)) %>% ggplot(aes(x=release_year, y=ratio, fill=is_game)) + geom_area() + scale_fill_uchicago() + labs(title="Ratio of apps to developers" )

- The ratio of apps (app and game classes) to developers have increased over the years from 2.08 in 2011 to 3.98 in 2019.
- suggesting that developers are increasingly releasing more apps on the play store per year
- Comparison of app to developer ratio between apps and games classes
- game developers release more games a year as compared to app developers
- the growth in ratio is more significant over the years in the games class compared to apps class
Subset
The following subset is created to try reduce biasness for further exploration, with the following conditions: + are active (have at least one update in 2020) + have a release year + have a rating score + minimum installs equal or above 5000 + rating count above 500
#subset
#last updated in 2020, a rating score, rating count >500, minimum_installs =>5000
active = df %>% filter (last_updated_year== 2020) %>% filter(rating>0) %>% filter(rating_count>500) %>% filter(minimum_installs>1000)
#release year
active %>% group_by(release_year) %>% tally()
#drop obs with no release year
active = active %>% filter (!is.na(release_year))
active$size = as.numeric(active$size)
active$minimum_android = as.numeric(active$minimum_android)
active = active %>% filter(!is.na(size)) %>% filter(!is.na(minimum_android))
dim(active)
[1] 80657 32
- 80657 out of 1118136 observations met the above observations.
#plot release year of active apps
active %>% group_by(release_year) %>% tally() %>% ggplot(aes(x=as.factor(release_year), y=n, fill=n)) + geom_col(width=0.8) + labs(y="","x=release_year") + geom_text(aes(label=n), vjust = -0.3, size = 3.5) + scale_fill_viridis() + theme(legend.position="none")

- As expected, majority of the active apps are released in the recent years
- The oldest active apps are released in 2010 and there are 855 observations.
- There are more apps from 2019 than 2020 in the subset, this could be attributed to conditions of the subset as newer apps released in the recent months could have lower install count and rating count.
#comparing games and apps rating distribution
active %>% ggplot(aes(x=rating, fill=is_game)) + geom_bar() + facet_wrap(~is_game, labeller=label_both) + scale_fill_jama()

- There is a similar distribution shape of rating score between games and apps, both are positively skewed.
#proportion of rating score between free and paid apps
#apps
rf0 = active %>% filter(is_game==0) %>% group_by(free, rating) %>% tally() %>% mutate(prop=n/sum(n)) %>% ggplot(aes(x=rating,y=prop, fill=factor(free))) + geom_col() + facet_wrap(~free,labeller=label_both) + theme(legend.position="none") + scale_fill_manual(values = c("#dda15e", "#bc6c25")) + labs(y="proportion", title="Apps") + scale_y_continuous(limits=c(0,0.13))
#games
rf1 = active %>% filter(is_game==1) %>% group_by(free, rating) %>% tally() %>% mutate(prop=n/sum(n)) %>% ggplot(aes(x=rating,y=prop, fill=factor(free))) + geom_col() + facet_wrap(~free,labeller=label_both) + theme(legend.position="none") +
scale_fill_manual(values = c("#606c38", "#283618")) + labs(y="proportion", title="Games") + scale_y_continuous(limits=c(0,0.13))
#plot
ggarrange(rf0,rf1,ncol=1)

- The distribution of rating score between free games and free apps are very similar.
- Free apps have a larger proportion of ratings 2 and below, compared to paid apps.
- Free games have a larger proportion of ratings 2.5 and below compared to paid games.
- In both apps and games, free class has a lower proportion of ratings 4.5 and above, compared to paid class.
- Paid apps have a larger proportion of ratings 4.5 and above compared to paid games.
#maximum_installs, rating_count, rating
pair_vars = active %>% select(maximum_installs, rating_count, rating, age, age_update)
ggpairs(pair_vars)

- As expected there is a positive relationship between
- maximum_installs and rating_count
- rating(score) and maximum_installs
- rating(score) and rating_count
- Age of app has a negative correlation with rating, positive with rating count and maximum_installs
- Age of app updates negative correlation with maximum installs, rating_count and rating
- The distribution of ratings in the subset is negatively skewed, with majority of the observations having rating(score) of 4.0 and above.
#categories
c0 = active %>% filter(is_game==0) %>% group_by(category) %>% tally() %>% ggplot(aes(x=reorder(category,n), y=n)) + geom_col(fill="#52796f") + coord_flip() + labs(y="category",x="")
c1 = active %>% filter(is_game==1) %>% group_by(category) %>% tally() %>% ggplot(aes(x=reorder(category,n), y=n)) + geom_col(fill="#495057") + coord_flip() + labs(y="category",x="")
ggarrange(c0,c1)

- Apps: Tools are the most frequent category while events are the least frequent
- Games: Simulation genre is the most frequent category while music is the least frequent
#avg rating: app
active %>% filter(is_game==0) %>% group_by(category) %>% summarise(avg= mean(rating)) %>% arrange(desc(avg)) %>% as.data.frame()
#avg rating: game
active %>% filter(is_game==1) %>% group_by(category) %>% summarise(avg= mean(rating)) %>% arrange(desc(avg))
Average app ratings for categories:
- In apps: Books & Reference have the highest average rating score, while Dating have the lowest.
- In games: Casino and Word genre have the highest average rating score, while Simulation genre have the lowest.
- interestingly, simulation games are the most common game apps (shown in the previous section)
#active installs levels
active %>% group_by(is_game, minimum_installs) %>% tally() %>% mutate(prop=n/sum(n)) %>% ggplot(aes(x=factor(minimum_installs), y=prop, fill= is_game)) + geom_col(alpha=0.9) + facet_wrap(~is_game) + theme(legend.position="top") + theme(axis.text.x=element_text(angle=90)) + scale_fill_manual(values=wes_palette("Moonrise2")) + labs(x="minimum_installs",y="proportion",title="Install levels")

- When comparing install levels between apps and games,
- apps have a higher proportion in 100,000 installs and below, compared to games
- apps have a lower proportion in levels 1 million installs and above, compared to games
Most frequent categories in apps and games
#comparing categories in subset and complete dataset
#apps
df %>% filter(is_game==0) %>% group_by(category) %>% tally(sort=T)
active %>% filter(is_game==0) %>% group_by(category) %>% tally(sort=T)
#games
df %>% filter(is_game==1) %>% group_by(category) %>% tally(sort=T)
active %>% filter(is_game==1) %>% group_by(category) %>% tally(sort=T)
- In comparing subset(active apps) and entire dataset, there are some similarities in the most frequent categories in both games and apps.
- The following exploration will use the top 5 categories in the subset as it contains (active) apps that have at least an update in 2020.
- Apps: tools, entertainment, education, personalization, music & audio
- Games: simulation, puzzle, casual, sports, action,
#proportion of top 5 most frequent categories across the years
#apps
top_cats = c("Tools","Entertainment","Education","Personalization","Music & Audio")
df %>% filter(last_updated_year==2020) %>% group_by(release_year, category) %>% tally() %>% mutate(prop=n/sum(n)) %>% filter(category %in% top_cats)%>% ggplot(aes(x=release_year, y=prop, fill=category)) + geom_col(alpha=0.9) + scale_fill_jama() + theme(legend.position="left") + labs(y="proportion", x="", fill="Apps")

#games
top_genres = c("Simulation","Puzzle","Casual","Sports","Action")
df %>% filter(last_updated_year==2020) %>% filter(is_game==1) %>% group_by(release_year, category) %>% tally() %>% mutate(prop=n/sum(n)) %>% filter(category %in% top_genres) %>% ggplot(aes(x=release_year, y=prop, fill=category)) + geom_col(alpha=0.9) + scale_fill_jco() + theme(legend.position="left") + labs(y="proportion", x="", fill="Games")

Looking into the proportion of most frequent categories in active apps (last updated in 2020) to explore the saturation of new releases:
- Apps:
- The identified 5 categories jointly have increased in proportion to other categories over the years.
- Music & Audio category have increased in the past two years (2019 and 2020) while Personalization have decreased in the same time period.
- Games:
- The identified 5 genres jointly make up more than 50% of the games across the years.
- Proportion of Sports games have decreased over the years
- Between the identified 5 genres, Action games make up the lowest proportion from 2015 to 2020.
Pricing strategies
#price model by app type
p1 = active %>% group_by(is_game,price_model) %>% tally() %>% mutate(prop=n/sum(n)) %>% ggplot(aes(x=is_game, y=prop, fill=price_model)) + geom_col(width=0.5) + coord_flip() + scale_fill_jama() + theme(legend.position="right")
#ad supported by app type
p2 = active %>% group_by(is_game,ad_supported) %>% tally() %>% mutate(prop=n/sum(n)) %>% ggplot(aes(x=is_game, y=prop, fill=factor(ad_supported))) + geom_col(width=0.5) + coord_flip() + scale_fill_uchicago() + theme(legend.position="right")
ggarrange(p1,p2, nrow=2)

- Price model:
- freemium is the most popular price model among game developer, while free is most popular among app developers
- there is a higher proportion of paid and paymium price model in game when compared to apps
- Add supported:
- games have a higher proportion of ad supported than apps
#price model and ad supported
active %>% group_by(is_game, price_model, ad_supported) %>% tally() %>% mutate(prop = n/sum(n)) %>% ggplot(aes(x=price_model, y=prop, fill=factor(ad_supported))) + geom_col(width=0.6,alpha=0.9) + coord_flip() + facet_wrap(~is_game, labeller=label_both) + scale_fill_uchicago() + theme(legend.position="bottom") + labs(y="proportion",fill="ad_supported")

- Looking at both price model and ad supported:
- across both games and apps, as expected free to download model (free and freemium) have a higher proportion of ad supported than paid and paymium models
- free to download games have a higher proportion of ad support than free to download apps
- paymium apps have a higher proportion of ad supported than paymium games
- freemium games have the most ad support while paid games have the least ad support
#most frequent categories and price_model
active %>% filter (is_game==0) %>% filter(category %in% top_cats) %>% group_by(category, price_model) %>% tally() %>% mutate(prop=n/sum(n)) %>% ggplot(aes(x=category, y=prop, fill=price_model)) + geom_col(alpha=0.9, width=0.7) + scale_fill_jama() + theme(legend.position="bottom") + labs(y="proportion", x="app category") + coord_flip()

active %>% filter (is_game==1) %>% filter(category %in% top_genres) %>% group_by(category, price_model) %>% tally() %>% mutate(prop=n/sum(n)) %>% ggplot(aes(x=category, y=prop, fill=price_model)) + geom_col(alpha=0.9, width=0.7) + scale_fill_jama() + theme(legend.position="bottom") + labs(y="proportion", x="game category") + coord_flip()

Looking at developers’ pricing (model) strategy in the identified 5 categories:
- Apps:
- personalization has the most paymium model compared to other categories
- tools have paid model compared to other categories
- Games:
- action genre have the most paymium price model
- sports genre have the most free model and least freemium model
#summary of paid apps by is_game
active_copy = active %>% filter(free==0)
by(active_copy$price, active_copy$is_game, summary)
active_copy$is_game: 0
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.990 1.990 2.990 5.353 4.990 379.990
---------------------------------------------------------------------------------------
active_copy$is_game: 1
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.99 1.49 2.99 3.95 4.99 49.99
#paid games prices of 5 most frequent categories
active %>% filter (is_game==1) %>% filter(free==0) %>% filter(category %in% top_genres) %>% ggplot(aes(x=category, y=price, color=category)) + geom_jitter(show.legend=FALSE, size=2, alpha=0.5) + coord_flip() + scale_color_futurama() + labs(y="price (USD)", title="is_game = 1")

#paid apps prices of 5 most frequent categories
active %>% filter (is_game==0) %>% filter(free==0) %>% filter(category %in% top_cats) %>% ggplot(aes(x=category, y=price, color=category)) + geom_jitter(show.legend=FALSE, size=2, alpha=0.5) + coord_flip() + scale_color_futurama() + labs(y="price (USD)", title="is_game = 0")

- For pay to download models,
- the lowest price is 0.99USD for both games and apps
- highest priced pay to download game is 49.99USD and app is 379.99
- median of 2.99 USD for both games and apps
- Looking at prices between 0.99 and 50.00USD
- games: sports and simulation have games more than 10USD but puzzle, casual and action do not
- apps: tools and education have apps that are more than 20USD, but personalization, music & audio, and entertainment categories do not.
Content rating
#content rating
active %>% group_by(is_game,content_rating) %>% tally() %>% mutate(prop=round(n/sum(n),2))
active %>% group_by(is_game,content_rating) %>% tally() %>% mutate(prop=round(n/sum(n),2)) %>% ggplot(aes(x=is_game, y=prop, fill=factor(content_rating))) + geom_col(width=0.5) + coord_flip() + scale_fill_uchicago() + theme(legend.position="bottom") + labs(y="proportion",fill="content_rating")

- in both games and apps types, ‘Everyone’ rating is the most common and "Adults only 18+’ is the least common
- compared to apps, games have a higher proportion of ‘Teen’ and ‘Everyone 10+’ content but lower proportion of ‘Mature 17+’ and ‘Everyone’ rating.
#content_rating of most frequent categories
active %>% filter (is_game==1) %>% filter(category %in% top_genres) %>% group_by(category, content_rating) %>% tally() %>% mutate(prop=n/sum(n)) %>% ggplot(aes(x=category, y=prop, fill=content_rating)) + geom_col(alpha=0.9, width=0.7) + scale_fill_uchicago() + theme(legend.position="bottom") + labs(y="proportion", x="game category") + coord_flip()

active %>% filter (is_game==0) %>% filter(category %in% top_cats) %>% group_by(category, content_rating) %>% tally() %>% mutate(prop=n/sum(n)) %>% ggplot(aes(x=category, y=prop, fill=content_rating)) + geom_col(width=0.7) + scale_fill_jco() + theme(legend.position="bottom") + labs(y="proportion", x="app category") + coord_flip()

- Games:
- majority of sports, simulation, puzzle and casual genres are rated ‘Everyone’, but majority of action is rated ‘Teen’
- action games have the highest proportion of ‘Mature 17+’ content compared to the other identified genres
- Apps:
- across the five identified genres, majority of the apps are rated ‘Everyone’, least are ‘Everyone 10+’, and none are rated ‘Adults only 18+’
- entertainment apps have the highest proportion of ‘Teen’ and ‘Mature 17+’ content compared to the other identified app categories
Export dataframe
#shortlisted variables for modeling
games1 = active %>% filter(is_game == 1) %>% select(app_id, category, rating, rating_count, minimum_installs, maximum_installs, free, price, size, content_rating, ad_supported, in_app_purchases, last_updated_year, release_year, age, age_update)
dim(games1)
[1] 22591 16
apps1 = active %>% filter(is_game == 0) %>% select(app_id, category, rating, rating_count, minimum_installs, maximum_installs, free, price, size, content_rating, ad_supported, in_app_purchases, last_updated_year, release_year, age, age_update)
dim(apps1)
[1] 58066 16
#export df
write.csv(games1,"games1.csv",row.names = FALSE)
write.csv(apps1,"apps1.csv",row.names = FALSE)
---
title: "Play Store EDA"
output: html_notebook
---

Dataset: [Google Play Store Apps](https://github.com/gauthamp10/Google-Playstore-Dataset) by Gauthan Prakash, containing 1.1 million applications with 23 attributes (collected in December 2020). 

Notebook description: This exercise explores both apps and games on the Google App Store, looking into their rating scores, installs, pricing models, ad-support, categories and content ratings.

Notebook Contents:

* Data Cleaning 
* Attribute Construction
* Data Exploration

```{r, message=FALSE}
library(tidyverse)
library(lubridate)
library(janitor)
library(Hmisc)
library(skimr)
library(ggpubr)
library(GGally)
library(wesanderson)
library(ggsci)
library(viridis)
theme_set(theme_bw())
```

### Import Data
```{r}
playstore = read.csv("Google-Playstore.csv",header=TRUE)
dim(playstore)
```
* 1118136 rows and 23 cols

### Data Cleaning
```{r}
#clean names and parse dates
df= playstore
df = df %>% clean_names()
df$released = mdy(df$released)
df$last_updated = mdy(df$last_updated)
df$last_updated_year = year(df$last_updated)
df$release_year = year(df$released)

#remove alphabet after number in size
df$size = sub("(\\d)[^0-9]+$", "\\1", df$size)
#remove 'and above' in minimum andriod
df$minimum_android = sub("(\\d)[^0-9]+$", "\\1", df$minimum_android)
```

```{r, warning=FALSE}
summary(df$released)
summary(df$last_updated)
```

The dataset contains:

* apps that were released from 2010 to 2020
  + earliest: 2010-01-28, latest: 2020-12-03
  + 7730 obs without release year
* apps that were last updated in 2008 to 2020
  + earliest: 2008-10-22, latest: 2020-12-03
  + we can assume that games that did not have an update in the year 2020 
* while the earliest release year is 2010, the earliest update year is 2008

```{r}
length(unique(df$app_id)) 
length(unique(df$app_name)) 
length(unique(df$app_id)) - length(unique(df$app_name)) 
length(unique(df$developer_id)) 
```
* 1118136 unique IDs, there are no duplicates in the dataset
* 1046870 unique app names, there are some apps with the same names in the dataset
* 165594 unique developers 

```{r}
df$free = as.integer(as.logical(df$free))
df$in_app_purchases = as.integer(as.logical(df$in_app_purchases))
df$ad_supported = as.integer(as.logical(df$ad_supported))

Hmisc::describe(as.factor(df$free))
Hmisc::describe(as.factor(df$in_app_purchases))
Hmisc::describe(as.factor(df$ad_supported))
```

### Attribute Construction 

```{r}
print(unique(df$category))
```

**Category**
There are 49 levels in the category feature, including a NA. The levels are grouped into (1) Apps or (2) Games as described by [Google Play Console Help](https://support.google.com/googleplay/android-developer/answer/9859673?hl=en) in the following section. 

```{r}
#17 game genres
gamegenres= c("Action","Adventure","Arcade","Board","Casino","Casual","Educational","Music","Puzzle","Racing","Role Playing","Simulation","Sports","Strategy","Trivia","Word")
df = df %>% mutate(is_game = if_else(category %in% gamegenres,"1","0"))
Hmisc::describe(df$is_game)
```

* 15.3% of the obs in the dataset are games and 84.7% are apps.  

```{r}
#price model
df = df %>% mutate(price_model = case_when(
	(free==1 & in_app_purchases == 1) ~ "freemium",
	(free==1 & in_app_purchases == 0) ~ "free",
	(free==0 & in_app_purchases == 1) ~ "paymium",
	(free==0 & in_app_purchases == 0) ~ "paid", 
	TRUE ~"other"
	))
Hmisc::describe(df$price_model)
```

* Price model attribute combines free and in_app_purchases features to understand the developers pricing strategy
* Majority of the observations in the dataset are free applications and least are paymium. 


```{r}
str(df)
```



```{r}
#age of app (days between collection_date and released) 
collection_date = '2020-12-03'
df$interval = interval(ymd(df$released),ymd(collection_date))
df$age<- as.period(df$interval)
df$age <- df$age %/% days(1) #convert to days
head(df$age)
summary(df$age)
```
* median of 745 days between data collection date and app release date

```{r}
#age of update (days between collection date and last_update)
df$interval2 = interval(ymd(df$last_updated),ymd(collection_date))
df$age_update <- as.period(df$interval2)
df$age_update <- df$age_update %/% days(1) #convert to days
head(df$age_update)
summary(df$age_update)
```
* median of 270 days between data collection date and app last update date

### Data Exploration 

#### Entire dataset

```{r, warning=FALSE}
#apps and games released over the years 
df %>% group_by(is_game,release_year) %>% tally() %>% ggplot(aes(x=release_year, y= n, fill=is_game)) + geom_area(alpha=0.9) + scale_fill_jama()
```


```{r}
#games release year 
df %>% group_by(release_year) %>% tally() %>% mutate(prop = round(n/sum(n),3)) 
#games update year
df %>% group_by(last_updated_year) %>% tally() %>% mutate(prop = round(n/sum(n),3))
```

* Release year: 
  + more than 60% of the apps in the dataset are released from 2018, 2019 and 2020. 
  + 7730 apps in the dataset have no release year indicated
* Last updated year: 
  + 55.8 % of the apps in the dataset had at least one update in 2020, we will consider these as active applications in     this notebook. 


```{r}
#proportion of ad supported over years (last_updated_year 2016 to 2020)
df %>% filter(last_updated_year>2015) %>% group_by(is_game,last_updated_year, ad_supported) %>% tally() %>% mutate(prop=n/sum(n)) %>% ggplot(aes(x=last_updated_year, y=prop, fill=factor(ad_supported))) + geom_col(width=0.6,alpha=0.9) + facet_wrap(~is_game,labeller=label_both) + scale_fill_uchicago() + theme(legend.position="bottom") + labs(y="proportion",fill="ad_supported")
```
Last updated dates: 2010-01-28 to 2020-12-03
* The ad support feature could help in understanding the developers' pricing strategy 
* The above chart used last updated year as a comparision as ad support could change with app updates
* Apps: increased use of ads from 2016 to 2019, and a slight decrease in 2020 (Jan to Nov)
* Games: increasing porportion of ad support games from 2016 to 2020 

```{r}
#installs levels
length(unique(df$installs))
unique(df$installs)

#minimum_installs distribution
df %>% filter(!is.na(minimum_installs)) %>% group_by(is_game, minimum_installs) %>% tally() %>% mutate(prop=n/sum(n)) %>% ggplot(aes(x=factor(minimum_installs), y=prop, fill= is_game)) + geom_col() + facet_wrap(~is_game,labeller=label_both) + theme(legend.position="none") + theme(axis.text.x=element_text(angle=90)) + scale_fill_jama() + labs(y="proportion",x="minimum_installs")
```

* There are 23 levels of installs categories including one NA
* Similar distribution shape of apps and games installs(minimum installs)

```{r, message=FALSE, warning=FALSE}
#ratio of app to dev
df %>% group_by(release_year) %>% summarise(devcount=n_distinct(developer_id), appcount=n_distinct(app_id)) %>% mutate(ratio= (appcount/devcount))

#plot release_year, ratio, is_game
df %>% group_by(is_game, release_year) %>% summarise(devcount=n_distinct(developer_id), appcount=n_distinct(app_id)) %>% mutate(ratio= (appcount/devcount)) %>% ggplot(aes(x=release_year, y=ratio, fill=is_game)) + geom_area() + scale_fill_uchicago() + labs(title="Ratio of apps to developers" )
```

* The ratio of apps (app and game classes) to developers have increased over the years from 2.08 in 2011 to 3.98 in 2019.
  + suggesting that developers are increasingly releasing more apps on the play store per year
* Comparison of app to developer ratio between apps and games classes
  + game developers release more games a year as compared to app developers
  + the growth in ratio is more significant over the years in the games class compared to apps class




#### Subset
The following subset is created to try reduce biasness for further exploration, with the following conditions: 
  + are active (have at least one update in 2020) 
  + have a release year 
  + have a rating score
  + minimum installs equal or above 5000
  + rating count above 500
  
```{r, warning=FALSE, message=FALSE}
#subset 
#last updated in 2020, a rating score, rating count >500, minimum_installs =>5000
active = df %>% filter (last_updated_year== 2020) %>% filter(rating>0) %>% filter(rating_count>500) %>% filter(minimum_installs>1000)

#release year
active %>% group_by(release_year) %>% tally() 
#drop obs with no release year 
active = active %>% filter (!is.na(release_year)) 

active$size = as.numeric(active$size)
active$minimum_android = as.numeric(active$minimum_android)
active = active %>% filter(!is.na(size)) %>% filter(!is.na(minimum_android)) 
dim(active)
```

* 80657 out of 1118136 observations met the above observations. 


```{r}
#plot release year of active apps
active %>% group_by(release_year) %>% tally() %>% ggplot(aes(x=as.factor(release_year), y=n, fill=n)) + geom_col(width=0.8) + labs(y="","x=release_year") + geom_text(aes(label=n), vjust = -0.3, size = 3.5) + scale_fill_viridis() + theme(legend.position="none")
```

* As expected, majority of the active apps are released in the recent years
* The oldest active apps are released in 2010 and there are 855 observations. 
* There are more apps from 2019 than 2020 in the subset, this could be attributed to conditions of the subset as newer apps released in the recent months could have lower install count and rating count. 

```{r}
#comparing games and apps rating distribution 
active %>% ggplot(aes(x=rating, fill=is_game)) + geom_bar() + facet_wrap(~is_game, labeller=label_both) + scale_fill_jama()
```

* There is a similar distribution shape of rating score between games and apps, both are positively skewed. 

```{r}
#proportion of rating score between free and paid apps
#apps
rf0 = active %>% filter(is_game==0) %>% group_by(free, rating) %>% tally() %>% mutate(prop=n/sum(n)) %>% ggplot(aes(x=rating,y=prop, fill=factor(free))) + geom_col() + facet_wrap(~free,labeller=label_both) + theme(legend.position="none") + scale_fill_manual(values = c("#dda15e", "#bc6c25")) + labs(y="proportion", title="Apps") + scale_y_continuous(limits=c(0,0.13))
#games
rf1 = active %>% filter(is_game==1) %>% group_by(free, rating) %>% tally() %>% mutate(prop=n/sum(n)) %>% ggplot(aes(x=rating,y=prop, fill=factor(free))) + geom_col() + facet_wrap(~free,labeller=label_both) + theme(legend.position="none") + 
scale_fill_manual(values = c("#606c38", "#283618")) + labs(y="proportion", title="Games") + scale_y_continuous(limits=c(0,0.13))
#plot
ggarrange(rf0,rf1,ncol=1)
```

* The distribution of rating score between free games and free apps are very similar.
* Free apps have a larger proportion of ratings 2 and below, compared to paid apps.
* Free games have a larger proportion of ratings 2.5 and below compared to paid games. 
* In both apps and games, free class has a lower proportion of ratings 4.5 and above, compared to paid class. 
* Paid apps have a larger proportion of ratings 4.5 and above compared to paid games.

```{r, message=FALSE, warning=FALSE}
#maximum_installs, rating_count, rating
pair_vars = active %>% select(maximum_installs, rating_count, rating, age, age_update)
ggpairs(pair_vars)
```

* As expected there is a positive relationship between
  + maximum_installs and rating_count
  + rating(score) and maximum_installs
  + rating(score) and rating_count 
* Age of app has a negative correlation with rating, positive with rating count and maximum_installs
* Age of app updates negative correlation with maximum installs, rating_count and rating
* The distribution of ratings in the subset is negatively skewed, with majority of the observations having rating(score) of 4.0    and above. 

```{r}
#categories
c0 = active %>% filter(is_game==0) %>% group_by(category) %>% tally() %>% ggplot(aes(x=reorder(category,n), y=n)) + geom_col(fill="#52796f") + coord_flip() + labs(y="category",x="")
c1 = active %>% filter(is_game==1) %>% group_by(category) %>% tally() %>% ggplot(aes(x=reorder(category,n), y=n)) + geom_col(fill="#495057") + coord_flip() + labs(y="category",x="")
ggarrange(c0,c1)
```

* Apps: Tools are the most frequent category while events are the least frequent
* Games: Simulation genre is the most frequent category while music is the least frequent

```{r, message=FALSE}
#avg rating: app
active %>% filter(is_game==0) %>% group_by(category) %>% summarise(avg= mean(rating)) %>% arrange(desc(avg)) %>% as.data.frame()
#avg rating: game
active %>% filter(is_game==1) %>% group_by(category) %>% summarise(avg= mean(rating)) %>% arrange(desc(avg))
```

Average app ratings for categories:  

* In apps: *Books & Reference* have the highest average rating score, while *Dating* have the lowest.
* In games: *Casino* and *Word* genre have the highest average rating score, while *Simulation* genre have the lowest.
  + interestingly, simulation games are the most common game apps (shown in the previous section)

```{r}
#active installs levels
active %>% group_by(is_game, minimum_installs) %>% tally() %>% mutate(prop=n/sum(n)) %>% ggplot(aes(x=factor(minimum_installs), y=prop, fill= is_game)) + geom_col(alpha=0.9) + facet_wrap(~is_game) + theme(legend.position="top") + theme(axis.text.x=element_text(angle=90)) + scale_fill_manual(values=wes_palette("Moonrise2")) + labs(x="minimum_installs",y="proportion",title="Install levels")
```

* When comparing install levels between apps and games, 
  + apps have a higher proportion in 100,000 installs and below, compared to games 
  + apps have a lower proportion in levels 1 million installs and above, compared to games 



#### Most frequent categories in apps and games

```{r}
#comparing categories in subset and complete dataset
#apps
df %>% filter(is_game==0) %>% group_by(category) %>% tally(sort=T)
active %>% filter(is_game==0) %>% group_by(category) %>% tally(sort=T)
#games
df %>% filter(is_game==1) %>% group_by(category) %>% tally(sort=T)
active %>% filter(is_game==1) %>% group_by(category) %>% tally(sort=T)
```

* In comparing subset(active apps) and entire dataset, there are some similarities in the most frequent categories in both games and apps. 
* The following exploration will use the top 5 categories in the subset as it contains (active) apps that have at least an update in 2020. 
  + Apps: tools, entertainment, education, personalization, music & audio
  + Games: simulation, puzzle, casual, sports, action, 
  

```{r, warning=FALSE}
#proportion of top 5 most frequent categories across the years 
#apps
top_cats = c("Tools","Entertainment","Education","Personalization","Music & Audio")
df %>% filter(last_updated_year==2020) %>% group_by(release_year, category) %>% tally() %>% mutate(prop=n/sum(n)) %>% filter(category %in% top_cats)%>% ggplot(aes(x=release_year, y=prop, fill=category)) + geom_col(alpha=0.9) + scale_fill_jama() + theme(legend.position="left") + labs(y="proportion", x="", fill="Apps")
#games
top_genres = c("Simulation","Puzzle","Casual","Sports","Action")
df %>% filter(last_updated_year==2020) %>% filter(is_game==1) %>% group_by(release_year, category) %>% tally() %>% mutate(prop=n/sum(n)) %>% filter(category %in% top_genres) %>% ggplot(aes(x=release_year, y=prop, fill=category)) + geom_col(alpha=0.9) + scale_fill_jco() + theme(legend.position="left") + labs(y="proportion", x="", fill="Games")
```

Looking into the proportion of most frequent categories in active apps (last updated in 2020) to explore the saturation of new releases:

* Apps:
  + The identified 5 categories jointly have increased in proportion to other categories over the years. 
  + Music & Audio category have increased in the past two years (2019 and 2020) while Personalization have decreased in the same     time period. 
* Games:
  + The identified 5 genres jointly make up more than 50% of the games across the years.
  + Proportion of Sports games have  decreased over the years
  + Between the identified 5 genres, Action games make up the lowest proportion from 2015 to 2020. 




#### Pricing strategies 

```{r}
#price model by app type
p1 = active %>% group_by(is_game,price_model) %>% tally() %>% mutate(prop=n/sum(n)) %>% ggplot(aes(x=is_game, y=prop, fill=price_model)) + geom_col(width=0.5) + coord_flip() + scale_fill_jama() + theme(legend.position="right")

#ad supported by app type
p2 = active %>% group_by(is_game,ad_supported) %>% tally() %>% mutate(prop=n/sum(n)) %>% ggplot(aes(x=is_game, y=prop, fill=factor(ad_supported))) + geom_col(width=0.5) + coord_flip() + scale_fill_uchicago() + theme(legend.position="right")

ggarrange(p1,p2, nrow=2)
```
* Price model:
  + freemium is the most popular price model among game developer, while free is most popular among app developers
  + there is a higher proportion of paid and paymium price model in game when compared to apps
* Add supported:
  + games have a higher proportion of ad supported than apps


```{r}
#price model and ad supported 
active %>% group_by(is_game, price_model, ad_supported) %>% tally() %>% mutate(prop = n/sum(n)) %>% ggplot(aes(x=price_model, y=prop, fill=factor(ad_supported))) + geom_col(width=0.6,alpha=0.9) + coord_flip() + facet_wrap(~is_game, labeller=label_both) + scale_fill_uchicago() + theme(legend.position="bottom") + labs(y="proportion",fill="ad_supported")
```

* Looking at both price model and ad supported:
  + across both games and apps, as expected free to download model (free and freemium) have a higher proportion of ad supported        than paid and paymium models
  + free to download games have a higher proportion of ad support than free to download apps 
  + paymium apps have a higher proportion of ad supported than paymium games 
  + freemium games have the most ad support while paid games have the least ad support
 

```{r}
#most frequent categories and price_model
active %>% filter (is_game==0) %>% filter(category %in% top_cats) %>% group_by(category, price_model) %>% tally() %>% mutate(prop=n/sum(n)) %>% ggplot(aes(x=category, y=prop, fill=price_model)) + geom_col(alpha=0.9, width=0.7) + scale_fill_jama() + theme(legend.position="bottom") + labs(y="proportion", x="app category") + coord_flip()

active %>% filter (is_game==1) %>% filter(category %in% top_genres) %>% group_by(category, price_model) %>% tally() %>% mutate(prop=n/sum(n)) %>% ggplot(aes(x=category, y=prop, fill=price_model)) + geom_col(alpha=0.9, width=0.7) + scale_fill_jama() + theme(legend.position="bottom") + labs(y="proportion", x="game category") + coord_flip()
```

Looking at developers' pricing (model) strategy in the identified 5 categories:

* Apps: 
  + personalization has the most paymium model compared to other categories
  + tools have paid model compared to other categories
* Games:
  + action genre have the most paymium price model 
  + sports genre have the most free model and least freemium model 



```{r}
#summary of paid apps by is_game
active_copy = active %>% filter(free==0) 
by(active_copy$price, active_copy$is_game, summary)

#paid games prices of 5 most frequent categories
active %>% filter (is_game==1) %>% filter(free==0) %>% filter(category %in% top_genres) %>% ggplot(aes(x=category, y=price, color=category)) + geom_jitter(show.legend=FALSE, size=2, alpha=0.5) + coord_flip() + scale_color_futurama() + labs(y="price (USD)", title="is_game = 1")

#paid apps prices of 5 most frequent categories
active %>% filter (is_game==0) %>% filter(free==0) %>% filter(category %in% top_cats) %>% ggplot(aes(x=category, y=price, color=category)) + geom_jitter(show.legend=FALSE, size=2, alpha=0.5) + coord_flip() + scale_color_futurama() + labs(y="price (USD)", title="is_game = 0")
```
* For pay to download models, 
  + the lowest price is 0.99USD for both games and apps
  + highest priced pay to download game is 49.99USD and app is 379.99
  + median of 2.99 USD for both games and apps

* Looking at prices between 0.99 and 50.00USD
  + games: sports and simulation have games more than 10USD but puzzle, casual and action do not
  + apps: tools and education have apps that are more than 20USD, but personalization, music & audio, and entertainment categories     do not. 


#### Content rating
```{r}
#content rating
active %>% group_by(is_game,content_rating) %>% tally() %>% mutate(prop=round(n/sum(n),2))

active %>% group_by(is_game,content_rating) %>% tally() %>% mutate(prop=round(n/sum(n),2)) %>% ggplot(aes(x=is_game, y=prop, fill=factor(content_rating))) + geom_col(width=0.5) + coord_flip() + scale_fill_uchicago() + theme(legend.position="bottom") + labs(y="proportion",fill="content_rating")
```

* in both games and apps types, 'Everyone' rating is the most common  and "Adults only 18+' is the least common
* compared to apps, games have a higher proportion of 'Teen' and 'Everyone 10+' content but lower proportion of 'Mature 17+' and     'Everyone' rating.  


```{r}
#content_rating of most frequent categories 
active %>% filter (is_game==1) %>% filter(category %in% top_genres) %>% group_by(category, content_rating) %>% tally() %>% mutate(prop=n/sum(n)) %>% ggplot(aes(x=category, y=prop, fill=content_rating)) + geom_col(alpha=0.9, width=0.7) + scale_fill_uchicago() + theme(legend.position="bottom") + labs(y="proportion", x="game category") + coord_flip()

active %>% filter (is_game==0) %>% filter(category %in% top_cats) %>% group_by(category, content_rating) %>% tally() %>% mutate(prop=n/sum(n)) %>% ggplot(aes(x=category, y=prop, fill=content_rating)) + geom_col(width=0.7) + scale_fill_jco() + theme(legend.position="bottom") + labs(y="proportion", x="app category") + coord_flip()
```

* Games: 
  + majority of sports, simulation, puzzle and casual genres are rated 'Everyone', but majority of action is rated 'Teen'
  + action games have the highest proportion of 'Mature 17+' content compared to the other identified genres
* Apps: 
  + across the five identified genres, majority of the apps are rated 'Everyone', least are 'Everyone 10+', and none are rated         'Adults only 18+'
  + entertainment apps have the highest proportion of 'Teen' and 'Mature 17+' content compared to the other identified app             categories


### Export dataframe 

```{r}
#shortlisted variables for modeling 
games1 = active %>% filter(is_game == 1) %>% select(app_id, category, rating, rating_count, minimum_installs, maximum_installs, free, price, size, content_rating, ad_supported, in_app_purchases, last_updated_year, release_year, age, age_update) 
dim(games1)

apps1 = active %>% filter(is_game == 0) %>% select(app_id, category, rating, rating_count, minimum_installs, maximum_installs, free, price, size, content_rating, ad_supported, in_app_purchases, last_updated_year, release_year, age, age_update)
dim(apps1)
```

```{r}
#export df
write.csv(games1,"games1.csv",row.names = FALSE)
write.csv(apps1,"apps1.csv",row.names = FALSE)
```


