Note: I wish I could share with you the HTML file. It is a lot more readable than the PDF.
dim(account)
## [1] 112792 6
head(account)
## account_id created_time created_device created_platform
## 1 13514010 2016-03-02 17:11:00.332 iPhone6,2 iOS
## 2 4308483975 2016-03-02 20:57:46.14 MIDC147PJ Android
## 3 17193137415 2016-03-02 13:52:16.735 SM-G360F Android
## 4 21488104920 2016-03-02 12:43:27.899 H60-L01 Android
## 5 21488107995 2016-03-02 17:20:12.145 GT-I9500 Android
## 6 25783063200 2016-03-02 04:03:33.005 vivo Y27 Android
## country_code created_app_store_id
## 1 GB 1
## 2 FR 2
## 3 IT 2
## 4 CN 8
## 5 RU 2
## 6 CN 15
str(account)
## 'data.frame': 112792 obs. of 6 variables:
## $ account_id : chr "13514010" "4308483975" "17193137415" "21488104920" ...
## $ created_time : chr "2016-03-02 17:11:00.332" "2016-03-02 20:57:46.14" "2016-03-02 13:52:16.735" "2016-03-02 12:43:27.899" ...
## $ created_device : chr "iPhone6,2" "MIDC147PJ" "SM-G360F" "H60-L01" ...
## $ created_platform : chr "iOS" "Android" "Android" "Android" ...
## $ country_code : chr "GB" "FR" "IT" "CN" ...
## $ created_app_store_id: int 1 2 2 8 2 15 2 2 4 2 ...
So, the dataset “Account” is composed by 112792 rows and 6 columns.
Columns are all characters (I will modify the date later), with the exception of the last one, “Created_app_store_id” which is an integer.
account_id should be a unique identifier for each account.
sum(is.na(account))
## [1] 107
columns_with_na1 <- colSums(is.na(account)) > 0
# Print the result
print(columns_with_na1)
## account_id created_time created_device
## FALSE FALSE FALSE
## created_platform country_code created_app_store_id
## FALSE TRUE FALSE
So apparently we have some NAs in the country code of account. I will take this into account.
dim(account_date_session)
## [1] 1698974 4
head(account_date_session)
## account_id date session_count session_duration_sec
## 1 68730811144 2016-01-01 1 47
## 2 68730812806 2016-01-01 1 204
## 3 68730829426 2016-01-01 12 4703
## 4 68730829426 2016-01-02 9 4676
## 5 68730829426 2016-01-03 9 2271
## 6 68730829426 2016-01-04 6 1356
str(account_date_session)
## 'data.frame': 1698974 obs. of 4 variables:
## $ account_id : chr "68730811144" "68730812806" "68730829426" "68730829426" ...
## $ date : chr "2016-01-01" "2016-01-01" "2016-01-01" "2016-01-02" ...
## $ session_count : int 1 1 12 9 9 6 1 4 2 5 ...
## $ session_duration_sec: int 47 204 4703 4676 2271 1356 50 2554 857 755 ...
So, the dataset account date session is composed by 1698974 rows and 4 columns.
account_id is, like in the “account” dataset, a unique identifier for each account I guess.
sum(is.na(account_date_session))
## [1] 0
dim(iap_purchase)
## [1] 9909 5
str(iap_purchase)
## 'data.frame': 9909 obs. of 5 variables:
## $ account_id : chr "30077202816" "30077202816" "21487283560" "21487152816" ...
## $ created_time : chr "2016-03-26 23:59:59.355" "2016-05-31 11:24:37.283" "2016-02-13 03:40:28.644" "2016-02-28 00:53:26.678" ...
## $ package_id_hash : chr "ae0253c27c34edd1ab4fe21d6bfc91f8" "dd4c1bda4f2c904075fb2fbfcf30f30e" "99a9e0e63efa2fdce8fc8de74c66cea9" "99a9e0e63efa2fdce8fc8de74c66cea9" ...
## $ iap_price_usd_cents: int 739 369 184 184 184 184 36 184 1849 184 ...
## $ app_store_id : int 0 0 0 0 0 0 10 1 1 0 ...
head(iap_purchase)
## account_id created_time package_id_hash
## 1 30077202816 2016-03-26 23:59:59.355 ae0253c27c34edd1ab4fe21d6bfc91f8
## 2 30077202816 2016-05-31 11:24:37.283 dd4c1bda4f2c904075fb2fbfcf30f30e
## 3 21487283560 2016-02-13 03:40:28.644 99a9e0e63efa2fdce8fc8de74c66cea9
## 4 21487152816 2016-02-28 00:53:26.678 99a9e0e63efa2fdce8fc8de74c66cea9
## 5 8602037685 2016-02-11 01:03:04.727 99a9e0e63efa2fdce8fc8de74c66cea9
## 6 8602037685 2016-02-25 12:03:48.61 99a9e0e63efa2fdce8fc8de74c66cea9
## iap_price_usd_cents app_store_id
## 1 739 0
## 2 369 0
## 3 184 0
## 4 184 0
## 5 184 0
## 6 184 0
sum(is.na(iap_purchase))
## [1] 0
So, the dataset iap_purchase is composed by 9909 rows and 5 columns.
Also here we have our “account_id”.
Okay. Let’s jump into it.
account_date_session$date <- as.Date(account_date_session$date,format = "%Y-%m-%d")
account_date_session_aggregated_by_day <- account_date_session %>%
group_by(date)%>%
summarize(total_number_of_sessions = sum(session_count),
average_session_duration= mean(session_duration_sec),
tot_number_active_users = n_distinct(account_id))
#When it comes to visualize a time trend, there is only one rule: use the area chart.
p <- account_date_session_aggregated_by_day %>%
ggplot( aes(x=date, y=tot_number_active_users)) +
geom_area(fill="#69b3a2", alpha=0.5) +
geom_line(color="#69b3a2") +
scale_x_date(date_breaks = "1 month", date_labels = "%b")+
scale_y_continuous(breaks = c(1000,2000,3000, 4000, 5000))+
labs(title = "Daily trend - Number of active users",
subtitle = "The amount of users who played for at least a session in a given day",
caption = "2016",
y = "Number of active user",
x = "Month")+
theme_bw()+
theme(plot.title.position = "plot",
text = element_text(family = "Georgia"),
axis.text.y = element_text(size = 8),
plot.title = element_text(size = 20, margin = margin(b= 10), hjust= 0),
plot.subtitle = element_text(size = 12, color = "darkslategrey", margin = margin(b = 10, l =-25)))
p
So, key takeaways here:
-The number of daily active users seems to have nearly tripled from January to March 2016. A closer look:
week_comparison_JanMarch <- account_date_session_aggregated_by_day%>%
select(date, tot_number_active_users)
week_comparison_JanMarch$date <- floor_date(week_comparison_JanMarch$date, unit = "week", week_start = 1)
week_comparison_JanMarch <- week_comparison_JanMarch %>%
filter(date >= "2016-01-01" & date <= "2016-03-31")%>%
group_by(date)%>%
summarize(peak_users = max(tot_number_active_users)
)
week_comparison_JanMarch$peak_users_percentage_variation <- (week_comparison_JanMarch$peak_users - lag(week_comparison_JanMarch$peak_users)) /lag(week_comparison_JanMarch$peak_users) * 100
#For this, I'll take the peak(max) number of daily users for each week.
p1 <- ggplot(week_comparison_JanMarch, aes(x = date, y = peak_users))+
geom_line(aes(group = 1), size = 1.5)+
geom_point(shape= 21,fill = "red", size = 3)+
scale_x_date(date_breaks = "1 week", date_labels = "%Y-%m-%d")+
scale_y_continuous(breaks = c(3000, 4000, 5000))+
labs(title = "Weekly trend - Maximum number of active users",
subtitle = "The peak daily user count achieved in the weeks of January, February, and March",
caption = "2016",
y = "Peak number of active user",
x = "Week")+
geom_label(aes(label = peak_users),
nudge_x = 0.2,
nudge_y = -150,
size = 3.2)+
theme_bw()+
theme(plot.title.position = "plot",
text = element_text(family = "Georgia"),
axis.text.y = element_text(size = 8),
axis.text.x = element_text(size = 10,angle = 90, vjust = 0.5, color = "black"),
plot.title = element_text(size = 20, margin = margin(b= 10), hjust= 0),
plot.subtitle = element_text(size = 12, color = "darkslategrey", margin = margin(b = 10, l =-25)))
p1
Yes, from the first week in January to the start of March, there is a constant upward trend.
This constant upward trend is not a surprise. A quick look here tells me that in those days you were releasing Clash Royale.
Fun fact: it is very likely that I am in between these numbers.
“I was there, Gandalf…”
-After March it seems that the trend is pretty constant, with an average that should be around 5k active users daily.
Let’s take a look to a more insightful chart: the weekly percentage variation in peak users.
ggplot(week_comparison_JanMarch, aes(x = date, y = peak_users_percentage_variation, fill = peak_users_percentage_variation > 0)) +
geom_bar(stat = "identity", position = "identity") +
scale_fill_manual(values = c("darkred", "steelblue")) +
scale_x_date(breaks = "1 week")+
labs(title = "Peak Weekly users percentage change(%)",
x = "Week",
y = "Percentage Change") +
theme_bw()+
theme(
legend.position = "none",
plot.title.position = "plot",
text = element_text(family = "Georgia"),
axis.text.y = element_text(size = 8),
axis.text.x = element_text(size = 10,angle = 90, vjust = 0.5, color = "black"),
plot.title = element_text(size = 20, margin = margin(b= 10), hjust= 0),
plot.subtitle = element_text(size = 12, color = "darkslategrey", margin = margin(b = 10, l =-25)))
## Warning: Removed 1 rows containing missing values (geom_bar).
-It seems that there is less variability in the months of July and August. Being a player of your games myself, I suppose this could be due to the fact that in that months there is no school/university and also lots of people don’t work.
The only drop worth of notice that I can see from the starting chart is one around mid March.
Let’s take a look at the date.
march_data <- account_date_session_aggregated_by_day %>%
filter(format(date, "%m") == "03")
min_date <- march_data$date[which.min(march_data$tot_number_active_users)]
min_date
## [1] "2016-03-15"
Apparently on 15 March there was a minimum point. Did something happen?
I’ll let you tell me this.
There are, however, many micro ups and downs worth of noticing.
Supposition: the days of the week? Is it possible that the spikes are caused by weekends?
Let’s take a look.
#To have a better look, I will "zoom" on the weeks of April and May since from the chart it look like a
#"constant" timeframe.
week_comparison_AprMay <- account_date_session_aggregated_by_day%>%
select(date, tot_number_active_users)%>%
filter(date >= "2016-04-01" & date <= "2016-05-31")
week_comparison_AprMay$date <- as.Date(week_comparison_AprMay$date, format = "%Y-%m-%d")
week_comparison_AprMay$weekend <- ifelse(weekdays(week_comparison_AprMay$date) %in% c('Saturday', 'Sunday'), TRUE, FALSE)
ggplot(week_comparison_AprMay, aes(x = date, y = tot_number_active_users)) +
geom_line() + # Line chart
geom_point(aes(color = weekend), size = 3) + # Add points, different color for weekends
scale_color_manual(values = c("FALSE" = "black", "TRUE" = "red")) + # Set weekend points to red
theme_bw()+
labs(title = "Daily Trend - Active Users - Weekend Comparison",
x = "Date",
y = "Active Users")+
theme(plot.title.position = "plot",
text = element_text(family = "Georgia"),
axis.text.y = element_text(size = 8),
axis.text.x = element_text(size = 10,angle = 90, vjust = 0.5, color = "black"),
plot.title = element_text(size = 20, margin = margin(b= 10), hjust= 0),
plot.subtitle = element_text(size = 12, color = "darkslategrey", margin = margin(b = 10, l =-25)))
This chart confirms what I supposed. The spikes are likely caused by the fact that in the weekends people have more time to enjoy your games.
#Data preparation for the choropleth.
map <- st_read("C:/Users/Michele/Desktop/Lavoro/Supercell_TEST/ne_10m_admin_0_countries/ne_10m_admin_0_countries.shp")
## Reading layer `ne_10m_admin_0_countries' from data source
## `C:\Users\Michele\Desktop\Lavoro\Supercell_TEST\ne_10m_admin_0_countries\ne_10m_admin_0_countries.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 258 features and 168 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -180 ymin: -90 xmax: 180 ymax: 83.6341
## Geodetic CRS: WGS 84
map$ISO_A2[map$NAME=="France"] <- "FR"
map$ISO_A2[map$NAME=="Norway"] <- "NO"
#Manually changing this, since for some reason the iso code for France and Norway was missing.
map_cleaned <- map%>%
select(ISO_A2, NAME)
#taking the country code column from "account"
iap_purchase2 <- left_join(iap_purchase, account %>%
select(account_id, country_code), by = "account_id")
revenue_by_country <- iap_purchase2%>%
dplyr::group_by(country_code)%>%
summarize(tot_revenue_country = sum(iap_price_usd_cents))
#I'll normalize this in USD.
revenue_by_country$tot_revenue_country <- revenue_by_country$tot_revenue_country/100
account_by_country <- account%>%
dplyr::group_by(country_code )%>%
summarize(accounts = n_distinct(account_id))
account_choro <- merge(account_by_country, map_cleaned, by.x = "country_code", by.y = "ISO_A2")
revenue_choro <- merge(revenue_by_country, map_cleaned, by.x = "country_code", by.y = "ISO_A2")
#Taking population data from 2015. I need it to normalize the data by the population of each country.
data(pop)
pop2015 <- pop%>%
select(name, "2015")
pop2015$name[pop2015$name == "Russian Federation"] <- "Russia"
#Manually changing this because for some reason they named it "Russian Federation".
account_choro2 <- merge(account_choro, pop2015, by.x= "NAME", by.y= "name" )
colnames(account_choro2) <- c("NAME","country_code","accounts","geometry","population_2015")
account_choro3 <- account_choro2%>%
mutate(accounts_per_1000 = (account_choro2$accounts / account_choro2$population_2015) * 1000)%>%
arrange(desc(accounts_per_1000))
The first thing I’m gonna do is to visualize where the vast majority of the revenue comes from, in percentage.
A simple piechart can help us in this.
pie_data <- revenue_by_country%>%
group_by(country_code)%>%
summarize(tot_revenue = sum(tot_revenue_country))%>%
mutate(perc = round(100*tot_revenue/sum(tot_revenue),1))
top_countries <- pie_data %>%
select(country_code, perc)
top_countries <-top_countries %>%
arrange(desc(perc)) %>%
slice(1:7)
others <- sum(pie_data$perc) - sum(top_countries$perc)
others_row <- data.frame(country_code = "Others", perc = others)
pie_data2 <- rbind(top_countries, others_row)
positions <- pie_data2 %>%
mutate(csum = rev(cumsum(rev(perc))),
pos = perc/2 + lead(csum, 1),
pos = if_else(is.na(pos), perc/2, pos))
positions$pos[positions$country_code == "US"] = 10
positions$pos[positions$country_code == "TR"] = 32
positions$pos[positions$country_code == "CN"] = 75
positions$pos[positions$country_code == "Others"] = 40
positions$pos[positions$country_code == "CH"] = 92
positions$pos[positions$country_code == "CA"] = 98
positions$pos[positions$country_code == "KR"] = 53
positions$pos[positions$country_code == "FR"] = 60
ggplot(pie_data2, aes(x = "", y = perc, fill = country_code)) +
geom_bar( stat = "identity") +
coord_polar("y", start = 0) +
scale_fill_brewer()+
geom_label_repel(
data = positions, aes( y= pos, label = paste0(perc, "%")),
size = 4.5, nudge_x = 1, show.legend = F)+
theme_void()+
labs(title = "Percentage of revenue by country")+
guides(fill = guide_legend(title = "Country Code"))+
theme(plot.title.position = "plot",
text = element_text(family = "Georgia"),
plot.title = element_text(size = 20, margin = margin(b= 10), hjust= 0),
plot.subtitle = element_text(size = 12, color = "darkslategrey", margin = margin(b = 10, l =-25)))
And now what I want to see is the map with this informations.
Countries generating 0 revenue will be automatically excluded.
#When it comes to analyze geographic split, the best choice is usually the choropleth.
#Let's start from revenue.
clean1<-revenue_choro%>%
mutate(clss=case_when(
tot_revenue_country<100~"1",
tot_revenue_country<200~"2",
tot_revenue_country<500~"3",
tot_revenue_country<1000~"4",
tot_revenue_country<2000~"5",
TRUE~"6"
))
bck <- "#001219"
theme_custom1 <- theme_void()+
theme(
plot.margin = margin(1,1,10,1,"pt"),
plot.background = element_rect(fill=bck,color=NA),
legend.position = "bottom",
legend.title = element_text(hjust=0.5,color="white",face="bold"),
legend.text = element_text(color="white")
)
pal <- c("#bb3e03","#ee9b00","#e9d8a6","#94d2bd","#0a9396","#005f73")
ggplot(clean1, aes(fill=clss, geometry=geometry))+
geom_sf()+
labs(fill="Total Revenue by country in $")+
guides(
fill=guide_legend(
nrow=1,
title.position="top",
label.position="bottom"
)
)+
scale_fill_manual(
values=pal,
label=c("< 100 ","< 200 ","< 500","< 1000","< 2000", ">=2000 ")
)+
theme_custom1
# Now that we have everything up and running, let's visualize everything.
#To start with, we can take a look at which are the countries with most players in absolute terms. Let's plot the first 10.
account_choro_barchart <- account_choro2%>%
arrange(desc(accounts))%>%
slice_head(n= 10)
ggplot(account_choro_barchart, aes(x = reorder(NAME, accounts), y =accounts))+
geom_bar(stat = "identity", fill = "steelblue")+
geom_label(aes(label = accounts))+
labs(title = "Countries with the most accounts registered",
subtitle = "China comes first, of course, in terms of absolute number of players.
",
x = "Country",
y = "Number of accounts")+
coord_flip()+
theme_bw()+
theme(plot.title.position = "plot",
text = element_text(family = "Georgia"),
axis.text.y = element_text(size = 8),
plot.title = element_text(size = 20, margin = margin(b= 10), hjust= 0),
plot.subtitle = element_text(size = 12, color = "darkslategrey", margin = margin(b = 10, l =-25)))
But let’s try to make this more insightful by normalizing the number of players by the population.
account_choro_barchart2 <- account_choro3 %>%
arrange(desc(accounts_per_1000))%>%
slice_head(n= 10)
account_choro_barchart2$accounts_per_1000 <- round(account_choro_barchart2$accounts_per_1000,1)
ggplot(account_choro_barchart2, aes(x = reorder(NAME, accounts_per_1000), y =accounts_per_1000))+
geom_bar(stat = "identity", fill = "steelblue")+
geom_label(aes(label = accounts_per_1000))+
labs(title = "Countries with the most accounts per 1000 inhabitants",
subtitle = "Yes, I am as surprised as you.
",
x = "Country",
y = "Number of accounts per 1000 inhabitants")+
coord_flip()+
theme_bw()+
theme(plot.title.position = "plot",
text = element_text(family = "Georgia"),
axis.text.y = element_text(size = 8),
plot.title = element_text(size = 20, margin = margin(b= 10), hjust= 0),
plot.subtitle = element_text(size = 12, color = "darkslategrey", margin = margin(b = 10, l =-25)))
To be honest, I didn’t even know that “Guam” was a country, imagine thinking that it could come out in this list. LMAO.
Let’s check this. I’ll start from the number of inhabitants.
pop2015$"2015"[pop2015$name == "Guam"]
## [1] 169.885
And then the number of players.
account_choro2$accounts[account_choro2$NAME == "Guam"]
## [1] 13
So apparently we have a country with around 170.000 inhabitants, and 13 of them play your games.
That’s pretty impressive.
But now let’s check this on the map.
clean2<-account_choro3%>%
mutate(clss=case_when(
accounts_per_1000<10~"1",
accounts_per_1000<20~"2",
accounts_per_1000<30~"3",
accounts_per_1000<40~"4",
accounts_per_1000<50~"5",
TRUE~"6"
))
bck <- "#001219"
theme_custom2 <- theme_void()+
theme(
plot.margin = margin(1,1,10,1,"pt"),
plot.background = element_rect(fill=bck,color=NA),
legend.position = "bottom",
legend.title = element_text(hjust=0.5,color="white",face="bold"),
legend.text = element_text(color="white")
)
pal <- c("#bb3e03","#ee9b00","#e9d8a6","#94d2bd","#0a9396","#005f73")
ggplot(clean2, aes(fill=clss, geometry=geometry))+
geom_sf()+
labs(fill="Players per 1000 inhabitants")+
guides(
fill=guide_legend(
nrow=1,
title.position="top",
label.position="bottom"
)
)+
scale_fill_manual(
values=pal,
label=c("< 10 ","< 20 ","< 30","< 40","< 50", ">= 50 ")
)+
theme_custom2
Sorry if some South African and South American countries are not here and there but it isn’t easy to keep track of their Isocode or different names, but if you want me to do it we can discuss and I’ll try my best.
I assume that when you say “per market” you intend “per country”
#Calculating the average revenue per user per country.
user_aggregated <- account%>%
group_by(country_code)%>%
summarize(users = n_distinct(account_id))
revenue_aggregated <- iap_purchase2%>%
group_by(country_code)%>%
summarize(total_revenue_usd_cents = sum(iap_price_usd_cents))
#Taking just the countries where purchases were made.
user_revenue_aggregated <- left_join(revenue_aggregated, user_aggregated, by = "country_code")
#Creating a new column indicating the revenue by user.
user_revenue_aggregated$revenue_by_user <- user_revenue_aggregated$total_revenue_usd_cents/user_revenue_aggregated$users
user_revenue_aggregated$country_code[is.na(user_revenue_aggregated$country_code)] <- "NA"
user_revenue_aggregated <- user_revenue_aggregated%>%
arrange(desc(revenue_by_user))
user_revenue_aggregated$revenue_by_user <- round(user_revenue_aggregated$revenue_by_user,2)
user_revenue_aggregated$country_code <- factor(user_revenue_aggregated$country_code, levels = user_revenue_aggregated$country_code[order(user_revenue_aggregated$revenue_by_user)])
ggplot(user_revenue_aggregated, aes(revenue_by_user, country_code, label = revenue_by_user))+
geom_segment(aes(x = 0, y = country_code, xend = revenue_by_user, yend = country_code))+
geom_point(shape= 21,fill = "red")+
geom_text(nudge_x = 50)+
labs(title = "Average revenue per user by country (USD cents)",
subtitle = "Switzerland leads the way with a huge advantage on every other country")+
theme_bw()+
theme(axis.title = element_blank(),
panel.grid.minor = element_blank(),
legend.position = "none",
plot.title.position = "plot",
text = element_text(family = "Georgia"),
axis.text.y = element_text(size = 8),
plot.title = element_text(size = 20, margin = margin(b= 10), hjust= 0),
plot.subtitle = element_text(size = 12, color = "darkslategrey", margin = margin(b = 15, l =-25)))
To conclude, we can say that obviously countries with higher GDP per capita(like Switzerland) will tendencially have more resources to spend. A good continue to this analysis could be analyze the data by taking into account this other data sources, but there is no time unfortunately.
Goodbye!
Michele