Note: I wish I could share with you the HTML file. It is a lot more readable than the PDF.

1. Give us short description of datasets

dim(account)
## [1] 112792      6
head(account)
##    account_id            created_time created_device created_platform
## 1    13514010 2016-03-02 17:11:00.332      iPhone6,2              iOS
## 2  4308483975  2016-03-02 20:57:46.14      MIDC147PJ          Android
## 3 17193137415 2016-03-02 13:52:16.735       SM-G360F          Android
## 4 21488104920 2016-03-02 12:43:27.899        H60-L01          Android
## 5 21488107995 2016-03-02 17:20:12.145       GT-I9500          Android
## 6 25783063200 2016-03-02 04:03:33.005       vivo Y27          Android
##   country_code created_app_store_id
## 1           GB                    1
## 2           FR                    2
## 3           IT                    2
## 4           CN                    8
## 5           RU                    2
## 6           CN                   15
str(account)
## 'data.frame':    112792 obs. of  6 variables:
##  $ account_id          : chr  "13514010" "4308483975" "17193137415" "21488104920" ...
##  $ created_time        : chr  "2016-03-02 17:11:00.332" "2016-03-02 20:57:46.14" "2016-03-02 13:52:16.735" "2016-03-02 12:43:27.899" ...
##  $ created_device      : chr  "iPhone6,2" "MIDC147PJ" "SM-G360F" "H60-L01" ...
##  $ created_platform    : chr  "iOS" "Android" "Android" "Android" ...
##  $ country_code        : chr  "GB" "FR" "IT" "CN" ...
##  $ created_app_store_id: int  1 2 2 8 2 15 2 2 4 2 ...

So, the dataset “Account” is composed by 112792 rows and 6 columns.

Columns are all characters (I will modify the date later), with the exception of the last one, “Created_app_store_id” which is an integer.

account_id should be a unique identifier for each account.

sum(is.na(account))
## [1] 107
columns_with_na1 <- colSums(is.na(account)) > 0

# Print the result
print(columns_with_na1)
##           account_id         created_time       created_device 
##                FALSE                FALSE                FALSE 
##     created_platform         country_code created_app_store_id 
##                FALSE                 TRUE                FALSE

So apparently we have some NAs in the country code of account. I will take this into account.

dim(account_date_session)
## [1] 1698974       4
head(account_date_session)
##    account_id       date session_count session_duration_sec
## 1 68730811144 2016-01-01             1                   47
## 2 68730812806 2016-01-01             1                  204
## 3 68730829426 2016-01-01            12                 4703
## 4 68730829426 2016-01-02             9                 4676
## 5 68730829426 2016-01-03             9                 2271
## 6 68730829426 2016-01-04             6                 1356
str(account_date_session)
## 'data.frame':    1698974 obs. of  4 variables:
##  $ account_id          : chr  "68730811144" "68730812806" "68730829426" "68730829426" ...
##  $ date                : chr  "2016-01-01" "2016-01-01" "2016-01-01" "2016-01-02" ...
##  $ session_count       : int  1 1 12 9 9 6 1 4 2 5 ...
##  $ session_duration_sec: int  47 204 4703 4676 2271 1356 50 2554 857 755 ...

So, the dataset account date session is composed by 1698974 rows and 4 columns.

account_id is, like in the “account” dataset, a unique identifier for each account I guess.

sum(is.na(account_date_session))
## [1] 0
dim(iap_purchase)
## [1] 9909    5
str(iap_purchase)
## 'data.frame':    9909 obs. of  5 variables:
##  $ account_id         : chr  "30077202816" "30077202816" "21487283560" "21487152816" ...
##  $ created_time       : chr  "2016-03-26 23:59:59.355" "2016-05-31 11:24:37.283" "2016-02-13 03:40:28.644" "2016-02-28 00:53:26.678" ...
##  $ package_id_hash    : chr  "ae0253c27c34edd1ab4fe21d6bfc91f8" "dd4c1bda4f2c904075fb2fbfcf30f30e" "99a9e0e63efa2fdce8fc8de74c66cea9" "99a9e0e63efa2fdce8fc8de74c66cea9" ...
##  $ iap_price_usd_cents: int  739 369 184 184 184 184 36 184 1849 184 ...
##  $ app_store_id       : int  0 0 0 0 0 0 10 1 1 0 ...
head(iap_purchase)
##    account_id            created_time                  package_id_hash
## 1 30077202816 2016-03-26 23:59:59.355 ae0253c27c34edd1ab4fe21d6bfc91f8
## 2 30077202816 2016-05-31 11:24:37.283 dd4c1bda4f2c904075fb2fbfcf30f30e
## 3 21487283560 2016-02-13 03:40:28.644 99a9e0e63efa2fdce8fc8de74c66cea9
## 4 21487152816 2016-02-28 00:53:26.678 99a9e0e63efa2fdce8fc8de74c66cea9
## 5  8602037685 2016-02-11 01:03:04.727 99a9e0e63efa2fdce8fc8de74c66cea9
## 6  8602037685  2016-02-25 12:03:48.61 99a9e0e63efa2fdce8fc8de74c66cea9
##   iap_price_usd_cents app_store_id
## 1                 739            0
## 2                 369            0
## 3                 184            0
## 4                 184            0
## 5                 184            0
## 6                 184            0
sum(is.na(iap_purchase))
## [1] 0

So, the dataset iap_purchase is composed by 9909 rows and 5 columns.

Also here we have our “account_id”.

Okay. Let’s jump into it.

2. Analyse the daily active users

Compare DAU changes over time.

account_date_session$date <- as.Date(account_date_session$date,format = "%Y-%m-%d")

account_date_session_aggregated_by_day <- account_date_session %>%
  group_by(date)%>%
  summarize(total_number_of_sessions = sum(session_count),
            average_session_duration= mean(session_duration_sec),
            tot_number_active_users = n_distinct(account_id))
#When it comes to visualize a time trend, there is only one rule: use the area chart.


p <- account_date_session_aggregated_by_day %>% 
  ggplot( aes(x=date, y=tot_number_active_users)) +
    geom_area(fill="#69b3a2", alpha=0.5) +
    geom_line(color="#69b3a2") +
    scale_x_date(date_breaks = "1 month", date_labels = "%b")+
    scale_y_continuous(breaks = c(1000,2000,3000, 4000, 5000))+  
    labs(title = "Daily trend - Number of active users",
       subtitle = "The amount of users who played for at least a session in a given day",
       caption = "2016",
       y = "Number of active user",
       x = "Month")+
    theme_bw()+
  theme(plot.title.position = "plot",
        text = element_text(family = "Georgia"),
        axis.text.y = element_text(size = 8),
        plot.title = element_text(size = 20, margin = margin(b= 10), hjust= 0),
        plot.subtitle = element_text(size = 12, color = "darkslategrey", margin = margin(b = 10,                                                                    l =-25)))


p

So, key takeaways here:

-The number of daily active users seems to have nearly tripled from January to March 2016. A closer look:

week_comparison_JanMarch <- account_date_session_aggregated_by_day%>%
  select(date, tot_number_active_users)

week_comparison_JanMarch$date <- floor_date(week_comparison_JanMarch$date, unit = "week", week_start = 1)

week_comparison_JanMarch <- week_comparison_JanMarch %>%
  filter(date >= "2016-01-01" & date <= "2016-03-31")%>%
  group_by(date)%>%
  summarize(peak_users = max(tot_number_active_users)
            
             )

week_comparison_JanMarch$peak_users_percentage_variation <- (week_comparison_JanMarch$peak_users - lag(week_comparison_JanMarch$peak_users)) /lag(week_comparison_JanMarch$peak_users) * 100

#For this, I'll take the peak(max) number of daily users for each week.
p1 <- ggplot(week_comparison_JanMarch, aes(x = date, y = peak_users))+
  geom_line(aes(group = 1), size = 1.5)+
  geom_point(shape= 21,fill = "red", size = 3)+
  scale_x_date(date_breaks = "1 week", date_labels = "%Y-%m-%d")+
  scale_y_continuous(breaks = c(3000, 4000, 5000))+
  labs(title = "Weekly trend - Maximum number of active users",
       subtitle = "The peak daily user count achieved in the weeks of January, February, and March",
       caption = "2016",
       y = "Peak number of active user",
       x = "Week")+
  geom_label(aes(label = peak_users),
             nudge_x = 0.2,
             nudge_y = -150,
             size = 3.2)+
             
  theme_bw()+
  theme(plot.title.position = "plot",
        text = element_text(family = "Georgia"),
        axis.text.y = element_text(size = 8),
        axis.text.x = element_text(size = 10,angle = 90, vjust = 0.5, color = "black"),
        plot.title = element_text(size = 20, margin = margin(b= 10), hjust= 0),
        plot.subtitle = element_text(size = 12, color = "darkslategrey", margin = margin(b = 10,                                                                    l =-25)))


p1

Can you find any ups or drops that are out of the normal behaviour?

The only drop worth of notice that I can see from the starting chart is one around mid March.

Let’s take a look at the date.

march_data <- account_date_session_aggregated_by_day %>%
  filter(format(date, "%m") == "03")

min_date <- march_data$date[which.min(march_data$tot_number_active_users)]

min_date
## [1] "2016-03-15"

Apparently on 15 March there was a minimum point. Did something happen?

I’ll let you tell me this.

There are, however, many micro ups and downs worth of noticing.

What do you think why do they happen?

Supposition: the days of the week? Is it possible that the spikes are caused by weekends?
Let’s take a look.

#To have a better look, I will "zoom" on the weeks of April and May since from the chart it look like a
#"constant" timeframe.

week_comparison_AprMay <- account_date_session_aggregated_by_day%>%
  select(date, tot_number_active_users)%>%
  filter(date >= "2016-04-01" & date <= "2016-05-31")

week_comparison_AprMay$date <- as.Date(week_comparison_AprMay$date, format = "%Y-%m-%d")

week_comparison_AprMay$weekend <- ifelse(weekdays(week_comparison_AprMay$date) %in% c('Saturday', 'Sunday'), TRUE, FALSE)


ggplot(week_comparison_AprMay, aes(x = date, y = tot_number_active_users)) +
  geom_line() +  # Line chart
  geom_point(aes(color = weekend), size = 3) +  # Add points, different color for weekends
  scale_color_manual(values = c("FALSE" = "black", "TRUE" = "red")) +  # Set weekend points to red
  theme_bw()+
  labs(title = "Daily Trend - Active Users - Weekend Comparison",
       x = "Date",
       y = "Active Users")+
  theme(plot.title.position = "plot",
        text = element_text(family = "Georgia"),
        axis.text.y = element_text(size = 8),
        axis.text.x = element_text(size = 10,angle = 90, vjust = 0.5, color = "black"),
        plot.title = element_text(size = 20, margin = margin(b= 10), hjust= 0),
        plot.subtitle = element_text(size = 12, color = "darkslategrey", margin = margin(b = 10,                                                                    l =-25)))

This chart confirms what I supposed. The spikes are likely caused by the fact that in the weekends people have more time to enjoy your games.

3. Analyse sales:

Analyse the geographic split of the revenue and the users.

#Data preparation for the choropleth.

map <- st_read("C:/Users/Michele/Desktop/Lavoro/Supercell_TEST/ne_10m_admin_0_countries/ne_10m_admin_0_countries.shp")
## Reading layer `ne_10m_admin_0_countries' from data source 
##   `C:\Users\Michele\Desktop\Lavoro\Supercell_TEST\ne_10m_admin_0_countries\ne_10m_admin_0_countries.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 258 features and 168 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -180 ymin: -90 xmax: 180 ymax: 83.6341
## Geodetic CRS:  WGS 84
map$ISO_A2[map$NAME=="France"] <- "FR"
map$ISO_A2[map$NAME=="Norway"] <- "NO"
#Manually changing this, since for some reason the iso code for France and Norway was missing.

map_cleaned <- map%>%
  select(ISO_A2, NAME)

#taking the country code column from "account"
iap_purchase2 <- left_join(iap_purchase, account %>%
                            select(account_id, country_code), by = "account_id")

revenue_by_country <- iap_purchase2%>%
  dplyr::group_by(country_code)%>%
  summarize(tot_revenue_country = sum(iap_price_usd_cents))

#I'll normalize this in USD.

revenue_by_country$tot_revenue_country <-  revenue_by_country$tot_revenue_country/100
  

account_by_country <- account%>%
  dplyr::group_by(country_code )%>%
  summarize(accounts = n_distinct(account_id))

account_choro <- merge(account_by_country, map_cleaned, by.x = "country_code", by.y = "ISO_A2")

revenue_choro <- merge(revenue_by_country, map_cleaned, by.x = "country_code", by.y = "ISO_A2")

#Taking population data from 2015. I need it to normalize the data by the population of each country.

data(pop)

pop2015 <- pop%>%
  select(name, "2015")

pop2015$name[pop2015$name == "Russian Federation"] <- "Russia"
#Manually changing this because for some reason they named it "Russian Federation".




account_choro2 <- merge(account_choro, pop2015, by.x= "NAME", by.y= "name" )

colnames(account_choro2) <- c("NAME","country_code","accounts","geometry","population_2015")

account_choro3 <- account_choro2%>%
  mutate(accounts_per_1000 = (account_choro2$accounts / account_choro2$population_2015) * 1000)%>%
  arrange(desc(accounts_per_1000))

The first thing I’m gonna do is to visualize where the vast majority of the revenue comes from, in percentage.

A simple piechart can help us in this.

pie_data <- revenue_by_country%>%
  group_by(country_code)%>%
  summarize(tot_revenue = sum(tot_revenue_country))%>%
  mutate(perc = round(100*tot_revenue/sum(tot_revenue),1))

top_countries <- pie_data %>%
  select(country_code, perc)

top_countries <-top_countries %>%
  arrange(desc(perc)) %>%
  slice(1:7)

others <- sum(pie_data$perc) - sum(top_countries$perc)
others_row <- data.frame(country_code = "Others", perc = others)

pie_data2 <- rbind(top_countries, others_row)


positions  <- pie_data2 %>% 
  mutate(csum = rev(cumsum(rev(perc))), 
         pos = perc/2 + lead(csum, 1),
         pos = if_else(is.na(pos), perc/2, pos))

positions$pos[positions$country_code == "US"] = 10
positions$pos[positions$country_code == "TR"] = 32
positions$pos[positions$country_code == "CN"] = 75
positions$pos[positions$country_code == "Others"] = 40
positions$pos[positions$country_code == "CH"] = 92
positions$pos[positions$country_code == "CA"] = 98
positions$pos[positions$country_code == "KR"] = 53
positions$pos[positions$country_code == "FR"] = 60







ggplot(pie_data2, aes(x = "", y = perc, fill = country_code)) +
  geom_bar( stat = "identity") +
  coord_polar("y", start = 0) +
  scale_fill_brewer()+
  geom_label_repel(
    data = positions, aes( y= pos, label = paste0(perc, "%")),
    size = 4.5, nudge_x = 1, show.legend = F)+
  theme_void()+
    labs(title = "Percentage of revenue by country")+
  guides(fill = guide_legend(title = "Country Code"))+
  theme(plot.title.position = "plot",
        text = element_text(family = "Georgia"),
        plot.title = element_text(size = 20, margin = margin(b= 10), hjust= 0),
        plot.subtitle = element_text(size = 12, color = "darkslategrey", margin = margin(b = 10,                                                                    l =-25)))




And now what I want to see is the map with this informations.

Countries generating 0 revenue will be automatically excluded.

#When it comes to analyze geographic split, the best choice is usually the choropleth.
#Let's start from revenue.

clean1<-revenue_choro%>%
  mutate(clss=case_when(
    tot_revenue_country<100~"1",
    tot_revenue_country<200~"2",
    tot_revenue_country<500~"3",
    tot_revenue_country<1000~"4",
    tot_revenue_country<2000~"5",
    TRUE~"6"
  ))

bck <- "#001219"


theme_custom1 <- theme_void()+
  theme(
    plot.margin = margin(1,1,10,1,"pt"),
    plot.background = element_rect(fill=bck,color=NA),
    legend.position = "bottom",
    legend.title = element_text(hjust=0.5,color="white",face="bold"),
    legend.text = element_text(color="white")
  )
pal <- c("#bb3e03","#ee9b00","#e9d8a6","#94d2bd","#0a9396","#005f73")


ggplot(clean1, aes(fill=clss, geometry=geometry))+
  geom_sf()+
  labs(fill="Total Revenue by country in $")+
  guides(
    fill=guide_legend(
      nrow=1,
      title.position="top",
      label.position="bottom"
    )
  )+
  scale_fill_manual(
    values=pal,
    label=c("< 100 ","< 200 ","< 500","< 1000","< 2000", ">=2000 ")
  )+
  theme_custom1

# Now that we have everything up and running, let's visualize everything.
#To start with, we can take a look at which are the countries with most players in absolute terms. Let's plot the first 10.

account_choro_barchart <- account_choro2%>%
  arrange(desc(accounts))%>%
  slice_head(n= 10)

ggplot(account_choro_barchart, aes(x = reorder(NAME, accounts), y =accounts))+
  geom_bar(stat = "identity", fill = "steelblue")+
  geom_label(aes(label = accounts))+
  labs(title = "Countries with the most accounts registered",
       subtitle = "China comes first, of course, in terms of absolute number of players.
",
       x = "Country", 
       y = "Number of accounts")+
  coord_flip()+
  theme_bw()+
   theme(plot.title.position = "plot",
        text = element_text(family = "Georgia"),
        axis.text.y = element_text(size = 8),
        plot.title = element_text(size = 20, margin = margin(b= 10), hjust= 0),
        plot.subtitle = element_text(size = 12, color = "darkslategrey", margin = margin(b = 10,                                                                    l =-25)))

But let’s try to make this more insightful by normalizing the number of players by the population.

account_choro_barchart2 <- account_choro3 %>%
   arrange(desc(accounts_per_1000))%>%
   slice_head(n= 10)

account_choro_barchart2$accounts_per_1000 <- round(account_choro_barchart2$accounts_per_1000,1)

ggplot(account_choro_barchart2, aes(x = reorder(NAME, accounts_per_1000), y =accounts_per_1000))+
  geom_bar(stat = "identity", fill = "steelblue")+
  geom_label(aes(label = accounts_per_1000))+
  labs(title = "Countries with the most accounts per 1000 inhabitants",
       subtitle = "Yes, I am as surprised as you.
",
       x = "Country", 
       y = "Number of accounts per 1000 inhabitants")+
  coord_flip()+
  theme_bw()+
   theme(plot.title.position = "plot",
        text = element_text(family = "Georgia"),
        axis.text.y = element_text(size = 8),
        plot.title = element_text(size = 20, margin = margin(b= 10), hjust= 0),
        plot.subtitle = element_text(size = 12, color = "darkslategrey", margin = margin(b = 10,                                                                    l =-25)))

To be honest, I didn’t even know that “Guam” was a country, imagine thinking that it could come out in this list. LMAO.

Let’s check this. I’ll start from the number of inhabitants.

pop2015$"2015"[pop2015$name == "Guam"]
## [1] 169.885

And then the number of players.

account_choro2$accounts[account_choro2$NAME == "Guam"]
## [1] 13

So apparently we have a country with around 170.000 inhabitants, and 13 of them play your games.

That’s pretty impressive.

But now let’s check this on the map.

clean2<-account_choro3%>%
  mutate(clss=case_when(
    accounts_per_1000<10~"1",
    accounts_per_1000<20~"2",
    accounts_per_1000<30~"3",
    accounts_per_1000<40~"4",
    accounts_per_1000<50~"5",
    TRUE~"6"
  ))

bck <- "#001219"


theme_custom2 <- theme_void()+
  theme(
    plot.margin = margin(1,1,10,1,"pt"),
    plot.background = element_rect(fill=bck,color=NA),
    legend.position = "bottom",
    legend.title = element_text(hjust=0.5,color="white",face="bold"),
    legend.text = element_text(color="white")
  )
pal <- c("#bb3e03","#ee9b00","#e9d8a6","#94d2bd","#0a9396","#005f73")


ggplot(clean2, aes(fill=clss, geometry=geometry))+
  geom_sf()+
  labs(fill="Players per 1000 inhabitants")+
  guides(
    fill=guide_legend(
      nrow=1,
      title.position="top",
      label.position="bottom"
    )
  )+
  scale_fill_manual(
    values=pal,
    label=c("< 10 ","< 20 ","< 30","< 40","< 50", ">= 50 ")
  )+
  theme_custom2

Sorry if some South African and South American countries are not here and there but it isn’t easy to keep track of their Isocode or different names, but if you want me to do it we can discuss and I’ll try my best.

Calculate average revenue per user per market

I assume that when you say “per market” you intend “per country”

#Calculating the average revenue per user per country.

user_aggregated <- account%>%
  group_by(country_code)%>%
  summarize(users = n_distinct(account_id))

revenue_aggregated <- iap_purchase2%>%
  group_by(country_code)%>%
  summarize(total_revenue_usd_cents = sum(iap_price_usd_cents))

#Taking just the countries where purchases were made.
user_revenue_aggregated <- left_join(revenue_aggregated, user_aggregated, by = "country_code")

#Creating a new column indicating the revenue by user.
user_revenue_aggregated$revenue_by_user <-  user_revenue_aggregated$total_revenue_usd_cents/user_revenue_aggregated$users


user_revenue_aggregated$country_code[is.na(user_revenue_aggregated$country_code)] <- "NA"



user_revenue_aggregated <- user_revenue_aggregated%>%
  arrange(desc(revenue_by_user))

user_revenue_aggregated$revenue_by_user <- round(user_revenue_aggregated$revenue_by_user,2)


user_revenue_aggregated$country_code <- factor(user_revenue_aggregated$country_code, levels = user_revenue_aggregated$country_code[order(user_revenue_aggregated$revenue_by_user)])
ggplot(user_revenue_aggregated, aes(revenue_by_user, country_code, label = revenue_by_user))+
  geom_segment(aes(x = 0, y = country_code, xend = revenue_by_user, yend = country_code))+
  geom_point(shape= 21,fill = "red")+
  geom_text(nudge_x = 50)+
  labs(title = "Average revenue per user by country (USD cents)",
       subtitle = "Switzerland leads the way with a huge advantage on every other country")+
  theme_bw()+
  theme(axis.title = element_blank(),
        panel.grid.minor = element_blank(),
        legend.position = "none",
        plot.title.position = "plot",
        text = element_text(family = "Georgia"),
        axis.text.y = element_text(size = 8),
        plot.title = element_text(size = 20, margin = margin(b= 10), hjust= 0),
        plot.subtitle = element_text(size = 12, color = "darkslategrey", margin = margin(b = 15,                                                                    l =-25)))

Conclusions

To conclude, we can say that obviously countries with higher GDP per capita(like Switzerland) will   tendencially have more resources to spend. A good continue to this analysis could be analyze the data  by taking into account this other data sources, but there is no time unfortunately.

Goodbye!

Michele