Data Exploration with R

setwd("C:/Users/macro/Documents/Maquina vieja/Bunny Studio") 

pages = read.csv("pages.csv", header=TRUE, sep=",")
# interaction with the platform. Identified by anonymous_id while navigating

identifies = read.csv("identifies.csv", header=TRUE, sep=",")
# each user has two ids, anonymous_id and user_id

projects = read.csv("projects.csv", header=TRUE, sep=",")
# projects are submitted under user_id. Same field name as in identifies

Summary of Datasets

summary(identifies)
##       id            anonymous_id         user_id           timestamp        
##  Length:86606       Length:86606       Length:86606       Length:86606      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character
#check date format
head(identifies$timestamp,3)
## [1] "9/3/2020 23:07:22.27"   "17/3/2020 04:22:31.146" "22/5/2020 00:48:39.925"
# cast to datetime type
identifies$timestamp = as.POSIXlt(identifies$timestamp, format = "%d/%m/%Y %H:%M:%S")
# Rename to differenciate from other files when merging
colnames(identifies)[colnames(identifies) == 'timestamp'] <- 'timestamp_i'
summary(pages)
##        X                id            anonymous_id           path          
##  Min.   :      0   Length:2322593     Length:2322593     Length:2322593    
##  1st Qu.: 580648   Class :character   Class :character   Class :character  
##  Median :1161296   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :1161296                                                           
##  3rd Qu.:1741944                                                           
##  Max.   :2322592                                                           
##   timestamp        
##  Length:2322593    
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
#check date format
head(pages$timestamp,3)
## [1] "15/6/2020 21:20:23.023" "15/6/2020 21:20:24.522" "15/6/2020 21:20:25.286"
# cast to datetime type
pages$timestamp = as.POSIXlt(pages$timestamp, format = "%d/%m/%Y %H:%M:%S")
# Rename to differenciate from other files when merging
colnames(pages)[colnames(pages) == 'timestamp'] <- 'timestamp_p'
# Explore path structure
length(unique(pages$path))
## [1] 105392
head(pages$path,20)
##  [1] "/reads/add"                                                                
##  [2] "/reads/expired/3760702EFEC556C487EC/"                                      
##  [3] "/"                                                                         
##  [4] "/users/accepted_projects/"                                                 
##  [5] "/audio/"                                                                   
##  [6] "/"                                                                         
##  [7] "/dashboard/"                                                               
##  [8] "/users/signin/"                                                            
##  [9] "/dashboard/"                                                               
## [10] "/projects/e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"
## [11] "/blog/the-growing-world-of-the-spanish-podcast/"                           
## [12] "/reads/add"                                                                
## [13] "/reads/expired/3760702EFEC556C487EC/"                                      
## [14] "/search"                                                                   
## [15] "/"                                                                         
## [16] "/"                                                                         
## [17] "/voice/how-it-works/"                                                      
## [18] "/dashboard/"                                                               
## [19] "/projects/e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"
## [20] "/blog/why-is-translating-from-japanese-to-english-so-hard/"
library(stringr)
## Warning: package 'stringr' was built under R version 4.0.5
# Remove from path the strings containing numbers

# Split by slash
strim_path_split = str_split(pages$path, "/",simplify = TRUE)

# Identify which path folder with number
mask_contains_number <- grepl("[[:digit:]]", strim_path_split)

# Replace where string is True (folder name with number)
strim_path_split  = as.data.frame(replace(strim_path_split, mask_contains_number, "/"))


# Reconstruct path without folder with numbers
# get column names from new dataframe
cols <- names(strim_path_split)

strim_path_split$merged <- apply( strim_path_split[ , cols ] , 1 , paste , collapse = "/" )

# Adding column to Pages df
pages = cbind(pages, strim_path = strim_path_split$merged)

# Removing duplicated slashes
to_remove <- gsub("\\b(.)\\1+", "\\1", pages$strim_path, ignore.case = TRUE, perl = TRUE)
pages$strim_path = gsub("(.)\\1+\\b", "\\1", to_remove)

# Count slash / to know path depth
library(stringr)

pages$path_depth <- str_count(pages$path,coll("/")) - 1
summary(projects)
##        X              id              user_id          fulfillment_type  
##  Min.   :    0   Length:15804       Length:15804       Length:15804      
##  1st Qu.: 3951   Class :character   Class :character   Class :character  
##  Median : 7902   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 7902                                                           
##  3rd Qu.:11852                                                           
##  Max.   :15803                                                           
##                                                                          
##    created            category         revenue_in_usd     site        
##  Length:15804       Length:15804       Min.   :   0.0   Mode:logical  
##  Class :character   Class :character   1st Qu.:  59.0   NA's:15804    
##  Mode  :character   Mode  :character   Median :  85.0                 
##                                        Mean   : 136.8                 
##                                        3rd Qu.: 152.0                 
##                                        Max.   :4613.0                 
##                                        NA's   :102                    
##    language         gender_and_age        pro_id         
##  Length:15804       Length:15804       Length:15804      
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
## 
#check date format
head(projects$created,3)
## [1] "20/4/2020 22:19:49" "16/6/2020 15:46:02" "6/4/2020 17:05:08"
# cast to datetime type
projects$created = as.POSIXlt(projects$created, format = "%d/%m/%Y %H:%M:%S")
# Unique user_id from identifies
print(length(unique(identifies$user_id)))
## [1] 26322
# Unique anonymous_id from identifies
print(length(unique(identifies$anonymous_id)))
## [1] 86431
# Unique user_id from projects
print(length(unique(projects$user_id)))
## [1] 3019
# if user_id at identifies exists among projects´ user_id then
# user submitted project --> converted
# add converted label at identifies file

identifies$converted_user_id <- as.integer(identifies$user_id %in% projects$user_id)
# Get the earliest date of user_id activity 
# This can be taken as the "user creation date"

#install.packages("dplyr")
library(dplyr)
library(tidyverse)
library(lubridate)

identifies = as.data.frame(identifies %>% group_by(user_id) %>% mutate(earliest_timestamp = as.POSIXlt(min(ymd_hms(timestamp_i)), format = "%d/%m/%Y %H:%M:%S")))
# Activity log timeframe identifies
print(range(identifies$timestamp))
## [1] "2020-03-01 00:00:06 -03" "2020-06-30 23:58:20 -03"
print(range(projects$created))
## [1] "2020-03-01 00:06:58 -03" "2020-06-30 23:53:01 -03"
print(range(pages$timestamp))
## [1] "2020-03-01 00:00:05 -03" "2020-06-30 23:59:59 -03"
# count of converted/non converted
# to understand balance

balance_count_sessions=addmargins(table(as.factor(identifies$converted_user_id)))

# Proportion of activity of unconverted users (%)
round((balance_count_sessions[1]/  balance_count_sessions[3]) * 100 ,2)  
##    0 
## 81.7
# Count of unique users by category 

balance_count_unique_users <- aggregate(data = identifies,
               user_id ~ converted_user_id,
              function(x) length(unique(x)))

# Proportion of activity of unconverted users (%)
round((balance_count_unique_users[1,2]/sum(balance_count_unique_users)) * 100 ,2) 
## [1] 88.53
# There is an unbalanced distribution of the class "converted vs. uncorverted"
# Count unique users by their earliest_timestamp month 

unique_users_by_month <- aggregate(data = identifies,
                          user_id ~ month(identifies$earliest_timestamp),
                          function(x) length(unique(x)))
unique_users_by_month
# There is a peak of "user creation" during March, all other 3 months are stable around 5700 users
# Revenue Distribution
boxplot(projects$revenue_in_usd)

# Revenue Distribution
hist(projects$revenue_in_usd, breaks  = 40)

# Right skewed / very asymmetric distribution
library(dplyr)
projects  %>% group_by(fulfillment_type) %>% 
  summarize(revenue_avg = mean(revenue_in_usd, na.rm = TRUE),
            revenue_total = sum(revenue_in_usd, na.rm = TRUE),
            count = length(revenue_in_usd),
            revenue_sd = sd(revenue_in_usd, na.rm = TRUE))
# Most of income is generated by booking, not only due to volume, but also by average price.
# Contest services are not that frequent, however, their average revenue is the highest
# Booking and Contest Standard Deviation are pretty similar considering the difference in frequency
options(scipen = 1)
projects  %>% group_by(month(created)) %>% 
  summarize(revenue_avg = mean(revenue_in_usd, na.rm = TRUE),
            revenue_total = sum(revenue_in_usd, na.rm = TRUE),
            count = length(revenue_in_usd),
            revenue_sd = sd(revenue_in_usd, na.rm = TRUE))
# Monthly revenue is pretty stable across the analyzed period
# Last analyzed month has an increase in total revenue

Merging Dataframes

# Left join identifies with pages by anonymous_id
merged =merge(x = identifies, y = pages, by = "anonymous_id")

#gc()
#memory.limit(4000)
# Left join with projects by user_id
# merged =merge(x = merged, y = projects, by = "user_id")

# Cannot merge these two dataframes due to memory limitation
# Joining can be done at the Tableau
# Export merged dataframes with new attributes
# No changes done to Projects. Will use original file

write.csv(merged,"identifies_with_pages.csv",na="NA",row.names=TRUE, quote = TRUE)