Summary of Datasets
summary(identifies)
## id anonymous_id user_id timestamp
## Length:86606 Length:86606 Length:86606 Length:86606
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
#check date format
head(identifies$timestamp,3)
## [1] "9/3/2020 23:07:22.27" "17/3/2020 04:22:31.146" "22/5/2020 00:48:39.925"
# cast to datetime type
identifies$timestamp = as.POSIXlt(identifies$timestamp, format = "%d/%m/%Y %H:%M:%S")
# Rename to differenciate from other files when merging
colnames(identifies)[colnames(identifies) == 'timestamp'] <- 'timestamp_i'
summary(pages)
## X id anonymous_id path
## Min. : 0 Length:2322593 Length:2322593 Length:2322593
## 1st Qu.: 580648 Class :character Class :character Class :character
## Median :1161296 Mode :character Mode :character Mode :character
## Mean :1161296
## 3rd Qu.:1741944
## Max. :2322592
## timestamp
## Length:2322593
## Class :character
## Mode :character
##
##
##
#check date format
head(pages$timestamp,3)
## [1] "15/6/2020 21:20:23.023" "15/6/2020 21:20:24.522" "15/6/2020 21:20:25.286"
# cast to datetime type
pages$timestamp = as.POSIXlt(pages$timestamp, format = "%d/%m/%Y %H:%M:%S")
# Rename to differenciate from other files when merging
colnames(pages)[colnames(pages) == 'timestamp'] <- 'timestamp_p'
# Explore path structure
length(unique(pages$path))
## [1] 105392
head(pages$path,20)
## [1] "/reads/add"
## [2] "/reads/expired/3760702EFEC556C487EC/"
## [3] "/"
## [4] "/users/accepted_projects/"
## [5] "/audio/"
## [6] "/"
## [7] "/dashboard/"
## [8] "/users/signin/"
## [9] "/dashboard/"
## [10] "/projects/e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"
## [11] "/blog/the-growing-world-of-the-spanish-podcast/"
## [12] "/reads/add"
## [13] "/reads/expired/3760702EFEC556C487EC/"
## [14] "/search"
## [15] "/"
## [16] "/"
## [17] "/voice/how-it-works/"
## [18] "/dashboard/"
## [19] "/projects/e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"
## [20] "/blog/why-is-translating-from-japanese-to-english-so-hard/"
library(stringr)
## Warning: package 'stringr' was built under R version 4.0.5
# Remove from path the strings containing numbers
# Split by slash
strim_path_split = str_split(pages$path, "/",simplify = TRUE)
# Identify which path folder with number
mask_contains_number <- grepl("[[:digit:]]", strim_path_split)
# Replace where string is True (folder name with number)
strim_path_split = as.data.frame(replace(strim_path_split, mask_contains_number, "/"))
# Reconstruct path without folder with numbers
# get column names from new dataframe
cols <- names(strim_path_split)
strim_path_split$merged <- apply( strim_path_split[ , cols ] , 1 , paste , collapse = "/" )
# Adding column to Pages df
pages = cbind(pages, strim_path = strim_path_split$merged)
# Removing duplicated slashes
to_remove <- gsub("\\b(.)\\1+", "\\1", pages$strim_path, ignore.case = TRUE, perl = TRUE)
pages$strim_path = gsub("(.)\\1+\\b", "\\1", to_remove)
# Count slash / to know path depth
library(stringr)
pages$path_depth <- str_count(pages$path,coll("/")) - 1
summary(projects)
## X id user_id fulfillment_type
## Min. : 0 Length:15804 Length:15804 Length:15804
## 1st Qu.: 3951 Class :character Class :character Class :character
## Median : 7902 Mode :character Mode :character Mode :character
## Mean : 7902
## 3rd Qu.:11852
## Max. :15803
##
## created category revenue_in_usd site
## Length:15804 Length:15804 Min. : 0.0 Mode:logical
## Class :character Class :character 1st Qu.: 59.0 NA's:15804
## Mode :character Mode :character Median : 85.0
## Mean : 136.8
## 3rd Qu.: 152.0
## Max. :4613.0
## NA's :102
## language gender_and_age pro_id
## Length:15804 Length:15804 Length:15804
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
#check date format
head(projects$created,3)
## [1] "20/4/2020 22:19:49" "16/6/2020 15:46:02" "6/4/2020 17:05:08"
# cast to datetime type
projects$created = as.POSIXlt(projects$created, format = "%d/%m/%Y %H:%M:%S")
# Unique user_id from identifies
print(length(unique(identifies$user_id)))
## [1] 26322
# Unique anonymous_id from identifies
print(length(unique(identifies$anonymous_id)))
## [1] 86431
# Unique user_id from projects
print(length(unique(projects$user_id)))
## [1] 3019
# if user_id at identifies exists among projects´ user_id then
# user submitted project --> converted
# add converted label at identifies file
identifies$converted_user_id <- as.integer(identifies$user_id %in% projects$user_id)
# Get the earliest date of user_id activity
# This can be taken as the "user creation date"
#install.packages("dplyr")
library(dplyr)
library(tidyverse)
library(lubridate)
identifies = as.data.frame(identifies %>% group_by(user_id) %>% mutate(earliest_timestamp = as.POSIXlt(min(ymd_hms(timestamp_i)), format = "%d/%m/%Y %H:%M:%S")))
# Activity log timeframe identifies
print(range(identifies$timestamp))
## [1] "2020-03-01 00:00:06 -03" "2020-06-30 23:58:20 -03"
print(range(projects$created))
## [1] "2020-03-01 00:06:58 -03" "2020-06-30 23:53:01 -03"
print(range(pages$timestamp))
## [1] "2020-03-01 00:00:05 -03" "2020-06-30 23:59:59 -03"
# count of converted/non converted
# to understand balance
balance_count_sessions=addmargins(table(as.factor(identifies$converted_user_id)))
# Proportion of activity of unconverted users (%)
round((balance_count_sessions[1]/ balance_count_sessions[3]) * 100 ,2)
## 0
## 81.7
# Count of unique users by category
balance_count_unique_users <- aggregate(data = identifies,
user_id ~ converted_user_id,
function(x) length(unique(x)))
# Proportion of activity of unconverted users (%)
round((balance_count_unique_users[1,2]/sum(balance_count_unique_users)) * 100 ,2)
## [1] 88.53
# There is an unbalanced distribution of the class "converted vs. uncorverted"
# Count unique users by their earliest_timestamp month
unique_users_by_month <- aggregate(data = identifies,
user_id ~ month(identifies$earliest_timestamp),
function(x) length(unique(x)))
unique_users_by_month
# There is a peak of "user creation" during March, all other 3 months are stable around 5700 users
# Revenue Distribution
boxplot(projects$revenue_in_usd)

# Revenue Distribution
hist(projects$revenue_in_usd, breaks = 40)

# Right skewed / very asymmetric distribution
library(dplyr)
projects %>% group_by(fulfillment_type) %>%
summarize(revenue_avg = mean(revenue_in_usd, na.rm = TRUE),
revenue_total = sum(revenue_in_usd, na.rm = TRUE),
count = length(revenue_in_usd),
revenue_sd = sd(revenue_in_usd, na.rm = TRUE))
# Most of income is generated by booking, not only due to volume, but also by average price.
# Contest services are not that frequent, however, their average revenue is the highest
# Booking and Contest Standard Deviation are pretty similar considering the difference in frequency
options(scipen = 1)
projects %>% group_by(month(created)) %>%
summarize(revenue_avg = mean(revenue_in_usd, na.rm = TRUE),
revenue_total = sum(revenue_in_usd, na.rm = TRUE),
count = length(revenue_in_usd),
revenue_sd = sd(revenue_in_usd, na.rm = TRUE))
# Monthly revenue is pretty stable across the analyzed period
# Last analyzed month has an increase in total revenue