The 80/20 rule has proven true for many businesses-only a small percentage of customers produce most of the revenue. As such, marketing teams are challenged to make appropriate investments in promotional strategies.
GStore RStudio, the developer of free and open tools for R and enterprise-ready products for teams to scale and share work, has partnered with Google Cloud and Kaggle to demonstrate the business impact that thorough data analysis can have.
In this competition, you’re challenged to analyze a Google Merchandise Store (also known as GStore, where Google swag is sold) customer dataset to predict revenue per customer. Hopefully, the outcome will be more actionable operational changes and a better use of marketing budgets for those companies who choose to use data analysis on top of GA data.
library(stringr)
library(lubridate)
library(jsonlite)
library(data.table)
library(dplyr)
library(countrycode)
library(caret)
library(randomForest)
setwd("G:/DataScienceProject/GoogleAnalyticsCustomerRevenuePrediction")
Train <- read.csv(file = "train.csv", header = TRUE)
Test <- read.csv(file = "test.csv", header = TRUE)
Test <- Test[!duplicated(Test$fullVisitorId),]
head(Train)
## channelGrouping date
## 1 Organic Search 20160902
## 2 Organic Search 20160902
## 3 Organic Search 20160902
## 4 Organic Search 20160902
## 5 Organic Search 20160902
## 6 Organic Search 20160902
## device
## 1 {"browser": "Chrome", "browserVersion": "not available in demo dataset", "browserSize": "not available in demo dataset", "operatingSystem": "Windows", "operatingSystemVersion": "not available in demo dataset", "isMobile": false, "mobileDeviceBranding": "not available in demo dataset", "mobileDeviceModel": "not available in demo dataset", "mobileInputSelector": "not available in demo dataset", "mobileDeviceInfo": "not available in demo dataset", "mobileDeviceMarketingName": "not available in demo dataset", "flashVersion": "not available in demo dataset", "language": "not available in demo dataset", "screenColors": "not available in demo dataset", "screenResolution": "not available in demo dataset", "deviceCategory": "desktop"}
## 2 {"browser": "Firefox", "browserVersion": "not available in demo dataset", "browserSize": "not available in demo dataset", "operatingSystem": "Macintosh", "operatingSystemVersion": "not available in demo dataset", "isMobile": false, "mobileDeviceBranding": "not available in demo dataset", "mobileDeviceModel": "not available in demo dataset", "mobileInputSelector": "not available in demo dataset", "mobileDeviceInfo": "not available in demo dataset", "mobileDeviceMarketingName": "not available in demo dataset", "flashVersion": "not available in demo dataset", "language": "not available in demo dataset", "screenColors": "not available in demo dataset", "screenResolution": "not available in demo dataset", "deviceCategory": "desktop"}
## 3 {"browser": "Chrome", "browserVersion": "not available in demo dataset", "browserSize": "not available in demo dataset", "operatingSystem": "Windows", "operatingSystemVersion": "not available in demo dataset", "isMobile": false, "mobileDeviceBranding": "not available in demo dataset", "mobileDeviceModel": "not available in demo dataset", "mobileInputSelector": "not available in demo dataset", "mobileDeviceInfo": "not available in demo dataset", "mobileDeviceMarketingName": "not available in demo dataset", "flashVersion": "not available in demo dataset", "language": "not available in demo dataset", "screenColors": "not available in demo dataset", "screenResolution": "not available in demo dataset", "deviceCategory": "desktop"}
## 4 {"browser": "UC Browser", "browserVersion": "not available in demo dataset", "browserSize": "not available in demo dataset", "operatingSystem": "Linux", "operatingSystemVersion": "not available in demo dataset", "isMobile": false, "mobileDeviceBranding": "not available in demo dataset", "mobileDeviceModel": "not available in demo dataset", "mobileInputSelector": "not available in demo dataset", "mobileDeviceInfo": "not available in demo dataset", "mobileDeviceMarketingName": "not available in demo dataset", "flashVersion": "not available in demo dataset", "language": "not available in demo dataset", "screenColors": "not available in demo dataset", "screenResolution": "not available in demo dataset", "deviceCategory": "desktop"}
## 5 {"browser": "Chrome", "browserVersion": "not available in demo dataset", "browserSize": "not available in demo dataset", "operatingSystem": "Android", "operatingSystemVersion": "not available in demo dataset", "isMobile": true, "mobileDeviceBranding": "not available in demo dataset", "mobileDeviceModel": "not available in demo dataset", "mobileInputSelector": "not available in demo dataset", "mobileDeviceInfo": "not available in demo dataset", "mobileDeviceMarketingName": "not available in demo dataset", "flashVersion": "not available in demo dataset", "language": "not available in demo dataset", "screenColors": "not available in demo dataset", "screenResolution": "not available in demo dataset", "deviceCategory": "mobile"}
## 6 {"browser": "Chrome", "browserVersion": "not available in demo dataset", "browserSize": "not available in demo dataset", "operatingSystem": "Windows", "operatingSystemVersion": "not available in demo dataset", "isMobile": false, "mobileDeviceBranding": "not available in demo dataset", "mobileDeviceModel": "not available in demo dataset", "mobileInputSelector": "not available in demo dataset", "mobileDeviceInfo": "not available in demo dataset", "mobileDeviceMarketingName": "not available in demo dataset", "flashVersion": "not available in demo dataset", "language": "not available in demo dataset", "screenColors": "not available in demo dataset", "screenResolution": "not available in demo dataset", "deviceCategory": "desktop"}
## fullVisitorId
## 1 1.131660e+18
## 2 3.773060e+17
## 3 3.895546e+18
## 4 4.763447e+18
## 5 2.729444e+16
## 6 2.938943e+18
## geoNetwork
## 1 {"continent": "Asia", "subContinent": "Western Asia", "country": "Turkey", "region": "Izmir", "metro": "(not set)", "city": "Izmir", "cityId": "not available in demo dataset", "networkDomain": "ttnet.com.tr", "latitude": "not available in demo dataset", "longitude": "not available in demo dataset", "networkLocation": "not available in demo dataset"}
## 2 {"continent": "Oceania", "subContinent": "Australasia", "country": "Australia", "region": "not available in demo dataset", "metro": "not available in demo dataset", "city": "not available in demo dataset", "cityId": "not available in demo dataset", "networkDomain": "dodo.net.au", "latitude": "not available in demo dataset", "longitude": "not available in demo dataset", "networkLocation": "not available in demo dataset"}
## 3 {"continent": "Europe", "subContinent": "Southern Europe", "country": "Spain", "region": "Community of Madrid", "metro": "(not set)", "city": "Madrid", "cityId": "not available in demo dataset", "networkDomain": "unknown.unknown", "latitude": "not available in demo dataset", "longitude": "not available in demo dataset", "networkLocation": "not available in demo dataset"}
## 4 {"continent": "Asia", "subContinent": "Southeast Asia", "country": "Indonesia", "region": "not available in demo dataset", "metro": "not available in demo dataset", "city": "not available in demo dataset", "cityId": "not available in demo dataset", "networkDomain": "unknown.unknown", "latitude": "not available in demo dataset", "longitude": "not available in demo dataset", "networkLocation": "not available in demo dataset"}
## 5 {"continent": "Europe", "subContinent": "Northern Europe", "country": "United Kingdom", "region": "not available in demo dataset", "metro": "not available in demo dataset", "city": "not available in demo dataset", "cityId": "not available in demo dataset", "networkDomain": "unknown.unknown", "latitude": "not available in demo dataset", "longitude": "not available in demo dataset", "networkLocation": "not available in demo dataset"}
## 6 {"continent": "Europe", "subContinent": "Southern Europe", "country": "Italy", "region": "not available in demo dataset", "metro": "not available in demo dataset", "city": "not available in demo dataset", "cityId": "not available in demo dataset", "networkDomain": "fastwebnet.it", "latitude": "not available in demo dataset", "longitude": "not available in demo dataset", "networkLocation": "not available in demo dataset"}
## sessionId socialEngagementType
## 1 1131660440785968503_1472830385 Not Socially Engaged
## 2 377306020877927890_1472880147 Not Socially Engaged
## 3 3895546263509774583_1472865386 Not Socially Engaged
## 4 4763447161404445595_1472881213 Not Socially Engaged
## 5 27294437909732085_1472822600 Not Socially Engaged
## 6 2938943183656635653_1472807194 Not Socially Engaged
## totals
## 1 {"visits": "1", "hits": "1", "pageviews": "1", "bounces": "1", "newVisits": "1"}
## 2 {"visits": "1", "hits": "1", "pageviews": "1", "bounces": "1", "newVisits": "1"}
## 3 {"visits": "1", "hits": "1", "pageviews": "1", "bounces": "1", "newVisits": "1"}
## 4 {"visits": "1", "hits": "1", "pageviews": "1", "bounces": "1", "newVisits": "1"}
## 5 {"visits": "1", "hits": "1", "pageviews": "1", "bounces": "1"}
## 6 {"visits": "1", "hits": "1", "pageviews": "1", "bounces": "1", "newVisits": "1"}
## trafficSource
## 1 {"campaign": "(not set)", "source": "google", "medium": "organic", "keyword": "(not provided)", "adwordsClickInfo": {"criteriaParameters": "not available in demo dataset"}}
## 2 {"campaign": "(not set)", "source": "google", "medium": "organic", "keyword": "(not provided)", "adwordsClickInfo": {"criteriaParameters": "not available in demo dataset"}}
## 3 {"campaign": "(not set)", "source": "google", "medium": "organic", "keyword": "(not provided)", "adwordsClickInfo": {"criteriaParameters": "not available in demo dataset"}}
## 4 {"campaign": "(not set)", "source": "google", "medium": "organic", "keyword": "google + online", "adwordsClickInfo": {"criteriaParameters": "not available in demo dataset"}}
## 5 {"campaign": "(not set)", "source": "google", "medium": "organic", "keyword": "(not provided)", "adwordsClickInfo": {"criteriaParameters": "not available in demo dataset"}, "isTrueDirect": true}
## 6 {"campaign": "(not set)", "source": "google", "medium": "organic", "keyword": "(not provided)", "adwordsClickInfo": {"criteriaParameters": "not available in demo dataset"}}
## visitId visitNumber visitStartTime
## 1 1472830385 1 1472830385
## 2 1472880147 1 1472880147
## 3 1472865386 1 1472865386
## 4 1472881213 1 1472881213
## 5 1472822600 2 1472822600
## 6 1472807194 1 1472807194
Lets convert time format
# convert date column from character to Date class
Train$date <- as.Date(as.character(Train$date), format='%Y%m%d')
Test$date <- as.Date(as.character(Test$date), format='%Y%m%d')
# convert visitStartTime to POSIXct
Train$visitStartTime <- as_datetime(Train$visitStartTime)
Test$visitStartTime <- as_datetime(Test$visitStartTime)
Since JSON is not easy to work with on R, we will convert it.
tr_device <- paste("[", paste(Train$device, collapse = ","), "]") %>% fromJSON(flatten = T)
ts_device <- paste("[", paste(Test$device, collapse = ","), "]") %>% fromJSON(flatten = T)
tr_geoNetwork <- paste("[", paste(Train$geoNetwork, collapse = ","), "]") %>% fromJSON(flatten = T)
ts_geoNetwork <- paste("[", paste(Test$geoNetwork, collapse = ","), "]") %>% fromJSON(flatten = T)
tr_totals <- paste("[", paste(Train$totals, collapse = ","), "]") %>% fromJSON(flatten = T)
ts_totals <- paste("[", paste(Test$totals, collapse = ","), "]") %>% fromJSON(flatten = T)
tr_trafficSource <- paste("[", paste(Train$trafficSource, collapse = ","), "]") %>% fromJSON(flatten = T)
ts_trafficSource <- paste("[", paste(Test$trafficSource, collapse = ","), "]") %>% fromJSON(flatten = T)
Train <- cbind(Train, tr_device, tr_geoNetwork, tr_totals, tr_trafficSource) %>%
as.data.table()
Test <- cbind(Test, ts_device, ts_geoNetwork, ts_totals, ts_trafficSource) %>%
as.data.table()
# drop the old json columns
Train[, c('device', 'geoNetwork', 'totals', 'trafficSource') := NULL]
Test[, c('device', 'geoNetwork', 'totals', 'trafficSource') := NULL]
Calculate log of user transaction according to Kaggle format.
Train <- Train[!is.na(Train$transactionRevenue), ]
Train$transactionRevenue <- as.numeric(Train$transactionRevenue)
Train$ActTransacPredic <- 0
i <- 1
for (i in 1:length(Train$fullVisitorId)) {
Train$ActTransacPredic[i] <- round(log((Train$transactionRevenue[i] / 1000000) + 1), 3)
}
Train[, c('transactionRevenue') := NULL]
#Creating the 20/80 rule
Train$Rule <- 0
i <- 1
for (i in 1:length(Train$ActTransacPredic)) {
myRule <- (Train$ActTransacPredic[i] - mean(Train$ActTransacPredic)) / sd(Train$ActTransacPredic)
if(myRule >= 0.291){
Train$Rule[i] <- 1
}
i <- i + 1
}
#Test$PredictedLogRevenue <- 0
Test$ActTransacPredic <- 0 #for col alignment
#Test$Rule <- 0
write.csv(Train, file = "train_1.csv", row.names = FALSE)
write.csv(Test, file = "test_1.csv", row.names = FALSE)
Remove all single value columns & NAs
# values to convert to NA
na_vals <- c('unknown.unknown', '(not set)', 'not available in demo dataset',
'(not provided)', '(none)', '<NA>')
for(col in names(Train)) {
set(Train, i=which(Train[[col]] %in% na_vals), j=col, value=NA)
}
#Train <-Train[!complete.cases(Train),]
nonUniqueCol <- c()
i <- 1
for (i in 1:length(names(Train))){
if(length(unique(Train[[names(Train)[i]]][!is.na(Train[[names(Train)[i]]])])) < 2 | sum(is.na(Train[[names(Train)[i]]])) / length(Train[[names(Train)[i]]]) > 0.6){
nonUniqueCol <- append(nonUniqueCol, names(Train)[i])
}
i <- i + 1
}
Train <- subset(Train,select = !names(Train) %in% nonUniqueCol)
write.csv(Train, file = "train_2.csv", row.names = FALSE)
Since, I already checked the coulumns importance by first ML running, We can remove those non relevant columns. Let’s start with Train side.
#Train <- na.omit(Train)
Train$channelGrouping <- NULL
Train$sessionId <- NULL
Train$visitId <- NULL
Train$date <- NULL
Train$visitStartTime <- NULL
Train$date <- NULL
Train$visitStartTime <- NULL
Train$sessionId <- NULL
Train$visitId <- NULL
Train$channelGrouping <- NULL
Train$browser <- NULL
operatingSystem <- NULL
Train$isMobile <- NULL
Train$deviceCategory <- NULL
Train$continent <- NULL
Train$subContinent <- NULL
Train$country <- NULL
Train$networkDomain <- NULL
Train$keyword <- NULL
Train$campaign <- NULL
Train$region <- NULL
Train$metro <- NULL
Train$city <- NULL
Train$referralPath <- NULL
Train$source <- NULL
Train$medium <- NULL
Train$ActTransacPredic <- NULL
#Handling daate: as weekday & day in month
Train$weekday <- 0
Train$DayofMonth <- 0
Train$Month <- 0
Train$hour <- 0
i <- 1
Sys.setlocale("LC_TIME", "C") #for not write in native language
for (i in 1:length(Train$weekday)){
Train$weekday[i] <- as.numeric(format(Train$date[i],format="%u"))
Train$DayofMonth[i] <- as.numeric(format(Train$date[i],format="%d"))
Train$Month[i] <- as.numeric(format(Train$date[i],format="%m"))
Train$hour[i] <- hour(Train$visitStartTime[i])
i <- i + 1
}
Train$hits <- as.numeric(Train$hits)
Train$pageviews <- as.numeric(Train$pageviews)
write.csv(Train, file = "train_3.csv", row.names = FALSE)
Remove non relevant columns on Test side.
#Test <- na.omit(Test)
Test$date <- NULL
Test$visitStartTime <- NULL
Test$sessionId <- NULL
Test$visitId <- NULL
Test$channelGrouping <- NULL
Test$browser <- NULL
operatingSystem <- NULL
Test$isMobile <- NULL
Test$deviceCategory <- NULL
Test$continent <- NULL
Test$subContinent <- NULL
Test$country <- NULL
Test$networkDomain <- NULL
Test$keyword <- NULL
Test$campaign <- NULL
Test$ActTransacPredic <- NULL
Test$hits <- as.numeric(Test$hits)
Test$pageviews <- as.numeric(Test$pageviews)
write.csv(Test, file = "test_1.csv", row.names = FALSE)
#Handling daate: as weekday & day in month
Test$weekday <- 0
Test$DayofMonth <- 0
Test$Month <- 0
Test$hour <- 0
i <- 1
Sys.setlocale("LC_TIME", "C") #for not write in native language
for (i in 1:length(Test$weekday)){
Test$weekday[i] <- as.numeric(format(Test$date[i],format="%u"))
Test$DayofMonth[i] <- as.numeric(format(Test$date[i],format="%d"))
Test$Month[i] <- as.numeric(format(Test$date[i],format="%m"))
Test$hour[i] <- hour(Test$visitStartTime[i])
#print(i)
i <- i + 1
}
write.csv(Test, file = "test_1.csv", row.names = FALSE)
prepare our cross validation for ML part
#Train <- na.exclude(Train)
Training_cv <- createDataPartition(Train$Rule, p=0.8, list=FALSE)
Train_cv <- Train[Training_cv,]
Test_cv <- Train[-Training_cv,]
Prediction Test_cv as cross validation with random forest - ML
model1 <- randomForest(Rule ~.-fullVisitorId, data = Train_cv, importance = TRUE)
model1
predicTrain <- predict(model1, Test_cv, type = "response")
We need to check the columns impact on estimated year transaction prediction.
importance(model1)
After selecting the relevant columns from above. I re-test and remove some columns.
predicTest <- predict(model1, Test, type = "response")
i <- 1
for (i in 1:length(predicTest)) {
Test$PredictedLogRevenue[i] <- round(predicTest[i], 2)
i <- i + 1
print(i)
}
i <- 1
for (i in 1:length(predicTest)) {
if(is.na(Test$PredictedLogRevenue[i]) == TRUE){
Test$PredictedLogRevenue[i] <- 0
}
i <- i + 1
print(i)
}
Test$visitNumberoperatingSystem <- NULL
Test$hits <- NULL
Test$pageviews <- NULL
Test$weekday <- NULL
Test$DayofMonth <- NULL
Test$Month <- NULL
Test$hour <- NULL
Test$Rule <- NULL
Test$visitNumber <- NULL
Test$operatingSystem <- NULL
write.csv(Test, file = "predicTest.csv", row.names = FALSE)