In this document, we first obtain data in December 2017, from SQL database, and then use five different methods: Linear, Regression, Random Forest, Gradient Boosting Machine(GBM), and Deep Learning model to predict total number of people who watch the media, based on device information and weekday.
In the following, we will illustrate all processes step by step, including obtaining data, cleaning data, building models, and making predictions. In the end, we will also list the comparisons of the five methods.
We will use some packages to help us complete this task and have to download and install them first.
getwd()
## [1] "C:/Users/cc.li/Documents/R"
setwd("C:/Users/cc.li/Documents/R/boards")
list.of.packages <- c("dplyr", "ggplot2", "magrittr", "RODBC", "dbConnect", "DBI", "gWidgets", "RMySQL", "xlsx", "rJava", "RJDBC", "h2o")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
library(RODBC)
library(dbConnect)
## Loading required package: RMySQL
## Loading required package: DBI
## Loading required package: gWidgets
library(DBI)
library(gWidgets)
library(RMySQL)
library(xlsx)
## Loading required package: rJava
## Loading required package: xlsxjars
library(rJava)
library(RJDBC)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:gWidgets':
##
## id
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:gWidgets':
##
## add
Now we connect to database to obtain data for further analysis. Note that we only obtain data from qualified devices and the number of people are “extended” to person times per 3,600 seconds.
conn <- odbcDriverConnect(connection="driver=SQL Server;server=coretronic.database.windows.net;database=coredw;uid=CoreAnalysis;pwd=$Core$Analysis123$")
query <-"select
Id,
EnterDate,--Hour,
SumPlayTimeSec, NumberOfFemale, NumberOfMale, NumberOfNeutral,
[1],[2],[3],[4],[5],[6],[7],[8],[9],[10],
[11],[12],[13],[14],[15],[16],[17],[18],[19],[20],
[21],[22],[23],[24],[25],[26],[27],[28],[29],[30],
[31],[32],[33],[34],[35],[36],[37],[38],[39]
from (
select mm.BoardTagId, db.Id,
cc.EnterDate,
--cc.Hour,
sum(3600) SumPlayTimeSec,
sum(cc.CountF) NumberOfFemale,
sum(cc.CountM) NumberOfMale,
sum(cc.CountN) NumberOfNeutral
from dw_dt_board db
join dw_mt_board_tag mm on (db.Id=mm.BoardId)
join dm_ft_cameraevents_pivot_hourly cc on (cc.DeviceUid=db.LastDeviceUid)
where db.qualified=1
and cc.EnterDate between 20171201 and 20171231
group by mm.BoardTagId, db.Id, cc.EnterDate
) as SourceTable
pivot (
count(BoardTagId)
for BoardTagId in ([1],[2],[3],[4],[5],[6],[7],[8],[9],[10],
[11],[12],[13],[14],[15],[16],[17],[18],[19],[20],
[21],[22],[23],[24],[25],[26],[27],[28],[29],[30],
[31],[32],[33],[34],[35],[36],[37],[38],[39])
) as PivotTable
"
data <- sqlQuery(conn, query)
close(conn)
We can take a look at raw data before further processes.
head(data)
## Id EnterDate SumPlayTimeSec NumberOfFemale NumberOfMale NumberOfNeutral
## 1 1 20171201 36000 18 6 0
## 2 1 20171202 39600 104 25 0
## 3 1 20171203 36000 82 28 2
## 4 1 20171204 14400 8 1 0
## 5 1 20171206 28800 16 4 0
## 6 1 20171207 28800 11 4 1
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27
## 1 1 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 1 0
## 2 1 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 1 0
## 3 1 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 1 0
## 4 1 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 1 0
## 5 1 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 1 0
## 6 1 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 1 0
## 28 29 30 31 32 33 34 35 36 37 38 39
## 1 1 0 0 0 0 0 0 1 0 0 0 0
## 2 1 0 0 0 0 0 0 1 0 0 0 0
## 3 1 0 0 0 0 0 0 1 0 0 0 0
## 4 1 0 0 0 0 0 0 1 0 0 0 0
## 5 1 0 0 0 0 0 0 1 0 0 0 0
## 6 1 0 0 0 0 0 0 1 0 0 0 0
We have to clean the data to the suitable form. First, we compute the total number of people per minute by summation all genders and then group by EnterDate.
newdata <- data %>%
mutate(TotalPeople =60*(NumberOfMale + NumberOfFemale + NumberOfNeutral)/SumPlayTimeSec) %>%
group_by(EnterDate)
Moreover, we get weekday from Enterdate.
newdata$wday <- as.POSIXlt(as.Date(as.character(newdata$EnterDate), "%Y%m%d"))$wday
newdata <- as.data.frame(newdata)
For analysis, we transfer weekday into binary tags.
weekdaytag <- data.frame(matrix(ncol = 7, nrow = nrow(newdata)))
names(weekdaytag) <- c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")
index <-1
for (index in 1:nrow(newdata)){
if (newdata$wday[index]==1){
weekdaytag$"Mon"[index] = 1
}else if (newdata$wday[index]==2){
weekdaytag$"Tue"[index] = 1
}else if (newdata$wday[index]==3){
weekdaytag$"Wed"[index] = 1
}else if (newdata$wday[index]==4){
weekdaytag$"Thu"[index] = 1
}else if (newdata$wday[index]==5){
weekdaytag$"Fri"[index] = 1
}else if (newdata$wday[index]==6){
weekdaytag$"Sat"[index] = 1
}else if (newdata$wday[index]==0){
weekdaytag$"Sun"[index] = 1
}
index <- index +1
}
result <- cbind(newdata, weekdaytag)
We change “NA” into “0”, remove some unused columns, remove some “untaged” columns, and re-order columns to make “TotalPeople” in the last one.
result[is.na(result)] <- 0
result$Id <- NULL
result$EnterDate <- NULL
result$wday <- NULL
result$SumPlayTimeSec <- NULL
result$NumberOfMale <- NULL
result$NumberOfFemale <- NULL
result$NumberOfNeutral <- NULL
result <- result[, colSums(result != 0) > 0]
result <- result[,c(1:(which(colnames(result)=="TotalPeople")-1),(which(colnames(result)=="TotalPeople")+1):ncol(result),which(colnames(result)=="TotalPeople"))]
Finally, we have to remove outliers that are two standard deviations from the mean.
findOutlier <- function(data, cutoff = 2) {
## Calculate the sd
sds <- apply(data, 2, sd, na.rm = TRUE)
## Identify the cells with value greater than cutoff * sd (column wise)
result <- mapply(function(d, s) {
which(d > cutoff * s)
}, data, sds)
result
}
removeOutlier <- function(data, outliers) {
result <- data[-outliers, ]
return(as.data.frame(result))
}
outliers <- findOutlier(result)
result<- removeOutlier(result, outliers$TotalPeople)
Finally, we also take a look at the data that are ready to be modeled:
head(result)
## 1 2 4 5 7 8 10 12 13 15 17 18 21 22 23 26 27 28 30 33 35 36 38 Mon Tue
## 1 1 0 1 0 0 1 0 0 1 0 0 0 0 1 1 1 0 1 0 0 1 0 0 0 0
## 2 1 0 1 0 0 1 0 0 1 0 0 0 0 1 1 1 0 1 0 0 1 0 0 0 0
## 3 1 0 1 0 0 1 0 0 1 0 0 0 0 1 1 1 0 1 0 0 1 0 0 0 0
## 4 1 0 1 0 0 1 0 0 1 0 0 0 0 1 1 1 0 1 0 0 1 0 0 1 0
## 5 1 0 1 0 0 1 0 0 1 0 0 0 0 1 1 1 0 1 0 0 1 0 0 0 0
## 6 1 0 1 0 0 1 0 0 1 0 0 0 0 1 1 1 0 1 0 0 1 0 0 0 0
## Wed Thu Fri Sat Sun TotalPeople
## 1 0 0 1 0 0 0.04000000
## 2 0 0 0 1 0 0.19545455
## 3 0 0 0 0 1 0.18666667
## 4 0 0 0 0 0 0.03750000
## 5 1 0 0 0 0 0.04166667
## 6 0 1 0 0 0 0.03333333
smp_size <- floor(0.75 * nrow(result))
set.seed(123)
train_ind <- sample(seq_len(nrow(result)), size = smp_size)
cleanTrain <- result[train_ind, ]
cleanTest <- result[-train_ind, ]
Original <- cleanTest$TotalPeople
We first build a linear model based on train data.
lm_model<-lm(TotalPeople~., data=cleanTrain)
summary(lm_model)
##
## Call:
## lm(formula = TotalPeople ~ ., data = cleanTrain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.47219 -0.09973 -0.02978 0.06732 0.74707
##
## Coefficients: (16 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.26586 0.18497 1.437 0.152249
## `1` 0.30650 0.18042 1.699 0.090970 .
## `2` NA NA NA NA
## `4` NA NA NA NA
## `5` NA NA NA NA
## `7` -0.11590 0.04027 -2.878 0.004451 **
## `8` NA NA NA NA
## `10` NA NA NA NA
## `12` 0.02899 0.05778 0.502 0.616469
## `13` -0.24919 0.05427 -4.592 7.91e-06 ***
## `15` -0.02803 0.05556 -0.505 0.614430
## `17` -0.25900 0.06588 -3.932 0.000118 ***
## `18` NA NA NA NA
## `21` 1.00606 0.18739 5.369 2.26e-07 ***
## `22` NA NA NA NA
## `23` -0.01188 0.03951 -0.301 0.764027
## `26` NA NA NA NA
## `27` NA NA NA NA
## `28` NA NA NA NA
## `30` NA NA NA NA
## `33` NA NA NA NA
## `35` NA NA NA NA
## `36` NA NA NA NA
## `38` NA NA NA NA
## Mon -0.20530 0.04825 -4.255 3.26e-05 ***
## Tue -0.23365 0.04813 -4.854 2.49e-06 ***
## Wed -0.20678 0.04438 -4.659 5.91e-06 ***
## Thu -0.21285 0.04506 -4.723 4.45e-06 ***
## Fri -0.13337 0.04236 -3.149 0.001900 **
## Sat -0.09032 0.04610 -1.959 0.051544 .
## Sun NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1768 on 193 degrees of freedom
## Multiple R-squared: 0.5438, Adjusted R-squared: 0.5107
## F-statistic: 16.43 on 14 and 193 DF, p-value: < 2.2e-16
Use the model to predict values:
predict.lm<-predict(lm_model, newdata = cleanTest, interval = "prediction", level =0.95)
## Warning in predict.lm(lm_model, newdata = cleanTest, interval =
## "prediction", : prediction from a rank-deficient fit may be misleading
predict.lm <- as.data.frame(predict.lm)
One can see that some variables’ Pr are greater than 0.05, it means these variables do not have strong connection to result (total people). In this model, R-squared are slightly larger than 0.5, it means its fitness is satisfied but not good.
In the following, we will use h2o package to build models.
library(h2o)
## Warning: package 'h2o' was built under R version 3.4.3
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit http://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
##
## cor, sd, var
## The following objects are masked from 'package:base':
##
## %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
localH2O <- h2o.init(nthreads = -1)
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 4 hours 47 minutes
## H2O cluster version: 3.16.0.2
## H2O cluster version age: 1 month and 17 days
## H2O cluster name: H2O_started_from_R_cc.li_eem226
## H2O cluster total nodes: 1
## H2O cluster total memory: 1.53 GB
## H2O cluster total cores: 4
## H2O cluster allowed cores: 4
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: Algos, AutoML, Core V3, Core V4
## R Version: R version 3.4.2 (2017-09-28)
train.h2o <- as.h2o(cleanTrain)
##
|
| | 0%
|
|=================================================================| 100%
test.h2o <- as.h2o(cleanTest)
##
|
| | 0%
|
|=================================================================| 100%
colnames(train.h2o)
## [1] "1" "2" "4" "5" "7"
## [6] "8" "10" "12" "13" "15"
## [11] "17" "18" "21" "22" "23"
## [16] "26" "27" "28" "30" "33"
## [21] "35" "36" "38" "Mon" "Tue"
## [26] "Wed" "Thu" "Fri" "Sat" "Sun"
## [31] "TotalPeople"
y.dep <- length(colnames(train.h2o))
x.indep <- c(1:length(colnames(train.h2o))-1)
For regression model:
regression.model <- h2o.glm( y = y.dep, x = x.indep, training_frame = train.h2o, family = "gaussian")
## Warning in .h2o.startModelJob(algo, params, h2oRestApiVersion): Dropping bad and constant columns: [28].
##
|
| | 0%
|
|=================================================================| 100%
h2o.performance(regression.model)
## H2ORegressionMetrics: glm
## ** Reported on training data. **
##
## MSE: 0.032211
## RMSE: 0.1794742
## MAE: 0.1338425
## RMSLE: 0.1315522
## Mean Residual Deviance : 0.032211
## R^2 : 0.4934777
## Null Deviance :13.22723
## Null D.o.F. :207
## Residual Deviance :6.699889
## Residual D.o.F. :197
## AIC :-100.2946
#check variable importance
h2o.varimp(regression.model)
## Standardized Coefficient Magnitudes: standardized coefficient magnitudes
## names coefficients sign
## 1 30 0.068754 POS
## 2 21 0.068754 POS
## 3 Sun 0.045108 POS
## 4 13 0.036545 NEG
## 5 12 0.030924 POS
##
## ---
## names coefficients sign
## 24 33 0.000000 POS
## 25 35 0.000000 POS
## 26 36 0.000000 POS
## 27 Mon 0.000000 POS
## 28 Wed 0.000000 POS
## 29 Thu 0.000000 POS
#make predictions
predict.reg <- as.data.frame(h2o.predict(regression.model, test.h2o))
##
|
| | 0%
|
|=================================================================| 100%
Since the package and data are done, we can build the model and predict the result directly.
# Random Forest Model------------------------------------
rforest.model <- h2o.randomForest(y=y.dep, x=x.indep, training_frame = train.h2o, ntrees = 1000, mtries = 3, max_depth = 4, seed = 1122)
## Warning in .h2o.startModelJob(algo, params, h2oRestApiVersion): Dropping bad and constant columns: [28].
##
|
| | 0%
|
|=== | 5%
|
|========================== | 40%
|
|====================================== | 59%
|
|================================================ | 74%
|
|=================================================================| 100%
h2o.performance(rforest.model)
## H2ORegressionMetrics: drf
## ** Reported on training data. **
## ** Metrics reported on Out-Of-Bag training samples **
##
## MSE: 0.03957866
## RMSE: 0.1989439
## MAE: 0.1475737
## RMSLE: 0.1461759
## Mean Residual Deviance : 0.03957866
#check variable importance
h2o.varimp(rforest.model)
## Variable Importances:
## variable relative_importance scaled_importance percentage
## 1 30 594.560974 1.000000 0.138087
## 2 21 552.664368 0.929534 0.128356
## 3 5 475.573181 0.799873 0.110452
## 4 4 463.402985 0.779404 0.107625
## 5 1 435.730621 0.732861 0.101198
##
## ---
## variable relative_importance scaled_importance percentage
## 24 18 20.475264 0.034438 0.004755
## 25 Sat 19.605469 0.032975 0.004553
## 26 36 18.813929 0.031643 0.004370
## 27 Thu 17.284122 0.029070 0.004014
## 28 26 11.570848 0.019461 0.002687
## 29 23 9.034933 0.015196 0.002098
#make predictions
predict.rforest <- as.data.frame(h2o.predict(rforest.model, test.h2o))
##
|
| | 0%
|
|=================================================================| 100%
Based on the similar way, we build GBM and deep learning models.
#GBM Model------------------------------------
gbm.model <- h2o.gbm(y=y.dep, x=x.indep, training_frame = train.h2o, ntrees = 1000, max_depth = 4, learn_rate = 0.01,
seed = 1122)
## Warning in .h2o.startModelJob(algo, params, h2oRestApiVersion): Dropping bad and constant columns: [28].
##
|
| | 0%
|
|=== | 5%
|
|======================== | 38%
|
|===================================== | 57%
|
|=============================================== | 72%
|
|=============================================================== | 98%
|
|=================================================================| 100%
h2o.performance (gbm.model)
## H2ORegressionMetrics: gbm
## ** Reported on training data. **
##
## MSE: 0.02550984
## RMSE: 0.159718
## MAE: 0.1119923
## RMSLE: 0.1175391
## Mean Residual Deviance : 0.02550984
#check variable importance
h2o.varimp(gbm.model)
## Variable Importances:
## variable relative_importance scaled_importance percentage
## 1 21 223.986801 1.000000 0.562711
## 2 Sun 42.934624 0.191684 0.107862
## 3 13 25.745617 0.114943 0.064679
## 4 17 18.778776 0.083839 0.047177
## 5 12 18.291016 0.081661 0.045952
##
## ---
## variable relative_importance scaled_importance percentage
## 24 10 0.000000 0.000000 0.000000
## 25 18 0.000000 0.000000 0.000000
## 26 22 0.000000 0.000000 0.000000
## 27 26 0.000000 0.000000 0.000000
## 28 27 0.000000 0.000000 0.000000
## 29 30 0.000000 0.000000 0.000000
#make predictions
predict.gbm <- as.data.frame(h2o.predict(gbm.model, test.h2o))
##
|
| | 0%
|
|=================================================================| 100%
#deep learning model------------------------------------
dlearning.model <- h2o.deeplearning(y = y.dep,
x = x.indep,
training_frame = train.h2o,
epoch = 1000,
#hidden = c(200, 100, 100, 50, 20),
activation = "RectifierWithDropout",
seed = 1122
)
## Warning in .h2o.startModelJob(algo, params, h2oRestApiVersion): Dropping bad and constant columns: [28].
##
|
| | 0%
|
|=== | 4%
|
|======== | 13%
|
|=============== | 23%
|
|==================== | 31%
|
|=========================== | 41%
|
|================================== | 52%
|
|=========================================== | 66%
|
|================================================= | 76%
|
|========================================================= | 88%
|
|================================================================ | 99%
|
|=================================================================| 100%
h2o.performance(dlearning.model)
## H2ORegressionMetrics: deeplearning
## ** Reported on training data. **
## ** Metrics reported on full training frame **
##
## MSE: 0.02290942
## RMSE: 0.1513586
## MAE: 0.1053971
## RMSLE: 0.1134806
## Mean Residual Deviance : 0.02290942
#make predictions
predict.dl2 <- as.data.frame(h2o.predict(dlearning.model, test.h2o))
##
|
| | 0%
|
|=================================================================| 100%
We show the test data and predictions of all five methods.
predictions <- data.frame("Original" = Original,
"Linear"= predict.lm$fit,
"Regression" = predict.reg$predict,
"Random Forest" = predict.rforest$predict,
"GBM" = predict.gbm$predict,
"Deep Learning" = predict.dl2$predict
)
head(predictions)
## Original Linear Regression Random.Forest GBM Deep.Learning
## 1 0.18666667 0.31129217 0.2741256 0.2388036 0.39091392 0.42299507
## 2 0.03750000 0.10599167 0.1538414 0.1620972 0.06714267 0.09114255
## 3 0.04166667 0.10451489 0.1538414 0.1557462 0.06054010 0.07222114
## 4 0.03333333 0.09843866 0.1538414 0.1557802 0.08198531 0.07674303
## 5 0.54242424 0.31129217 0.2741256 0.2388036 0.39091392 0.42299507
## 6 0.03958333 0.07764175 0.1376659 0.1491999 0.05004449 0.08360956
Their RMSEs are:
rmse <- function(error)
{
sqrt(mean(error^2))
}
RMSE_LIN <- rmse(predictions$Original - predictions$Linear)
RMSE_REG <- rmse(predictions$Original - predictions$Regression)
RMSE_RFE <- rmse(predictions$Original - predictions$Random.Forest)
RMSE_GBM <- rmse(predictions$Original - predictions$GBM)
RMSE_DPL <- rmse(predictions$Original - predictions$Deep.Learning)
RMSE <- data.frame("Linear" = RMSE_LIN,
"Regression" = RMSE_REG,
"Random Forest" = RMSE_RFE,
"GBM" = RMSE_GBM,
"Deep Learning"= RMSE_DPL
)
RMSE
## Linear Regression Random.Forest GBM Deep.Learning
## 1 0.1705908 0.1832619 0.1948878 0.167609 0.1692455
Their R-squared are:
rsq <- function (x, y) {
1 - (sum((x-y)^2)/sum((x-mean(x))^2))
}
RSQ_LIN <- rsq(predictions$Original, predictions$Linear)
RSQ_REG <- rsq(predictions$Original, predictions$Regression)
RSQ_RFE <- rsq(predictions$Original, predictions$Random.Forest)
RSQ_GBM <- rsq(predictions$Original, predictions$GBM)
RSQ_DPL <- rsq(predictions$Original, predictions$Deep.Learning)
RSQ <- data.frame("Linear" = RSQ_LIN,
"Regression" = RSQ_REG,
"Random Forest" = RSQ_RFE,
"GBM" = RSQ_GBM,
"Deep Learning"= RSQ_DPL
)
RSQ
## Linear Regression Random.Forest GBM Deep.Learning
## 1 0.5994514 0.537738 0.477227 0.6133315 0.605744
We also visualize the predictions by plot their trends:
order_pred <- predictions[order(Original),]
order_pred$ID <- c(1:nrow(predictions))
library("reshape2")
library("ggplot2")
data_long <- melt(order_pred, id="ID")
plot <- ggplot(data_long,
aes(x=ID, y=value, colour=variable)) +
geom_line()+
ggtitle("Use Five Methods to Predict Total People")
plot
In the plot, we sort the original (test) data in ascending order and GBM and deeping learing methods’ fitness are better than others.
In sum, based on RMSE, R-squared and the plot, GBM and deeping learning are worthy to further study. We may focus on hyperparameter optimization to obtain a better result.