Capstone Modeling
library(tidyverse)
library(caret)
library(DMwR2)
library(e1071)
library(ipred)
library(xgboost)
library(kableExtra)
library(ggthemes)
library(egg)
library(forcats)
library(summarytools)
library(forecast)
library(vtreat)
library(doParallel)
Loading Data
df <- read.csv('modeling_data.csv')
# Identify character columns
char_columns <- sapply(df, is.character)
# Convert character columns to factors
df[char_columns] <- lapply(df[char_columns], as.factor)
glimpse(df)
## Rows: 1,722
## Columns: 19
## $ rated <fct> PG-13, PG-13, PG-13, PG-13, PG-13, PG-13, PG, PG-13, …
## $ runtime <int> 143, 164, 142, 143, 115, 136, 93, 169, 106, 93, 86, 1…
## $ genre <fct> Action, Action, Action, Action, Adventure, Action, An…
## $ language <fct> English, English, English, English, English, English,…
## $ country <fct> United States, United States, United States, United S…
## $ boxoffice <int> 623357910, 448149584, 408010692, 304360277, 292324737…
## $ budget <int> 220000000, 250000000, 78000000, 200000000, 120000000,…
## $ num_directors <int> 1, 1, 1, 1, 1, 1, 3, 1, 1, 3, 2, 1, 1, 2, 1, 1, 1, 2,…
## $ num_actors <int> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,…
## $ actor_films5 <int> 17, 17, 7, 12, 17, 8, 4, 0, 12, 10, 3, 11, 7, 5, 12, …
## $ actor_films10 <int> 34, 24, 10, 17, 20, 9, 10, 5, 17, 25, 4, 22, 9, 12, 1…
## $ actor_rev5 <dbl> 1799862677, 1523557430, 438810084, 790069947, 3307645…
## $ actor_rev10 <dbl> 2411207730, 2081390095, 603184119, 1071362460, 337077…
## $ director_films5 <int> 0, 2, 0, 2, 1, 1, 0, 1, 0, 4, 1, 0, 0, 2, 0, 0, 2, 2,…
## $ director_films10 <int> 1, 4, 1, 3, 3, 1, 0, 4, 0, 7, 1, 2, 0, 2, 0, 0, 2, 2,…
## $ director_rev5 <dbl> 0, 827574406, 0, 32363426, 281287133, 32391374, 0, 44…
## $ director_rev10 <dbl> 25514517, 1087527776, 120277854, 95021646, 394908068,…
## $ month <fct> May, July, March, November, November, July, June, Dec…
## $ bud_avail <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes…
We create copies of the dataset to use for each modeling approach,
Helper Functions
We use the following helper functions to streamline our processes:
Calculating Metrics
Column IQR Factorizer
Factorizes column by IQR with separate level for NA
# function to create factored column based on IQR plus NA
create_factored_column <- function(column) {
q <- quantile(column, seq(0, 1, 0.25), na.rm = TRUE)
factor_column <- cut(column, breaks = q, labels = FALSE, include.lowest = TRUE)
factor_column[is.na(column)] <- 5 # Assigning 5 for NA
factor_column <- factor(factor_column, levels = 1:5, labels = c("1", "2", "3", "4", "NA"))
return(factor_column)
}
Box-Cox Transformer
Box-Cox transforms the response variable.
box.response <- function(df){
# make a copy of the master dataframe
t.df <- df
# Specify the response variable and predictors
response <- "boxoffice"
predictors <- names(df)[names(df) != response]
# Apply Box-Cox transformation only to the response variable
preprocess.params <- preProcess(df[, response, drop = FALSE], method = "BoxCox")
# Apply the preProcess function to the response variable
transformed.response <- predict(preprocess.params, newdata = df[, response, drop = FALSE])
# Store the transformed response in the boxoffice column
t.df['boxoffice'] <- transformed.response
return(t.df)
}
Generating Dummies
Creates dummy variables for categorical predictors.
generate.dummies <- function(df){
# get names of categorical variables
cols <- names(df)[sapply(df, is.factor)]
# One-hot encode factor columns
dummies <- model.matrix(~.-1, data = df[, cols])
# combine dummies with numerical columns
df.encoded <- cbind(df[, !names(df) %in% cols], dummies)
return(df.encoded)
}
Data Splitting
data.splitter <- function(df) {
set.seed(1)
# Shuffle the dataset
shuffled <- df[sample(nrow(df)), ]
# Create split point
split.index <- createDataPartition(df$boxoffice, p = 0.8, list = FALSE)
# Split the data
train.set <- shuffled[split.index, ]
test.set <- shuffled[-split.index, ]
# Print the code
cat(sprintf("Training set contains %d rows.", dim(train.set)[1]),
sprintf("\nTesting set contains %d rows.", dim(test.set)[1]))
# Return the processed data
return(list(train.set = train.set, test.set = test.set))
}
Bagged Modeling
bag.deploy <- function(df){
set.seed(1)
# split data
split.data <- data.splitter(df)
train.set <- split.data$train.set
test.set <- split.data$test.set
# enable clusters for parallel processing
cl <- makeCluster(4)
registerDoParallel(cl)
start <- Sys.time()
model <- train(
boxoffice ~ .,
data = train.set,
method = "treebag",
trControl = bag.ctrl,
tuneLength = 10
)
end <- Sys.time()
elapsed.time <- as.numeric(difftime(end, start, units = "secs"))
preds <- predict(model, test.set)
results <- metrics(preds, test.set$boxoffice)
results['TIME'] <- elapsed.time
stopCluster(cl)
return(list(model = model, results = results))
}
Random Forest Modeling
rf.deploy <- function(df){
set.seed(1)
# split data
split.data <- data.splitter(df)
train.set <- split.data$train.set
test.set <- split.data$test.set
# enable clusters for parallel processing
cl <- makeCluster(4)
registerDoParallel(cl)
start <- Sys.time()
model <- train(
boxoffice ~ .,
data = train.set,
method = "rf",
ntrees = 1000,
trControl = rf.ctrl,
parallel = "multicore"
)
end <- Sys.time()
elapsed.time <- as.numeric(difftime(end, start, units = "secs"))
preds <- predict(model, test.set)
results <- metrics(preds, test.set$boxoffice)
results['TIME'] <- elapsed.time
stopCluster(cl)
return(list(model = model, results = results))
}
XGBoost Modeling
xg.deploy <- function(df, ctrl, grid, scale=NULL){
set.seed(1)
# split data
split.data <- data.splitter(df)
train.set <- split.data$train.set
test.set <- split.data$test.set
# enable clusters for parallel processing
cl <- makeCluster(4)
registerDoParallel(cl)
start <- Sys.time()
if (missing(scale)){
model <- train(
boxoffice ~ .,
data = train.set,
method = "xgbTree",
trControl = ctrl,
tuneGrid = grid
)
} else {
model <- train(
boxoffice ~ .,
data = train.set,
method = "xgbTree",
trControl = ctrl,
tuneGrid = grid,
preProc = c(scale)
)
}
end <- Sys.time()
elapsed.time <- as.numeric(difftime(end, start, units = "secs"))
preds <- predict(model, test.set)
results <- metrics(preds, test.set$boxoffice)
results['TIME'] <- elapsed.time
stopCluster(cl)
return(list(model = model, results = results))
}
Cubist Modeling
cube.deploy <- function(df, ctrl, grid, scale=NULL){
set.seed(1)
# split data
split.data <- data.splitter(df)
train.set <- split.data$train.set
test.set <- split.data$test.set
# enable clusters for parallel processing
cl <- makeCluster(4)
registerDoParallel(cl)
start <- Sys.time()
if (missing(scale)){
model <- train(
boxoffice ~ .,
data = train.set,
method = "cubist",
trControl = ctrl,
tuneGrid = grid
)
} else {
model <- train(
boxoffice ~ .,
data = train.set,
method = "cubist",
trControl = ctrl,
tuneGrid = grid,
preProc = c(scale)
)
}
end <- Sys.time()
elapsed.time <- as.numeric(difftime(end, start, units = "secs"))
preds <- predict(model, test.set)
results <- metrics(preds, test.set$boxoffice)
results['TIME'] <- elapsed.time
stopCluster(cl)
return(list(model = model, results = results))
}
Variable Importance Plotting
plotImp <- function(imp, top_n, title=NULL){
# Select the top n most important variables
top_vars <- imp$importance |>
as.data.frame() |>
arrange(desc(Overall)) |>
head(top_n)
if (missing(title)){
# Create a horizontal bar plot
ggplot(top_vars, aes(x = Overall, y = reorder(rownames(top_vars), Overall))) +
geom_col(fill="lightblue") +
labs(title = "Most Important Variables",
x = "Importance",
y = "Variable") +
theme_few() +
theme(plot.title = element_text(hjust = 0.5))
} else {
# Create a horizontal bar plot
ggplot(top_vars, aes(x = Overall, y = reorder(rownames(top_vars), Overall))) +
geom_col(fill="lightblue") +
labs(title = title,
x = "",
y = "") +
theme_few() +
theme(plot.title = element_text(hjust = 0.5,
size = 10))
}
}
Modeling
Bagged Trees
We define the parameters for model training, enabling 10-fold cross-validation.
We perform the BoxCox transformation on the response variable,
Baseline Model
First we build a model including no star power variables and imputing zeros for NAs in Budget.
b.base <- b.df |>
select(c(1:7,18:19)) |>
mutate(budget = ifelse(is.na(budget),
0,
budget))
bag.base <- bag.deploy(b.base)
## Training set contains 1378 rows.
## Testing set contains 344 rows.
## ME RMSE Rsquared MAE MPE MAPE
## bag.baseline -0.0935848 1.084371 0.6040531 0.8695907 -1.057409 5.428965
## TIME
## bag.baseline 18.72908
V1
- Full dataset.
- Imputing zeros for NAs in Budget.
## Training set contains 1378 rows.
## Testing set contains 344 rows.
## ME RMSE Rsquared MAE MPE MAPE
## bag.baseline -0.09358480 1.084371 0.6040531 0.8695907 -1.057409 5.428965
## bag1 -0.09099274 1.062611 0.6193303 0.8592402 -1.015751 5.353042
## TIME
## bag.baseline 18.72908
## bag1 19.96190
V2
- Full dataset.
- Discretizing
budget
variable by IQR.
## Training set contains 1378 rows.
## Testing set contains 344 rows.
## ME RMSE Rsquared MAE MPE MAPE
## bag.baseline -0.09358480 1.084371 0.6040531 0.8695907 -1.057409 5.428965
## bag1 -0.09099274 1.062611 0.6193303 0.8592402 -1.015751 5.353042
## bag2 -0.09093805 1.065737 0.6172368 0.8764869 -1.016833 5.439538
## TIME
## bag.baseline 18.72908
## bag1 19.96190
## bag2 18.32149
Turning the budget
column into a categorical variable
performs worse than imputing zero for NAs, and worse than the baseline
model.
V3
- Remove Near Zero Variance variables
- Impute Zero for NAs.
## [1] "language" "num_actors"
cols <- nzv |>
filter(nzv == TRUE) |>
rownames()
b.df3 <- b.df1 |>
select(-cols)
bag3 <- bag.deploy(b.df3)
## Training set contains 1378 rows.
## Testing set contains 344 rows.
## ME RMSE Rsquared MAE MPE MAPE
## bag.baseline -0.09358480 1.084371 0.6040531 0.8695907 -1.057409 5.428965
## bag1 -0.09099274 1.062611 0.6193303 0.8592402 -1.015751 5.353042
## bag2 -0.09093805 1.065737 0.6172368 0.8764869 -1.016833 5.439538
## bag3 -0.09099274 1.062611 0.6193303 0.8592402 -1.015751 5.353042
## TIME
## bag.baseline 18.72908
## bag1 19.96190
## bag2 18.32149
## bag3 19.38999
The near-zero variance variables had no effect on the modeling results. In fact, the full model worked faster.
V4
- Including only 5-year Star Power
b.df4 <- b.df3 |>
select(-c(num_directors,
actor_films10,
actor_rev10,
director_films10,
director_rev10))
bag4 <- bag.deploy(b.df4)
## Training set contains 1378 rows.
## Testing set contains 344 rows.
## ME RMSE Rsquared MAE MPE MAPE
## bag.baseline -0.09358480 1.084371 0.6040531 0.8695907 -1.057409 5.428965
## bag1 -0.09099274 1.062611 0.6193303 0.8592402 -1.015751 5.353042
## bag2 -0.09093805 1.065737 0.6172368 0.8764869 -1.016833 5.439538
## bag3 -0.09099274 1.062611 0.6193303 0.8592402 -1.015751 5.353042
## bag4 -0.08998240 1.064850 0.6175457 0.8607279 -1.010223 5.363212
## TIME
## bag.baseline 18.72908
## bag1 19.96190
## bag2 18.32149
## bag3 19.38999
## bag4 17.60555
V5
- Including only 10-year Star Power
b.df5 <- b.df3 |>
select(-c(num_directors,
actor_films5,
actor_rev5,
director_films5,
director_rev5))
bag5 <- bag.deploy(b.df5)
## Training set contains 1378 rows.
## Testing set contains 344 rows.
## ME RMSE Rsquared MAE MPE MAPE
## bag.baseline -0.09358480 1.084371 0.6040531 0.8695907 -1.057409 5.428965
## bag1 -0.09099274 1.062611 0.6193303 0.8592402 -1.015751 5.353042
## bag2 -0.09093805 1.065737 0.6172368 0.8764869 -1.016833 5.439538
## bag3 -0.09099274 1.062611 0.6193303 0.8592402 -1.015751 5.353042
## bag4 -0.08998240 1.064850 0.6175457 0.8607279 -1.010223 5.363212
## bag5 -0.08916172 1.057499 0.6237698 0.8555682 -1.010743 5.328622
## TIME
## bag.baseline 18.72908
## bag1 19.96190
## bag2 18.32149
## bag3 19.38999
## bag4 17.60555
## bag5 17.85801
V6
- 5 year and 5+ - 10 year
b.df6 <- b.df3 |>
mutate(actor_films10 = actor_films10 - actor_films5,
actor_rev10 = actor_rev10 - actor_rev5,
director_films10 = director_films10 - director_films5,
director_rev10 = director_rev10 - director_rev5) |>
select(-c(num_directors))
bag6 <- bag.deploy(b.df6)
## Training set contains 1378 rows.
## Testing set contains 344 rows.
## ME RMSE Rsquared MAE MPE MAPE
## bag.baseline -0.09358480 1.084371 0.6040531 0.8695907 -1.057409 5.428965
## bag1 -0.09099274 1.062611 0.6193303 0.8592402 -1.015751 5.353042
## bag2 -0.09093805 1.065737 0.6172368 0.8764869 -1.016833 5.439538
## bag3 -0.09099274 1.062611 0.6193303 0.8592402 -1.015751 5.353042
## bag4 -0.08998240 1.064850 0.6175457 0.8607279 -1.010223 5.363212
## bag5 -0.08916172 1.057499 0.6237698 0.8555682 -1.010743 5.328622
## bag6 -0.09008839 1.065559 0.6170242 0.8612496 -1.011299 5.366995
## TIME
## bag.baseline 18.72908
## bag1 19.96190
## bag2 18.32149
## bag3 19.38999
## bag4 17.60555
## bag5 17.85801
## bag6 20.05631
Random Forest
We define the parameters for model training, enabling 10-fold cross-validation.
We perform the BoxCox transformation on the response variable,
Further, we are going to make the zero NA imputation our baseline for this model,
Baseline Model
## Training set contains 1378 rows.
## Testing set contains 344 rows.
## ME RMSE Rsquared MAE MPE MAPE
## bag.baseline -0.09358480 1.084371 0.6040531 0.8695907 -1.0574089 5.428965
## bag1 -0.09099274 1.062611 0.6193303 0.8592402 -1.0157512 5.353042
## bag2 -0.09093805 1.065737 0.6172368 0.8764869 -1.0168330 5.439538
## bag3 -0.09099274 1.062611 0.6193303 0.8592402 -1.0157512 5.353042
## bag4 -0.08998240 1.064850 0.6175457 0.8607279 -1.0102230 5.363212
## bag5 -0.08916172 1.057499 0.6237698 0.8555682 -1.0107429 5.328622
## bag6 -0.09008839 1.065559 0.6170242 0.8612496 -1.0112991 5.366995
## rf.baseline -0.05938704 1.058679 0.6200670 0.8361933 -0.7858701 5.200798
## TIME
## bag.baseline 18.72908
## bag1 19.96190
## bag2 18.32149
## bag3 19.38999
## bag4 17.60555
## bag5 17.85801
## bag6 20.05631
## rf.baseline 103.99030
The baseline Random Forest model performs better than the best Bagged Trees model. Now let’s introduce the star power variables.
V1
## Training set contains 1378 rows.
## Testing set contains 344 rows.
## ME RMSE Rsquared MAE MPE MAPE
## bag.baseline -0.09358480 1.0843706 0.6040531 0.8695907 -1.0574089 5.428965
## bag1 -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag2 -0.09093805 1.0657372 0.6172368 0.8764869 -1.0168330 5.439538
## bag3 -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag4 -0.08998240 1.0648496 0.6175457 0.8607279 -1.0102230 5.363212
## bag5 -0.08916172 1.0574986 0.6237698 0.8555682 -1.0107429 5.328622
## bag6 -0.09008839 1.0655591 0.6170242 0.8612496 -1.0112991 5.366995
## rf.baseline -0.05938704 1.0586792 0.6200670 0.8361933 -0.7858701 5.200798
## rf1 -0.06758472 0.9972875 0.6655080 0.7960606 -0.8410135 4.948810
## TIME
## bag.baseline 18.72908
## bag1 19.96190
## bag2 18.32149
## bag3 19.38999
## bag4 17.60555
## bag5 17.85801
## bag6 20.05631
## rf.baseline 103.99030
## rf1 132.44422
The full random forest model performs better than the bagged trees models. However, this came at far greater computational expense. Let’s take a look at the important variables to this model:
## rf variable importance
##
## only 20 most important variables shown (out of 40)
##
## Overall
## budget 100.000
## bud_availYes 24.227
## director_rev10 14.873
## director_rev5 12.121
## runtime 10.782
## actor_rev5 9.714
## actor_rev10 9.310
## actor_films10 5.325
## actor_films5 4.569
## director_films10 2.603
## genreHorror 2.240
## ratedR 2.104
## countryUnited States 1.843
## monthDecember 1.808
## director_films5 1.687
## monthOctober 1.637
## genreAnimation 1.285
## genreComedy 1.252
## monthAugust 1.250
## ratedPG-13 1.155
We can see that the near zero variance variables are not amongst the top 20, nor is the number of directors and number of actors. Let’s see if removing those variables improves the model.
V2
We remove the near zero variance variables as well as the number of actors and directors.
## Training set contains 1378 rows.
## Testing set contains 344 rows.
## ME RMSE Rsquared MAE MPE MAPE
## bag.baseline -0.09358480 1.0843706 0.6040531 0.8695907 -1.0574089 5.428965
## bag1 -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag2 -0.09093805 1.0657372 0.6172368 0.8764869 -1.0168330 5.439538
## bag3 -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag4 -0.08998240 1.0648496 0.6175457 0.8607279 -1.0102230 5.363212
## bag5 -0.08916172 1.0574986 0.6237698 0.8555682 -1.0107429 5.328622
## bag6 -0.09008839 1.0655591 0.6170242 0.8612496 -1.0112991 5.366995
## rf.baseline -0.05938704 1.0586792 0.6200670 0.8361933 -0.7858701 5.200798
## rf1 -0.06758472 0.9972875 0.6655080 0.7960606 -0.8410135 4.948810
## rf2 -0.06366472 1.0033461 0.6610371 0.7992679 -0.8212055 4.971403
## TIME
## bag.baseline 18.72908
## bag1 19.96190
## bag2 18.32149
## bag3 19.38999
## bag4 17.60555
## bag5 17.85801
## bag6 20.05631
## rf.baseline 103.99030
## rf1 132.44422
## rf2 123.42354
This model performs marginally better computationally at the expense of accuracy. While the language variable has near zero variance, there is still some important information contained in the variable.
V3
Let’s focus in on the 5 year star power metrics and see how the model performs.
r.df3 <- r.df1 |>
select(-c(num_actors,
num_directors,
actor_films10,
actor_rev10,
director_films10,
director_rev10))
rf3 <- rf.deploy(r.df3)
## Training set contains 1378 rows.
## Testing set contains 344 rows.
## ME RMSE Rsquared MAE MPE MAPE
## bag.baseline -0.09358480 1.0843706 0.6040531 0.8695907 -1.0574089 5.428965
## bag1 -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag2 -0.09093805 1.0657372 0.6172368 0.8764869 -1.0168330 5.439538
## bag3 -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag4 -0.08998240 1.0648496 0.6175457 0.8607279 -1.0102230 5.363212
## bag5 -0.08916172 1.0574986 0.6237698 0.8555682 -1.0107429 5.328622
## bag6 -0.09008839 1.0655591 0.6170242 0.8612496 -1.0112991 5.366995
## rf.baseline -0.05938704 1.0586792 0.6200670 0.8361933 -0.7858701 5.200798
## rf1 -0.06758472 0.9972875 0.6655080 0.7960606 -0.8410135 4.948810
## rf2 -0.06366472 1.0033461 0.6610371 0.7992679 -0.8212055 4.971403
## rf3 -0.05755666 1.0071221 0.6571241 0.8033652 -0.7766925 4.998547
## TIME
## bag.baseline 18.72908
## bag1 19.96190
## bag2 18.32149
## bag3 19.38999
## bag4 17.60555
## bag5 17.85801
## bag6 20.05631
## rf.baseline 103.99030
## rf1 132.44422
## rf2 123.42354
## rf3 105.98884
V4
Now let’s compare with only the 10 year star power metrics,
r.df4 <- r.df1 |>
select(-c(num_actors,
num_directors,
actor_films5,
actor_rev5,
director_films5,
director_rev5))
rf4 <- rf.deploy(r.df4)
## Training set contains 1378 rows.
## Testing set contains 344 rows.
## ME RMSE Rsquared MAE MPE MAPE
## bag.baseline -0.09358480 1.0843706 0.6040531 0.8695907 -1.0574089 5.428965
## bag1 -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag2 -0.09093805 1.0657372 0.6172368 0.8764869 -1.0168330 5.439538
## bag3 -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag4 -0.08998240 1.0648496 0.6175457 0.8607279 -1.0102230 5.363212
## bag5 -0.08916172 1.0574986 0.6237698 0.8555682 -1.0107429 5.328622
## bag6 -0.09008839 1.0655591 0.6170242 0.8612496 -1.0112991 5.366995
## rf.baseline -0.05938704 1.0586792 0.6200670 0.8361933 -0.7858701 5.200798
## rf1 -0.06758472 0.9972875 0.6655080 0.7960606 -0.8410135 4.948810
## rf2 -0.06366472 1.0033461 0.6610371 0.7992679 -0.8212055 4.971403
## rf3 -0.05755666 1.0071221 0.6571241 0.8033652 -0.7766925 4.998547
## rf4 -0.07985569 0.9985825 0.6646830 0.7919069 -0.9103610 4.923883
## TIME
## bag.baseline 18.72908
## bag1 19.96190
## bag2 18.32149
## bag3 19.38999
## bag4 17.60555
## bag5 17.85801
## bag6 20.05631
## rf.baseline 103.99030
## rf1 132.44422
## rf2 123.42354
## rf3 105.98884
## rf4 104.72795
The model using only the 10 year totals performs best. This might be because the 10 year data contains the 5 year data. We now split up the 5 and 10 year data,
V5
The model using only the 10 year totals performs best. This might be because the 10 year data contains the 5 year data. We now split up the 5 and 10 year data,
r.df5 <- r.df1 |>
mutate(actor_films10 = actor_films10 - actor_films5,
actor_rev10 = actor_rev10 - actor_rev5,
director_films10 = director_films10 - director_films5,
director_rev10 = director_rev10 - director_rev5) |>
select(-c(num_actors,
num_directors))
rf5 <- rf.deploy(r.df5)
## Training set contains 1378 rows.
## Testing set contains 344 rows.
## ME RMSE Rsquared MAE MPE MAPE
## bag.baseline -0.09358480 1.0843706 0.6040531 0.8695907 -1.0574089 5.428965
## bag1 -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag2 -0.09093805 1.0657372 0.6172368 0.8764869 -1.0168330 5.439538
## bag3 -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag4 -0.08998240 1.0648496 0.6175457 0.8607279 -1.0102230 5.363212
## bag5 -0.08916172 1.0574986 0.6237698 0.8555682 -1.0107429 5.328622
## bag6 -0.09008839 1.0655591 0.6170242 0.8612496 -1.0112991 5.366995
## rf.baseline -0.05938704 1.0586792 0.6200670 0.8361933 -0.7858701 5.200798
## rf1 -0.06758472 0.9972875 0.6655080 0.7960606 -0.8410135 4.948810
## rf2 -0.06366472 1.0033461 0.6610371 0.7992679 -0.8212055 4.971403
## rf3 -0.05755666 1.0071221 0.6571241 0.8033652 -0.7766925 4.998547
## rf4 -0.07985569 0.9985825 0.6646830 0.7919069 -0.9103610 4.923883
## rf5 -0.04872120 0.9975813 0.6644236 0.7960901 -0.7273081 4.946893
## TIME
## bag.baseline 18.72908
## bag1 19.96190
## bag2 18.32149
## bag3 19.38999
## bag4 17.60555
## bag5 17.85801
## bag6 20.05631
## rf.baseline 103.99030
## rf1 132.44422
## rf2 123.42354
## rf3 105.98884
## rf4 104.72795
## rf5 113.90101
XGBoost
We define the parameters for model training, enabling 10-fold cross-validation, perform the Box-Cox transformation, and impute the NA values in the budget data. For the XGBoost model, we need to perform grid search to determine the optimal parameters. We intialize a grid search object to do so. We also need to create dummy variables for the categorical data.
# set parameters
xg.ctrl <- trainControl(
method = "cv",
number = 10,
allowParallel = TRUE
)
# initialize grid for search
initial.grid <- expand.grid(
nrounds = c(50, 100, 150),
max_depth = c(3, 6),
colsample_bytree = c(0.6, 0.8),
eta = c(0.05, 0.1, 0.2),
gamma= 0,
min_child_weight = 10,
subsample = 1
)
# perform BoxCox transformation
x.df <- box.response(x.df)
# impute zeros for NAs
x.df <- x.df |>
mutate(budget = ifelse(is.na(budget),
0,
budget))
# generate dummy vars
x.df <- generate.dummies(x.df)
Baseline Model
First we produce the model without any star power variables.
xgb.base <- x.df |>
select(-c(num_actors,
num_directors,
actor_films5,
actor_films10,
actor_rev5,
actor_rev10,
director_films5,
director_films10,
director_rev5,
director_rev10))
xg.base <- xg.deploy(xgb.base, xg.ctrl, initial.grid)
## Training set contains 1378 rows.
## Testing set contains 344 rows.
## ME RMSE Rsquared MAE MPE MAPE
## bag.baseline -0.09358480 1.0843706 0.6040531 0.8695907 -1.0574089 5.428965
## bag1 -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag2 -0.09093805 1.0657372 0.6172368 0.8764869 -1.0168330 5.439538
## bag3 -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag4 -0.08998240 1.0648496 0.6175457 0.8607279 -1.0102230 5.363212
## bag5 -0.08916172 1.0574986 0.6237698 0.8555682 -1.0107429 5.328622
## bag6 -0.09008839 1.0655591 0.6170242 0.8612496 -1.0112991 5.366995
## rf.baseline -0.05938704 1.0586792 0.6200670 0.8361933 -0.7858701 5.200798
## rf1 -0.06758472 0.9972875 0.6655080 0.7960606 -0.8410135 4.948810
## rf2 -0.06366472 1.0033461 0.6610371 0.7992679 -0.8212055 4.971403
## rf3 -0.05755666 1.0071221 0.6571241 0.8033652 -0.7766925 4.998547
## rf4 -0.07985569 0.9985825 0.6646830 0.7919069 -0.9103610 4.923883
## rf5 -0.04872120 0.9975813 0.6644236 0.7960901 -0.7273081 4.946893
## xgb.baseline -0.07860914 1.0508742 0.6271185 0.8423056 -0.9343947 5.249769
## TIME
## bag.baseline 18.72908
## bag1 19.96190
## bag2 18.32149
## bag3 19.38999
## bag4 17.60555
## bag5 17.85801
## bag6 20.05631
## rf.baseline 103.99030
## rf1 132.44422
## rf2 123.42354
## rf3 105.98884
## rf4 104.72795
## rf5 113.90101
## xgb.baseline 33.11829
V1
Now we include the full dataset minus the number of actors and directors,
xgb.df1 <- x.df |>
select(-c(num_actors,
num_directors))
xgb1 <- xg.deploy(xgb.df1, xg.ctrl, initial.grid)
## Training set contains 1378 rows.
## Testing set contains 344 rows.
## ME RMSE Rsquared MAE MPE MAPE
## bag.baseline -0.09358480 1.0843706 0.6040531 0.8695907 -1.0574089 5.428965
## bag1 -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag2 -0.09093805 1.0657372 0.6172368 0.8764869 -1.0168330 5.439538
## bag3 -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag4 -0.08998240 1.0648496 0.6175457 0.8607279 -1.0102230 5.363212
## bag5 -0.08916172 1.0574986 0.6237698 0.8555682 -1.0107429 5.328622
## bag6 -0.09008839 1.0655591 0.6170242 0.8612496 -1.0112991 5.366995
## rf.baseline -0.05938704 1.0586792 0.6200670 0.8361933 -0.7858701 5.200798
## rf1 -0.06758472 0.9972875 0.6655080 0.7960606 -0.8410135 4.948810
## rf2 -0.06366472 1.0033461 0.6610371 0.7992679 -0.8212055 4.971403
## rf3 -0.05755666 1.0071221 0.6571241 0.8033652 -0.7766925 4.998547
## rf4 -0.07985569 0.9985825 0.6646830 0.7919069 -0.9103610 4.923883
## rf5 -0.04872120 0.9975813 0.6644236 0.7960901 -0.7273081 4.946893
## xgb.baseline -0.07860914 1.0508742 0.6271185 0.8423056 -0.9343947 5.249769
## xgb1 -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
## TIME
## bag.baseline 18.72908
## bag1 19.96190
## bag2 18.32149
## bag3 19.38999
## bag4 17.60555
## bag5 17.85801
## bag6 20.05631
## rf.baseline 103.99030
## rf1 132.44422
## rf2 123.42354
## rf3 105.98884
## rf4 104.72795
## rf5 113.90101
## xgb.baseline 33.11829
## xgb1 41.03013
## eXtreme Gradient Boosting
##
## 1378 samples
## 39 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1240, 1239, 1240, 1240, 1240, 1241, ...
## Resampling results across tuning parameters:
##
## eta max_depth colsample_bytree nrounds RMSE Rsquared MAE
## 0.05 3 0.6 50 1.6379956 0.6216149 1.4238739
## 0.05 3 0.6 100 1.0043707 0.6344061 0.8013707
## 0.05 3 0.6 150 0.9984778 0.6346001 0.7917885
## 0.05 3 0.8 50 1.6352806 0.6207552 1.4226454
## 0.05 3 0.8 100 1.0070404 0.6327254 0.8028029
## 0.05 3 0.8 150 1.0022758 0.6318723 0.7930077
## 0.05 6 0.6 50 1.6439423 0.6218046 1.4299996
## 0.05 6 0.6 100 1.0143390 0.6269557 0.8076141
## 0.05 6 0.6 150 1.0153622 0.6218429 0.8015324
## 0.05 6 0.8 50 1.6366411 0.6243291 1.4241675
## 0.05 6 0.8 100 1.0087875 0.6304816 0.8037345
## 0.05 6 0.8 150 1.0129008 0.6236401 0.7987113
## 0.10 3 0.6 50 1.0054021 0.6329870 0.8037944
## 0.10 3 0.6 100 1.0037020 0.6303860 0.7947996
## 0.10 3 0.6 150 1.0088341 0.6269252 0.7988991
## 0.10 3 0.8 50 1.0064614 0.6317026 0.8026161
## 0.10 3 0.8 100 1.0049060 0.6297010 0.7958916
## 0.10 3 0.8 150 1.0120986 0.6244223 0.8027101
## 0.10 6 0.6 50 1.0083643 0.6311659 0.8047561
## 0.10 6 0.6 100 1.0185589 0.6204094 0.8067567
## 0.10 6 0.6 150 1.0357809 0.6088416 0.8193894
## 0.10 6 0.8 50 1.0134177 0.6269693 0.8047117
## 0.10 6 0.8 100 1.0312322 0.6112587 0.8129940
## 0.10 6 0.8 150 1.0477260 0.5998182 0.8246396
## 0.20 3 0.6 50 1.0094588 0.6265192 0.8016020
## 0.20 3 0.6 100 1.0270647 0.6144371 0.8153585
## 0.20 3 0.6 150 1.0424950 0.6039809 0.8270089
## 0.20 3 0.8 50 1.0077571 0.6279545 0.7980564
## 0.20 3 0.8 100 1.0232970 0.6169444 0.8066051
## 0.20 3 0.8 150 1.0408695 0.6045921 0.8206761
## 0.20 6 0.6 50 1.0384229 0.6056873 0.8209327
## 0.20 6 0.6 100 1.0739422 0.5813701 0.8497973
## 0.20 6 0.6 150 1.0941007 0.5682342 0.8665790
## 0.20 6 0.8 50 1.0398593 0.6048097 0.8217671
## 0.20 6 0.8 100 1.0725093 0.5852262 0.8460600
## 0.20 6 0.8 150 1.0925463 0.5730316 0.8644093
##
## Tuning parameter 'gamma' was held constant at a value of 0
## Tuning
## parameter 'min_child_weight' was held constant at a value of 10
##
## Tuning parameter 'subsample' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nrounds = 150, max_depth = 3, eta
## = 0.05, gamma = 0, colsample_bytree = 0.6, min_child_weight = 10 and
## subsample = 1.
V2
What effect, if any, does scaling the predictors and introducing a lower learning rate to the grid search have on our model?
# update grid search
updated.grid <- expand.grid(
nrounds = c(100, 150),
max_depth = c(3, 6),
colsample_bytree = c(0.3, 0.6),
eta = c(0.01, 0.05, 0.1),
gamma= 0,
min_child_weight = 10,
subsample = 1
)
# store preprocessing
scale <- "scale"
xgb.df2 <- xgb.df1
xgb2 <- xg.deploy(xgb.df2, xg.ctrl, updated.grid, scale)
## Training set contains 1378 rows.
## Testing set contains 344 rows.
## ME RMSE Rsquared MAE MPE MAPE
## bag.baseline -0.09358480 1.0843706 0.6040531 0.8695907 -1.0574089 5.428965
## bag1 -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag2 -0.09093805 1.0657372 0.6172368 0.8764869 -1.0168330 5.439538
## bag3 -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag4 -0.08998240 1.0648496 0.6175457 0.8607279 -1.0102230 5.363212
## bag5 -0.08916172 1.0574986 0.6237698 0.8555682 -1.0107429 5.328622
## bag6 -0.09008839 1.0655591 0.6170242 0.8612496 -1.0112991 5.366995
## rf.baseline -0.05938704 1.0586792 0.6200670 0.8361933 -0.7858701 5.200798
## rf1 -0.06758472 0.9972875 0.6655080 0.7960606 -0.8410135 4.948810
## rf2 -0.06366472 1.0033461 0.6610371 0.7992679 -0.8212055 4.971403
## rf3 -0.05755666 1.0071221 0.6571241 0.8033652 -0.7766925 4.998547
## rf4 -0.07985569 0.9985825 0.6646830 0.7919069 -0.9103610 4.923883
## rf5 -0.04872120 0.9975813 0.6644236 0.7960901 -0.7273081 4.946893
## xgb.baseline -0.07860914 1.0508742 0.6271185 0.8423056 -0.9343947 5.249769
## xgb1 -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
## xgb2 -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
## TIME
## bag.baseline 18.72908
## bag1 19.96190
## bag2 18.32149
## bag3 19.38999
## bag4 17.60555
## bag5 17.85801
## bag6 20.05631
## rf.baseline 103.99030
## rf1 132.44422
## rf2 123.42354
## rf3 105.98884
## rf4 104.72795
## rf5 113.90101
## xgb.baseline 33.11829
## xgb1 41.03013
## xgb2 32.23954
Scaling has no effect on the model accuracy but it does improve performance.
## xgbTree variable importance
##
## only 20 most important variables shown (out of 39)
##
## Overall
## budget 100.0000
## bud_availYes 31.7508
## director_rev5 8.7267
## director_rev10 8.2283
## genreHorror 3.2017
## runtime 2.2959
## actor_rev10 2.2645
## `countryUnited States` 1.9715
## actor_rev5 1.9705
## ratedR 1.9617
## genreAnimation 1.4434
## actor_films5 1.1664
## actor_films10 0.6857
## monthDecember 0.5387
## genreCrime 0.4462
## `ratedPG-13` 0.4287
## ratedG 0.4092
## monthOctober 0.3991
## director_films10 0.3447
## monthJanuary 0.2837
V3
Checking the model against previously tuned grid.
tuned.grid <- expand.grid(
nrounds = 100,
max_depth = 6,
colsample_bytree = 0.6,
eta = 0.1,
gamma= 0,
min_child_weight = 10,
subsample = 1
)
xgb3 <- xg.deploy(xgb.df2, xg.ctrl, tuned.grid, scale)
## Training set contains 1378 rows.
## Testing set contains 344 rows.
## ME RMSE Rsquared MAE MPE MAPE
## bag.baseline -0.09358480 1.0843706 0.6040531 0.8695907 -1.0574089 5.428965
## bag1 -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag2 -0.09093805 1.0657372 0.6172368 0.8764869 -1.0168330 5.439538
## bag3 -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag4 -0.08998240 1.0648496 0.6175457 0.8607279 -1.0102230 5.363212
## bag5 -0.08916172 1.0574986 0.6237698 0.8555682 -1.0107429 5.328622
## bag6 -0.09008839 1.0655591 0.6170242 0.8612496 -1.0112991 5.366995
## rf.baseline -0.05938704 1.0586792 0.6200670 0.8361933 -0.7858701 5.200798
## rf1 -0.06758472 0.9972875 0.6655080 0.7960606 -0.8410135 4.948810
## rf2 -0.06366472 1.0033461 0.6610371 0.7992679 -0.8212055 4.971403
## rf3 -0.05755666 1.0071221 0.6571241 0.8033652 -0.7766925 4.998547
## rf4 -0.07985569 0.9985825 0.6646830 0.7919069 -0.9103610 4.923883
## rf5 -0.04872120 0.9975813 0.6644236 0.7960901 -0.7273081 4.946893
## xgb.baseline -0.07860914 1.0508742 0.6271185 0.8423056 -0.9343947 5.249769
## xgb1 -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
## xgb2 -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
## xgb3 -0.06005700 1.0179837 0.6487246 0.8185924 -0.7790652 5.085187
## TIME
## bag.baseline 18.72908
## bag1 19.96190
## bag2 18.32149
## bag3 19.38999
## bag4 17.60555
## bag5 17.85801
## bag6 20.05631
## rf.baseline 103.99030
## rf1 132.44422
## rf2 123.42354
## rf3 105.98884
## rf4 104.72795
## rf5 113.90101
## xgb.baseline 33.11829
## xgb1 41.03013
## xgb2 32.23954
## xgb3 16.24166
This model performs better and is faster.
V4
Using only 10-year data:
xgb.df4 <- xgb.df2 |>
select(-c(
actor_films5,
actor_rev5,
director_films5,
director_rev5
))
xgb4 <- xg.deploy(xgb.df4, xg.ctrl, tuned.grid, scale)
## Training set contains 1378 rows.
## Testing set contains 344 rows.
## ME RMSE Rsquared MAE MPE MAPE
## bag.baseline -0.09358480 1.0843706 0.6040531 0.8695907 -1.0574089 5.428965
## bag1 -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag2 -0.09093805 1.0657372 0.6172368 0.8764869 -1.0168330 5.439538
## bag3 -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag4 -0.08998240 1.0648496 0.6175457 0.8607279 -1.0102230 5.363212
## bag5 -0.08916172 1.0574986 0.6237698 0.8555682 -1.0107429 5.328622
## bag6 -0.09008839 1.0655591 0.6170242 0.8612496 -1.0112991 5.366995
## rf.baseline -0.05938704 1.0586792 0.6200670 0.8361933 -0.7858701 5.200798
## rf1 -0.06758472 0.9972875 0.6655080 0.7960606 -0.8410135 4.948810
## rf2 -0.06366472 1.0033461 0.6610371 0.7992679 -0.8212055 4.971403
## rf3 -0.05755666 1.0071221 0.6571241 0.8033652 -0.7766925 4.998547
## rf4 -0.07985569 0.9985825 0.6646830 0.7919069 -0.9103610 4.923883
## rf5 -0.04872120 0.9975813 0.6644236 0.7960901 -0.7273081 4.946893
## xgb.baseline -0.07860914 1.0508742 0.6271185 0.8423056 -0.9343947 5.249769
## xgb1 -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
## xgb2 -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
## xgb3 -0.06005700 1.0179837 0.6487246 0.8185924 -0.7790652 5.085187
## xgb4 -0.07641102 1.0149441 0.6515209 0.8255738 -0.8732818 5.121493
## TIME
## bag.baseline 18.72908
## bag1 19.96190
## bag2 18.32149
## bag3 19.38999
## bag4 17.60555
## bag5 17.85801
## bag6 20.05631
## rf.baseline 103.99030
## rf1 132.44422
## rf2 123.42354
## rf3 105.98884
## rf4 104.72795
## rf5 113.90101
## xgb.baseline 33.11829
## xgb1 41.03013
## xgb2 32.23954
## xgb3 16.24166
## xgb4 16.91872
Model performance decreases with only the 10 year data.
V5
We update the 10 year revenue to only revenue between years 5-10 to
see if the model improves further. We exclude the zero variance
variables languageNone
and countryNone
xgb.df5 <- xgb.df2 |>
mutate(actor_films10 = actor_films10 - actor_films5,
actor_rev10 = actor_rev10 - actor_rev5,
director_films10 = director_films10 - director_films5,
director_rev10 = director_rev10 - director_rev5) |>
select(-c(languageNone,
countryNone))
xgb5 <- xg.deploy(xgb.df5, xg.ctrl, tuned.grid, scale)
## Training set contains 1378 rows.
## Testing set contains 344 rows.
## ME RMSE Rsquared MAE MPE MAPE
## bag.baseline -0.09358480 1.0843706 0.6040531 0.8695907 -1.0574089 5.428965
## bag1 -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag2 -0.09093805 1.0657372 0.6172368 0.8764869 -1.0168330 5.439538
## bag3 -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag4 -0.08998240 1.0648496 0.6175457 0.8607279 -1.0102230 5.363212
## bag5 -0.08916172 1.0574986 0.6237698 0.8555682 -1.0107429 5.328622
## bag6 -0.09008839 1.0655591 0.6170242 0.8612496 -1.0112991 5.366995
## rf.baseline -0.05938704 1.0586792 0.6200670 0.8361933 -0.7858701 5.200798
## rf1 -0.06758472 0.9972875 0.6655080 0.7960606 -0.8410135 4.948810
## rf2 -0.06366472 1.0033461 0.6610371 0.7992679 -0.8212055 4.971403
## rf3 -0.05755666 1.0071221 0.6571241 0.8033652 -0.7766925 4.998547
## rf4 -0.07985569 0.9985825 0.6646830 0.7919069 -0.9103610 4.923883
## rf5 -0.04872120 0.9975813 0.6644236 0.7960901 -0.7273081 4.946893
## xgb.baseline -0.07860914 1.0508742 0.6271185 0.8423056 -0.9343947 5.249769
## xgb1 -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
## xgb2 -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
## xgb3 -0.06005700 1.0179837 0.6487246 0.8185924 -0.7790652 5.085187
## xgb4 -0.07641102 1.0149441 0.6515209 0.8255738 -0.8732818 5.121493
## xgb5 -0.03487635 1.0313299 0.6385238 0.8361075 -0.6351771 5.183738
## TIME
## bag.baseline 18.72908
## bag1 19.96190
## bag2 18.32149
## bag3 19.38999
## bag4 17.60555
## bag5 17.85801
## bag6 20.05631
## rf.baseline 103.99030
## rf1 132.44422
## rf2 123.42354
## rf3 105.98884
## rf4 104.72795
## rf5 113.90101
## xgb.baseline 33.11829
## xgb1 41.03013
## xgb2 32.23954
## xgb3 16.24166
## xgb4 16.91872
## xgb5 17.00468
V6
Considering only 5 year data,
xgb.df6 <- xgb.df2 |>
select(-c(
actor_films10,
actor_rev10,
director_films10,
director_rev10,
languageNone,
countryNone
))
xgb6 <- xg.deploy(xgb.df6, xg.ctrl, tuned.grid, scale)
## Training set contains 1378 rows.
## Testing set contains 344 rows.
## ME RMSE Rsquared MAE MPE MAPE
## bag.baseline -0.09358480 1.0843706 0.6040531 0.8695907 -1.0574089 5.428965
## bag1 -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag2 -0.09093805 1.0657372 0.6172368 0.8764869 -1.0168330 5.439538
## bag3 -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag4 -0.08998240 1.0648496 0.6175457 0.8607279 -1.0102230 5.363212
## bag5 -0.08916172 1.0574986 0.6237698 0.8555682 -1.0107429 5.328622
## bag6 -0.09008839 1.0655591 0.6170242 0.8612496 -1.0112991 5.366995
## rf.baseline -0.05938704 1.0586792 0.6200670 0.8361933 -0.7858701 5.200798
## rf1 -0.06758472 0.9972875 0.6655080 0.7960606 -0.8410135 4.948810
## rf2 -0.06366472 1.0033461 0.6610371 0.7992679 -0.8212055 4.971403
## rf3 -0.05755666 1.0071221 0.6571241 0.8033652 -0.7766925 4.998547
## rf4 -0.07985569 0.9985825 0.6646830 0.7919069 -0.9103610 4.923883
## rf5 -0.04872120 0.9975813 0.6644236 0.7960901 -0.7273081 4.946893
## xgb.baseline -0.07860914 1.0508742 0.6271185 0.8423056 -0.9343947 5.249769
## xgb1 -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
## xgb2 -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
## xgb3 -0.06005700 1.0179837 0.6487246 0.8185924 -0.7790652 5.085187
## xgb4 -0.07641102 1.0149441 0.6515209 0.8255738 -0.8732818 5.121493
## xgb5 -0.03487635 1.0313299 0.6385238 0.8361075 -0.6351771 5.183738
## xgb6 -0.05525757 1.0191473 0.6477012 0.8211301 -0.7502124 5.100705
## TIME
## bag.baseline 18.72908
## bag1 19.96190
## bag2 18.32149
## bag3 19.38999
## bag4 17.60555
## bag5 17.85801
## bag6 20.05631
## rf.baseline 103.99030
## rf1 132.44422
## rf2 123.42354
## rf3 105.98884
## rf4 104.72795
## rf5 113.90101
## xgb.baseline 33.11829
## xgb1 41.03013
## xgb2 32.23954
## xgb3 16.24166
## xgb4 16.91872
## xgb5 17.00468
## xgb6 16.63881
V7
Removing zero-variance variables from our best model “V3”
xgb.df7 <- xgb.df2 |>
select(-c(
languageNone,
countryNone
))
xgb7 <- xg.deploy(xgb.df7, xg.ctrl, tuned.grid, scale)
## Training set contains 1378 rows.
## Testing set contains 344 rows.
## ME RMSE Rsquared MAE MPE MAPE
## bag.baseline -0.09358480 1.0843706 0.6040531 0.8695907 -1.0574089 5.428965
## bag1 -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag2 -0.09093805 1.0657372 0.6172368 0.8764869 -1.0168330 5.439538
## bag3 -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag4 -0.08998240 1.0648496 0.6175457 0.8607279 -1.0102230 5.363212
## bag5 -0.08916172 1.0574986 0.6237698 0.8555682 -1.0107429 5.328622
## bag6 -0.09008839 1.0655591 0.6170242 0.8612496 -1.0112991 5.366995
## rf.baseline -0.05938704 1.0586792 0.6200670 0.8361933 -0.7858701 5.200798
## rf1 -0.06758472 0.9972875 0.6655080 0.7960606 -0.8410135 4.948810
## rf2 -0.06366472 1.0033461 0.6610371 0.7992679 -0.8212055 4.971403
## rf3 -0.05755666 1.0071221 0.6571241 0.8033652 -0.7766925 4.998547
## rf4 -0.07985569 0.9985825 0.6646830 0.7919069 -0.9103610 4.923883
## rf5 -0.04872120 0.9975813 0.6644236 0.7960901 -0.7273081 4.946893
## xgb.baseline -0.07860914 1.0508742 0.6271185 0.8423056 -0.9343947 5.249769
## xgb1 -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
## xgb2 -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
## xgb3 -0.06005700 1.0179837 0.6487246 0.8185924 -0.7790652 5.085187
## xgb4 -0.07641102 1.0149441 0.6515209 0.8255738 -0.8732818 5.121493
## xgb5 -0.03487635 1.0313299 0.6385238 0.8361075 -0.6351771 5.183738
## xgb6 -0.05525757 1.0191473 0.6477012 0.8211301 -0.7502124 5.100705
## xgb7 -0.05353738 1.0265252 0.6428285 0.8250519 -0.7540614 5.123845
## TIME
## bag.baseline 18.72908
## bag1 19.96190
## bag2 18.32149
## bag3 19.38999
## bag4 17.60555
## bag5 17.85801
## bag6 20.05631
## rf.baseline 103.99030
## rf1 132.44422
## rf2 123.42354
## rf3 105.98884
## rf4 104.72795
## rf5 113.90101
## xgb.baseline 33.11829
## xgb1 41.03013
## xgb2 32.23954
## xgb3 16.24166
## xgb4 16.91872
## xgb5 17.00468
## xgb6 16.63881
## xgb7 15.97367
This model actually performs worse. Random forest is outperforming XGBoost on this data.
V8
Best XGBoost model with Gamma (regularization parameter) = 5
gamma.grid <- expand.grid(
nrounds = 100,
max_depth = 6,
colsample_bytree = 0.6,
eta = 0.1,
gamma= 5,
min_child_weight = 10,
subsample = 1
)
xgb8 <- xg.deploy(xgb.df2, xg.ctrl, gamma.grid, scale)
## Training set contains 1378 rows.
## Testing set contains 344 rows.
## ME RMSE Rsquared MAE MPE MAPE
## -0.06616451 1.01878086 0.64899210 0.82032832 -0.82920226 5.09587033
## TIME
## 17.04133391
Cubist
We define the parameters for model training, enabling 10-fold cross-validation, perform the Box-Cox transformation, and set the grid search parameters. Cubist models can handle categorical variables so we do not need to create dummies.
# define parameters
cube.ctrl <- trainControl(
method = "cv",
number = 10,
allowParallel = TRUE
)
# perform Box-Cox
c.df <- box.response(c.df)
# impute zeros for NA
c.df <- c.df |>
mutate(budget = ifelse(is.na(budget),
0,
budget))
# set grid search params
cube.grid1 <- expand.grid(
.committees=c(seq(1, 10),
seq(20, 100,
by=10)),
.neighbors=c(0, 1, 5, 9))
Baseline
Baseline model without Star Power,
cube.base <- c.df |>
select(-c(num_actors,
num_directors,
actor_films5,
actor_films10,
actor_rev5,
actor_rev10,
director_films5,
director_films10,
director_rev5,
director_rev10))
cube.base <- cube.deploy(cube.base, cube.ctrl, cube.grid1, scale)
## Training set contains 1378 rows.
## Testing set contains 344 rows.
## ME RMSE Rsquared MAE MPE MAPE
## bag.baseline -0.09358480 1.0843706 0.6040531 0.8695907 -1.0574089 5.428965
## bag1 -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag2 -0.09093805 1.0657372 0.6172368 0.8764869 -1.0168330 5.439538
## bag3 -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag4 -0.08998240 1.0648496 0.6175457 0.8607279 -1.0102230 5.363212
## bag5 -0.08916172 1.0574986 0.6237698 0.8555682 -1.0107429 5.328622
## bag6 -0.09008839 1.0655591 0.6170242 0.8612496 -1.0112991 5.366995
## rf.baseline -0.05938704 1.0586792 0.6200670 0.8361933 -0.7858701 5.200798
## rf1 -0.06758472 0.9972875 0.6655080 0.7960606 -0.8410135 4.948810
## rf2 -0.06366472 1.0033461 0.6610371 0.7992679 -0.8212055 4.971403
## rf3 -0.05755666 1.0071221 0.6571241 0.8033652 -0.7766925 4.998547
## rf4 -0.07985569 0.9985825 0.6646830 0.7919069 -0.9103610 4.923883
## rf5 -0.04872120 0.9975813 0.6644236 0.7960901 -0.7273081 4.946893
## xgb.baseline -0.07860914 1.0508742 0.6271185 0.8423056 -0.9343947 5.249769
## xgb1 -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
## xgb2 -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
## xgb3 -0.06005700 1.0179837 0.6487246 0.8185924 -0.7790652 5.085187
## xgb4 -0.07641102 1.0149441 0.6515209 0.8255738 -0.8732818 5.121493
## xgb5 -0.03487635 1.0313299 0.6385238 0.8361075 -0.6351771 5.183738
## xgb6 -0.05525757 1.0191473 0.6477012 0.8211301 -0.7502124 5.100705
## xgb7 -0.05353738 1.0265252 0.6428285 0.8250519 -0.7540614 5.123845
## cube.baseline -0.17794846 1.0650029 0.6256989 0.8365519 -1.5383026 5.254428
## TIME
## bag.baseline 18.72908
## bag1 19.96190
## bag2 18.32149
## bag3 19.38999
## bag4 17.60555
## bag5 17.85801
## bag6 20.05631
## rf.baseline 103.99030
## rf1 132.44422
## rf2 123.42354
## rf3 105.98884
## rf4 104.72795
## rf5 113.90101
## xgb.baseline 33.11829
## xgb1 41.03013
## xgb2 32.23954
## xgb3 16.24166
## xgb4 16.91872
## xgb5 17.00468
## xgb6 16.63881
## xgb7 15.97367
## cube.baseline 144.29722
V1
Full model including Star Power minus number of actors and directors.
cube.df1 <- c.df |>
select(-c(num_actors,
num_directors
))
cube1 <- cube.deploy(cube.df1, cube.ctrl, cube.grid1, scale)
## Training set contains 1378 rows.
## Testing set contains 344 rows.
## ME RMSE Rsquared MAE MPE MAPE
## bag.baseline -0.09358480 1.0843706 0.6040531 0.8695907 -1.0574089 5.428965
## bag1 -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag2 -0.09093805 1.0657372 0.6172368 0.8764869 -1.0168330 5.439538
## bag3 -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag4 -0.08998240 1.0648496 0.6175457 0.8607279 -1.0102230 5.363212
## bag5 -0.08916172 1.0574986 0.6237698 0.8555682 -1.0107429 5.328622
## bag6 -0.09008839 1.0655591 0.6170242 0.8612496 -1.0112991 5.366995
## rf.baseline -0.05938704 1.0586792 0.6200670 0.8361933 -0.7858701 5.200798
## rf1 -0.06758472 0.9972875 0.6655080 0.7960606 -0.8410135 4.948810
## rf2 -0.06366472 1.0033461 0.6610371 0.7992679 -0.8212055 4.971403
## rf3 -0.05755666 1.0071221 0.6571241 0.8033652 -0.7766925 4.998547
## rf4 -0.07985569 0.9985825 0.6646830 0.7919069 -0.9103610 4.923883
## rf5 -0.04872120 0.9975813 0.6644236 0.7960901 -0.7273081 4.946893
## xgb.baseline -0.07860914 1.0508742 0.6271185 0.8423056 -0.9343947 5.249769
## xgb1 -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
## xgb2 -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
## xgb3 -0.06005700 1.0179837 0.6487246 0.8185924 -0.7790652 5.085187
## xgb4 -0.07641102 1.0149441 0.6515209 0.8255738 -0.8732818 5.121493
## xgb5 -0.03487635 1.0313299 0.6385238 0.8361075 -0.6351771 5.183738
## xgb6 -0.05525757 1.0191473 0.6477012 0.8211301 -0.7502124 5.100705
## xgb7 -0.05353738 1.0265252 0.6428285 0.8250519 -0.7540614 5.123845
## cube.baseline -0.17794846 1.0650029 0.6256989 0.8365519 -1.5383026 5.254428
## cube1 -0.13139698 1.0324528 0.6445176 0.8220364 -1.2410078 5.137437
## TIME
## bag.baseline 18.72908
## bag1 19.96190
## bag2 18.32149
## bag3 19.38999
## bag4 17.60555
## bag5 17.85801
## bag6 20.05631
## rf.baseline 103.99030
## rf1 132.44422
## rf2 123.42354
## rf3 105.98884
## rf4 104.72795
## rf5 113.90101
## xgb.baseline 33.11829
## xgb1 41.03013
## xgb2 32.23954
## xgb3 16.24166
## xgb4 16.91872
## xgb5 17.00468
## xgb6 16.63881
## xgb7 15.97367
## cube.baseline 144.29722
## cube1 198.45836
## committees neighbors
## 57 60 0
Running the model with the tuned grid to gauge performance,
cube.tuned <- cube.grid1 <- expand.grid(
.committees=60,
.neighbors=0)
cube1.scaled <- cube.deploy(cube.df1, cube.ctrl, cube.tuned, scale)
## Training set contains 1378 rows.
## Testing set contains 344 rows.
## Training set contains 1378 rows.
## Testing set contains 344 rows.
## ME RMSE Rsquared MAE MPE MAPE TIME
## [1,] -0.1313970 1.032453 0.6445176 0.8220364 -1.241008 5.137437 31.85722
## [2,] -0.1290602 1.035148 0.6419848 0.8250736 -1.223889 5.154409 31.10698
Scaled performs much better, but still underperforms XGboost and Random Forest.
V2
Full model w. 10 year only:
cube.df2 <- c.df |>
select(-c(num_actors,
num_directors,
actor_films5,
actor_rev5,
director_films5,
director_rev5
))
cube2 <- cube.deploy(cube.df2, cube.ctrl, cube.tuned, scale)
## Training set contains 1378 rows.
## Testing set contains 344 rows.
## ME RMSE Rsquared MAE MPE MAPE
## bag.baseline -0.09358480 1.0843706 0.6040531 0.8695907 -1.0574089 5.428965
## bag1 -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag2 -0.09093805 1.0657372 0.6172368 0.8764869 -1.0168330 5.439538
## bag3 -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag4 -0.08998240 1.0648496 0.6175457 0.8607279 -1.0102230 5.363212
## bag5 -0.08916172 1.0574986 0.6237698 0.8555682 -1.0107429 5.328622
## bag6 -0.09008839 1.0655591 0.6170242 0.8612496 -1.0112991 5.366995
## rf.baseline -0.05938704 1.0586792 0.6200670 0.8361933 -0.7858701 5.200798
## rf1 -0.06758472 0.9972875 0.6655080 0.7960606 -0.8410135 4.948810
## rf2 -0.06366472 1.0033461 0.6610371 0.7992679 -0.8212055 4.971403
## rf3 -0.05755666 1.0071221 0.6571241 0.8033652 -0.7766925 4.998547
## rf4 -0.07985569 0.9985825 0.6646830 0.7919069 -0.9103610 4.923883
## rf5 -0.04872120 0.9975813 0.6644236 0.7960901 -0.7273081 4.946893
## xgb.baseline -0.07860914 1.0508742 0.6271185 0.8423056 -0.9343947 5.249769
## xgb1 -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
## xgb2 -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
## xgb3 -0.06005700 1.0179837 0.6487246 0.8185924 -0.7790652 5.085187
## xgb4 -0.07641102 1.0149441 0.6515209 0.8255738 -0.8732818 5.121493
## xgb5 -0.03487635 1.0313299 0.6385238 0.8361075 -0.6351771 5.183738
## xgb6 -0.05525757 1.0191473 0.6477012 0.8211301 -0.7502124 5.100705
## xgb7 -0.05353738 1.0265252 0.6428285 0.8250519 -0.7540614 5.123845
## cube.baseline -0.17794846 1.0650029 0.6256989 0.8365519 -1.5383026 5.254428
## cube1 -0.13139698 1.0324528 0.6445176 0.8220364 -1.2410078 5.137437
## cube2 -0.16125278 1.0361215 0.6448753 0.8198624 -1.4205780 5.142243
## TIME
## bag.baseline 18.72908
## bag1 19.96190
## bag2 18.32149
## bag3 19.38999
## bag4 17.60555
## bag5 17.85801
## bag6 20.05631
## rf.baseline 103.99030
## rf1 132.44422
## rf2 123.42354
## rf3 105.98884
## rf4 104.72795
## rf5 113.90101
## xgb.baseline 33.11829
## xgb1 41.03013
## xgb2 32.23954
## xgb3 16.24166
## xgb4 16.91872
## xgb5 17.00468
## xgb6 16.63881
## xgb7 15.97367
## cube.baseline 144.29722
## cube1 198.45836
## cube2 28.57859
Worse than full model, but fast now that we have the parameters.
V3
Separating out years 5-10.
cube.df3 <- c.df |>
mutate(actor_films10 = actor_films10 - actor_films5,
actor_rev10 = actor_rev10 - actor_rev5,
director_films10 = director_films10 - director_films5,
director_rev10 = director_rev10 - director_rev5) |>
select(-c(num_actors,
num_directors))
cube3 <- cube.deploy(cube.df3, cube.ctrl, cube.tuned, scale)
## Training set contains 1378 rows.
## Testing set contains 344 rows.
## ME RMSE Rsquared MAE MPE MAPE
## bag.baseline -0.09358480 1.0843706 0.6040531 0.8695907 -1.0574089 5.428965
## bag1 -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag2 -0.09093805 1.0657372 0.6172368 0.8764869 -1.0168330 5.439538
## bag3 -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag4 -0.08998240 1.0648496 0.6175457 0.8607279 -1.0102230 5.363212
## bag5 -0.08916172 1.0574986 0.6237698 0.8555682 -1.0107429 5.328622
## bag6 -0.09008839 1.0655591 0.6170242 0.8612496 -1.0112991 5.366995
## rf.baseline -0.05938704 1.0586792 0.6200670 0.8361933 -0.7858701 5.200798
## rf1 -0.06758472 0.9972875 0.6655080 0.7960606 -0.8410135 4.948810
## rf2 -0.06366472 1.0033461 0.6610371 0.7992679 -0.8212055 4.971403
## rf3 -0.05755666 1.0071221 0.6571241 0.8033652 -0.7766925 4.998547
## rf4 -0.07985569 0.9985825 0.6646830 0.7919069 -0.9103610 4.923883
## rf5 -0.04872120 0.9975813 0.6644236 0.7960901 -0.7273081 4.946893
## xgb.baseline -0.07860914 1.0508742 0.6271185 0.8423056 -0.9343947 5.249769
## xgb1 -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
## xgb2 -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
## xgb3 -0.06005700 1.0179837 0.6487246 0.8185924 -0.7790652 5.085187
## xgb4 -0.07641102 1.0149441 0.6515209 0.8255738 -0.8732818 5.121493
## xgb5 -0.03487635 1.0313299 0.6385238 0.8361075 -0.6351771 5.183738
## xgb6 -0.05525757 1.0191473 0.6477012 0.8211301 -0.7502124 5.100705
## xgb7 -0.05353738 1.0265252 0.6428285 0.8250519 -0.7540614 5.123845
## cube.baseline -0.17794846 1.0650029 0.6256989 0.8365519 -1.5383026 5.254428
## cube1 -0.13139698 1.0324528 0.6445176 0.8220364 -1.2410078 5.137437
## cube2 -0.16125278 1.0361215 0.6448753 0.8198624 -1.4205780 5.142243
## cube3 -0.13158143 1.0355657 0.6419864 0.8265906 -1.2402341 5.162284
## TIME
## bag.baseline 18.72908
## bag1 19.96190
## bag2 18.32149
## bag3 19.38999
## bag4 17.60555
## bag5 17.85801
## bag6 20.05631
## rf.baseline 103.99030
## rf1 132.44422
## rf2 123.42354
## rf3 105.98884
## rf4 104.72795
## rf5 113.90101
## xgb.baseline 33.11829
## xgb1 41.03013
## xgb2 32.23954
## xgb3 16.24166
## xgb4 16.91872
## xgb5 17.00468
## xgb6 16.63881
## xgb7 15.97367
## cube.baseline 144.29722
## cube1 198.45836
## cube2 28.57859
## cube3 31.94294
## cubist variable importance
##
## only 20 most important variables shown (out of 39)
##
## Overall
## budget 100.000
## actor_films5 48.765
## ratedR 48.148
## ratedPG-13 45.062
## bud_availYes 38.889
## genreAnimation 37.037
## ratedPG 36.420
## States 35.802
## director_films5 34.568
## runtime 33.333
## genreHorror 30.864
## monthOctober 20.370
## genreComedy 17.901
## actor_rev10 17.284
## director_films10 16.667
## monthJuly 16.049
## genreDrama 9.259
## genreBiography 8.025
## monthJune 4.938
## genreCrime 4.938
V4
Considering only 5 year data,
cube.df4 <- c.df |>
select(-c(num_actors,
num_directors,
actor_films10,
actor_rev10,
director_films10,
director_rev10
))
cube4 <- cube.deploy(cube.df4, cube.ctrl, cube.tuned, scale)
## Training set contains 1378 rows.
## Testing set contains 344 rows.
## ME RMSE Rsquared MAE MPE MAPE
## bag.baseline -0.09358480 1.0843706 0.6040531 0.8695907 -1.0574089 5.428965
## bag1 -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag2 -0.09093805 1.0657372 0.6172368 0.8764869 -1.0168330 5.439538
## bag3 -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag4 -0.08998240 1.0648496 0.6175457 0.8607279 -1.0102230 5.363212
## bag5 -0.08916172 1.0574986 0.6237698 0.8555682 -1.0107429 5.328622
## bag6 -0.09008839 1.0655591 0.6170242 0.8612496 -1.0112991 5.366995
## rf.baseline -0.05938704 1.0586792 0.6200670 0.8361933 -0.7858701 5.200798
## rf1 -0.06758472 0.9972875 0.6655080 0.7960606 -0.8410135 4.948810
## rf2 -0.06366472 1.0033461 0.6610371 0.7992679 -0.8212055 4.971403
## rf3 -0.05755666 1.0071221 0.6571241 0.8033652 -0.7766925 4.998547
## rf4 -0.07985569 0.9985825 0.6646830 0.7919069 -0.9103610 4.923883
## rf5 -0.04872120 0.9975813 0.6644236 0.7960901 -0.7273081 4.946893
## xgb.baseline -0.07860914 1.0508742 0.6271185 0.8423056 -0.9343947 5.249769
## xgb1 -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
## xgb2 -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
## xgb3 -0.06005700 1.0179837 0.6487246 0.8185924 -0.7790652 5.085187
## xgb4 -0.07641102 1.0149441 0.6515209 0.8255738 -0.8732818 5.121493
## xgb5 -0.03487635 1.0313299 0.6385238 0.8361075 -0.6351771 5.183738
## xgb6 -0.05525757 1.0191473 0.6477012 0.8211301 -0.7502124 5.100705
## xgb7 -0.05353738 1.0265252 0.6428285 0.8250519 -0.7540614 5.123845
## cube.baseline -0.17794846 1.0650029 0.6256989 0.8365519 -1.5383026 5.254428
## cube1 -0.13139698 1.0324528 0.6445176 0.8220364 -1.2410078 5.137437
## cube2 -0.16125278 1.0361215 0.6448753 0.8198624 -1.4205780 5.142243
## cube3 -0.13158143 1.0355657 0.6419864 0.8265906 -1.2402341 5.162284
## cube4 -0.14600407 1.0336423 0.6451466 0.8212243 -1.3293151 5.137742
## TIME
## bag.baseline 18.72908
## bag1 19.96190
## bag2 18.32149
## bag3 19.38999
## bag4 17.60555
## bag5 17.85801
## bag6 20.05631
## rf.baseline 103.99030
## rf1 132.44422
## rf2 123.42354
## rf3 105.98884
## rf4 104.72795
## rf5 113.90101
## xgb.baseline 33.11829
## xgb1 41.03013
## xgb2 32.23954
## xgb3 16.24166
## xgb4 16.91872
## xgb5 17.00468
## xgb6 16.63881
## xgb7 15.97367
## cube.baseline 144.29722
## cube1 198.45836
## cube2 28.57859
## cube3 31.94294
## cube4 29.58219
## cubist variable importance
##
## only 20 most important variables shown (out of 35)
##
## Overall
## budget 100.000
## director_films5 51.412
## actor_films5 38.418
## ratedR 37.288
## genreHorror 35.028
## genreAnimation 32.768
## States 32.203
## ratedPG-13 31.073
## runtime 30.508
## bud_availYes 29.944
## monthOctober 25.424
## ratedPG 23.729
## genreDrama 21.469
## genreBiography 19.209
## genreCrime 16.384
## genreComedy 14.689
## monthJuly 13.559
## director_rev5 9.040
## monthJune 8.475
## monthSeptember 5.650
V5
Including only the number of films in the past 5 years,
cube.df5 <- c.df |>
select(-c(num_actors,
num_directors,
actor_films10,
actor_rev5,
actor_rev10,
director_films10,
director_rev5,
director_rev10
))
cube5 <- cube.deploy(cube.df5, cube.ctrl, cube.tuned, scale)
## Training set contains 1378 rows.
## Testing set contains 344 rows.
## ME RMSE Rsquared MAE MPE MAPE
## bag.baseline -0.09358480 1.0843706 0.6040531 0.8695907 -1.0574089 5.428965
## bag1 -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag2 -0.09093805 1.0657372 0.6172368 0.8764869 -1.0168330 5.439538
## bag3 -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag4 -0.08998240 1.0648496 0.6175457 0.8607279 -1.0102230 5.363212
## bag5 -0.08916172 1.0574986 0.6237698 0.8555682 -1.0107429 5.328622
## bag6 -0.09008839 1.0655591 0.6170242 0.8612496 -1.0112991 5.366995
## rf.baseline -0.05938704 1.0586792 0.6200670 0.8361933 -0.7858701 5.200798
## rf1 -0.06758472 0.9972875 0.6655080 0.7960606 -0.8410135 4.948810
## rf2 -0.06366472 1.0033461 0.6610371 0.7992679 -0.8212055 4.971403
## rf3 -0.05755666 1.0071221 0.6571241 0.8033652 -0.7766925 4.998547
## rf4 -0.07985569 0.9985825 0.6646830 0.7919069 -0.9103610 4.923883
## rf5 -0.04872120 0.9975813 0.6644236 0.7960901 -0.7273081 4.946893
## xgb.baseline -0.07860914 1.0508742 0.6271185 0.8423056 -0.9343947 5.249769
## xgb1 -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
## xgb2 -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
## xgb3 -0.06005700 1.0179837 0.6487246 0.8185924 -0.7790652 5.085187
## xgb4 -0.07641102 1.0149441 0.6515209 0.8255738 -0.8732818 5.121493
## xgb5 -0.03487635 1.0313299 0.6385238 0.8361075 -0.6351771 5.183738
## xgb6 -0.05525757 1.0191473 0.6477012 0.8211301 -0.7502124 5.100705
## xgb7 -0.05353738 1.0265252 0.6428285 0.8250519 -0.7540614 5.123845
## cube.baseline -0.17794846 1.0650029 0.6256989 0.8365519 -1.5383026 5.254428
## cube1 -0.13139698 1.0324528 0.6445176 0.8220364 -1.2410078 5.137437
## cube2 -0.16125278 1.0361215 0.6448753 0.8198624 -1.4205780 5.142243
## cube3 -0.13158143 1.0355657 0.6419864 0.8265906 -1.2402341 5.162284
## cube4 -0.14600407 1.0336423 0.6451466 0.8212243 -1.3293151 5.137742
## cube5 -0.13784628 1.0388059 0.6399798 0.8260460 -1.2760452 5.162864
## TIME
## bag.baseline 18.72908
## bag1 19.96190
## bag2 18.32149
## bag3 19.38999
## bag4 17.60555
## bag5 17.85801
## bag6 20.05631
## rf.baseline 103.99030
## rf1 132.44422
## rf2 123.42354
## rf3 105.98884
## rf4 104.72795
## rf5 113.90101
## xgb.baseline 33.11829
## xgb1 41.03013
## xgb2 32.23954
## xgb3 16.24166
## xgb4 16.91872
## xgb5 17.00468
## xgb6 16.63881
## xgb7 15.97367
## cube.baseline 144.29722
## cube1 198.45836
## cube2 28.57859
## cube3 31.94294
## cube4 29.58219
## cube5 28.81885
V6
Removing the bud_avail
variable,
cube.df6 <- c.df |>
select(-c(num_actors,
num_directors,
actor_rev5,
director_films5,
director_rev5,
director_films5,
bud_avail
))
cube6 <- cube.deploy(cube.df6, cube.ctrl, cube.tuned, scale)
## Training set contains 1378 rows.
## Testing set contains 344 rows.
## ME RMSE Rsquared MAE MPE MAPE
## bag.baseline -0.09358480 1.0843706 0.6040531 0.8695907 -1.0574089 5.428965
## bag1 -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag2 -0.09093805 1.0657372 0.6172368 0.8764869 -1.0168330 5.439538
## bag3 -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag4 -0.08998240 1.0648496 0.6175457 0.8607279 -1.0102230 5.363212
## bag5 -0.08916172 1.0574986 0.6237698 0.8555682 -1.0107429 5.328622
## bag6 -0.09008839 1.0655591 0.6170242 0.8612496 -1.0112991 5.366995
## rf.baseline -0.05938704 1.0586792 0.6200670 0.8361933 -0.7858701 5.200798
## rf1 -0.06758472 0.9972875 0.6655080 0.7960606 -0.8410135 4.948810
## rf2 -0.06366472 1.0033461 0.6610371 0.7992679 -0.8212055 4.971403
## rf3 -0.05755666 1.0071221 0.6571241 0.8033652 -0.7766925 4.998547
## rf4 -0.07985569 0.9985825 0.6646830 0.7919069 -0.9103610 4.923883
## rf5 -0.04872120 0.9975813 0.6644236 0.7960901 -0.7273081 4.946893
## xgb.baseline -0.07860914 1.0508742 0.6271185 0.8423056 -0.9343947 5.249769
## xgb1 -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
## xgb2 -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
## xgb3 -0.06005700 1.0179837 0.6487246 0.8185924 -0.7790652 5.085187
## xgb4 -0.07641102 1.0149441 0.6515209 0.8255738 -0.8732818 5.121493
## xgb5 -0.03487635 1.0313299 0.6385238 0.8361075 -0.6351771 5.183738
## xgb6 -0.05525757 1.0191473 0.6477012 0.8211301 -0.7502124 5.100705
## xgb7 -0.05353738 1.0265252 0.6428285 0.8250519 -0.7540614 5.123845
## cube.baseline -0.17794846 1.0650029 0.6256989 0.8365519 -1.5383026 5.254428
## cube1 -0.13139698 1.0324528 0.6445176 0.8220364 -1.2410078 5.137437
## cube2 -0.16125278 1.0361215 0.6448753 0.8198624 -1.4205780 5.142243
## cube3 -0.13158143 1.0355657 0.6419864 0.8265906 -1.2402341 5.162284
## cube4 -0.14600407 1.0336423 0.6451466 0.8212243 -1.3293151 5.137742
## cube5 -0.13784628 1.0388059 0.6399798 0.8260460 -1.2760452 5.162864
## cube6 -0.15464835 1.0369273 0.6451362 0.8205757 -1.3968214 5.147856
## TIME
## bag.baseline 18.72908
## bag1 19.96190
## bag2 18.32149
## bag3 19.38999
## bag4 17.60555
## bag5 17.85801
## bag6 20.05631
## rf.baseline 103.99030
## rf1 132.44422
## rf2 123.42354
## rf3 105.98884
## rf4 104.72795
## rf5 113.90101
## xgb.baseline 33.11829
## xgb1 41.03013
## xgb2 32.23954
## xgb3 16.24166
## xgb4 16.91872
## xgb5 17.00468
## xgb6 16.63881
## xgb7 15.97367
## cube.baseline 144.29722
## cube1 198.45836
## cube2 28.57859
## cube3 31.94294
## cube4 29.58219
## cube5 28.81885
## cube6 28.57339
V7
Full dataset w/o bud_avail
,
cube.df7 <- c.df |>
select(-c(num_actors,
num_directors,
bud_avail
))
cube7 <- cube.deploy(cube.df7, cube.ctrl, cube.tuned, scale)
## Training set contains 1378 rows.
## Testing set contains 344 rows.
## ME RMSE Rsquared MAE MPE MAPE
## bag.baseline -0.09358480 1.0843706 0.6040531 0.8695907 -1.0574089 5.428965
## bag1 -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag2 -0.09093805 1.0657372 0.6172368 0.8764869 -1.0168330 5.439538
## bag3 -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag4 -0.08998240 1.0648496 0.6175457 0.8607279 -1.0102230 5.363212
## bag5 -0.08916172 1.0574986 0.6237698 0.8555682 -1.0107429 5.328622
## bag6 -0.09008839 1.0655591 0.6170242 0.8612496 -1.0112991 5.366995
## rf.baseline -0.05938704 1.0586792 0.6200670 0.8361933 -0.7858701 5.200798
## rf1 -0.06758472 0.9972875 0.6655080 0.7960606 -0.8410135 4.948810
## rf2 -0.06366472 1.0033461 0.6610371 0.7992679 -0.8212055 4.971403
## rf3 -0.05755666 1.0071221 0.6571241 0.8033652 -0.7766925 4.998547
## rf4 -0.07985569 0.9985825 0.6646830 0.7919069 -0.9103610 4.923883
## rf5 -0.04872120 0.9975813 0.6644236 0.7960901 -0.7273081 4.946893
## xgb.baseline -0.07860914 1.0508742 0.6271185 0.8423056 -0.9343947 5.249769
## xgb1 -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
## xgb2 -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
## xgb3 -0.06005700 1.0179837 0.6487246 0.8185924 -0.7790652 5.085187
## xgb4 -0.07641102 1.0149441 0.6515209 0.8255738 -0.8732818 5.121493
## xgb5 -0.03487635 1.0313299 0.6385238 0.8361075 -0.6351771 5.183738
## xgb6 -0.05525757 1.0191473 0.6477012 0.8211301 -0.7502124 5.100705
## xgb7 -0.05353738 1.0265252 0.6428285 0.8250519 -0.7540614 5.123845
## cube.baseline -0.17794846 1.0650029 0.6256989 0.8365519 -1.5383026 5.254428
## cube1 -0.13139698 1.0324528 0.6445176 0.8220364 -1.2410078 5.137437
## cube2 -0.16125278 1.0361215 0.6448753 0.8198624 -1.4205780 5.142243
## cube3 -0.13158143 1.0355657 0.6419864 0.8265906 -1.2402341 5.162284
## cube4 -0.14600407 1.0336423 0.6451466 0.8212243 -1.3293151 5.137742
## cube5 -0.13784628 1.0388059 0.6399798 0.8260460 -1.2760452 5.162864
## cube6 -0.15464835 1.0369273 0.6451362 0.8205757 -1.3968214 5.147856
## cube7 -0.14615186 1.0369947 0.6443757 0.8210387 -1.3477388 5.144373
## TIME
## bag.baseline 18.72908
## bag1 19.96190
## bag2 18.32149
## bag3 19.38999
## bag4 17.60555
## bag5 17.85801
## bag6 20.05631
## rf.baseline 103.99030
## rf1 132.44422
## rf2 123.42354
## rf3 105.98884
## rf4 104.72795
## rf5 113.90101
## xgb.baseline 33.11829
## xgb1 41.03013
## xgb2 32.23954
## xgb3 16.24166
## xgb4 16.91872
## xgb5 17.00468
## xgb6 16.63881
## xgb7 15.97367
## cube.baseline 144.29722
## cube1 198.45836
## cube2 28.57859
## cube3 31.94294
## cube4 29.58219
## cube5 28.81885
## cube6 28.57339
## cube7 30.65470
Results
ME | RMSE | Rsquared | MAE | MPE | MAPE | TIME | |
---|---|---|---|---|---|---|---|
bag.baseline | -0.0935848 | 1.0843706 | 0.6040531 | 0.8695907 | -1.0574089 | 5.428965 | 18.72908 |
bag1 | -0.0909927 | 1.0626108 | 0.6193303 | 0.8592402 | -1.0157512 | 5.353042 | 19.96190 |
bag2 | -0.0909381 | 1.0657372 | 0.6172368 | 0.8764869 | -1.0168330 | 5.439538 | 18.32149 |
bag3 | -0.0909927 | 1.0626108 | 0.6193303 | 0.8592402 | -1.0157512 | 5.353042 | 19.38999 |
bag4 | -0.0899824 | 1.0648496 | 0.6175457 | 0.8607279 | -1.0102230 | 5.363212 | 17.60555 |
bag5 | -0.0891617 | 1.0574986 | 0.6237698 | 0.8555682 | -1.0107429 | 5.328622 | 17.85801 |
bag6 | -0.0900884 | 1.0655591 | 0.6170242 | 0.8612496 | -1.0112991 | 5.366995 | 20.05631 |
rf.baseline | -0.0593870 | 1.0586792 | 0.6200670 | 0.8361933 | -0.7858701 | 5.200798 | 103.99030 |
rf1 | -0.0675847 | 0.9972875 | 0.6655080 | 0.7960606 | -0.8410135 | 4.948810 | 132.44422 |
rf2 | -0.0636647 | 1.0033461 | 0.6610371 | 0.7992679 | -0.8212055 | 4.971403 | 123.42354 |
rf3 | -0.0575567 | 1.0071221 | 0.6571241 | 0.8033652 | -0.7766925 | 4.998547 | 105.98884 |
rf4 | -0.0798557 | 0.9985825 | 0.6646830 | 0.7919069 | -0.9103610 | 4.923883 | 104.72795 |
rf5 | -0.0487212 | 0.9975813 | 0.6644236 | 0.7960901 | -0.7273081 | 4.946893 | 113.90101 |
xgb.baseline | -0.0786091 | 1.0508742 | 0.6271185 | 0.8423056 | -0.9343947 | 5.249769 | 33.11829 |
xgb1 | -0.0626545 | 1.0168226 | 0.6508696 | 0.8221444 | -0.8164864 | 5.113196 | 41.03013 |
xgb2 | -0.0626545 | 1.0168226 | 0.6508696 | 0.8221444 | -0.8164864 | 5.113196 | 32.23954 |
xgb3 | -0.0600570 | 1.0179837 | 0.6487246 | 0.8185924 | -0.7790652 | 5.085187 | 16.24166 |
xgb4 | -0.0764110 | 1.0149441 | 0.6515209 | 0.8255738 | -0.8732818 | 5.121493 | 16.91872 |
xgb5 | -0.0348763 | 1.0313299 | 0.6385238 | 0.8361075 | -0.6351771 | 5.183738 | 17.00468 |
xgb6 | -0.0552576 | 1.0191473 | 0.6477012 | 0.8211301 | -0.7502124 | 5.100705 | 16.63881 |
xgb7 | -0.0535374 | 1.0265252 | 0.6428285 | 0.8250519 | -0.7540614 | 5.123845 | 15.97367 |
cube.baseline | -0.1779485 | 1.0650029 | 0.6256989 | 0.8365519 | -1.5383026 | 5.254428 | 144.29722 |
cube1 | -0.1313970 | 1.0324528 | 0.6445176 | 0.8220364 | -1.2410078 | 5.137437 | 198.45836 |
cube2 | -0.1612528 | 1.0361215 | 0.6448753 | 0.8198624 | -1.4205780 | 5.142243 | 28.57859 |
cube3 | -0.1315814 | 1.0355657 | 0.6419864 | 0.8265906 | -1.2402341 | 5.162284 | 31.94294 |
cube4 | -0.1460041 | 1.0336423 | 0.6451466 | 0.8212243 | -1.3293151 | 5.137742 | 29.58219 |
cube5 | -0.1378463 | 1.0388059 | 0.6399798 | 0.8260460 | -1.2760452 | 5.162864 | 28.81885 |
cube6 | -0.1546484 | 1.0369273 | 0.6451362 | 0.8205757 | -1.3968214 | 5.147856 | 28.57339 |
cube7 | -0.1461519 | 1.0369947 | 0.6443757 | 0.8210387 | -1.3477388 | 5.144373 | 30.65470 |
Importance Measures
Bagged Models
Full Model (V1)
Overall | |
---|---|
budget | 100.000000 |
director_rev5 | 55.293303 |
director_rev10 | 52.923247 |
bud_availYes | 40.369767 |
actor_rev5 | 23.843383 |
actor_rev10 | 15.965978 |
director_films5 | 11.463206 |
runtime | 11.151040 |
ratedPG-13 | 7.541583 |
genreHorror | 6.703949 |
Random Forest
Full Model (V1)
Overall | |
---|---|
budget | 100.000000 |
bud_availYes | 24.226549 |
director_rev10 | 14.872920 |
director_rev5 | 12.121446 |
runtime | 10.781919 |
actor_rev5 | 9.714001 |
actor_rev10 | 9.310301 |
actor_films10 | 5.324959 |
actor_films5 | 4.569498 |
director_films10 | 2.602546 |
Tuning Parameter:
## Random Forest
##
## 1378 samples
## 18 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1240, 1239, 1240, 1240, 1240, 1241, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 1.089755 0.6019913 0.8554087
## 21 1.011198 0.6250762 0.8035619
## 40 1.017215 0.6203623 0.8064708
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 21.
SP-5 Model
Using only Star Power 5
Overall | |
---|---|
budget | 100.000000 |
bud_availYes | 24.583393 |
director_rev10 | 14.534784 |
director_rev5 | 12.811826 |
runtime | 11.007748 |
actor_rev5 | 10.115440 |
actor_rev10 | 9.865975 |
actor_films10 | 5.168749 |
actor_films5 | 4.399251 |
director_films10 | 2.694230 |
Best Model
Overall | |
---|---|
budget | 100.000000 |
bud_availYes | 25.623682 |
director_rev10 | 20.453446 |
actor_rev10 | 14.125389 |
runtime | 13.432925 |
actor_films10 | 8.610988 |
director_films10 | 5.036964 |
ratedR | 2.496025 |
genreHorror | 2.403049 |
countryUnited States | 2.153712 |
XGBoost
Full Model
Overall | |
---|---|
budget | 100.000000 |
bud_availYes | 31.750788 |
director_rev5 | 8.726736 |
director_rev10 | 8.228347 |
genreHorror | 3.201705 |
runtime | 2.295859 |
actor_rev10 | 2.264511 |
countryUnited States
|
1.971501 |
actor_rev5 | 1.970460 |
ratedR | 1.961712 |
Cubist
Full Model
Overall | |
---|---|
budget | 100.00000 |
ratedR | 53.33333 |
director_films5 | 46.00000 |
actor_films5 | 43.33333 |
ratedPG-13 | 41.33333 |
bud_availYes | 40.66667 |
States | 38.00000 |
runtime | 35.33333 |
genreHorror | 34.66667 |
ratedPG | 34.66667 |
Side-by-Side
Full Model
f.v1.b <- plotImp(b1.import, 10, title="Bagged Trees")
f.v1.r <- plotImp(rf1.import, 10, title="Random Forest")
f.v1.x <- plotImp(xgb1.import, 10, title="XGBoost")
f.v1.c <- plotImp(cube1.import, 10, title="Cubist")
ggarrange(f.v1.b,
f.v1.r,
f.v1.x,
f.v1.c,
ncol=2, top="Variable Importance: Full Model")