Capstone Modeling

library(tidyverse)
library(caret)
library(DMwR2)
library(e1071)
library(ipred)
library(xgboost)
library(kableExtra)
library(ggthemes)
library(egg)
library(forcats)
library(summarytools)
library(forecast)
library(vtreat)
library(doParallel)

Loading Data

df <- read.csv('modeling_data.csv')
# Identify character columns
char_columns <- sapply(df, is.character)

# Convert character columns to factors
df[char_columns] <- lapply(df[char_columns], as.factor)

glimpse(df)
## Rows: 1,722
## Columns: 19
## $ rated            <fct> PG-13, PG-13, PG-13, PG-13, PG-13, PG-13, PG, PG-13, …
## $ runtime          <int> 143, 164, 142, 143, 115, 136, 93, 169, 106, 93, 86, 1…
## $ genre            <fct> Action, Action, Action, Action, Adventure, Action, An…
## $ language         <fct> English, English, English, English, English, English,…
## $ country          <fct> United States, United States, United States, United S…
## $ boxoffice        <int> 623357910, 448149584, 408010692, 304360277, 292324737…
## $ budget           <int> 220000000, 250000000, 78000000, 200000000, 120000000,…
## $ num_directors    <int> 1, 1, 1, 1, 1, 1, 3, 1, 1, 3, 2, 1, 1, 2, 1, 1, 1, 2,…
## $ num_actors       <int> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,…
## $ actor_films5     <int> 17, 17, 7, 12, 17, 8, 4, 0, 12, 10, 3, 11, 7, 5, 12, …
## $ actor_films10    <int> 34, 24, 10, 17, 20, 9, 10, 5, 17, 25, 4, 22, 9, 12, 1…
## $ actor_rev5       <dbl> 1799862677, 1523557430, 438810084, 790069947, 3307645…
## $ actor_rev10      <dbl> 2411207730, 2081390095, 603184119, 1071362460, 337077…
## $ director_films5  <int> 0, 2, 0, 2, 1, 1, 0, 1, 0, 4, 1, 0, 0, 2, 0, 0, 2, 2,…
## $ director_films10 <int> 1, 4, 1, 3, 3, 1, 0, 4, 0, 7, 1, 2, 0, 2, 0, 0, 2, 2,…
## $ director_rev5    <dbl> 0, 827574406, 0, 32363426, 281287133, 32391374, 0, 44…
## $ director_rev10   <dbl> 25514517, 1087527776, 120277854, 95021646, 394908068,…
## $ month            <fct> May, July, March, November, November, July, June, Dec…
## $ bud_avail        <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes…

We create copies of the dataset to use for each modeling approach,

b.df <- df # data for bagging
r.df <- df # data for random forest
x.df <- df # data for xg boost
c.df <- df # data for cubist

Helper Functions

We use the following helper functions to streamline our processes:

Calculating Metrics

metrics <- function(predicted, actual){
  results = accuracy(predicted, actual)
  mape = results['Test set','MAPE']
  me = results['Test set','ME']
  mpe = results['Test set','MPE']
  measures = postResample(predicted, actual) 
  metrics = c(ME=me, measures, MPE=mpe, MAPE=mape)
  return(metrics)
}

Column IQR Factorizer

Factorizes column by IQR with separate level for NA

# function to create factored column based on IQR plus NA
create_factored_column <- function(column) {
  q <- quantile(column, seq(0, 1, 0.25), na.rm = TRUE)
  factor_column <- cut(column, breaks = q, labels = FALSE, include.lowest = TRUE)
  factor_column[is.na(column)] <- 5  # Assigning 5 for NA
  factor_column <- factor(factor_column, levels = 1:5, labels = c("1", "2", "3", "4", "NA"))
  return(factor_column)
}

Box-Cox Transformer

Box-Cox transforms the response variable.

box.response <- function(df){
  # make a copy of the master dataframe 
  t.df <- df

  # Specify the response variable and predictors
  response <- "boxoffice"
  predictors <- names(df)[names(df) != response]

  # Apply Box-Cox transformation only to the response variable
  preprocess.params <- preProcess(df[, response, drop = FALSE], method = "BoxCox")

  # Apply the preProcess function to the response variable
  transformed.response <- predict(preprocess.params, newdata = df[, response, drop = FALSE])

  # Store the transformed response in the boxoffice column
  t.df['boxoffice'] <- transformed.response
  
  return(t.df)
}

Generating Dummies

Creates dummy variables for categorical predictors.

generate.dummies <- function(df){
  # get names of categorical variables
  cols <- names(df)[sapply(df, is.factor)]

  # One-hot encode factor columns
  dummies <- model.matrix(~.-1, data = df[, cols])

  # combine dummies with numerical columns
  df.encoded <- cbind(df[, !names(df) %in% cols], dummies)
  
  return(df.encoded)
}

Data Splitting

data.splitter <- function(df) {
  set.seed(1)

  # Shuffle the dataset
  shuffled <- df[sample(nrow(df)), ]

  # Create split point
  split.index <- createDataPartition(df$boxoffice, p = 0.8, list = FALSE)

  # Split the data
  train.set <- shuffled[split.index, ]
  test.set <- shuffled[-split.index, ]

  # Print the code
  cat(sprintf("Training set contains %d rows.", dim(train.set)[1]),
      sprintf("\nTesting set contains %d rows.", dim(test.set)[1]))

  # Return the processed data
  return(list(train.set = train.set, test.set = test.set))
}

Bagged Modeling

bag.deploy <- function(df){
  set.seed(1)
  
  # split data
  split.data <- data.splitter(df)
  train.set <- split.data$train.set
  test.set <- split.data$test.set
  
  # enable clusters for parallel processing
  cl <- makeCluster(4)
  registerDoParallel(cl)

  start <- Sys.time()

  model <- train(
    boxoffice ~ .,
    data = train.set,
    method = "treebag",
    trControl = bag.ctrl,
    tuneLength = 10
  )

  end <- Sys.time()

  elapsed.time <- as.numeric(difftime(end, start, units = "secs"))

  preds <- predict(model, test.set)
  results <- metrics(preds, test.set$boxoffice)
  results['TIME'] <- elapsed.time

  stopCluster(cl)
  
  return(list(model = model, results = results))
}

Random Forest Modeling

rf.deploy <- function(df){
  
  set.seed(1)
  
  # split data
  split.data <- data.splitter(df)
  train.set <- split.data$train.set
  test.set <- split.data$test.set
  
  # enable clusters for parallel processing
  cl <- makeCluster(4)
  registerDoParallel(cl)

  start <- Sys.time()

  model <- train(
    boxoffice ~ ., 
    data = train.set,
    method = "rf",
    ntrees = 1000,
    trControl = rf.ctrl,
    parallel = "multicore"
  )

  end <- Sys.time()

  elapsed.time <- as.numeric(difftime(end, start, units = "secs"))

  preds <- predict(model, test.set)
  results <- metrics(preds, test.set$boxoffice)
  results['TIME'] <- elapsed.time

  stopCluster(cl)
  
  return(list(model = model, results = results))
}

XGBoost Modeling

xg.deploy <- function(df, ctrl, grid, scale=NULL){
    
  set.seed(1)
  
  # split data
  split.data <- data.splitter(df)
  train.set <- split.data$train.set
  test.set <- split.data$test.set
  
  # enable clusters for parallel processing
  cl <- makeCluster(4)
  registerDoParallel(cl)

  start <- Sys.time()
  
  if (missing(scale)){
    model <- train(
    boxoffice ~ .,
    data = train.set,
    method = "xgbTree",
    trControl = ctrl,
    tuneGrid = grid
    )
  } else {
    model <- train(
    boxoffice ~ .,
    data = train.set,
    method = "xgbTree",
    trControl = ctrl,
    tuneGrid = grid,
    preProc = c(scale)
    )
  }
  
  end <- Sys.time()

  elapsed.time <- as.numeric(difftime(end, start, units = "secs"))

  preds <- predict(model, test.set)
  results <- metrics(preds, test.set$boxoffice)
  results['TIME'] <- elapsed.time

  stopCluster(cl)
  
  return(list(model = model, results = results))
}

Cubist Modeling

cube.deploy <- function(df, ctrl, grid, scale=NULL){
    
  set.seed(1)
  
  # split data
  split.data <- data.splitter(df)
  train.set <- split.data$train.set
  test.set <- split.data$test.set
  
  # enable clusters for parallel processing
  cl <- makeCluster(4)
  registerDoParallel(cl)

  start <- Sys.time()
  
  if (missing(scale)){
    model <- train(
    boxoffice ~ .,
    data = train.set,
    method = "cubist",
    trControl = ctrl,
    tuneGrid = grid
    )
  } else {
    model <- train(
    boxoffice ~ .,
    data = train.set,
    method = "cubist",
    trControl = ctrl,
    tuneGrid = grid,
    preProc = c(scale)
    )
  }
  
  end <- Sys.time()

  elapsed.time <- as.numeric(difftime(end, start, units = "secs"))

  preds <- predict(model, test.set)
  results <- metrics(preds, test.set$boxoffice)
  results['TIME'] <- elapsed.time

  stopCluster(cl)
  
  return(list(model = model, results = results))
}

Variable Importance Plotting

plotImp <- function(imp, top_n, title=NULL){

  # Select the top n most important variables
  top_vars <- imp$importance |> 
    as.data.frame() |>
    arrange(desc(Overall)) |> 
    head(top_n)
  
    if (missing(title)){
      # Create a horizontal bar plot
      ggplot(top_vars, aes(x = Overall, y = reorder(rownames(top_vars), Overall))) +
        geom_col(fill="lightblue") +
        labs(title = "Most Important Variables",
             x = "Importance",
             y = "Variable") +
        theme_few() +
        theme(plot.title = element_text(hjust = 0.5))
    } else {
      # Create a horizontal bar plot
      ggplot(top_vars, aes(x = Overall, y = reorder(rownames(top_vars), Overall))) +
        geom_col(fill="lightblue") +
        labs(title = title,
             x = "",
             y = "") +
        theme_few() +
        theme(plot.title = element_text(hjust = 0.5,
                                        size = 10))
    }
}

Variable Importance Table

tableImp <- function(imp, top_n){
  imp$importance |>
  arrange(desc(Overall)) |>
  head(top_n) |>
  kable() |> kable_styling()
}

Modeling

Bagged Trees

We define the parameters for model training, enabling 10-fold cross-validation.

bag.ctrl <- trainControl(
  method = "cv",
  number = 10,
  allowParallel = TRUE
)

We perform the BoxCox transformation on the response variable,

b.df <- box.response(b.df)

Baseline Model

First we build a model including no star power variables and imputing zeros for NAs in Budget.

b.base <- b.df |>
  select(c(1:7,18:19)) |>
  mutate(budget = ifelse(is.na(budget),
                         0,
                         budget))

bag.base <- bag.deploy(b.base)
## Training set contains 1378 rows. 
## Testing set contains 344 rows.
stack.results <- rbind("bag.baseline" = bag.base$results)
stack.results
##                      ME     RMSE  Rsquared       MAE       MPE     MAPE
## bag.baseline -0.0935848 1.084371 0.6040531 0.8695907 -1.057409 5.428965
##                  TIME
## bag.baseline 18.72908

V1

  • Full dataset.
  • Imputing zeros for NAs in Budget.
b.df1 <- b.df |>
  mutate(budget = ifelse(is.na(budget),
                         0,
                         budget))

bag1 <- bag.deploy(b.df1)
## Training set contains 1378 rows. 
## Testing set contains 344 rows.
stack.results <- rbind(stack.results, "bag1" = bag1$results)
stack.results
##                       ME     RMSE  Rsquared       MAE       MPE     MAPE
## bag.baseline -0.09358480 1.084371 0.6040531 0.8695907 -1.057409 5.428965
## bag1         -0.09099274 1.062611 0.6193303 0.8592402 -1.015751 5.353042
##                  TIME
## bag.baseline 18.72908
## bag1         19.96190

V2

  • Full dataset.
  • Discretizing budget variable by IQR.
b.df2 <- b.df |>
  mutate(budget = create_factored_column(budget))

bag2 <- bag.deploy(b.df2)
## Training set contains 1378 rows. 
## Testing set contains 344 rows.
stack.results <- rbind(stack.results, "bag2" = bag2$results)
stack.results
##                       ME     RMSE  Rsquared       MAE       MPE     MAPE
## bag.baseline -0.09358480 1.084371 0.6040531 0.8695907 -1.057409 5.428965
## bag1         -0.09099274 1.062611 0.6193303 0.8592402 -1.015751 5.353042
## bag2         -0.09093805 1.065737 0.6172368 0.8764869 -1.016833 5.439538
##                  TIME
## bag.baseline 18.72908
## bag1         19.96190
## bag2         18.32149

Turning the budget column into a categorical variable performs worse than imputing zero for NAs, and worse than the baseline model.

V3

  • Remove Near Zero Variance variables
  • Impute Zero for NAs.
nzv <- nearZeroVar(b.df, saveMetrics = TRUE, names = TRUE)
nzv |> filter(nzv == TRUE) |> rownames()
## [1] "language"   "num_actors"
cols <- nzv |>
  filter(nzv == TRUE) |>
  rownames()

b.df3 <- b.df1 |>
  select(-cols) 

bag3 <- bag.deploy(b.df3)
## Training set contains 1378 rows. 
## Testing set contains 344 rows.
stack.results <- rbind(stack.results, "bag3" = bag3$results)
stack.results
##                       ME     RMSE  Rsquared       MAE       MPE     MAPE
## bag.baseline -0.09358480 1.084371 0.6040531 0.8695907 -1.057409 5.428965
## bag1         -0.09099274 1.062611 0.6193303 0.8592402 -1.015751 5.353042
## bag2         -0.09093805 1.065737 0.6172368 0.8764869 -1.016833 5.439538
## bag3         -0.09099274 1.062611 0.6193303 0.8592402 -1.015751 5.353042
##                  TIME
## bag.baseline 18.72908
## bag1         19.96190
## bag2         18.32149
## bag3         19.38999

The near-zero variance variables had no effect on the modeling results. In fact, the full model worked faster.

V4

  • Including only 5-year Star Power
b.df4 <- b.df3 |>
  select(-c(num_directors,
            actor_films10,
            actor_rev10,
            director_films10,
            director_rev10))

bag4 <- bag.deploy(b.df4)
## Training set contains 1378 rows. 
## Testing set contains 344 rows.
stack.results <- rbind(stack.results, "bag4" = bag4$results)
stack.results
##                       ME     RMSE  Rsquared       MAE       MPE     MAPE
## bag.baseline -0.09358480 1.084371 0.6040531 0.8695907 -1.057409 5.428965
## bag1         -0.09099274 1.062611 0.6193303 0.8592402 -1.015751 5.353042
## bag2         -0.09093805 1.065737 0.6172368 0.8764869 -1.016833 5.439538
## bag3         -0.09099274 1.062611 0.6193303 0.8592402 -1.015751 5.353042
## bag4         -0.08998240 1.064850 0.6175457 0.8607279 -1.010223 5.363212
##                  TIME
## bag.baseline 18.72908
## bag1         19.96190
## bag2         18.32149
## bag3         19.38999
## bag4         17.60555

V5

  • Including only 10-year Star Power
b.df5 <- b.df3 |>
  select(-c(num_directors,
            actor_films5,
            actor_rev5,
            director_films5,
            director_rev5))

bag5 <- bag.deploy(b.df5)
## Training set contains 1378 rows. 
## Testing set contains 344 rows.
stack.results <- rbind(stack.results, "bag5" = bag5$results)
stack.results
##                       ME     RMSE  Rsquared       MAE       MPE     MAPE
## bag.baseline -0.09358480 1.084371 0.6040531 0.8695907 -1.057409 5.428965
## bag1         -0.09099274 1.062611 0.6193303 0.8592402 -1.015751 5.353042
## bag2         -0.09093805 1.065737 0.6172368 0.8764869 -1.016833 5.439538
## bag3         -0.09099274 1.062611 0.6193303 0.8592402 -1.015751 5.353042
## bag4         -0.08998240 1.064850 0.6175457 0.8607279 -1.010223 5.363212
## bag5         -0.08916172 1.057499 0.6237698 0.8555682 -1.010743 5.328622
##                  TIME
## bag.baseline 18.72908
## bag1         19.96190
## bag2         18.32149
## bag3         19.38999
## bag4         17.60555
## bag5         17.85801

V6

  • 5 year and 5+ - 10 year
b.df6 <- b.df3 |>
  mutate(actor_films10 = actor_films10 - actor_films5,
         actor_rev10 = actor_rev10 - actor_rev5,
         director_films10 = director_films10 - director_films5,
         director_rev10 = director_rev10 - director_rev5) |>
  select(-c(num_directors))

bag6 <- bag.deploy(b.df6)
## Training set contains 1378 rows. 
## Testing set contains 344 rows.
stack.results <- rbind(stack.results, "bag6" = bag6$results)
stack.results
##                       ME     RMSE  Rsquared       MAE       MPE     MAPE
## bag.baseline -0.09358480 1.084371 0.6040531 0.8695907 -1.057409 5.428965
## bag1         -0.09099274 1.062611 0.6193303 0.8592402 -1.015751 5.353042
## bag2         -0.09093805 1.065737 0.6172368 0.8764869 -1.016833 5.439538
## bag3         -0.09099274 1.062611 0.6193303 0.8592402 -1.015751 5.353042
## bag4         -0.08998240 1.064850 0.6175457 0.8607279 -1.010223 5.363212
## bag5         -0.08916172 1.057499 0.6237698 0.8555682 -1.010743 5.328622
## bag6         -0.09008839 1.065559 0.6170242 0.8612496 -1.011299 5.366995
##                  TIME
## bag.baseline 18.72908
## bag1         19.96190
## bag2         18.32149
## bag3         19.38999
## bag4         17.60555
## bag5         17.85801
## bag6         20.05631

Random Forest

We define the parameters for model training, enabling 10-fold cross-validation.

rf.ctrl <- trainControl(
  method = "cv",
  number = 10,
  allowParallel = TRUE
)

We perform the BoxCox transformation on the response variable,

r.df <- box.response(r.df)

Further, we are going to make the zero NA imputation our baseline for this model,

r.df <- r.df |>
  mutate(budget = ifelse(is.na(budget),
                         0,
                         budget))

Baseline Model

r.base <- r.df |>
  select(c(1:7,18:19))

rf.base <- rf.deploy(r.base)
## Training set contains 1378 rows. 
## Testing set contains 344 rows.
stack.results <- rbind(stack.results, "rf.baseline" = rf.base$results)
stack.results
##                       ME     RMSE  Rsquared       MAE        MPE     MAPE
## bag.baseline -0.09358480 1.084371 0.6040531 0.8695907 -1.0574089 5.428965
## bag1         -0.09099274 1.062611 0.6193303 0.8592402 -1.0157512 5.353042
## bag2         -0.09093805 1.065737 0.6172368 0.8764869 -1.0168330 5.439538
## bag3         -0.09099274 1.062611 0.6193303 0.8592402 -1.0157512 5.353042
## bag4         -0.08998240 1.064850 0.6175457 0.8607279 -1.0102230 5.363212
## bag5         -0.08916172 1.057499 0.6237698 0.8555682 -1.0107429 5.328622
## bag6         -0.09008839 1.065559 0.6170242 0.8612496 -1.0112991 5.366995
## rf.baseline  -0.05938704 1.058679 0.6200670 0.8361933 -0.7858701 5.200798
##                   TIME
## bag.baseline  18.72908
## bag1          19.96190
## bag2          18.32149
## bag3          19.38999
## bag4          17.60555
## bag5          17.85801
## bag6          20.05631
## rf.baseline  103.99030

The baseline Random Forest model performs better than the best Bagged Trees model. Now let’s introduce the star power variables.

V1

r.df1 <- r.df 

rf1 <- rf.deploy(r.df1)
## Training set contains 1378 rows. 
## Testing set contains 344 rows.
stack.results <- rbind(stack.results, "rf1" = rf1$results)
stack.results
##                       ME      RMSE  Rsquared       MAE        MPE     MAPE
## bag.baseline -0.09358480 1.0843706 0.6040531 0.8695907 -1.0574089 5.428965
## bag1         -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag2         -0.09093805 1.0657372 0.6172368 0.8764869 -1.0168330 5.439538
## bag3         -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag4         -0.08998240 1.0648496 0.6175457 0.8607279 -1.0102230 5.363212
## bag5         -0.08916172 1.0574986 0.6237698 0.8555682 -1.0107429 5.328622
## bag6         -0.09008839 1.0655591 0.6170242 0.8612496 -1.0112991 5.366995
## rf.baseline  -0.05938704 1.0586792 0.6200670 0.8361933 -0.7858701 5.200798
## rf1          -0.06758472 0.9972875 0.6655080 0.7960606 -0.8410135 4.948810
##                   TIME
## bag.baseline  18.72908
## bag1          19.96190
## bag2          18.32149
## bag3          19.38999
## bag4          17.60555
## bag5          17.85801
## bag6          20.05631
## rf.baseline  103.99030
## rf1          132.44422

The full random forest model performs better than the bagged trees models. However, this came at far greater computational expense. Let’s take a look at the important variables to this model:

varImp(rf1$model)
## rf variable importance
## 
##   only 20 most important variables shown (out of 40)
## 
##                      Overall
## budget               100.000
## bud_availYes          24.227
## director_rev10        14.873
## director_rev5         12.121
## runtime               10.782
## actor_rev5             9.714
## actor_rev10            9.310
## actor_films10          5.325
## actor_films5           4.569
## director_films10       2.603
## genreHorror            2.240
## ratedR                 2.104
## countryUnited States   1.843
## monthDecember          1.808
## director_films5        1.687
## monthOctober           1.637
## genreAnimation         1.285
## genreComedy            1.252
## monthAugust            1.250
## ratedPG-13             1.155

We can see that the near zero variance variables are not amongst the top 20, nor is the number of directors and number of actors. Let’s see if removing those variables improves the model.

V2

We remove the near zero variance variables as well as the number of actors and directors.

r.df2 <- r.df1 |>
  select(-c(language,
            num_actors,
            num_directors))

rf2 <- rf.deploy(r.df2)
## Training set contains 1378 rows. 
## Testing set contains 344 rows.
stack.results <- rbind(stack.results, "rf2" = rf2$results)
stack.results
##                       ME      RMSE  Rsquared       MAE        MPE     MAPE
## bag.baseline -0.09358480 1.0843706 0.6040531 0.8695907 -1.0574089 5.428965
## bag1         -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag2         -0.09093805 1.0657372 0.6172368 0.8764869 -1.0168330 5.439538
## bag3         -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag4         -0.08998240 1.0648496 0.6175457 0.8607279 -1.0102230 5.363212
## bag5         -0.08916172 1.0574986 0.6237698 0.8555682 -1.0107429 5.328622
## bag6         -0.09008839 1.0655591 0.6170242 0.8612496 -1.0112991 5.366995
## rf.baseline  -0.05938704 1.0586792 0.6200670 0.8361933 -0.7858701 5.200798
## rf1          -0.06758472 0.9972875 0.6655080 0.7960606 -0.8410135 4.948810
## rf2          -0.06366472 1.0033461 0.6610371 0.7992679 -0.8212055 4.971403
##                   TIME
## bag.baseline  18.72908
## bag1          19.96190
## bag2          18.32149
## bag3          19.38999
## bag4          17.60555
## bag5          17.85801
## bag6          20.05631
## rf.baseline  103.99030
## rf1          132.44422
## rf2          123.42354

This model performs marginally better computationally at the expense of accuracy. While the language variable has near zero variance, there is still some important information contained in the variable.

V3

Let’s focus in on the 5 year star power metrics and see how the model performs.

r.df3 <- r.df1 |>
  select(-c(num_actors,
            num_directors,
            actor_films10,
            actor_rev10,
            director_films10,
            director_rev10))

rf3 <- rf.deploy(r.df3)
## Training set contains 1378 rows. 
## Testing set contains 344 rows.
stack.results <- rbind(stack.results, "rf3" = rf3$results)
stack.results
##                       ME      RMSE  Rsquared       MAE        MPE     MAPE
## bag.baseline -0.09358480 1.0843706 0.6040531 0.8695907 -1.0574089 5.428965
## bag1         -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag2         -0.09093805 1.0657372 0.6172368 0.8764869 -1.0168330 5.439538
## bag3         -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag4         -0.08998240 1.0648496 0.6175457 0.8607279 -1.0102230 5.363212
## bag5         -0.08916172 1.0574986 0.6237698 0.8555682 -1.0107429 5.328622
## bag6         -0.09008839 1.0655591 0.6170242 0.8612496 -1.0112991 5.366995
## rf.baseline  -0.05938704 1.0586792 0.6200670 0.8361933 -0.7858701 5.200798
## rf1          -0.06758472 0.9972875 0.6655080 0.7960606 -0.8410135 4.948810
## rf2          -0.06366472 1.0033461 0.6610371 0.7992679 -0.8212055 4.971403
## rf3          -0.05755666 1.0071221 0.6571241 0.8033652 -0.7766925 4.998547
##                   TIME
## bag.baseline  18.72908
## bag1          19.96190
## bag2          18.32149
## bag3          19.38999
## bag4          17.60555
## bag5          17.85801
## bag6          20.05631
## rf.baseline  103.99030
## rf1          132.44422
## rf2          123.42354
## rf3          105.98884

V4

Now let’s compare with only the 10 year star power metrics,

r.df4 <- r.df1 |>
  select(-c(num_actors,
            num_directors,
            actor_films5,
            actor_rev5,
            director_films5,
            director_rev5))

rf4 <- rf.deploy(r.df4)
## Training set contains 1378 rows. 
## Testing set contains 344 rows.
stack.results <- rbind(stack.results, "rf4" = rf4$results)
stack.results
##                       ME      RMSE  Rsquared       MAE        MPE     MAPE
## bag.baseline -0.09358480 1.0843706 0.6040531 0.8695907 -1.0574089 5.428965
## bag1         -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag2         -0.09093805 1.0657372 0.6172368 0.8764869 -1.0168330 5.439538
## bag3         -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag4         -0.08998240 1.0648496 0.6175457 0.8607279 -1.0102230 5.363212
## bag5         -0.08916172 1.0574986 0.6237698 0.8555682 -1.0107429 5.328622
## bag6         -0.09008839 1.0655591 0.6170242 0.8612496 -1.0112991 5.366995
## rf.baseline  -0.05938704 1.0586792 0.6200670 0.8361933 -0.7858701 5.200798
## rf1          -0.06758472 0.9972875 0.6655080 0.7960606 -0.8410135 4.948810
## rf2          -0.06366472 1.0033461 0.6610371 0.7992679 -0.8212055 4.971403
## rf3          -0.05755666 1.0071221 0.6571241 0.8033652 -0.7766925 4.998547
## rf4          -0.07985569 0.9985825 0.6646830 0.7919069 -0.9103610 4.923883
##                   TIME
## bag.baseline  18.72908
## bag1          19.96190
## bag2          18.32149
## bag3          19.38999
## bag4          17.60555
## bag5          17.85801
## bag6          20.05631
## rf.baseline  103.99030
## rf1          132.44422
## rf2          123.42354
## rf3          105.98884
## rf4          104.72795

The model using only the 10 year totals performs best. This might be because the 10 year data contains the 5 year data. We now split up the 5 and 10 year data,

V5

The model using only the 10 year totals performs best. This might be because the 10 year data contains the 5 year data. We now split up the 5 and 10 year data,

r.df5 <- r.df1 |>
  mutate(actor_films10 = actor_films10 - actor_films5,
         actor_rev10 = actor_rev10 - actor_rev5,
         director_films10 = director_films10 - director_films5,
         director_rev10 = director_rev10 - director_rev5) |>
  select(-c(num_actors,
            num_directors))

rf5 <- rf.deploy(r.df5)
## Training set contains 1378 rows. 
## Testing set contains 344 rows.
stack.results <- rbind(stack.results, "rf5" = rf5$results)
stack.results
##                       ME      RMSE  Rsquared       MAE        MPE     MAPE
## bag.baseline -0.09358480 1.0843706 0.6040531 0.8695907 -1.0574089 5.428965
## bag1         -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag2         -0.09093805 1.0657372 0.6172368 0.8764869 -1.0168330 5.439538
## bag3         -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag4         -0.08998240 1.0648496 0.6175457 0.8607279 -1.0102230 5.363212
## bag5         -0.08916172 1.0574986 0.6237698 0.8555682 -1.0107429 5.328622
## bag6         -0.09008839 1.0655591 0.6170242 0.8612496 -1.0112991 5.366995
## rf.baseline  -0.05938704 1.0586792 0.6200670 0.8361933 -0.7858701 5.200798
## rf1          -0.06758472 0.9972875 0.6655080 0.7960606 -0.8410135 4.948810
## rf2          -0.06366472 1.0033461 0.6610371 0.7992679 -0.8212055 4.971403
## rf3          -0.05755666 1.0071221 0.6571241 0.8033652 -0.7766925 4.998547
## rf4          -0.07985569 0.9985825 0.6646830 0.7919069 -0.9103610 4.923883
## rf5          -0.04872120 0.9975813 0.6644236 0.7960901 -0.7273081 4.946893
##                   TIME
## bag.baseline  18.72908
## bag1          19.96190
## bag2          18.32149
## bag3          19.38999
## bag4          17.60555
## bag5          17.85801
## bag6          20.05631
## rf.baseline  103.99030
## rf1          132.44422
## rf2          123.42354
## rf3          105.98884
## rf4          104.72795
## rf5          113.90101

XGBoost

We define the parameters for model training, enabling 10-fold cross-validation, perform the Box-Cox transformation, and impute the NA values in the budget data. For the XGBoost model, we need to perform grid search to determine the optimal parameters. We intialize a grid search object to do so. We also need to create dummy variables for the categorical data.

# set parameters
xg.ctrl <- trainControl(
  method = "cv",
  number = 10,
  allowParallel = TRUE
)

# initialize grid for search
initial.grid <- expand.grid(
                       nrounds = c(50, 100, 150),
                       max_depth = c(3, 6),
                       colsample_bytree = c(0.6, 0.8),
                       eta = c(0.05, 0.1, 0.2),
                       gamma= 0,
                       min_child_weight = 10,
                       subsample = 1
                       )

# perform BoxCox transformation
x.df <- box.response(x.df)

# impute zeros for NAs
x.df <- x.df |>
  mutate(budget = ifelse(is.na(budget),
                         0,
                         budget))

# generate dummy vars
x.df <- generate.dummies(x.df)

Baseline Model

First we produce the model without any star power variables.

xgb.base <- x.df |>
  select(-c(num_actors,
            num_directors,
            actor_films5,
            actor_films10,
            actor_rev5,
            actor_rev10,
            director_films5,
            director_films10,
            director_rev5,
            director_rev10)) 

xg.base <- xg.deploy(xgb.base, xg.ctrl, initial.grid)
## Training set contains 1378 rows. 
## Testing set contains 344 rows.
stack.results <- rbind(stack.results, "xgb.baseline" = xg.base$results)
stack.results
##                       ME      RMSE  Rsquared       MAE        MPE     MAPE
## bag.baseline -0.09358480 1.0843706 0.6040531 0.8695907 -1.0574089 5.428965
## bag1         -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag2         -0.09093805 1.0657372 0.6172368 0.8764869 -1.0168330 5.439538
## bag3         -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag4         -0.08998240 1.0648496 0.6175457 0.8607279 -1.0102230 5.363212
## bag5         -0.08916172 1.0574986 0.6237698 0.8555682 -1.0107429 5.328622
## bag6         -0.09008839 1.0655591 0.6170242 0.8612496 -1.0112991 5.366995
## rf.baseline  -0.05938704 1.0586792 0.6200670 0.8361933 -0.7858701 5.200798
## rf1          -0.06758472 0.9972875 0.6655080 0.7960606 -0.8410135 4.948810
## rf2          -0.06366472 1.0033461 0.6610371 0.7992679 -0.8212055 4.971403
## rf3          -0.05755666 1.0071221 0.6571241 0.8033652 -0.7766925 4.998547
## rf4          -0.07985569 0.9985825 0.6646830 0.7919069 -0.9103610 4.923883
## rf5          -0.04872120 0.9975813 0.6644236 0.7960901 -0.7273081 4.946893
## xgb.baseline -0.07860914 1.0508742 0.6271185 0.8423056 -0.9343947 5.249769
##                   TIME
## bag.baseline  18.72908
## bag1          19.96190
## bag2          18.32149
## bag3          19.38999
## bag4          17.60555
## bag5          17.85801
## bag6          20.05631
## rf.baseline  103.99030
## rf1          132.44422
## rf2          123.42354
## rf3          105.98884
## rf4          104.72795
## rf5          113.90101
## xgb.baseline  33.11829

V1

Now we include the full dataset minus the number of actors and directors,

xgb.df1 <- x.df |>
  select(-c(num_actors,
            num_directors)) 

xgb1 <- xg.deploy(xgb.df1, xg.ctrl, initial.grid)
## Training set contains 1378 rows. 
## Testing set contains 344 rows.
stack.results <- rbind(stack.results, "xgb1" = xgb1$results)
stack.results
##                       ME      RMSE  Rsquared       MAE        MPE     MAPE
## bag.baseline -0.09358480 1.0843706 0.6040531 0.8695907 -1.0574089 5.428965
## bag1         -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag2         -0.09093805 1.0657372 0.6172368 0.8764869 -1.0168330 5.439538
## bag3         -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag4         -0.08998240 1.0648496 0.6175457 0.8607279 -1.0102230 5.363212
## bag5         -0.08916172 1.0574986 0.6237698 0.8555682 -1.0107429 5.328622
## bag6         -0.09008839 1.0655591 0.6170242 0.8612496 -1.0112991 5.366995
## rf.baseline  -0.05938704 1.0586792 0.6200670 0.8361933 -0.7858701 5.200798
## rf1          -0.06758472 0.9972875 0.6655080 0.7960606 -0.8410135 4.948810
## rf2          -0.06366472 1.0033461 0.6610371 0.7992679 -0.8212055 4.971403
## rf3          -0.05755666 1.0071221 0.6571241 0.8033652 -0.7766925 4.998547
## rf4          -0.07985569 0.9985825 0.6646830 0.7919069 -0.9103610 4.923883
## rf5          -0.04872120 0.9975813 0.6644236 0.7960901 -0.7273081 4.946893
## xgb.baseline -0.07860914 1.0508742 0.6271185 0.8423056 -0.9343947 5.249769
## xgb1         -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
##                   TIME
## bag.baseline  18.72908
## bag1          19.96190
## bag2          18.32149
## bag3          19.38999
## bag4          17.60555
## bag5          17.85801
## bag6          20.05631
## rf.baseline  103.99030
## rf1          132.44422
## rf2          123.42354
## rf3          105.98884
## rf4          104.72795
## rf5          113.90101
## xgb.baseline  33.11829
## xgb1          41.03013
xgb1$model
## eXtreme Gradient Boosting 
## 
## 1378 samples
##   39 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1240, 1239, 1240, 1240, 1240, 1241, ... 
## Resampling results across tuning parameters:
## 
##   eta   max_depth  colsample_bytree  nrounds  RMSE       Rsquared   MAE      
##   0.05  3          0.6                50      1.6379956  0.6216149  1.4238739
##   0.05  3          0.6               100      1.0043707  0.6344061  0.8013707
##   0.05  3          0.6               150      0.9984778  0.6346001  0.7917885
##   0.05  3          0.8                50      1.6352806  0.6207552  1.4226454
##   0.05  3          0.8               100      1.0070404  0.6327254  0.8028029
##   0.05  3          0.8               150      1.0022758  0.6318723  0.7930077
##   0.05  6          0.6                50      1.6439423  0.6218046  1.4299996
##   0.05  6          0.6               100      1.0143390  0.6269557  0.8076141
##   0.05  6          0.6               150      1.0153622  0.6218429  0.8015324
##   0.05  6          0.8                50      1.6366411  0.6243291  1.4241675
##   0.05  6          0.8               100      1.0087875  0.6304816  0.8037345
##   0.05  6          0.8               150      1.0129008  0.6236401  0.7987113
##   0.10  3          0.6                50      1.0054021  0.6329870  0.8037944
##   0.10  3          0.6               100      1.0037020  0.6303860  0.7947996
##   0.10  3          0.6               150      1.0088341  0.6269252  0.7988991
##   0.10  3          0.8                50      1.0064614  0.6317026  0.8026161
##   0.10  3          0.8               100      1.0049060  0.6297010  0.7958916
##   0.10  3          0.8               150      1.0120986  0.6244223  0.8027101
##   0.10  6          0.6                50      1.0083643  0.6311659  0.8047561
##   0.10  6          0.6               100      1.0185589  0.6204094  0.8067567
##   0.10  6          0.6               150      1.0357809  0.6088416  0.8193894
##   0.10  6          0.8                50      1.0134177  0.6269693  0.8047117
##   0.10  6          0.8               100      1.0312322  0.6112587  0.8129940
##   0.10  6          0.8               150      1.0477260  0.5998182  0.8246396
##   0.20  3          0.6                50      1.0094588  0.6265192  0.8016020
##   0.20  3          0.6               100      1.0270647  0.6144371  0.8153585
##   0.20  3          0.6               150      1.0424950  0.6039809  0.8270089
##   0.20  3          0.8                50      1.0077571  0.6279545  0.7980564
##   0.20  3          0.8               100      1.0232970  0.6169444  0.8066051
##   0.20  3          0.8               150      1.0408695  0.6045921  0.8206761
##   0.20  6          0.6                50      1.0384229  0.6056873  0.8209327
##   0.20  6          0.6               100      1.0739422  0.5813701  0.8497973
##   0.20  6          0.6               150      1.0941007  0.5682342  0.8665790
##   0.20  6          0.8                50      1.0398593  0.6048097  0.8217671
##   0.20  6          0.8               100      1.0725093  0.5852262  0.8460600
##   0.20  6          0.8               150      1.0925463  0.5730316  0.8644093
## 
## Tuning parameter 'gamma' was held constant at a value of 0
## Tuning
##  parameter 'min_child_weight' was held constant at a value of 10
## 
## Tuning parameter 'subsample' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were nrounds = 150, max_depth = 3, eta
##  = 0.05, gamma = 0, colsample_bytree = 0.6, min_child_weight = 10 and
##  subsample = 1.

V2

What effect, if any, does scaling the predictors and introducing a lower learning rate to the grid search have on our model?

# update grid search
updated.grid <- expand.grid(
                       nrounds = c(100, 150),
                       max_depth = c(3, 6),
                       colsample_bytree = c(0.3, 0.6),
                       eta = c(0.01, 0.05, 0.1),
                       gamma= 0,
                       min_child_weight = 10,
                       subsample = 1
                       )
# store preprocessing
scale <- "scale"

xgb.df2 <- xgb.df1

xgb2 <- xg.deploy(xgb.df2, xg.ctrl, updated.grid, scale)
## Training set contains 1378 rows. 
## Testing set contains 344 rows.
stack.results <- rbind(stack.results, "xgb2" = xgb2$results)
stack.results
##                       ME      RMSE  Rsquared       MAE        MPE     MAPE
## bag.baseline -0.09358480 1.0843706 0.6040531 0.8695907 -1.0574089 5.428965
## bag1         -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag2         -0.09093805 1.0657372 0.6172368 0.8764869 -1.0168330 5.439538
## bag3         -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag4         -0.08998240 1.0648496 0.6175457 0.8607279 -1.0102230 5.363212
## bag5         -0.08916172 1.0574986 0.6237698 0.8555682 -1.0107429 5.328622
## bag6         -0.09008839 1.0655591 0.6170242 0.8612496 -1.0112991 5.366995
## rf.baseline  -0.05938704 1.0586792 0.6200670 0.8361933 -0.7858701 5.200798
## rf1          -0.06758472 0.9972875 0.6655080 0.7960606 -0.8410135 4.948810
## rf2          -0.06366472 1.0033461 0.6610371 0.7992679 -0.8212055 4.971403
## rf3          -0.05755666 1.0071221 0.6571241 0.8033652 -0.7766925 4.998547
## rf4          -0.07985569 0.9985825 0.6646830 0.7919069 -0.9103610 4.923883
## rf5          -0.04872120 0.9975813 0.6644236 0.7960901 -0.7273081 4.946893
## xgb.baseline -0.07860914 1.0508742 0.6271185 0.8423056 -0.9343947 5.249769
## xgb1         -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
## xgb2         -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
##                   TIME
## bag.baseline  18.72908
## bag1          19.96190
## bag2          18.32149
## bag3          19.38999
## bag4          17.60555
## bag5          17.85801
## bag6          20.05631
## rf.baseline  103.99030
## rf1          132.44422
## rf2          123.42354
## rf3          105.98884
## rf4          104.72795
## rf5          113.90101
## xgb.baseline  33.11829
## xgb1          41.03013
## xgb2          32.23954

Scaling has no effect on the model accuracy but it does improve performance.

varImp(xgb2$model)
## xgbTree variable importance
## 
##   only 20 most important variables shown (out of 39)
## 
##                         Overall
## budget                 100.0000
## bud_availYes            31.7508
## director_rev5            8.7267
## director_rev10           8.2283
## genreHorror              3.2017
## runtime                  2.2959
## actor_rev10              2.2645
## `countryUnited States`   1.9715
## actor_rev5               1.9705
## ratedR                   1.9617
## genreAnimation           1.4434
## actor_films5             1.1664
## actor_films10            0.6857
## monthDecember            0.5387
## genreCrime               0.4462
## `ratedPG-13`             0.4287
## ratedG                   0.4092
## monthOctober             0.3991
## director_films10         0.3447
## monthJanuary             0.2837

V3

Checking the model against previously tuned grid.

tuned.grid <- expand.grid(
                       nrounds = 100,
                       max_depth = 6,
                       colsample_bytree = 0.6,
                       eta = 0.1,
                       gamma= 0,
                       min_child_weight = 10,
                       subsample = 1
                       )

xgb3 <- xg.deploy(xgb.df2, xg.ctrl, tuned.grid, scale)
## Training set contains 1378 rows. 
## Testing set contains 344 rows.
stack.results <- rbind(stack.results, "xgb3" = xgb3$results)
stack.results
##                       ME      RMSE  Rsquared       MAE        MPE     MAPE
## bag.baseline -0.09358480 1.0843706 0.6040531 0.8695907 -1.0574089 5.428965
## bag1         -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag2         -0.09093805 1.0657372 0.6172368 0.8764869 -1.0168330 5.439538
## bag3         -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag4         -0.08998240 1.0648496 0.6175457 0.8607279 -1.0102230 5.363212
## bag5         -0.08916172 1.0574986 0.6237698 0.8555682 -1.0107429 5.328622
## bag6         -0.09008839 1.0655591 0.6170242 0.8612496 -1.0112991 5.366995
## rf.baseline  -0.05938704 1.0586792 0.6200670 0.8361933 -0.7858701 5.200798
## rf1          -0.06758472 0.9972875 0.6655080 0.7960606 -0.8410135 4.948810
## rf2          -0.06366472 1.0033461 0.6610371 0.7992679 -0.8212055 4.971403
## rf3          -0.05755666 1.0071221 0.6571241 0.8033652 -0.7766925 4.998547
## rf4          -0.07985569 0.9985825 0.6646830 0.7919069 -0.9103610 4.923883
## rf5          -0.04872120 0.9975813 0.6644236 0.7960901 -0.7273081 4.946893
## xgb.baseline -0.07860914 1.0508742 0.6271185 0.8423056 -0.9343947 5.249769
## xgb1         -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
## xgb2         -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
## xgb3         -0.06005700 1.0179837 0.6487246 0.8185924 -0.7790652 5.085187
##                   TIME
## bag.baseline  18.72908
## bag1          19.96190
## bag2          18.32149
## bag3          19.38999
## bag4          17.60555
## bag5          17.85801
## bag6          20.05631
## rf.baseline  103.99030
## rf1          132.44422
## rf2          123.42354
## rf3          105.98884
## rf4          104.72795
## rf5          113.90101
## xgb.baseline  33.11829
## xgb1          41.03013
## xgb2          32.23954
## xgb3          16.24166

This model performs better and is faster.

V4

Using only 10-year data:

xgb.df4 <- xgb.df2 |>
  select(-c(
    actor_films5,
    actor_rev5,
    director_films5,
    director_rev5
  ))

xgb4 <- xg.deploy(xgb.df4, xg.ctrl, tuned.grid, scale)
## Training set contains 1378 rows. 
## Testing set contains 344 rows.
stack.results <- rbind(stack.results, "xgb4" = xgb4$results)
stack.results
##                       ME      RMSE  Rsquared       MAE        MPE     MAPE
## bag.baseline -0.09358480 1.0843706 0.6040531 0.8695907 -1.0574089 5.428965
## bag1         -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag2         -0.09093805 1.0657372 0.6172368 0.8764869 -1.0168330 5.439538
## bag3         -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag4         -0.08998240 1.0648496 0.6175457 0.8607279 -1.0102230 5.363212
## bag5         -0.08916172 1.0574986 0.6237698 0.8555682 -1.0107429 5.328622
## bag6         -0.09008839 1.0655591 0.6170242 0.8612496 -1.0112991 5.366995
## rf.baseline  -0.05938704 1.0586792 0.6200670 0.8361933 -0.7858701 5.200798
## rf1          -0.06758472 0.9972875 0.6655080 0.7960606 -0.8410135 4.948810
## rf2          -0.06366472 1.0033461 0.6610371 0.7992679 -0.8212055 4.971403
## rf3          -0.05755666 1.0071221 0.6571241 0.8033652 -0.7766925 4.998547
## rf4          -0.07985569 0.9985825 0.6646830 0.7919069 -0.9103610 4.923883
## rf5          -0.04872120 0.9975813 0.6644236 0.7960901 -0.7273081 4.946893
## xgb.baseline -0.07860914 1.0508742 0.6271185 0.8423056 -0.9343947 5.249769
## xgb1         -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
## xgb2         -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
## xgb3         -0.06005700 1.0179837 0.6487246 0.8185924 -0.7790652 5.085187
## xgb4         -0.07641102 1.0149441 0.6515209 0.8255738 -0.8732818 5.121493
##                   TIME
## bag.baseline  18.72908
## bag1          19.96190
## bag2          18.32149
## bag3          19.38999
## bag4          17.60555
## bag5          17.85801
## bag6          20.05631
## rf.baseline  103.99030
## rf1          132.44422
## rf2          123.42354
## rf3          105.98884
## rf4          104.72795
## rf5          113.90101
## xgb.baseline  33.11829
## xgb1          41.03013
## xgb2          32.23954
## xgb3          16.24166
## xgb4          16.91872

Model performance decreases with only the 10 year data.

V5

We update the 10 year revenue to only revenue between years 5-10 to see if the model improves further. We exclude the zero variance variables languageNone and countryNone

xgb.df5 <- xgb.df2 |>
  mutate(actor_films10 = actor_films10 - actor_films5,
         actor_rev10 = actor_rev10 - actor_rev5,
         director_films10 = director_films10 - director_films5,
         director_rev10 = director_rev10 - director_rev5) |>
  select(-c(languageNone,
            countryNone))

xgb5 <- xg.deploy(xgb.df5, xg.ctrl, tuned.grid, scale)
## Training set contains 1378 rows. 
## Testing set contains 344 rows.
stack.results <- rbind(stack.results, "xgb5" = xgb5$results)
stack.results
##                       ME      RMSE  Rsquared       MAE        MPE     MAPE
## bag.baseline -0.09358480 1.0843706 0.6040531 0.8695907 -1.0574089 5.428965
## bag1         -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag2         -0.09093805 1.0657372 0.6172368 0.8764869 -1.0168330 5.439538
## bag3         -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag4         -0.08998240 1.0648496 0.6175457 0.8607279 -1.0102230 5.363212
## bag5         -0.08916172 1.0574986 0.6237698 0.8555682 -1.0107429 5.328622
## bag6         -0.09008839 1.0655591 0.6170242 0.8612496 -1.0112991 5.366995
## rf.baseline  -0.05938704 1.0586792 0.6200670 0.8361933 -0.7858701 5.200798
## rf1          -0.06758472 0.9972875 0.6655080 0.7960606 -0.8410135 4.948810
## rf2          -0.06366472 1.0033461 0.6610371 0.7992679 -0.8212055 4.971403
## rf3          -0.05755666 1.0071221 0.6571241 0.8033652 -0.7766925 4.998547
## rf4          -0.07985569 0.9985825 0.6646830 0.7919069 -0.9103610 4.923883
## rf5          -0.04872120 0.9975813 0.6644236 0.7960901 -0.7273081 4.946893
## xgb.baseline -0.07860914 1.0508742 0.6271185 0.8423056 -0.9343947 5.249769
## xgb1         -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
## xgb2         -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
## xgb3         -0.06005700 1.0179837 0.6487246 0.8185924 -0.7790652 5.085187
## xgb4         -0.07641102 1.0149441 0.6515209 0.8255738 -0.8732818 5.121493
## xgb5         -0.03487635 1.0313299 0.6385238 0.8361075 -0.6351771 5.183738
##                   TIME
## bag.baseline  18.72908
## bag1          19.96190
## bag2          18.32149
## bag3          19.38999
## bag4          17.60555
## bag5          17.85801
## bag6          20.05631
## rf.baseline  103.99030
## rf1          132.44422
## rf2          123.42354
## rf3          105.98884
## rf4          104.72795
## rf5          113.90101
## xgb.baseline  33.11829
## xgb1          41.03013
## xgb2          32.23954
## xgb3          16.24166
## xgb4          16.91872
## xgb5          17.00468

V6

Considering only 5 year data,

xgb.df6 <- xgb.df2 |>
  select(-c(
    actor_films10,
    actor_rev10,
    director_films10,
    director_rev10,
    languageNone,
    countryNone
  ))

xgb6 <- xg.deploy(xgb.df6, xg.ctrl, tuned.grid, scale)
## Training set contains 1378 rows. 
## Testing set contains 344 rows.
stack.results <- rbind(stack.results, "xgb6" = xgb6$results)
stack.results
##                       ME      RMSE  Rsquared       MAE        MPE     MAPE
## bag.baseline -0.09358480 1.0843706 0.6040531 0.8695907 -1.0574089 5.428965
## bag1         -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag2         -0.09093805 1.0657372 0.6172368 0.8764869 -1.0168330 5.439538
## bag3         -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag4         -0.08998240 1.0648496 0.6175457 0.8607279 -1.0102230 5.363212
## bag5         -0.08916172 1.0574986 0.6237698 0.8555682 -1.0107429 5.328622
## bag6         -0.09008839 1.0655591 0.6170242 0.8612496 -1.0112991 5.366995
## rf.baseline  -0.05938704 1.0586792 0.6200670 0.8361933 -0.7858701 5.200798
## rf1          -0.06758472 0.9972875 0.6655080 0.7960606 -0.8410135 4.948810
## rf2          -0.06366472 1.0033461 0.6610371 0.7992679 -0.8212055 4.971403
## rf3          -0.05755666 1.0071221 0.6571241 0.8033652 -0.7766925 4.998547
## rf4          -0.07985569 0.9985825 0.6646830 0.7919069 -0.9103610 4.923883
## rf5          -0.04872120 0.9975813 0.6644236 0.7960901 -0.7273081 4.946893
## xgb.baseline -0.07860914 1.0508742 0.6271185 0.8423056 -0.9343947 5.249769
## xgb1         -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
## xgb2         -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
## xgb3         -0.06005700 1.0179837 0.6487246 0.8185924 -0.7790652 5.085187
## xgb4         -0.07641102 1.0149441 0.6515209 0.8255738 -0.8732818 5.121493
## xgb5         -0.03487635 1.0313299 0.6385238 0.8361075 -0.6351771 5.183738
## xgb6         -0.05525757 1.0191473 0.6477012 0.8211301 -0.7502124 5.100705
##                   TIME
## bag.baseline  18.72908
## bag1          19.96190
## bag2          18.32149
## bag3          19.38999
## bag4          17.60555
## bag5          17.85801
## bag6          20.05631
## rf.baseline  103.99030
## rf1          132.44422
## rf2          123.42354
## rf3          105.98884
## rf4          104.72795
## rf5          113.90101
## xgb.baseline  33.11829
## xgb1          41.03013
## xgb2          32.23954
## xgb3          16.24166
## xgb4          16.91872
## xgb5          17.00468
## xgb6          16.63881

V7

Removing zero-variance variables from our best model “V3”

xgb.df7 <- xgb.df2 |>
  select(-c(
    languageNone,
    countryNone
  ))

xgb7 <- xg.deploy(xgb.df7, xg.ctrl, tuned.grid, scale)
## Training set contains 1378 rows. 
## Testing set contains 344 rows.
stack.results <- rbind(stack.results, "xgb7" = xgb7$results)
stack.results
##                       ME      RMSE  Rsquared       MAE        MPE     MAPE
## bag.baseline -0.09358480 1.0843706 0.6040531 0.8695907 -1.0574089 5.428965
## bag1         -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag2         -0.09093805 1.0657372 0.6172368 0.8764869 -1.0168330 5.439538
## bag3         -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag4         -0.08998240 1.0648496 0.6175457 0.8607279 -1.0102230 5.363212
## bag5         -0.08916172 1.0574986 0.6237698 0.8555682 -1.0107429 5.328622
## bag6         -0.09008839 1.0655591 0.6170242 0.8612496 -1.0112991 5.366995
## rf.baseline  -0.05938704 1.0586792 0.6200670 0.8361933 -0.7858701 5.200798
## rf1          -0.06758472 0.9972875 0.6655080 0.7960606 -0.8410135 4.948810
## rf2          -0.06366472 1.0033461 0.6610371 0.7992679 -0.8212055 4.971403
## rf3          -0.05755666 1.0071221 0.6571241 0.8033652 -0.7766925 4.998547
## rf4          -0.07985569 0.9985825 0.6646830 0.7919069 -0.9103610 4.923883
## rf5          -0.04872120 0.9975813 0.6644236 0.7960901 -0.7273081 4.946893
## xgb.baseline -0.07860914 1.0508742 0.6271185 0.8423056 -0.9343947 5.249769
## xgb1         -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
## xgb2         -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
## xgb3         -0.06005700 1.0179837 0.6487246 0.8185924 -0.7790652 5.085187
## xgb4         -0.07641102 1.0149441 0.6515209 0.8255738 -0.8732818 5.121493
## xgb5         -0.03487635 1.0313299 0.6385238 0.8361075 -0.6351771 5.183738
## xgb6         -0.05525757 1.0191473 0.6477012 0.8211301 -0.7502124 5.100705
## xgb7         -0.05353738 1.0265252 0.6428285 0.8250519 -0.7540614 5.123845
##                   TIME
## bag.baseline  18.72908
## bag1          19.96190
## bag2          18.32149
## bag3          19.38999
## bag4          17.60555
## bag5          17.85801
## bag6          20.05631
## rf.baseline  103.99030
## rf1          132.44422
## rf2          123.42354
## rf3          105.98884
## rf4          104.72795
## rf5          113.90101
## xgb.baseline  33.11829
## xgb1          41.03013
## xgb2          32.23954
## xgb3          16.24166
## xgb4          16.91872
## xgb5          17.00468
## xgb6          16.63881
## xgb7          15.97367

This model actually performs worse. Random forest is outperforming XGBoost on this data.

V8

Best XGBoost model with Gamma (regularization parameter) = 5

gamma.grid <- expand.grid(
                       nrounds = 100,
                       max_depth = 6,
                       colsample_bytree = 0.6,
                       eta = 0.1,
                       gamma= 5,
                       min_child_weight = 10,
                       subsample = 1
                       )

xgb8 <- xg.deploy(xgb.df2, xg.ctrl, gamma.grid, scale)
## Training set contains 1378 rows. 
## Testing set contains 344 rows.
xgb8$results
##          ME        RMSE    Rsquared         MAE         MPE        MAPE 
## -0.06616451  1.01878086  0.64899210  0.82032832 -0.82920226  5.09587033 
##        TIME 
## 17.04133391
# stack.results <- rbind(stack.results, "xgb8" = xgb2$results)
# stack.results

Cubist

We define the parameters for model training, enabling 10-fold cross-validation, perform the Box-Cox transformation, and set the grid search parameters. Cubist models can handle categorical variables so we do not need to create dummies.

# define parameters
cube.ctrl <- trainControl(
  method = "cv",
  number = 10,
  allowParallel = TRUE
)

# perform Box-Cox
c.df <- box.response(c.df)

# impute zeros for NA
c.df <- c.df |>
  mutate(budget = ifelse(is.na(budget),
                         0,
                         budget))

# set grid search params
cube.grid1 <- expand.grid(
  .committees=c(seq(1, 10), 
                seq(20, 100, 
                    by=10)), 
  .neighbors=c(0, 1, 5, 9))

Baseline

Baseline model without Star Power,

cube.base <- c.df |>
  select(-c(num_actors,
            num_directors,
            actor_films5,
            actor_films10,
            actor_rev5,
            actor_rev10,
            director_films5,
            director_films10,
            director_rev5,
            director_rev10)) 

cube.base <- cube.deploy(cube.base, cube.ctrl, cube.grid1, scale)
## Training set contains 1378 rows. 
## Testing set contains 344 rows.
stack.results <- rbind(stack.results, "cube.baseline" = cube.base$results)
stack.results
##                        ME      RMSE  Rsquared       MAE        MPE     MAPE
## bag.baseline  -0.09358480 1.0843706 0.6040531 0.8695907 -1.0574089 5.428965
## bag1          -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag2          -0.09093805 1.0657372 0.6172368 0.8764869 -1.0168330 5.439538
## bag3          -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag4          -0.08998240 1.0648496 0.6175457 0.8607279 -1.0102230 5.363212
## bag5          -0.08916172 1.0574986 0.6237698 0.8555682 -1.0107429 5.328622
## bag6          -0.09008839 1.0655591 0.6170242 0.8612496 -1.0112991 5.366995
## rf.baseline   -0.05938704 1.0586792 0.6200670 0.8361933 -0.7858701 5.200798
## rf1           -0.06758472 0.9972875 0.6655080 0.7960606 -0.8410135 4.948810
## rf2           -0.06366472 1.0033461 0.6610371 0.7992679 -0.8212055 4.971403
## rf3           -0.05755666 1.0071221 0.6571241 0.8033652 -0.7766925 4.998547
## rf4           -0.07985569 0.9985825 0.6646830 0.7919069 -0.9103610 4.923883
## rf5           -0.04872120 0.9975813 0.6644236 0.7960901 -0.7273081 4.946893
## xgb.baseline  -0.07860914 1.0508742 0.6271185 0.8423056 -0.9343947 5.249769
## xgb1          -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
## xgb2          -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
## xgb3          -0.06005700 1.0179837 0.6487246 0.8185924 -0.7790652 5.085187
## xgb4          -0.07641102 1.0149441 0.6515209 0.8255738 -0.8732818 5.121493
## xgb5          -0.03487635 1.0313299 0.6385238 0.8361075 -0.6351771 5.183738
## xgb6          -0.05525757 1.0191473 0.6477012 0.8211301 -0.7502124 5.100705
## xgb7          -0.05353738 1.0265252 0.6428285 0.8250519 -0.7540614 5.123845
## cube.baseline -0.17794846 1.0650029 0.6256989 0.8365519 -1.5383026 5.254428
##                    TIME
## bag.baseline   18.72908
## bag1           19.96190
## bag2           18.32149
## bag3           19.38999
## bag4           17.60555
## bag5           17.85801
## bag6           20.05631
## rf.baseline   103.99030
## rf1           132.44422
## rf2           123.42354
## rf3           105.98884
## rf4           104.72795
## rf5           113.90101
## xgb.baseline   33.11829
## xgb1           41.03013
## xgb2           32.23954
## xgb3           16.24166
## xgb4           16.91872
## xgb5           17.00468
## xgb6           16.63881
## xgb7           15.97367
## cube.baseline 144.29722

V1

Full model including Star Power minus number of actors and directors.

cube.df1 <- c.df |>
  select(-c(num_actors,
            num_directors
            )) 

cube1 <- cube.deploy(cube.df1, cube.ctrl, cube.grid1, scale)
## Training set contains 1378 rows. 
## Testing set contains 344 rows.
stack.results <- rbind(stack.results, "cube1" = cube1$results)
stack.results
##                        ME      RMSE  Rsquared       MAE        MPE     MAPE
## bag.baseline  -0.09358480 1.0843706 0.6040531 0.8695907 -1.0574089 5.428965
## bag1          -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag2          -0.09093805 1.0657372 0.6172368 0.8764869 -1.0168330 5.439538
## bag3          -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag4          -0.08998240 1.0648496 0.6175457 0.8607279 -1.0102230 5.363212
## bag5          -0.08916172 1.0574986 0.6237698 0.8555682 -1.0107429 5.328622
## bag6          -0.09008839 1.0655591 0.6170242 0.8612496 -1.0112991 5.366995
## rf.baseline   -0.05938704 1.0586792 0.6200670 0.8361933 -0.7858701 5.200798
## rf1           -0.06758472 0.9972875 0.6655080 0.7960606 -0.8410135 4.948810
## rf2           -0.06366472 1.0033461 0.6610371 0.7992679 -0.8212055 4.971403
## rf3           -0.05755666 1.0071221 0.6571241 0.8033652 -0.7766925 4.998547
## rf4           -0.07985569 0.9985825 0.6646830 0.7919069 -0.9103610 4.923883
## rf5           -0.04872120 0.9975813 0.6644236 0.7960901 -0.7273081 4.946893
## xgb.baseline  -0.07860914 1.0508742 0.6271185 0.8423056 -0.9343947 5.249769
## xgb1          -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
## xgb2          -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
## xgb3          -0.06005700 1.0179837 0.6487246 0.8185924 -0.7790652 5.085187
## xgb4          -0.07641102 1.0149441 0.6515209 0.8255738 -0.8732818 5.121493
## xgb5          -0.03487635 1.0313299 0.6385238 0.8361075 -0.6351771 5.183738
## xgb6          -0.05525757 1.0191473 0.6477012 0.8211301 -0.7502124 5.100705
## xgb7          -0.05353738 1.0265252 0.6428285 0.8250519 -0.7540614 5.123845
## cube.baseline -0.17794846 1.0650029 0.6256989 0.8365519 -1.5383026 5.254428
## cube1         -0.13139698 1.0324528 0.6445176 0.8220364 -1.2410078 5.137437
##                    TIME
## bag.baseline   18.72908
## bag1           19.96190
## bag2           18.32149
## bag3           19.38999
## bag4           17.60555
## bag5           17.85801
## bag6           20.05631
## rf.baseline   103.99030
## rf1           132.44422
## rf2           123.42354
## rf3           105.98884
## rf4           104.72795
## rf5           113.90101
## xgb.baseline   33.11829
## xgb1           41.03013
## xgb2           32.23954
## xgb3           16.24166
## xgb4           16.91872
## xgb5           17.00468
## xgb6           16.63881
## xgb7           15.97367
## cube.baseline 144.29722
## cube1         198.45836
cube.tune <- cube1$model
cube.tune$bestTune
##    committees neighbors
## 57         60         0

Running the model with the tuned grid to gauge performance,

cube.tuned <- cube.grid1 <- expand.grid(
  .committees=60, 
  .neighbors=0)

cube1.scaled <- cube.deploy(cube.df1, cube.ctrl, cube.tuned, scale)
## Training set contains 1378 rows. 
## Testing set contains 344 rows.
cube1.unscaled <- cube.deploy(cube.df1, cube.ctrl, cube.tuned)
## Training set contains 1378 rows. 
## Testing set contains 344 rows.
rbind(cube1.scaled$results, cube1.unscaled$results)
##              ME     RMSE  Rsquared       MAE       MPE     MAPE     TIME
## [1,] -0.1313970 1.032453 0.6445176 0.8220364 -1.241008 5.137437 31.85722
## [2,] -0.1290602 1.035148 0.6419848 0.8250736 -1.223889 5.154409 31.10698

Scaled performs much better, but still underperforms XGboost and Random Forest.

V2

Full model w. 10 year only:

cube.df2 <- c.df |>
  select(-c(num_actors,
            num_directors,
            actor_films5,
            actor_rev5,
            director_films5,
            director_rev5
            )) 

cube2 <- cube.deploy(cube.df2, cube.ctrl, cube.tuned, scale)
## Training set contains 1378 rows. 
## Testing set contains 344 rows.
stack.results <- rbind(stack.results, "cube2" = cube2$results)
stack.results
##                        ME      RMSE  Rsquared       MAE        MPE     MAPE
## bag.baseline  -0.09358480 1.0843706 0.6040531 0.8695907 -1.0574089 5.428965
## bag1          -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag2          -0.09093805 1.0657372 0.6172368 0.8764869 -1.0168330 5.439538
## bag3          -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag4          -0.08998240 1.0648496 0.6175457 0.8607279 -1.0102230 5.363212
## bag5          -0.08916172 1.0574986 0.6237698 0.8555682 -1.0107429 5.328622
## bag6          -0.09008839 1.0655591 0.6170242 0.8612496 -1.0112991 5.366995
## rf.baseline   -0.05938704 1.0586792 0.6200670 0.8361933 -0.7858701 5.200798
## rf1           -0.06758472 0.9972875 0.6655080 0.7960606 -0.8410135 4.948810
## rf2           -0.06366472 1.0033461 0.6610371 0.7992679 -0.8212055 4.971403
## rf3           -0.05755666 1.0071221 0.6571241 0.8033652 -0.7766925 4.998547
## rf4           -0.07985569 0.9985825 0.6646830 0.7919069 -0.9103610 4.923883
## rf5           -0.04872120 0.9975813 0.6644236 0.7960901 -0.7273081 4.946893
## xgb.baseline  -0.07860914 1.0508742 0.6271185 0.8423056 -0.9343947 5.249769
## xgb1          -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
## xgb2          -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
## xgb3          -0.06005700 1.0179837 0.6487246 0.8185924 -0.7790652 5.085187
## xgb4          -0.07641102 1.0149441 0.6515209 0.8255738 -0.8732818 5.121493
## xgb5          -0.03487635 1.0313299 0.6385238 0.8361075 -0.6351771 5.183738
## xgb6          -0.05525757 1.0191473 0.6477012 0.8211301 -0.7502124 5.100705
## xgb7          -0.05353738 1.0265252 0.6428285 0.8250519 -0.7540614 5.123845
## cube.baseline -0.17794846 1.0650029 0.6256989 0.8365519 -1.5383026 5.254428
## cube1         -0.13139698 1.0324528 0.6445176 0.8220364 -1.2410078 5.137437
## cube2         -0.16125278 1.0361215 0.6448753 0.8198624 -1.4205780 5.142243
##                    TIME
## bag.baseline   18.72908
## bag1           19.96190
## bag2           18.32149
## bag3           19.38999
## bag4           17.60555
## bag5           17.85801
## bag6           20.05631
## rf.baseline   103.99030
## rf1           132.44422
## rf2           123.42354
## rf3           105.98884
## rf4           104.72795
## rf5           113.90101
## xgb.baseline   33.11829
## xgb1           41.03013
## xgb2           32.23954
## xgb3           16.24166
## xgb4           16.91872
## xgb5           17.00468
## xgb6           16.63881
## xgb7           15.97367
## cube.baseline 144.29722
## cube1         198.45836
## cube2          28.57859

Worse than full model, but fast now that we have the parameters.

V3

Separating out years 5-10.

cube.df3 <- c.df |>
  mutate(actor_films10 = actor_films10 - actor_films5,
         actor_rev10 = actor_rev10 - actor_rev5,
         director_films10 = director_films10 - director_films5,
         director_rev10 = director_rev10 - director_rev5) |>
  select(-c(num_actors,
            num_directors))

cube3 <- cube.deploy(cube.df3, cube.ctrl, cube.tuned, scale)
## Training set contains 1378 rows. 
## Testing set contains 344 rows.
stack.results <- rbind(stack.results, "cube3" = cube3$results)
stack.results
##                        ME      RMSE  Rsquared       MAE        MPE     MAPE
## bag.baseline  -0.09358480 1.0843706 0.6040531 0.8695907 -1.0574089 5.428965
## bag1          -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag2          -0.09093805 1.0657372 0.6172368 0.8764869 -1.0168330 5.439538
## bag3          -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag4          -0.08998240 1.0648496 0.6175457 0.8607279 -1.0102230 5.363212
## bag5          -0.08916172 1.0574986 0.6237698 0.8555682 -1.0107429 5.328622
## bag6          -0.09008839 1.0655591 0.6170242 0.8612496 -1.0112991 5.366995
## rf.baseline   -0.05938704 1.0586792 0.6200670 0.8361933 -0.7858701 5.200798
## rf1           -0.06758472 0.9972875 0.6655080 0.7960606 -0.8410135 4.948810
## rf2           -0.06366472 1.0033461 0.6610371 0.7992679 -0.8212055 4.971403
## rf3           -0.05755666 1.0071221 0.6571241 0.8033652 -0.7766925 4.998547
## rf4           -0.07985569 0.9985825 0.6646830 0.7919069 -0.9103610 4.923883
## rf5           -0.04872120 0.9975813 0.6644236 0.7960901 -0.7273081 4.946893
## xgb.baseline  -0.07860914 1.0508742 0.6271185 0.8423056 -0.9343947 5.249769
## xgb1          -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
## xgb2          -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
## xgb3          -0.06005700 1.0179837 0.6487246 0.8185924 -0.7790652 5.085187
## xgb4          -0.07641102 1.0149441 0.6515209 0.8255738 -0.8732818 5.121493
## xgb5          -0.03487635 1.0313299 0.6385238 0.8361075 -0.6351771 5.183738
## xgb6          -0.05525757 1.0191473 0.6477012 0.8211301 -0.7502124 5.100705
## xgb7          -0.05353738 1.0265252 0.6428285 0.8250519 -0.7540614 5.123845
## cube.baseline -0.17794846 1.0650029 0.6256989 0.8365519 -1.5383026 5.254428
## cube1         -0.13139698 1.0324528 0.6445176 0.8220364 -1.2410078 5.137437
## cube2         -0.16125278 1.0361215 0.6448753 0.8198624 -1.4205780 5.142243
## cube3         -0.13158143 1.0355657 0.6419864 0.8265906 -1.2402341 5.162284
##                    TIME
## bag.baseline   18.72908
## bag1           19.96190
## bag2           18.32149
## bag3           19.38999
## bag4           17.60555
## bag5           17.85801
## bag6           20.05631
## rf.baseline   103.99030
## rf1           132.44422
## rf2           123.42354
## rf3           105.98884
## rf4           104.72795
## rf5           113.90101
## xgb.baseline   33.11829
## xgb1           41.03013
## xgb2           32.23954
## xgb3           16.24166
## xgb4           16.91872
## xgb5           17.00468
## xgb6           16.63881
## xgb7           15.97367
## cube.baseline 144.29722
## cube1         198.45836
## cube2          28.57859
## cube3          31.94294
varImp(cube3$model)
## cubist variable importance
## 
##   only 20 most important variables shown (out of 39)
## 
##                  Overall
## budget           100.000
## actor_films5      48.765
## ratedR            48.148
## ratedPG-13        45.062
## bud_availYes      38.889
## genreAnimation    37.037
## ratedPG           36.420
## States            35.802
## director_films5   34.568
## runtime           33.333
## genreHorror       30.864
## monthOctober      20.370
## genreComedy       17.901
## actor_rev10       17.284
## director_films10  16.667
## monthJuly         16.049
## genreDrama         9.259
## genreBiography     8.025
## monthJune          4.938
## genreCrime         4.938

V4

Considering only 5 year data,

cube.df4 <- c.df |>
  select(-c(num_actors,
            num_directors,
            actor_films10,
            actor_rev10,
            director_films10,
            director_rev10
            )) 

cube4 <- cube.deploy(cube.df4, cube.ctrl, cube.tuned, scale)
## Training set contains 1378 rows. 
## Testing set contains 344 rows.
stack.results <- rbind(stack.results, "cube4" = cube4$results)
stack.results
##                        ME      RMSE  Rsquared       MAE        MPE     MAPE
## bag.baseline  -0.09358480 1.0843706 0.6040531 0.8695907 -1.0574089 5.428965
## bag1          -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag2          -0.09093805 1.0657372 0.6172368 0.8764869 -1.0168330 5.439538
## bag3          -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag4          -0.08998240 1.0648496 0.6175457 0.8607279 -1.0102230 5.363212
## bag5          -0.08916172 1.0574986 0.6237698 0.8555682 -1.0107429 5.328622
## bag6          -0.09008839 1.0655591 0.6170242 0.8612496 -1.0112991 5.366995
## rf.baseline   -0.05938704 1.0586792 0.6200670 0.8361933 -0.7858701 5.200798
## rf1           -0.06758472 0.9972875 0.6655080 0.7960606 -0.8410135 4.948810
## rf2           -0.06366472 1.0033461 0.6610371 0.7992679 -0.8212055 4.971403
## rf3           -0.05755666 1.0071221 0.6571241 0.8033652 -0.7766925 4.998547
## rf4           -0.07985569 0.9985825 0.6646830 0.7919069 -0.9103610 4.923883
## rf5           -0.04872120 0.9975813 0.6644236 0.7960901 -0.7273081 4.946893
## xgb.baseline  -0.07860914 1.0508742 0.6271185 0.8423056 -0.9343947 5.249769
## xgb1          -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
## xgb2          -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
## xgb3          -0.06005700 1.0179837 0.6487246 0.8185924 -0.7790652 5.085187
## xgb4          -0.07641102 1.0149441 0.6515209 0.8255738 -0.8732818 5.121493
## xgb5          -0.03487635 1.0313299 0.6385238 0.8361075 -0.6351771 5.183738
## xgb6          -0.05525757 1.0191473 0.6477012 0.8211301 -0.7502124 5.100705
## xgb7          -0.05353738 1.0265252 0.6428285 0.8250519 -0.7540614 5.123845
## cube.baseline -0.17794846 1.0650029 0.6256989 0.8365519 -1.5383026 5.254428
## cube1         -0.13139698 1.0324528 0.6445176 0.8220364 -1.2410078 5.137437
## cube2         -0.16125278 1.0361215 0.6448753 0.8198624 -1.4205780 5.142243
## cube3         -0.13158143 1.0355657 0.6419864 0.8265906 -1.2402341 5.162284
## cube4         -0.14600407 1.0336423 0.6451466 0.8212243 -1.3293151 5.137742
##                    TIME
## bag.baseline   18.72908
## bag1           19.96190
## bag2           18.32149
## bag3           19.38999
## bag4           17.60555
## bag5           17.85801
## bag6           20.05631
## rf.baseline   103.99030
## rf1           132.44422
## rf2           123.42354
## rf3           105.98884
## rf4           104.72795
## rf5           113.90101
## xgb.baseline   33.11829
## xgb1           41.03013
## xgb2           32.23954
## xgb3           16.24166
## xgb4           16.91872
## xgb5           17.00468
## xgb6           16.63881
## xgb7           15.97367
## cube.baseline 144.29722
## cube1         198.45836
## cube2          28.57859
## cube3          31.94294
## cube4          29.58219
varImp(cube4$model)
## cubist variable importance
## 
##   only 20 most important variables shown (out of 35)
## 
##                 Overall
## budget          100.000
## director_films5  51.412
## actor_films5     38.418
## ratedR           37.288
## genreHorror      35.028
## genreAnimation   32.768
## States           32.203
## ratedPG-13       31.073
## runtime          30.508
## bud_availYes     29.944
## monthOctober     25.424
## ratedPG          23.729
## genreDrama       21.469
## genreBiography   19.209
## genreCrime       16.384
## genreComedy      14.689
## monthJuly        13.559
## director_rev5     9.040
## monthJune         8.475
## monthSeptember    5.650

V5

Including only the number of films in the past 5 years,

cube.df5 <- c.df |>
  select(-c(num_actors,
            num_directors,
            actor_films10,
            actor_rev5,
            actor_rev10,
            director_films10,
            director_rev5,
            director_rev10
            )) 

cube5 <- cube.deploy(cube.df5, cube.ctrl, cube.tuned, scale)
## Training set contains 1378 rows. 
## Testing set contains 344 rows.
stack.results <- rbind(stack.results, "cube5" = cube5$results)
stack.results
##                        ME      RMSE  Rsquared       MAE        MPE     MAPE
## bag.baseline  -0.09358480 1.0843706 0.6040531 0.8695907 -1.0574089 5.428965
## bag1          -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag2          -0.09093805 1.0657372 0.6172368 0.8764869 -1.0168330 5.439538
## bag3          -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag4          -0.08998240 1.0648496 0.6175457 0.8607279 -1.0102230 5.363212
## bag5          -0.08916172 1.0574986 0.6237698 0.8555682 -1.0107429 5.328622
## bag6          -0.09008839 1.0655591 0.6170242 0.8612496 -1.0112991 5.366995
## rf.baseline   -0.05938704 1.0586792 0.6200670 0.8361933 -0.7858701 5.200798
## rf1           -0.06758472 0.9972875 0.6655080 0.7960606 -0.8410135 4.948810
## rf2           -0.06366472 1.0033461 0.6610371 0.7992679 -0.8212055 4.971403
## rf3           -0.05755666 1.0071221 0.6571241 0.8033652 -0.7766925 4.998547
## rf4           -0.07985569 0.9985825 0.6646830 0.7919069 -0.9103610 4.923883
## rf5           -0.04872120 0.9975813 0.6644236 0.7960901 -0.7273081 4.946893
## xgb.baseline  -0.07860914 1.0508742 0.6271185 0.8423056 -0.9343947 5.249769
## xgb1          -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
## xgb2          -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
## xgb3          -0.06005700 1.0179837 0.6487246 0.8185924 -0.7790652 5.085187
## xgb4          -0.07641102 1.0149441 0.6515209 0.8255738 -0.8732818 5.121493
## xgb5          -0.03487635 1.0313299 0.6385238 0.8361075 -0.6351771 5.183738
## xgb6          -0.05525757 1.0191473 0.6477012 0.8211301 -0.7502124 5.100705
## xgb7          -0.05353738 1.0265252 0.6428285 0.8250519 -0.7540614 5.123845
## cube.baseline -0.17794846 1.0650029 0.6256989 0.8365519 -1.5383026 5.254428
## cube1         -0.13139698 1.0324528 0.6445176 0.8220364 -1.2410078 5.137437
## cube2         -0.16125278 1.0361215 0.6448753 0.8198624 -1.4205780 5.142243
## cube3         -0.13158143 1.0355657 0.6419864 0.8265906 -1.2402341 5.162284
## cube4         -0.14600407 1.0336423 0.6451466 0.8212243 -1.3293151 5.137742
## cube5         -0.13784628 1.0388059 0.6399798 0.8260460 -1.2760452 5.162864
##                    TIME
## bag.baseline   18.72908
## bag1           19.96190
## bag2           18.32149
## bag3           19.38999
## bag4           17.60555
## bag5           17.85801
## bag6           20.05631
## rf.baseline   103.99030
## rf1           132.44422
## rf2           123.42354
## rf3           105.98884
## rf4           104.72795
## rf5           113.90101
## xgb.baseline   33.11829
## xgb1           41.03013
## xgb2           32.23954
## xgb3           16.24166
## xgb4           16.91872
## xgb5           17.00468
## xgb6           16.63881
## xgb7           15.97367
## cube.baseline 144.29722
## cube1         198.45836
## cube2          28.57859
## cube3          31.94294
## cube4          29.58219
## cube5          28.81885

V6

Removing the bud_avail variable,

cube.df6 <- c.df |>
  select(-c(num_actors,
            num_directors,
            actor_rev5,
            director_films5,
            director_rev5,
            director_films5,
            bud_avail
            )) 

cube6 <- cube.deploy(cube.df6, cube.ctrl, cube.tuned, scale)
## Training set contains 1378 rows. 
## Testing set contains 344 rows.
stack.results <- rbind(stack.results, "cube6" = cube6$results)
stack.results
##                        ME      RMSE  Rsquared       MAE        MPE     MAPE
## bag.baseline  -0.09358480 1.0843706 0.6040531 0.8695907 -1.0574089 5.428965
## bag1          -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag2          -0.09093805 1.0657372 0.6172368 0.8764869 -1.0168330 5.439538
## bag3          -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag4          -0.08998240 1.0648496 0.6175457 0.8607279 -1.0102230 5.363212
## bag5          -0.08916172 1.0574986 0.6237698 0.8555682 -1.0107429 5.328622
## bag6          -0.09008839 1.0655591 0.6170242 0.8612496 -1.0112991 5.366995
## rf.baseline   -0.05938704 1.0586792 0.6200670 0.8361933 -0.7858701 5.200798
## rf1           -0.06758472 0.9972875 0.6655080 0.7960606 -0.8410135 4.948810
## rf2           -0.06366472 1.0033461 0.6610371 0.7992679 -0.8212055 4.971403
## rf3           -0.05755666 1.0071221 0.6571241 0.8033652 -0.7766925 4.998547
## rf4           -0.07985569 0.9985825 0.6646830 0.7919069 -0.9103610 4.923883
## rf5           -0.04872120 0.9975813 0.6644236 0.7960901 -0.7273081 4.946893
## xgb.baseline  -0.07860914 1.0508742 0.6271185 0.8423056 -0.9343947 5.249769
## xgb1          -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
## xgb2          -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
## xgb3          -0.06005700 1.0179837 0.6487246 0.8185924 -0.7790652 5.085187
## xgb4          -0.07641102 1.0149441 0.6515209 0.8255738 -0.8732818 5.121493
## xgb5          -0.03487635 1.0313299 0.6385238 0.8361075 -0.6351771 5.183738
## xgb6          -0.05525757 1.0191473 0.6477012 0.8211301 -0.7502124 5.100705
## xgb7          -0.05353738 1.0265252 0.6428285 0.8250519 -0.7540614 5.123845
## cube.baseline -0.17794846 1.0650029 0.6256989 0.8365519 -1.5383026 5.254428
## cube1         -0.13139698 1.0324528 0.6445176 0.8220364 -1.2410078 5.137437
## cube2         -0.16125278 1.0361215 0.6448753 0.8198624 -1.4205780 5.142243
## cube3         -0.13158143 1.0355657 0.6419864 0.8265906 -1.2402341 5.162284
## cube4         -0.14600407 1.0336423 0.6451466 0.8212243 -1.3293151 5.137742
## cube5         -0.13784628 1.0388059 0.6399798 0.8260460 -1.2760452 5.162864
## cube6         -0.15464835 1.0369273 0.6451362 0.8205757 -1.3968214 5.147856
##                    TIME
## bag.baseline   18.72908
## bag1           19.96190
## bag2           18.32149
## bag3           19.38999
## bag4           17.60555
## bag5           17.85801
## bag6           20.05631
## rf.baseline   103.99030
## rf1           132.44422
## rf2           123.42354
## rf3           105.98884
## rf4           104.72795
## rf5           113.90101
## xgb.baseline   33.11829
## xgb1           41.03013
## xgb2           32.23954
## xgb3           16.24166
## xgb4           16.91872
## xgb5           17.00468
## xgb6           16.63881
## xgb7           15.97367
## cube.baseline 144.29722
## cube1         198.45836
## cube2          28.57859
## cube3          31.94294
## cube4          29.58219
## cube5          28.81885
## cube6          28.57339

V7

Full dataset w/o bud_avail,

cube.df7 <- c.df |>
  select(-c(num_actors,
            num_directors,
            bud_avail
            )) 

cube7 <- cube.deploy(cube.df7, cube.ctrl, cube.tuned, scale)
## Training set contains 1378 rows. 
## Testing set contains 344 rows.
stack.results <- rbind(stack.results, "cube7" = cube7$results)
stack.results
##                        ME      RMSE  Rsquared       MAE        MPE     MAPE
## bag.baseline  -0.09358480 1.0843706 0.6040531 0.8695907 -1.0574089 5.428965
## bag1          -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag2          -0.09093805 1.0657372 0.6172368 0.8764869 -1.0168330 5.439538
## bag3          -0.09099274 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042
## bag4          -0.08998240 1.0648496 0.6175457 0.8607279 -1.0102230 5.363212
## bag5          -0.08916172 1.0574986 0.6237698 0.8555682 -1.0107429 5.328622
## bag6          -0.09008839 1.0655591 0.6170242 0.8612496 -1.0112991 5.366995
## rf.baseline   -0.05938704 1.0586792 0.6200670 0.8361933 -0.7858701 5.200798
## rf1           -0.06758472 0.9972875 0.6655080 0.7960606 -0.8410135 4.948810
## rf2           -0.06366472 1.0033461 0.6610371 0.7992679 -0.8212055 4.971403
## rf3           -0.05755666 1.0071221 0.6571241 0.8033652 -0.7766925 4.998547
## rf4           -0.07985569 0.9985825 0.6646830 0.7919069 -0.9103610 4.923883
## rf5           -0.04872120 0.9975813 0.6644236 0.7960901 -0.7273081 4.946893
## xgb.baseline  -0.07860914 1.0508742 0.6271185 0.8423056 -0.9343947 5.249769
## xgb1          -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
## xgb2          -0.06265449 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196
## xgb3          -0.06005700 1.0179837 0.6487246 0.8185924 -0.7790652 5.085187
## xgb4          -0.07641102 1.0149441 0.6515209 0.8255738 -0.8732818 5.121493
## xgb5          -0.03487635 1.0313299 0.6385238 0.8361075 -0.6351771 5.183738
## xgb6          -0.05525757 1.0191473 0.6477012 0.8211301 -0.7502124 5.100705
## xgb7          -0.05353738 1.0265252 0.6428285 0.8250519 -0.7540614 5.123845
## cube.baseline -0.17794846 1.0650029 0.6256989 0.8365519 -1.5383026 5.254428
## cube1         -0.13139698 1.0324528 0.6445176 0.8220364 -1.2410078 5.137437
## cube2         -0.16125278 1.0361215 0.6448753 0.8198624 -1.4205780 5.142243
## cube3         -0.13158143 1.0355657 0.6419864 0.8265906 -1.2402341 5.162284
## cube4         -0.14600407 1.0336423 0.6451466 0.8212243 -1.3293151 5.137742
## cube5         -0.13784628 1.0388059 0.6399798 0.8260460 -1.2760452 5.162864
## cube6         -0.15464835 1.0369273 0.6451362 0.8205757 -1.3968214 5.147856
## cube7         -0.14615186 1.0369947 0.6443757 0.8210387 -1.3477388 5.144373
##                    TIME
## bag.baseline   18.72908
## bag1           19.96190
## bag2           18.32149
## bag3           19.38999
## bag4           17.60555
## bag5           17.85801
## bag6           20.05631
## rf.baseline   103.99030
## rf1           132.44422
## rf2           123.42354
## rf3           105.98884
## rf4           104.72795
## rf5           113.90101
## xgb.baseline   33.11829
## xgb1           41.03013
## xgb2           32.23954
## xgb3           16.24166
## xgb4           16.91872
## xgb5           17.00468
## xgb6           16.63881
## xgb7           15.97367
## cube.baseline 144.29722
## cube1         198.45836
## cube2          28.57859
## cube3          31.94294
## cube4          29.58219
## cube5          28.81885
## cube6          28.57339
## cube7          30.65470

Results

stack.results |> kable() |> kable_styling()
ME RMSE Rsquared MAE MPE MAPE TIME
bag.baseline -0.0935848 1.0843706 0.6040531 0.8695907 -1.0574089 5.428965 18.72908
bag1 -0.0909927 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042 19.96190
bag2 -0.0909381 1.0657372 0.6172368 0.8764869 -1.0168330 5.439538 18.32149
bag3 -0.0909927 1.0626108 0.6193303 0.8592402 -1.0157512 5.353042 19.38999
bag4 -0.0899824 1.0648496 0.6175457 0.8607279 -1.0102230 5.363212 17.60555
bag5 -0.0891617 1.0574986 0.6237698 0.8555682 -1.0107429 5.328622 17.85801
bag6 -0.0900884 1.0655591 0.6170242 0.8612496 -1.0112991 5.366995 20.05631
rf.baseline -0.0593870 1.0586792 0.6200670 0.8361933 -0.7858701 5.200798 103.99030
rf1 -0.0675847 0.9972875 0.6655080 0.7960606 -0.8410135 4.948810 132.44422
rf2 -0.0636647 1.0033461 0.6610371 0.7992679 -0.8212055 4.971403 123.42354
rf3 -0.0575567 1.0071221 0.6571241 0.8033652 -0.7766925 4.998547 105.98884
rf4 -0.0798557 0.9985825 0.6646830 0.7919069 -0.9103610 4.923883 104.72795
rf5 -0.0487212 0.9975813 0.6644236 0.7960901 -0.7273081 4.946893 113.90101
xgb.baseline -0.0786091 1.0508742 0.6271185 0.8423056 -0.9343947 5.249769 33.11829
xgb1 -0.0626545 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196 41.03013
xgb2 -0.0626545 1.0168226 0.6508696 0.8221444 -0.8164864 5.113196 32.23954
xgb3 -0.0600570 1.0179837 0.6487246 0.8185924 -0.7790652 5.085187 16.24166
xgb4 -0.0764110 1.0149441 0.6515209 0.8255738 -0.8732818 5.121493 16.91872
xgb5 -0.0348763 1.0313299 0.6385238 0.8361075 -0.6351771 5.183738 17.00468
xgb6 -0.0552576 1.0191473 0.6477012 0.8211301 -0.7502124 5.100705 16.63881
xgb7 -0.0535374 1.0265252 0.6428285 0.8250519 -0.7540614 5.123845 15.97367
cube.baseline -0.1779485 1.0650029 0.6256989 0.8365519 -1.5383026 5.254428 144.29722
cube1 -0.1313970 1.0324528 0.6445176 0.8220364 -1.2410078 5.137437 198.45836
cube2 -0.1612528 1.0361215 0.6448753 0.8198624 -1.4205780 5.142243 28.57859
cube3 -0.1315814 1.0355657 0.6419864 0.8265906 -1.2402341 5.162284 31.94294
cube4 -0.1460041 1.0336423 0.6451466 0.8212243 -1.3293151 5.137742 29.58219
cube5 -0.1378463 1.0388059 0.6399798 0.8260460 -1.2760452 5.162864 28.81885
cube6 -0.1546484 1.0369273 0.6451362 0.8205757 -1.3968214 5.147856 28.57339
cube7 -0.1461519 1.0369947 0.6443757 0.8210387 -1.3477388 5.144373 30.65470

Importance Measures

Bagged Models

Full Model (V1)

b1.import <- varImp(bag1$model)

tableImp(b1.import, 10)
Overall
budget 100.000000
director_rev5 55.293303
director_rev10 52.923247
bud_availYes 40.369767
actor_rev5 23.843383
actor_rev10 15.965978
director_films5 11.463206
runtime 11.151040
ratedPG-13 7.541583
genreHorror 6.703949
plotImp(b1.import, 10)

Random Forest

Full Model (V1)

rf1.import <- varImp(rf1$model)

tableImp(rf1.import, 10)
Overall
budget 100.000000
bud_availYes 24.226549
director_rev10 14.872920
director_rev5 12.121446
runtime 10.781919
actor_rev5 9.714001
actor_rev10 9.310301
actor_films10 5.324959
actor_films5 4.569498
director_films10 2.602546
plotImp(rf1.import, 10)

Tuning Parameter:

rf1$model
## Random Forest 
## 
## 1378 samples
##   18 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1240, 1239, 1240, 1240, 1240, 1241, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE      Rsquared   MAE      
##    2    1.089755  0.6019913  0.8554087
##   21    1.011198  0.6250762  0.8035619
##   40    1.017215  0.6203623  0.8064708
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 21.

SP-5 Model

Using only Star Power 5

rf2.import <- varImp(rf2$model)

tableImp(rf2.import, 10)
Overall
budget 100.000000
bud_availYes 24.583393
director_rev10 14.534784
director_rev5 12.811826
runtime 11.007748
actor_rev5 10.115440
actor_rev10 9.865975
actor_films10 5.168749
actor_films5 4.399251
director_films10 2.694230

Best Model

rf4.import <- varImp(rf4$model)
tableImp(rf4.import, 10)
Overall
budget 100.000000
bud_availYes 25.623682
director_rev10 20.453446
actor_rev10 14.125389
runtime 13.432925
actor_films10 8.610988
director_films10 5.036964
ratedR 2.496025
genreHorror 2.403049
countryUnited States 2.153712
plotImp(rf4.import, 10, title="Random Forest: SP-10")

XGBoost

Full Model

xgb1.import <- varImp(xgb1$model)

tableImp(xgb1.import, 10)
Overall
budget 100.000000
bud_availYes 31.750788
director_rev5 8.726736
director_rev10 8.228347
genreHorror 3.201705
runtime 2.295859
actor_rev10 2.264511
countryUnited States 1.971501
actor_rev5 1.970460
ratedR 1.961712
plotImp(xgb1.import, 10)

Cubist

Full Model

cube1.import <- varImp(cube1$model)

tableImp(cube1.import, 10)
Overall
budget 100.00000
ratedR 53.33333
director_films5 46.00000
actor_films5 43.33333
ratedPG-13 41.33333
bud_availYes 40.66667
States 38.00000
runtime 35.33333
genreHorror 34.66667
ratedPG 34.66667
plotImp(cube1.import, 10)

Side-by-Side

Full Model

f.v1.b <- plotImp(b1.import, 10, title="Bagged Trees")
f.v1.r <- plotImp(rf1.import, 10, title="Random Forest")
f.v1.x <- plotImp(xgb1.import, 10, title="XGBoost")
f.v1.c <- plotImp(cube1.import, 10, title="Cubist")

ggarrange(f.v1.b,
          f.v1.r,
          f.v1.x,
          f.v1.c,
          ncol=2, top="Variable Importance: Full Model")