Goal: Split our data and build a model Click [here for the data] https://github.com/rfordatascience/tidytuesday/tree/master/data/2022/2022-11-01

Import Data

horror_movies <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-11-01/horror_movies.csv')

skimr:: skim(horror_movies)
Data summary
Name horror_movies
Number of rows 32540
Number of columns 20
_______________________
Column type frequency:
character 10
Date 1
logical 1
numeric 8
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
original_title 0 1.00 1 191 0 30296 0
title 0 1.00 1 191 0 29563 0
original_language 0 1.00 2 2 0 97 0
overview 1286 0.96 1 1000 0 31020 0
tagline 19835 0.39 1 237 0 12513 0
poster_path 4474 0.86 30 32 0 28048 0
status 0 1.00 7 15 0 4 0
backdrop_path 18995 0.42 29 32 0 13536 0
genre_names 0 1.00 6 144 0 772 0
collection_name 30234 0.07 4 56 0 815 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
release_date 0 1 1950-01-01 2022-12-31 2012-12-09 10999

Variable type: logical

skim_variable n_missing complete_rate mean count
adult 0 1 0 FAL: 32540

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 0 1.00 445910.83 305744.67 17 146494.8 426521.00 707534.00 1033095.00 ▇▆▆▅▅
popularity 0 1.00 4.01 37.51 0 0.6 0.84 2.24 5088.58 ▇▁▁▁▁
vote_count 0 1.00 62.69 420.89 0 0.0 2.00 11.00 16900.00 ▇▁▁▁▁
vote_average 0 1.00 3.34 2.88 0 0.0 4.00 5.70 10.00 ▇▂▆▃▁
budget 0 1.00 543126.59 4542667.81 0 0.0 0.00 0.00 200000000.00 ▇▁▁▁▁
revenue 0 1.00 1349746.73 14430479.15 0 0.0 0.00 0.00 701842551.00 ▇▁▁▁▁
runtime 0 1.00 62.14 41.00 0 14.0 80.00 91.00 683.00 ▇▁▁▁▁
collection 30234 0.07 481534.88 324498.16 656 155421.0 471259.00 759067.25 1033032.00 ▇▅▅▅▅

Clean Data

data <- horror_movies %>%
    
    # Log transform vote_average
    mutate(vote_average = log1p(vote_average)) %>% # for zeroes: log1p(x) is the same as log(x+1)
    
    # Treat multiple categories in genre_names
    separate_rows(genre_names, sep = ", ") %>%
    
    filter(status == "Released") %>%
    
    select(id, vote_average, genre_names, overview, runtime) %>%

    na.omit()

Explore Data

data %>%
  
  ggplot(aes(runtime, vote_average)) +
  geom_point()

data %>%
  group_by(runtime, vote_average) %>%
  summarise(mean_group = mean(vote_average)) -> data2

data2 %>%
  ggplot(aes(x= runtime, y= mean_group,
             color= runtime, shape= vote_average,
             group = runtime,
             label = round(mean_group, 2))) +
  scale_shape_binned() +
  geom_point()

Prepare Data

data_binarized_tbl <- data %>%
  select(-overview, -genre_names) %>%
  binarize()

data_binarized_tbl %>% glimpse()
## Rows: 62,252
## Columns: 11
## $ `id__-Inf_105927`                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ id__105927_387814                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ id__387814_654747                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ id__654747_Inf                                  <dbl> 1, 1, 1, 1, 1, 1, 1, 1…
## $ `vote_average__-Inf_1.70474809223843`           <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ vote_average__1.70474809223843_1.93152141160321 <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ vote_average__1.93152141160321_Inf              <dbl> 1, 1, 1, 1, 1, 1, 1, 1…
## $ `runtime__-Inf_24`                              <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ runtime__24_84                                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ runtime__84_93                                  <dbl> 0, 0, 1, 1, 1, 0, 0, 0…
## $ runtime__93_Inf                                 <dbl> 1, 1, 0, 0, 0, 1, 1, 1…

Correlate

data_corr_tbl <- data_binarized_tbl %>%
  correlate( `vote_average__-Inf_1.70474809223843` )

data_corr_tbl
## # A tibble: 11 × 3
##    feature      bin                               correlation
##    <fct>        <chr>                                   <dbl>
##  1 vote_average -Inf_1.70474809223843                  1     
##  2 vote_average 1.70474809223843_1.93152141160321     -0.585 
##  3 vote_average 1.93152141160321_Inf                  -0.579 
##  4 id           654747_Inf                             0.237 
##  5 id           -Inf_105927                           -0.233 
##  6 runtime      -Inf_24                                0.188 
##  7 runtime      93_Inf                                -0.178 
##  8 runtime      84_93                                 -0.0929
##  9 runtime      24_84                                  0.0778
## 10 id           105927_387814                         -0.0419
## 11 id           387814_654747                          0.0386

Plot

data_corr_tbl %>%
  plot_correlation_funnel()

Build Models

Split Data

data <- sample_n(data, 100)

# Split into train and test data set
set.seed(1234)
data_split <- rsample::initial_split(data)
data_train <- training(data_split)
data_test <- testing(data_split)


# Further split training data set for cross-validation
set.seed(2345)
data_cv <- rsample::vfold_cv(data_train)
data_cv
## #  10-fold cross-validation 
## # A tibble: 10 × 2
##    splits         id    
##    <list>         <chr> 
##  1 <split [67/8]> Fold01
##  2 <split [67/8]> Fold02
##  3 <split [67/8]> Fold03
##  4 <split [67/8]> Fold04
##  5 <split [67/8]> Fold05
##  6 <split [68/7]> Fold06
##  7 <split [68/7]> Fold07
##  8 <split [68/7]> Fold08
##  9 <split [68/7]> Fold09
## 10 <split [68/7]> Fold10
library(usemodels)
## Warning: package 'usemodels' was built under R version 4.4.1
usemodels::use_xgboost(vote_average ~., data = data_train)
## xgboost_recipe <- 
##   recipe(formula = vote_average ~ ., data = data_train) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("classification") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(6804)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
xgboost_recipe <- 
  recipe(formula = vote_average ~ ., data = data_train) %>% 
  
  step_other(genre_names, threshold = 0.05) %>%  
  step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%  
  step_YeoJohnson(all_numeric_predictors()) 

xgboost_spec <- 
  boost_tree(
    trees = tune(),
    min_n = tune(),
    tree_depth = tune(),
    learn_rate = tune(),
    loss_reduction = tune(),
    sample_size = tune()
  ) %>% 
  set_mode("regression") %>% 
  set_engine("xgboost")

xgboost_workflow <- 
  workflow() %>% 
  add_recipe(xgboost_recipe) %>% 
  add_model(xgboost_spec)

# Tune the model
set.seed(6804)
xgboost_tune <- tune_grid(
  xgboost_workflow,
  resamples = data_cv,
  grid = 5
)


xgboost_recipe %>% prep() %>% juice() %>% glimpse()
## Rows: 75
## Columns: 84
## $ id                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           <dbl> …
## $ runtime                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      <dbl> …
## $ vote_average                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 <dbl> …
## $ genre_names_Adventure                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        <dbl> …
## $ genre_names_Comedy                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           <dbl> …
## $ genre_names_Drama                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            <dbl> …
## $ genre_names_Horror                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           <dbl> …
## $ genre_names_Thriller                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         <dbl> …
## $ genre_names_other                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            <dbl> …
## $ overview_X2013.Chinese.horror.film.directed.by.Qiu.Chuji...After.being.kidnapped..a.young.woman.awakens.three.months.later.on.the.side.of.a.road.with.no.memory.of.her.abduction..Subsequently..she.begins.to.experience.psychological.episodes.that.mirror.her.abductor.s.persona.                                                                                                                                                                                                                                                                                                                                                                                                                                                                          <dbl> …
## $ overview_A.30.year.old.woman.who.works.as.an.exclusive.prostitute.likes.to.get.lost.in.her.daydreams..However..just.one.wrong.step.and.she.finds.herself.amidst.a.nightmare.instead..Can.she.escape.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         <dbl> …
## $ overview_A.brother.and.sister.are.sent.to.their.grandparents..remote.Pennsylvania.farm.for.a.week..where.they.discover.that.the.elderly.couple.is.involved.in.something.deeply.disturbing.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   <dbl> …
## $ overview_A.collection.of.horrifying.psychic.images.submitted.by.general.posts...It.was.really.there..Cursed.video...Fear.grows..Earth.bound.spirits..grudges..floating.spirits..guardian.spirits.....Full.of.mysterious.images.that.cannot.be.elucidated.by.science..including..Company.trip....Hydrogen.sulfide....Eyes.that.sneak.in....Spontaneous.combustion....Summer.Kawahara....House.Studio....Series.surveillance.camera.rental.office..and..Night.pond..                                                                                                                                                                                                                                                                                           <dbl> …
## $ overview_A.couple.are.watching.television.together..Over.time..the.shows.become.more.bizarre..a.news.report.about.the.arrest.of.two.terrorists.reveals.they.have.the.same.faces.as.the.two.spectators..in.another..a.female.strips.for.the.man.and.finally.a.politician.demonises.the.couple.and.declares.them.enemies.of.the.people..Panicked..the.couple.phone.a.television.exorcist.                                                                                                                                                                                                                                                                                                                                                                      <dbl> …
## $ overview_A.disturbed.young.woman.must.confront.her.worst.fears.when.she.finds.herself.trapped.alone.in.a.New.York.City.loft.during.the.2003.blackout.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        <dbl> …
## $ overview_A.group.of.four.bored.gamers.delve.into.the.world.of.Creepypasta..They.are.surprised.when.they.learn.that.the.story.of.a.cursed.Nintendo.cartridge.is.far.from.the.fringe.and.decide.to.seek.it.out.for.themselves.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 <dbl> …
## $ overview_A.group.of.friends.is.torn.apart.when.they.become.part.of.two.human.hunter.s.sick.game..The.men.give.them.ten.minutes.to.hide..and.then.the.real.hunt.begins.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       <dbl> …
## $ overview_A.group.of.motorcyclists.on.a..treasure.hunt..are.terrorized.by.a.gang.of.murderous.psychopaths.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    <dbl> …
## $ overview_A.journalist.desperately.tries.to.find.his.missing.fiancee.and.finally.uncover.the.truth.behind.a.sinister.folklore..leading.him.down.a.dangerous.road.of.discovery.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                <dbl> …
## $ overview_A.married.woman..who.s.been.getting.seductive.phone.calls.from.a.lesbian..and.a.man..who.believes.he.might.be.a.werewolf..are.about.to.find.out.who.they.deep.down.really.are.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      <dbl> …
## $ overview_A.mysterious.egg.hatches.into.a.demon..Scientists.try.to.find.the.source.of.the.egg.for.fear.the.world.will.be.overwhelmed.by.a.horde.of.the.monsters.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              <dbl> …
## $ overview_A.pregnant.woman.is.abandoned.by.her.lover..Enraged..she.goes.to.a.witch.doctor.and.uses.black.magic.to.have.her.ex.and.his.family.killed..only.for.another.woman.claiming.to.be.her.lover.s.secret.mistress.to.claim.his.inheritance.and.move.into.his.house.with.her.children..Not.long.after.that..inexplicable.things.happen.to.that.family.and..one.by.one..they.begin.to.die.                                                                                                                                                                                                                                                                                                                                                                 <dbl> …
## $ overview_A.reconstruction..made.from.still.photographs..of.the.lost.1927.Tod.Browning.film.London.After.Midnight..1927..starring.Lon.Chaney.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 <dbl> …
## $ overview_A.retired.couple..Bernard.and.Helen.Martin..inherit.a.house.in.rural.France..Bernard.s.father.had.liberated.this.same.village.from.the.Nazis.during.the.Second.World.War..in.a.rage.fueled.killing.spree..This.peaceful.couple.quickly.become.the.target.of.a.cruel.gang.of.street.kids..who.terrorise.the.village..Plugged.into.their.devices.and.devoid.of.empathy..they.are.a.new.breed.of.technological.psychopath....Bernard.and.Helen.s.lives.become.a.living.hell.as.they.are.harassed.and.tormented.by.the.gang..When.pushed.beyond.breaking.point..right.or.wrong.no.longer.matters..survival.is.everything..Can.Bernard.live.up.to.his.father.s.legend..And.could.they.live.with.the.consequences..This.is.the.old.generation.vs.the.new. <dbl> …
## $ overview_A.school.girl.visits.a.house.to.take.a.koto.lesson..She.meets.her.teacher.and.her.son.and.they.seem.to.be.playing..HIDE.and.SEEK..in.the.house..Koto.lesson.starts.but.the.girl.soon.realizes.that.there.s.something.very.odd.about.the.teacher.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    <dbl> …
## $ overview_A.search.and.recovery.team.heads.into.Victor.Crowley.s.haunted.swamp.to.pick.up.the.pieces..and.Marybeth.learns.the.secret.to.ending.the.voodoo.curse.that.has.left.Victor.Crowley.terrorizing.Honey.Island.Swamp.for.decades.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      <dbl> …
## $ overview_A.serial.killer.that.hates.woman.because.he.can.t.attract.them.goes.on.a.killing.spree.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             <dbl> …
## $ overview_A.short.prequel.of.the.post.apocalyptic.vampire.epic..Stake.Land..This.is.where.we.discover.the.origin.of.the.strong.and.silent..vampire.exterminator.known.simply.as.Mister.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       <dbl> …
## $ overview_A.sorority.mixer.at.a.local.bowling.alley.goes.terribly.wrong.when.the.five.women.who.own.the.building.turn.out.to.not.be.what.they.seem.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           <dbl> …
## $ overview_A.strange.Romanian.trying.to.enter.Canada.is.confronted.by.an.overzealous.customs.officer.and.his.curiously.cooperative.colleague.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  <dbl> …
## $ overview_A.Viet.Cong.soldier.stationed.in.the.claustrophobic.tunnels.of.Cu.Chi.during.the.Vietnam.War.finds.himself.haunted.by.the.ghost.of.a.fallen.comrade.after.the.burial.ceremony.is.compromised.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       <dbl> …
## $ overview_A.woman.in.a.red.dress.sets.herself.up.for.a.romantic.and.floral.bath.and.then.drowns.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              <dbl> …
## $ overview_A.young.couple.and.their.daughter.move.into.a.rambling.old.house..Soon..an.increasingly.alarming.string.of.events.and.supernatural.disturbances.connects.the.house..and.them..with.a.series.of.unsolved.murders.committed.three.years.earlier..They.are.the.only.living.witnesses..but.for.how.long.                                                                                                                                                                                                                                                                                                                                                                                                                                                <dbl> …
## $ overview_A.young.couple..physiologist.Agla.and.filmmaker.Gunnar..wake.up.at.a.glacier.drilling.camp.only.to.find.the.camp.mysteriously.abandoned.and.their.co.workers.gone..When.searching.for.the.lost.team.they.realize.they.re.up.against.an.unknown.deadly.force.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        <dbl> …
## $ overview_A.young.girl.tormented.by.the.tragedies.of.her.past.is.brought.in.for.questioning.by.the.police.over.the.death.of.a.man..who.she.claims.to.be.a.demon..Detective.Beckett.realizes.this.is.the.same.girl.he.made.a.broken.promise.to.seven.years.ago.that.he.d.find.the.monster.that.raped.and.murdered.her.12.year.old.sister.                                                                                                                                                                                                                                                                                                                                                                                                                      <dbl> …
## $ overview_A.young.woman.s.faith.is.put.to.the.ultimate.test.when.she.is.forced.to.uncover.the.truth.behind.her.husband.s.horrific.visions.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    <dbl> …
## $ overview_A.young.woman.hired.to.cater.the.post.funeral.gathering.for.accused.killer.Hank.Boyd.discovers.his.crimes.and.death.may.not.be.what.they.seem.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      <dbl> …
## $ overview_A.young.woman.visits.the.mysterious.property.she.has.inherited..While.hoping.to.learn.more.about.the.deaths.of.her.mother.and.sister..she.is.haunted.by.ghosts.and.so.must.uncover.the.truth.behind.the.curse.of.the.house..or.become.the.next.victim.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              <dbl> …
## $ overview_After.an.overly.ambitious.businessman.transports.an.80.foot.python.to.the.United.States..the.beast.escapes.and.starts.to.leave.behind.a.trail.of.human.victims..An.FBI.agent.and.a.snake.specialist.come.up.with.a.plot.to.combat.the.creature.by.pitting.it.against.a.bioengineered..70.foot.boa.constrictor..It.s.two.great.snakes.that.snake.great.together.                                                                                                                                                                                                                                                                                                                                                                                     <dbl> …
## $ overview_After.being.committed.for.17.years..Michael.Myers..now.a.grown.man.and.still.very.dangerous..escapes.from.the.mental.institution..where.he.was.committed.as.a.10.year.old..and.he.immediately.returns.to.Haddonfield..where.he.wants.to.find.his.baby.sister..Laurie..Anyone.who.crosses.his.path.is.in.mortal.danger                                                                                                                                                                                                                                                                                                                                                                                                                               <dbl> …
## $ overview_After.being.fired.from.the.rodeo..three.clowns.and.a.giant.chicken.get.involved.with.mind.altering.drugs.that.send.them.on.a.bloody.rampage.across.Kansas..Pursued.by.a.U.S..Marshall.from.Utah.who.specializes.in.clown.cases..they.become.dangerously.entangled.with.a.swindling.cult.leader.whose.truck..full.of.scammed.cash..they.have.stolen.to.pull.their.trailer..It.s.a.crazed.festival.of.guns..puppets..blood..rubber.noses.and.dark.humor....By.the.way..none.of.this.is.nearly.as.interesting.as.it.sounds..                                                                                                                                                                                                                           <dbl> …
## $ overview_Amy.has.moved.east.from.Kansas..determined.to.start.a.new.life..She.thinks.she.has.found.the.perfect.small.quiet.town..a.great.neighborhood.on.a.quiet.street..As.she.moves.into.her.brand.new.apartment..eager.to.start.a.dream.job..happy.to.befriend.her.neighbors..she.finds.out.that.not.everything.is.as.it.seems..especially.at.the.house.across.the.street.                                                                                                                                                                                                                                                                                                                                                                                 <dbl> …
## $ overview_An.adaptation.of.the.Icelandic.ghost.story.of..The.Deacon.of.Dark.River...set.in.1970s.France.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      <dbl> …
## $ overview_An.archaelogist.falls.under.the.spell.of.a.statue.with.a.curse.on.it.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               <dbl> …
## $ overview_An.independent.Australian.horror.drama.that.explores.the.societal.norms.that.break.down.among.a.small.group.of.survivors.in.a.post.apocalyptic.world..Ravenous.hordes.of.infected.zombies.terrorize.the.survivors..but.it.is.the.horror.within.their.own.sanctuary.that.they.must.fear.the.most.                                                                                                                                                                                                                                                                                                                                                                                                                                                    <dbl> …
## $ overview_An.overworked.American.ambassador.working.in.the.UK.attempts.to.spend.more.time.with.his.wife.by.visiting.a.countryside.mansion..but.soon.the.trip.turns.into.a.nightmare.with.his.wife.haunted.by.a.stalker.seemingly.from.her.past.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               <dbl> …
## $ overview_Ash.Williams.and.his.girlfriend.Linda.find.a.log.cabin.in.the.woods.with.a.voice.recording.from.an.archeologist.who.had.recorded.himself.reciting.ancient.chants.from..The.Book.of.the.Dead...As.they.play.the.recording.an.evil.power.is.unleashed.taking.over.Linda.s.body.                                                                                                                                                                                                                                                                                                                                                                                                                                                                       <dbl> …
## $ overview_Based.on.the.book.by.Jorge.Montenegro..the.film.is.composed.of.four.segments.of.the.famous.fables...El.cadejo....La.sucia....La.fiesta.de.ánimas...and..La.taconuda..                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               <dbl> …
## $ overview_Beautiful..mysterious...the.occult.history.Jack.Angel.uncovers.in.the.city.of.Bath.is.astonishing..But.the.discoveries.become.increasingly.creepy.and.disturbing...and.the..final.revelation...                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     <dbl> …
## $ overview_Clean.cut..handsome.looking.psychiatric.patient.Daryl.Gleeson.finds.himself.hopelessly.falling.in.love.with.restaurant.owner.Brooke.Daniels..after.having.instinctively.rescued.her.little.son.Mikey.from.a.traffic.accident..When.she.doesn.t.return.his.love.he.snaps.and.begins.to.stalk.her..eliminating.all.who.stand.in.his.way...                                                                                                                                                                                                                                                                                                                                                                                                            <dbl> …
## $ overview_Dead.Girls.is.a.horror.anthology.featuring.three.stories.of.terror.connected.through.the.pages.of.dead.girls..diaries..which.chronicle.each.girl.s.act.of.vengeance.against.the.people.who.have.wronged.or.abused.them.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             <dbl> …
## $ overview_Emmy.who.lives.in.the.UK..comes.to.India.to.organised.a.real.life.game..The.game.is.launched.in.India..5.players.from.5.states.participate.in.it.to.win.Rs..2.Crore..Three.young.boys.and.two.beautiful.girls.become.the.part.of.the.game.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          <dbl> …
## $ overview_FIEND.FATALE.is.an..epic..horror.adventure.proof.of.concept.short.film..about.five.sisters..a.vampire..zombie..werewolf..demon..and.mermaid..cloned.from.the.DNA.of.extinct.monsters..thrust.into.the.modern.world.and.facing.off.against.the.government..terrorists..and.themselves.                                                                                                                                                                                                                                                                                                                                                                                                                                                               <dbl> …
## $ overview_Five.young.adults.venture.into.a.bog.to.excavate.some.bodies..After.a.while.they.find.that.bodies.that.have.been.buried.in.the.bog.have.risen.from.the.dead.and.seek.to.pick.them.off.one.by.one.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   <dbl> …
## $ overview_For.fame..Raya.and.his.friends.came.to.the.haunted.area..When.making.video..one.of.them.died..They.decided.to.flee.but.the.ghost.is.coming.with.them.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               <dbl> …
## $ overview_For.four.college.students..a.weekend.camping.trip.turns.into.a.living.nightmare.as.they.stumble.upon.something.in.the.woods.that.should.not.be.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     <dbl> …
## $ overview_Guests.of.a.country.inn.begin.disappearing.and.dying..after.a.mentally.unstable.man.takes.over.his.terminally.ill.mother.s.bed.and.breakfast.operation.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             <dbl> …
## $ overview_Haunted.by.recent.events.and.on.the.run..a.man.finds.himself.the.unwitting.pawn.of.a.possessed.evangelical.radio.station.and.like.his.unfortunate.predecessor.must.ask.himself.whether.it.is.better.to.reign.in.hell.than.serve.in.heaven.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          <dbl> …
## $ overview_In.October.of.2020.food.blogger.Jeff.Blake.and.his.half.brother.Andy.Baker.hit.the.road.on.a.food.tour.that.had.the.potential.to.change.their.lives..They.were.never.seen.again..This.is.their.footage.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             <dbl> …
## $ overview_In.the.15th.century..a.young.goatherd.living.alone.in.a.mountain.hut.feels.a.dark.presence.in.the.woods.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            <dbl> …
## $ overview_It.s.always.just.a.party..until.somebody.gets.killed..Audrey.Small..Julie.Sherwood..and.three.of.her.closest.friends.are.preparing.to.throw.a.costume.party..Midsummer.Nightmares..which.everyone.calls..the.social.event.of.the.season..But.there.is.someone.in.their.midsts.who.would.prefer.to.make.this.faux.bloodbath.a.real.one..Can.Audrey.and.her.friends.survive.this.mad.individual.s.evil.plan..Or.will.they.all..one.by.one..be.picked.off....                                                                                                                                                                                                                                                                                          <dbl> …
## $ overview_Italy..1952..The.young.official.Furio.Momenté.is.sent.to.Veneto.region..near.Polesine..to.investigate.a.shocking.and.mysterious.case..a.minor.has.in.fact.killed.one.of.his.peers.claiming.to.have.killed.the.devil.himself.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        <dbl> …
## $ overview_More.than.89..less.than.91.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         <dbl> …
## $ overview_No.overview.found.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  <dbl> …
## $ overview_Phantom.Lake.County.is.known.for.its.share.of.strange.occurances..from.monster.attacks.to.alien.invasions..However..things.will.never.be.the.same.when..after.experiencing.inexplicable.earthquakes.emanating.from.a.local.system.of.caves..the.area.is.overrun.by.grossly.mutated.creatures.of.unusual.size..a.phenomenon.that.may.point.to.the.return.of.a.long.since.forgotten.menace..Thankfully..the.Phantom.Lake.Kids.are..on.the.job..and.determined.to.solve.the.real.mystery..What.s.the.deal.with.Butch.s.magic.hat.                                                                                                                                                                                                                      <dbl> …
## $ overview_Rama..Garin..Farel..Quincy.and.Celsi.survive.from.the.spirit.terror.at.the.Ayunan.Island.resort.which.harbored.a.terrible.history.of.the.slaughter.of.a.family.and.resort.employees..Rama.is.frantic..her.lover.is.in.a.mental.hospital..the.body.of.Hana..his.sister..and.Fira.have.not.been.found..Rama.returns.to.Ayunan.Island.to.look.for.them..But.obstacles.lay.ahead.                                                                                                                                                                                                                                                                                                                                                                       <dbl> …
## $ overview_Sandra.and.Jorge.s.trip.was.expected.to.be.pleasant.but.things.often.don.t.go.as.planned..Sometimes.the.middle.of.the.road.becomes.the.beginning.of.an.utterly.different.story.and.you.never.arrive.to.your.intended.destination.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   <dbl> …
## $ overview_Six.friends.plan.for.a.trek.to.an.undisclosed.mountain.in.the.Western.Ghats.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        <dbl> …
## $ overview_Stop.motion.animated.short.film.in.which.a.puppet.on.a.trike.captures.a.puppet.bird.man.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            <dbl> …
## $ overview_The.actor.Koheiji.is.terribly.in.love.with.the.wife.of.his.best.friend..the.playwright.Takuro..to.get.her..he.would.even.kill.Takuro.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               <dbl> …
## $ overview_The.Church.sends.in.a.team.to.investigate.the.tragic.deaths.of.a.young.group.found.in.the.crypt.of.a.Convent.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       <dbl> …
## $ overview_The.Dead.Man.returns..but.it.s.too.late.to.save.us..We.are.already.dead.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            <dbl> …
## $ overview_The.Kingdom.Of.Shadows.is.a.mystical.cinematic.experience.which.stirs.from.the.darkness.the.spirits.of.our.ancestors.and.reawakens.the.horror.of.unresolved.crimes.and.denied.desires.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              <dbl> …
## $ overview_The.sequel.is.set.just.weeks.after.Annie.Barlow.s.deadly.confrontation.with.the.Judas.Killer..In.this.elevated.sequel..we.meet.June..a.woman.whose.carefully.constructed.life.is.beginning.to.unravel.due.to.lucid.nightmares.so.awful.they.disturb.her.waking.life                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 <dbl> …
## $ overview_The.spaceship.AAB.Gamma.is.dispatched.from.FAFC.headquarters.in.Japan.to.make.a.landing.on.the.planet.Mars.and.investigate.reports.of.UFOs.in.the.area..As.they.near.the.red.planet..they.encounter.a.mysterious.UFO.that.coats.the.ship.s.hull.with.unusual.spores..Taking.one.of.the.specimens.back.to.earth..it.soon.develops.and.grows.into.a.giant.chicken.lizard.alien.monster.that.tramples.Japan.                                                                                                                                                                                                                                                                                                                                           <dbl> …
## $ overview_The.staff.of.a.black.hair.salon.fend.off.a.strange.new.monster..white.women.intent.on.sucking.the.lifeblood.from.black.culture.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     <dbl> …
## $ overview_The.story.of.a.student.named.Nikko..Ben.Joshua...engaged.to.Donna..Nia.Ramadhani...Things.are.fine.until.one.day.Nikko.meets.a.waiters.named.Livi..Nadilla.Ernesta..at.a.cafe..Nikko.decides.to.have.an.affair.with.Livi..meanwhile..this.girl.doesn.t.know..that.Nikko.is.engaged..One.day..Livi.was.pregnant.and.he.asked.for.Nikko.s.responsibility..Unable.to.accept.this.fact..Livi.was.accidentally.killed.by.Niko.and.then.dumped.his.body.from.the.Ancol.bridge..Then.the.spirits.of.Livi.continue.to.haunt.Nikko.                                                                                                                                                                                                                          <dbl> …
## $ overview_The.story.of.Michael.and.Richard.Henderson..two.stepbrothers.from.West.Virginia.who.saw.an.opportunity.in.the.burgeoning.VHS.market.in.the.1980s.and.made.their.own.backyard.horror.movies...The.Curse.of.Stabberman..and..Cannibal.Swim.Club...These.films.would.ve.been.long.forgotten..but.a.recent.resurgence.in.horror.fans.collecting.rare.VHS.tapes.has.put.the.Henderson.Brothers.back.in.the.spotlight..Thanks.to.their.biggest.fan..they.re.sitting.down.for.their.first.on.camera.interview.and.looking.back.on.their.movies...but.they.might.not.be.as.good.as.they.remembered.                                                                                                                                                         <dbl> …
## $ overview_Three.friends.making.a.web.series.about.their.town.discover.that.their.neighbors.are.being.killed.and.replaced.by.creatures.who.are.perfect.copies.of.their.victims.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                <dbl> …
## $ overview_To.keep.her.former.fiancé.from.leaving.her..a.neurotic.young.woman..Louise.Allbritton..fakes.a.riding.accident..and.feigns.paralysis.of.her.lower.body..When.her.graceful.nurse..Grace.Kelly..discovers.her.treachery..she.resorts.to.a.final.and.desperate.act.of.revenge.                                                                                                                                                                                                                                                                                                                                                                                                                                                                         <dbl> …
## $ overview_Two.inexplicably.coherent.zombies.awake.amidst.a.zombie.attack.and.decide.to.take.a.road.trip.to.find.the.one.s.lost.love..unaware.they.are.being.chased.by.the.agents.of.a.ruthless.company.with.its.own.agenda.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   <dbl> …
## $ overview_Two.people.Matt.and.Kate.wake.up.in.a.closed.down.prison..They.have.no.idea.how.they.got.there..or.why.they.are.there..In.the.prison.they.encounter.several.terrors.as.they.look.for.an.exit.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       <dbl> …
## $ overview_Up.Route.tells.the.unnerving.tale.of.a.hitchhiker.who.is.picked.up.by.a.strange.man.and.his..even.stranger..potted.plan                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             <dbl> …
## $ overview_Zombie.short.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       <dbl> …

Evaluate Models

tune::show_best(xgboost_tune, metric = "rmse")
## # A tibble: 5 × 12
##   trees min_n tree_depth learn_rate loss_reduction sample_size .metric
##   <int> <int>      <int>      <dbl>          <dbl>       <dbl> <chr>  
## 1  1309    24          2    0.00694       2.65e+ 0       0.905 rmse   
## 2   431     9         10    0.0100        7.91e-10       0.427 rmse   
## 3  1020    11          5    0.00253       4.65e- 5       0.672 rmse   
## 4  1926    32          7    0.0768        5.00e- 8       0.553 rmse   
## 5   121    34         15    0.212         1.08e- 1       0.110 rmse   
## # ℹ 5 more variables: .estimator <chr>, mean <dbl>, n <int>, std_err <dbl>,
## #   .config <chr>
xgboost_fw <- tune::finalize_workflow(xgboost_workflow,
                                      tune::select_best(xgboost_tune, metric = "rmse"))

data_fit <- tune::last_fit(xgboost_fw, data_split)
tune::collect_metrics(data_fit)
## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 rmse    standard       0.949 Preprocessor1_Model1
## 2 rsq     standard       0.227 Preprocessor1_Model1
tune::collect_predictions(data_fit) %>%
  ggplot(aes(vote_average, .pred)) +
  geom_point(alpha = 0.3, fill= "midnightblue") +
geom_abline(lty = 2, color = "gray50") +
  coord_fixed()

Experimenting with different preprocessing techniques

set.seed(123)

data_split <- initial_split(data, strata = vote_average)
data_train <- training(data_split)
data_test <- testing(data_split)

set.seed(234)
data_folds <- vfold_cv(data_train, strata = vote_average)
## Warning: The number of observations in each quantile is below the recommended threshold of 20.
## • Stratification will use 3 breaks instead.
# Setting up our preprocessing using step_tf

movie_rec <- 
  recipe(vote_average ~ overview, data = data_train) %>%
  step_tokenize(overview) %>%                             
  step_tokenfilter(overview, max_tokens = 100) %>%        
  step_tf(overview)                                    
# Attempting to use the SVM Model specification

library(parsnip)

svm_spec <- 
  svm_linear( mode = "regression") %>%
  set_engine("kernlab")
# Creating the workflow

movie_workflow_svm <- 
  workflow() %>%
  add_recipe(movie_rec) %>%    
  add_model(svm_spec)
# Fit workflow to the training data

library(kernlab)
## Warning: package 'kernlab' was built under R version 4.4.1
set.seed(345) 
movie_fit_svm <- 
  movie_workflow_svm %>%
  fit(data = data_train)
##  Setting default kernel parameters
# Evaluating the model

library(yardstick)

movie_predictions_svm <- predict(movie_fit_svm, data_test) %>%
  bind_cols(data_test)


# Evaluting performance

movie_metrics_svm <- movie_predictions_svm %>%
  metrics(truth = vote_average, estimate = .pred)



movie_metrics_svm
## # A tibble: 3 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard      2.31  
## 2 rsq     standard      0.0119
## 3 mae     standard      1.52

Summary

In this analysis I had initially used a basic linear regression model and added improvements throughout to enhace the predictive power of that model. I specifically used a text preprocessing pipeline with step_tokenize, step_tokenfilter, and step_tf to process the “overview” text data. I then transformed the text into term frequency values, so I could capture relevant info from the descriptions of the movies themselves, which might have contained patterns helpful for predicting vote_aveage, which was my target variable.

I then proceeded to replace my basic linear regression model with the SVM linear model, which is better at handling complex relationships within data. After training and evaluating both models I have observed that my changes have made an impact on the Root Mean Squared Error. The RMSE has improved showing that the SVM model might have been better at minimizing prediction errors compared to the basic linear regression model. This new model showed more variance in vote_average than the basic linear regression model I first used.