Goal: Split our data and build a model Click [here for the data] https://github.com/rfordatascience/tidytuesday/tree/master/data/2022/2022-11-01
horror_movies <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-11-01/horror_movies.csv')
skimr:: skim(horror_movies)
| Name | horror_movies |
| Number of rows | 32540 |
| Number of columns | 20 |
| _______________________ | |
| Column type frequency: | |
| character | 10 |
| Date | 1 |
| logical | 1 |
| numeric | 8 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| original_title | 0 | 1.00 | 1 | 191 | 0 | 30296 | 0 |
| title | 0 | 1.00 | 1 | 191 | 0 | 29563 | 0 |
| original_language | 0 | 1.00 | 2 | 2 | 0 | 97 | 0 |
| overview | 1286 | 0.96 | 1 | 1000 | 0 | 31020 | 0 |
| tagline | 19835 | 0.39 | 1 | 237 | 0 | 12513 | 0 |
| poster_path | 4474 | 0.86 | 30 | 32 | 0 | 28048 | 0 |
| status | 0 | 1.00 | 7 | 15 | 0 | 4 | 0 |
| backdrop_path | 18995 | 0.42 | 29 | 32 | 0 | 13536 | 0 |
| genre_names | 0 | 1.00 | 6 | 144 | 0 | 772 | 0 |
| collection_name | 30234 | 0.07 | 4 | 56 | 0 | 815 | 0 |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| release_date | 0 | 1 | 1950-01-01 | 2022-12-31 | 2012-12-09 | 10999 |
Variable type: logical
| skim_variable | n_missing | complete_rate | mean | count |
|---|---|---|---|---|
| adult | 0 | 1 | 0 | FAL: 32540 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| id | 0 | 1.00 | 445910.83 | 305744.67 | 17 | 146494.8 | 426521.00 | 707534.00 | 1033095.00 | ▇▆▆▅▅ |
| popularity | 0 | 1.00 | 4.01 | 37.51 | 0 | 0.6 | 0.84 | 2.24 | 5088.58 | ▇▁▁▁▁ |
| vote_count | 0 | 1.00 | 62.69 | 420.89 | 0 | 0.0 | 2.00 | 11.00 | 16900.00 | ▇▁▁▁▁ |
| vote_average | 0 | 1.00 | 3.34 | 2.88 | 0 | 0.0 | 4.00 | 5.70 | 10.00 | ▇▂▆▃▁ |
| budget | 0 | 1.00 | 543126.59 | 4542667.81 | 0 | 0.0 | 0.00 | 0.00 | 200000000.00 | ▇▁▁▁▁ |
| revenue | 0 | 1.00 | 1349746.73 | 14430479.15 | 0 | 0.0 | 0.00 | 0.00 | 701842551.00 | ▇▁▁▁▁ |
| runtime | 0 | 1.00 | 62.14 | 41.00 | 0 | 14.0 | 80.00 | 91.00 | 683.00 | ▇▁▁▁▁ |
| collection | 30234 | 0.07 | 481534.88 | 324498.16 | 656 | 155421.0 | 471259.00 | 759067.25 | 1033032.00 | ▇▅▅▅▅ |
data <- horror_movies %>%
# Log transform vote_average
mutate(vote_average = log1p(vote_average)) %>% # for zeroes: log1p(x) is the same as log(x+1)
# Treat multiple categories in genre_names
separate_rows(genre_names, sep = ", ") %>%
filter(status == "Released") %>%
select(id, vote_average, genre_names, overview, runtime) %>%
na.omit()
data %>%
ggplot(aes(runtime, vote_average)) +
geom_point()
data %>%
group_by(runtime, vote_average) %>%
summarise(mean_group = mean(vote_average)) -> data2
data2 %>%
ggplot(aes(x= runtime, y= mean_group,
color= runtime, shape= vote_average,
group = runtime,
label = round(mean_group, 2))) +
scale_shape_binned() +
geom_point()
data_binarized_tbl <- data %>%
select(-overview, -genre_names) %>%
binarize()
data_binarized_tbl %>% glimpse()
## Rows: 62,252
## Columns: 11
## $ `id__-Inf_105927` <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ id__105927_387814 <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ id__387814_654747 <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ id__654747_Inf <dbl> 1, 1, 1, 1, 1, 1, 1, 1…
## $ `vote_average__-Inf_1.70474809223843` <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ vote_average__1.70474809223843_1.93152141160321 <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ vote_average__1.93152141160321_Inf <dbl> 1, 1, 1, 1, 1, 1, 1, 1…
## $ `runtime__-Inf_24` <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ runtime__24_84 <dbl> 0, 0, 0, 0, 0, 0, 0, 0…
## $ runtime__84_93 <dbl> 0, 0, 1, 1, 1, 0, 0, 0…
## $ runtime__93_Inf <dbl> 1, 1, 0, 0, 0, 1, 1, 1…
data_corr_tbl <- data_binarized_tbl %>%
correlate( `vote_average__-Inf_1.70474809223843` )
data_corr_tbl
## # A tibble: 11 × 3
## feature bin correlation
## <fct> <chr> <dbl>
## 1 vote_average -Inf_1.70474809223843 1
## 2 vote_average 1.70474809223843_1.93152141160321 -0.585
## 3 vote_average 1.93152141160321_Inf -0.579
## 4 id 654747_Inf 0.237
## 5 id -Inf_105927 -0.233
## 6 runtime -Inf_24 0.188
## 7 runtime 93_Inf -0.178
## 8 runtime 84_93 -0.0929
## 9 runtime 24_84 0.0778
## 10 id 105927_387814 -0.0419
## 11 id 387814_654747 0.0386
data_corr_tbl %>%
plot_correlation_funnel()
Split Data
data <- sample_n(data, 100)
# Split into train and test data set
set.seed(1234)
data_split <- rsample::initial_split(data)
data_train <- training(data_split)
data_test <- testing(data_split)
# Further split training data set for cross-validation
set.seed(2345)
data_cv <- rsample::vfold_cv(data_train)
data_cv
## # 10-fold cross-validation
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [67/8]> Fold01
## 2 <split [67/8]> Fold02
## 3 <split [67/8]> Fold03
## 4 <split [67/8]> Fold04
## 5 <split [67/8]> Fold05
## 6 <split [68/7]> Fold06
## 7 <split [68/7]> Fold07
## 8 <split [68/7]> Fold08
## 9 <split [68/7]> Fold09
## 10 <split [68/7]> Fold10
library(usemodels)
## Warning: package 'usemodels' was built under R version 4.4.1
usemodels::use_xgboost(vote_average ~., data = data_train)
## xgboost_recipe <-
## recipe(formula = vote_average ~ ., data = data_train) %>%
## step_zv(all_predictors())
##
## xgboost_spec <-
## boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(),
## loss_reduction = tune(), sample_size = tune()) %>%
## set_mode("classification") %>%
## set_engine("xgboost")
##
## xgboost_workflow <-
## workflow() %>%
## add_recipe(xgboost_recipe) %>%
## add_model(xgboost_spec)
##
## set.seed(6804)
## xgboost_tune <-
## tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
xgboost_recipe <-
recipe(formula = vote_average ~ ., data = data_train) %>%
step_other(genre_names, threshold = 0.05) %>%
step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
step_YeoJohnson(all_numeric_predictors())
xgboost_spec <-
boost_tree(
trees = tune(),
min_n = tune(),
tree_depth = tune(),
learn_rate = tune(),
loss_reduction = tune(),
sample_size = tune()
) %>%
set_mode("regression") %>%
set_engine("xgboost")
xgboost_workflow <-
workflow() %>%
add_recipe(xgboost_recipe) %>%
add_model(xgboost_spec)
# Tune the model
set.seed(6804)
xgboost_tune <- tune_grid(
xgboost_workflow,
resamples = data_cv,
grid = 5
)
xgboost_recipe %>% prep() %>% juice() %>% glimpse()
## Rows: 75
## Columns: 84
## $ id <dbl> …
## $ runtime <dbl> …
## $ vote_average <dbl> …
## $ genre_names_Adventure <dbl> …
## $ genre_names_Comedy <dbl> …
## $ genre_names_Drama <dbl> …
## $ genre_names_Horror <dbl> …
## $ genre_names_Thriller <dbl> …
## $ genre_names_other <dbl> …
## $ overview_X2013.Chinese.horror.film.directed.by.Qiu.Chuji...After.being.kidnapped..a.young.woman.awakens.three.months.later.on.the.side.of.a.road.with.no.memory.of.her.abduction..Subsequently..she.begins.to.experience.psychological.episodes.that.mirror.her.abductor.s.persona. <dbl> …
## $ overview_A.30.year.old.woman.who.works.as.an.exclusive.prostitute.likes.to.get.lost.in.her.daydreams..However..just.one.wrong.step.and.she.finds.herself.amidst.a.nightmare.instead..Can.she.escape. <dbl> …
## $ overview_A.brother.and.sister.are.sent.to.their.grandparents..remote.Pennsylvania.farm.for.a.week..where.they.discover.that.the.elderly.couple.is.involved.in.something.deeply.disturbing. <dbl> …
## $ overview_A.collection.of.horrifying.psychic.images.submitted.by.general.posts...It.was.really.there..Cursed.video...Fear.grows..Earth.bound.spirits..grudges..floating.spirits..guardian.spirits.....Full.of.mysterious.images.that.cannot.be.elucidated.by.science..including..Company.trip....Hydrogen.sulfide....Eyes.that.sneak.in....Spontaneous.combustion....Summer.Kawahara....House.Studio....Series.surveillance.camera.rental.office..and..Night.pond.. <dbl> …
## $ overview_A.couple.are.watching.television.together..Over.time..the.shows.become.more.bizarre..a.news.report.about.the.arrest.of.two.terrorists.reveals.they.have.the.same.faces.as.the.two.spectators..in.another..a.female.strips.for.the.man.and.finally.a.politician.demonises.the.couple.and.declares.them.enemies.of.the.people..Panicked..the.couple.phone.a.television.exorcist. <dbl> …
## $ overview_A.disturbed.young.woman.must.confront.her.worst.fears.when.she.finds.herself.trapped.alone.in.a.New.York.City.loft.during.the.2003.blackout. <dbl> …
## $ overview_A.group.of.four.bored.gamers.delve.into.the.world.of.Creepypasta..They.are.surprised.when.they.learn.that.the.story.of.a.cursed.Nintendo.cartridge.is.far.from.the.fringe.and.decide.to.seek.it.out.for.themselves. <dbl> …
## $ overview_A.group.of.friends.is.torn.apart.when.they.become.part.of.two.human.hunter.s.sick.game..The.men.give.them.ten.minutes.to.hide..and.then.the.real.hunt.begins. <dbl> …
## $ overview_A.group.of.motorcyclists.on.a..treasure.hunt..are.terrorized.by.a.gang.of.murderous.psychopaths. <dbl> …
## $ overview_A.journalist.desperately.tries.to.find.his.missing.fiancee.and.finally.uncover.the.truth.behind.a.sinister.folklore..leading.him.down.a.dangerous.road.of.discovery. <dbl> …
## $ overview_A.married.woman..who.s.been.getting.seductive.phone.calls.from.a.lesbian..and.a.man..who.believes.he.might.be.a.werewolf..are.about.to.find.out.who.they.deep.down.really.are. <dbl> …
## $ overview_A.mysterious.egg.hatches.into.a.demon..Scientists.try.to.find.the.source.of.the.egg.for.fear.the.world.will.be.overwhelmed.by.a.horde.of.the.monsters. <dbl> …
## $ overview_A.pregnant.woman.is.abandoned.by.her.lover..Enraged..she.goes.to.a.witch.doctor.and.uses.black.magic.to.have.her.ex.and.his.family.killed..only.for.another.woman.claiming.to.be.her.lover.s.secret.mistress.to.claim.his.inheritance.and.move.into.his.house.with.her.children..Not.long.after.that..inexplicable.things.happen.to.that.family.and..one.by.one..they.begin.to.die. <dbl> …
## $ overview_A.reconstruction..made.from.still.photographs..of.the.lost.1927.Tod.Browning.film.London.After.Midnight..1927..starring.Lon.Chaney. <dbl> …
## $ overview_A.retired.couple..Bernard.and.Helen.Martin..inherit.a.house.in.rural.France..Bernard.s.father.had.liberated.this.same.village.from.the.Nazis.during.the.Second.World.War..in.a.rage.fueled.killing.spree..This.peaceful.couple.quickly.become.the.target.of.a.cruel.gang.of.street.kids..who.terrorise.the.village..Plugged.into.their.devices.and.devoid.of.empathy..they.are.a.new.breed.of.technological.psychopath....Bernard.and.Helen.s.lives.become.a.living.hell.as.they.are.harassed.and.tormented.by.the.gang..When.pushed.beyond.breaking.point..right.or.wrong.no.longer.matters..survival.is.everything..Can.Bernard.live.up.to.his.father.s.legend..And.could.they.live.with.the.consequences..This.is.the.old.generation.vs.the.new. <dbl> …
## $ overview_A.school.girl.visits.a.house.to.take.a.koto.lesson..She.meets.her.teacher.and.her.son.and.they.seem.to.be.playing..HIDE.and.SEEK..in.the.house..Koto.lesson.starts.but.the.girl.soon.realizes.that.there.s.something.very.odd.about.the.teacher. <dbl> …
## $ overview_A.search.and.recovery.team.heads.into.Victor.Crowley.s.haunted.swamp.to.pick.up.the.pieces..and.Marybeth.learns.the.secret.to.ending.the.voodoo.curse.that.has.left.Victor.Crowley.terrorizing.Honey.Island.Swamp.for.decades. <dbl> …
## $ overview_A.serial.killer.that.hates.woman.because.he.can.t.attract.them.goes.on.a.killing.spree. <dbl> …
## $ overview_A.short.prequel.of.the.post.apocalyptic.vampire.epic..Stake.Land..This.is.where.we.discover.the.origin.of.the.strong.and.silent..vampire.exterminator.known.simply.as.Mister. <dbl> …
## $ overview_A.sorority.mixer.at.a.local.bowling.alley.goes.terribly.wrong.when.the.five.women.who.own.the.building.turn.out.to.not.be.what.they.seem. <dbl> …
## $ overview_A.strange.Romanian.trying.to.enter.Canada.is.confronted.by.an.overzealous.customs.officer.and.his.curiously.cooperative.colleague. <dbl> …
## $ overview_A.Viet.Cong.soldier.stationed.in.the.claustrophobic.tunnels.of.Cu.Chi.during.the.Vietnam.War.finds.himself.haunted.by.the.ghost.of.a.fallen.comrade.after.the.burial.ceremony.is.compromised. <dbl> …
## $ overview_A.woman.in.a.red.dress.sets.herself.up.for.a.romantic.and.floral.bath.and.then.drowns. <dbl> …
## $ overview_A.young.couple.and.their.daughter.move.into.a.rambling.old.house..Soon..an.increasingly.alarming.string.of.events.and.supernatural.disturbances.connects.the.house..and.them..with.a.series.of.unsolved.murders.committed.three.years.earlier..They.are.the.only.living.witnesses..but.for.how.long. <dbl> …
## $ overview_A.young.couple..physiologist.Agla.and.filmmaker.Gunnar..wake.up.at.a.glacier.drilling.camp.only.to.find.the.camp.mysteriously.abandoned.and.their.co.workers.gone..When.searching.for.the.lost.team.they.realize.they.re.up.against.an.unknown.deadly.force. <dbl> …
## $ overview_A.young.girl.tormented.by.the.tragedies.of.her.past.is.brought.in.for.questioning.by.the.police.over.the.death.of.a.man..who.she.claims.to.be.a.demon..Detective.Beckett.realizes.this.is.the.same.girl.he.made.a.broken.promise.to.seven.years.ago.that.he.d.find.the.monster.that.raped.and.murdered.her.12.year.old.sister. <dbl> …
## $ overview_A.young.woman.s.faith.is.put.to.the.ultimate.test.when.she.is.forced.to.uncover.the.truth.behind.her.husband.s.horrific.visions. <dbl> …
## $ overview_A.young.woman.hired.to.cater.the.post.funeral.gathering.for.accused.killer.Hank.Boyd.discovers.his.crimes.and.death.may.not.be.what.they.seem. <dbl> …
## $ overview_A.young.woman.visits.the.mysterious.property.she.has.inherited..While.hoping.to.learn.more.about.the.deaths.of.her.mother.and.sister..she.is.haunted.by.ghosts.and.so.must.uncover.the.truth.behind.the.curse.of.the.house..or.become.the.next.victim. <dbl> …
## $ overview_After.an.overly.ambitious.businessman.transports.an.80.foot.python.to.the.United.States..the.beast.escapes.and.starts.to.leave.behind.a.trail.of.human.victims..An.FBI.agent.and.a.snake.specialist.come.up.with.a.plot.to.combat.the.creature.by.pitting.it.against.a.bioengineered..70.foot.boa.constrictor..It.s.two.great.snakes.that.snake.great.together. <dbl> …
## $ overview_After.being.committed.for.17.years..Michael.Myers..now.a.grown.man.and.still.very.dangerous..escapes.from.the.mental.institution..where.he.was.committed.as.a.10.year.old..and.he.immediately.returns.to.Haddonfield..where.he.wants.to.find.his.baby.sister..Laurie..Anyone.who.crosses.his.path.is.in.mortal.danger <dbl> …
## $ overview_After.being.fired.from.the.rodeo..three.clowns.and.a.giant.chicken.get.involved.with.mind.altering.drugs.that.send.them.on.a.bloody.rampage.across.Kansas..Pursued.by.a.U.S..Marshall.from.Utah.who.specializes.in.clown.cases..they.become.dangerously.entangled.with.a.swindling.cult.leader.whose.truck..full.of.scammed.cash..they.have.stolen.to.pull.their.trailer..It.s.a.crazed.festival.of.guns..puppets..blood..rubber.noses.and.dark.humor....By.the.way..none.of.this.is.nearly.as.interesting.as.it.sounds.. <dbl> …
## $ overview_Amy.has.moved.east.from.Kansas..determined.to.start.a.new.life..She.thinks.she.has.found.the.perfect.small.quiet.town..a.great.neighborhood.on.a.quiet.street..As.she.moves.into.her.brand.new.apartment..eager.to.start.a.dream.job..happy.to.befriend.her.neighbors..she.finds.out.that.not.everything.is.as.it.seems..especially.at.the.house.across.the.street. <dbl> …
## $ overview_An.adaptation.of.the.Icelandic.ghost.story.of..The.Deacon.of.Dark.River...set.in.1970s.France. <dbl> …
## $ overview_An.archaelogist.falls.under.the.spell.of.a.statue.with.a.curse.on.it. <dbl> …
## $ overview_An.independent.Australian.horror.drama.that.explores.the.societal.norms.that.break.down.among.a.small.group.of.survivors.in.a.post.apocalyptic.world..Ravenous.hordes.of.infected.zombies.terrorize.the.survivors..but.it.is.the.horror.within.their.own.sanctuary.that.they.must.fear.the.most. <dbl> …
## $ overview_An.overworked.American.ambassador.working.in.the.UK.attempts.to.spend.more.time.with.his.wife.by.visiting.a.countryside.mansion..but.soon.the.trip.turns.into.a.nightmare.with.his.wife.haunted.by.a.stalker.seemingly.from.her.past. <dbl> …
## $ overview_Ash.Williams.and.his.girlfriend.Linda.find.a.log.cabin.in.the.woods.with.a.voice.recording.from.an.archeologist.who.had.recorded.himself.reciting.ancient.chants.from..The.Book.of.the.Dead...As.they.play.the.recording.an.evil.power.is.unleashed.taking.over.Linda.s.body. <dbl> …
## $ overview_Based.on.the.book.by.Jorge.Montenegro..the.film.is.composed.of.four.segments.of.the.famous.fables...El.cadejo....La.sucia....La.fiesta.de.ánimas...and..La.taconuda.. <dbl> …
## $ overview_Beautiful..mysterious...the.occult.history.Jack.Angel.uncovers.in.the.city.of.Bath.is.astonishing..But.the.discoveries.become.increasingly.creepy.and.disturbing...and.the..final.revelation... <dbl> …
## $ overview_Clean.cut..handsome.looking.psychiatric.patient.Daryl.Gleeson.finds.himself.hopelessly.falling.in.love.with.restaurant.owner.Brooke.Daniels..after.having.instinctively.rescued.her.little.son.Mikey.from.a.traffic.accident..When.she.doesn.t.return.his.love.he.snaps.and.begins.to.stalk.her..eliminating.all.who.stand.in.his.way... <dbl> …
## $ overview_Dead.Girls.is.a.horror.anthology.featuring.three.stories.of.terror.connected.through.the.pages.of.dead.girls..diaries..which.chronicle.each.girl.s.act.of.vengeance.against.the.people.who.have.wronged.or.abused.them. <dbl> …
## $ overview_Emmy.who.lives.in.the.UK..comes.to.India.to.organised.a.real.life.game..The.game.is.launched.in.India..5.players.from.5.states.participate.in.it.to.win.Rs..2.Crore..Three.young.boys.and.two.beautiful.girls.become.the.part.of.the.game. <dbl> …
## $ overview_FIEND.FATALE.is.an..epic..horror.adventure.proof.of.concept.short.film..about.five.sisters..a.vampire..zombie..werewolf..demon..and.mermaid..cloned.from.the.DNA.of.extinct.monsters..thrust.into.the.modern.world.and.facing.off.against.the.government..terrorists..and.themselves. <dbl> …
## $ overview_Five.young.adults.venture.into.a.bog.to.excavate.some.bodies..After.a.while.they.find.that.bodies.that.have.been.buried.in.the.bog.have.risen.from.the.dead.and.seek.to.pick.them.off.one.by.one. <dbl> …
## $ overview_For.fame..Raya.and.his.friends.came.to.the.haunted.area..When.making.video..one.of.them.died..They.decided.to.flee.but.the.ghost.is.coming.with.them. <dbl> …
## $ overview_For.four.college.students..a.weekend.camping.trip.turns.into.a.living.nightmare.as.they.stumble.upon.something.in.the.woods.that.should.not.be. <dbl> …
## $ overview_Guests.of.a.country.inn.begin.disappearing.and.dying..after.a.mentally.unstable.man.takes.over.his.terminally.ill.mother.s.bed.and.breakfast.operation. <dbl> …
## $ overview_Haunted.by.recent.events.and.on.the.run..a.man.finds.himself.the.unwitting.pawn.of.a.possessed.evangelical.radio.station.and.like.his.unfortunate.predecessor.must.ask.himself.whether.it.is.better.to.reign.in.hell.than.serve.in.heaven. <dbl> …
## $ overview_In.October.of.2020.food.blogger.Jeff.Blake.and.his.half.brother.Andy.Baker.hit.the.road.on.a.food.tour.that.had.the.potential.to.change.their.lives..They.were.never.seen.again..This.is.their.footage. <dbl> …
## $ overview_In.the.15th.century..a.young.goatherd.living.alone.in.a.mountain.hut.feels.a.dark.presence.in.the.woods. <dbl> …
## $ overview_It.s.always.just.a.party..until.somebody.gets.killed..Audrey.Small..Julie.Sherwood..and.three.of.her.closest.friends.are.preparing.to.throw.a.costume.party..Midsummer.Nightmares..which.everyone.calls..the.social.event.of.the.season..But.there.is.someone.in.their.midsts.who.would.prefer.to.make.this.faux.bloodbath.a.real.one..Can.Audrey.and.her.friends.survive.this.mad.individual.s.evil.plan..Or.will.they.all..one.by.one..be.picked.off.... <dbl> …
## $ overview_Italy..1952..The.young.official.Furio.Momenté.is.sent.to.Veneto.region..near.Polesine..to.investigate.a.shocking.and.mysterious.case..a.minor.has.in.fact.killed.one.of.his.peers.claiming.to.have.killed.the.devil.himself. <dbl> …
## $ overview_More.than.89..less.than.91. <dbl> …
## $ overview_No.overview.found. <dbl> …
## $ overview_Phantom.Lake.County.is.known.for.its.share.of.strange.occurances..from.monster.attacks.to.alien.invasions..However..things.will.never.be.the.same.when..after.experiencing.inexplicable.earthquakes.emanating.from.a.local.system.of.caves..the.area.is.overrun.by.grossly.mutated.creatures.of.unusual.size..a.phenomenon.that.may.point.to.the.return.of.a.long.since.forgotten.menace..Thankfully..the.Phantom.Lake.Kids.are..on.the.job..and.determined.to.solve.the.real.mystery..What.s.the.deal.with.Butch.s.magic.hat. <dbl> …
## $ overview_Rama..Garin..Farel..Quincy.and.Celsi.survive.from.the.spirit.terror.at.the.Ayunan.Island.resort.which.harbored.a.terrible.history.of.the.slaughter.of.a.family.and.resort.employees..Rama.is.frantic..her.lover.is.in.a.mental.hospital..the.body.of.Hana..his.sister..and.Fira.have.not.been.found..Rama.returns.to.Ayunan.Island.to.look.for.them..But.obstacles.lay.ahead. <dbl> …
## $ overview_Sandra.and.Jorge.s.trip.was.expected.to.be.pleasant.but.things.often.don.t.go.as.planned..Sometimes.the.middle.of.the.road.becomes.the.beginning.of.an.utterly.different.story.and.you.never.arrive.to.your.intended.destination. <dbl> …
## $ overview_Six.friends.plan.for.a.trek.to.an.undisclosed.mountain.in.the.Western.Ghats. <dbl> …
## $ overview_Stop.motion.animated.short.film.in.which.a.puppet.on.a.trike.captures.a.puppet.bird.man. <dbl> …
## $ overview_The.actor.Koheiji.is.terribly.in.love.with.the.wife.of.his.best.friend..the.playwright.Takuro..to.get.her..he.would.even.kill.Takuro. <dbl> …
## $ overview_The.Church.sends.in.a.team.to.investigate.the.tragic.deaths.of.a.young.group.found.in.the.crypt.of.a.Convent. <dbl> …
## $ overview_The.Dead.Man.returns..but.it.s.too.late.to.save.us..We.are.already.dead. <dbl> …
## $ overview_The.Kingdom.Of.Shadows.is.a.mystical.cinematic.experience.which.stirs.from.the.darkness.the.spirits.of.our.ancestors.and.reawakens.the.horror.of.unresolved.crimes.and.denied.desires. <dbl> …
## $ overview_The.sequel.is.set.just.weeks.after.Annie.Barlow.s.deadly.confrontation.with.the.Judas.Killer..In.this.elevated.sequel..we.meet.June..a.woman.whose.carefully.constructed.life.is.beginning.to.unravel.due.to.lucid.nightmares.so.awful.they.disturb.her.waking.life <dbl> …
## $ overview_The.spaceship.AAB.Gamma.is.dispatched.from.FAFC.headquarters.in.Japan.to.make.a.landing.on.the.planet.Mars.and.investigate.reports.of.UFOs.in.the.area..As.they.near.the.red.planet..they.encounter.a.mysterious.UFO.that.coats.the.ship.s.hull.with.unusual.spores..Taking.one.of.the.specimens.back.to.earth..it.soon.develops.and.grows.into.a.giant.chicken.lizard.alien.monster.that.tramples.Japan. <dbl> …
## $ overview_The.staff.of.a.black.hair.salon.fend.off.a.strange.new.monster..white.women.intent.on.sucking.the.lifeblood.from.black.culture. <dbl> …
## $ overview_The.story.of.a.student.named.Nikko..Ben.Joshua...engaged.to.Donna..Nia.Ramadhani...Things.are.fine.until.one.day.Nikko.meets.a.waiters.named.Livi..Nadilla.Ernesta..at.a.cafe..Nikko.decides.to.have.an.affair.with.Livi..meanwhile..this.girl.doesn.t.know..that.Nikko.is.engaged..One.day..Livi.was.pregnant.and.he.asked.for.Nikko.s.responsibility..Unable.to.accept.this.fact..Livi.was.accidentally.killed.by.Niko.and.then.dumped.his.body.from.the.Ancol.bridge..Then.the.spirits.of.Livi.continue.to.haunt.Nikko. <dbl> …
## $ overview_The.story.of.Michael.and.Richard.Henderson..two.stepbrothers.from.West.Virginia.who.saw.an.opportunity.in.the.burgeoning.VHS.market.in.the.1980s.and.made.their.own.backyard.horror.movies...The.Curse.of.Stabberman..and..Cannibal.Swim.Club...These.films.would.ve.been.long.forgotten..but.a.recent.resurgence.in.horror.fans.collecting.rare.VHS.tapes.has.put.the.Henderson.Brothers.back.in.the.spotlight..Thanks.to.their.biggest.fan..they.re.sitting.down.for.their.first.on.camera.interview.and.looking.back.on.their.movies...but.they.might.not.be.as.good.as.they.remembered. <dbl> …
## $ overview_Three.friends.making.a.web.series.about.their.town.discover.that.their.neighbors.are.being.killed.and.replaced.by.creatures.who.are.perfect.copies.of.their.victims. <dbl> …
## $ overview_To.keep.her.former.fiancé.from.leaving.her..a.neurotic.young.woman..Louise.Allbritton..fakes.a.riding.accident..and.feigns.paralysis.of.her.lower.body..When.her.graceful.nurse..Grace.Kelly..discovers.her.treachery..she.resorts.to.a.final.and.desperate.act.of.revenge. <dbl> …
## $ overview_Two.inexplicably.coherent.zombies.awake.amidst.a.zombie.attack.and.decide.to.take.a.road.trip.to.find.the.one.s.lost.love..unaware.they.are.being.chased.by.the.agents.of.a.ruthless.company.with.its.own.agenda. <dbl> …
## $ overview_Two.people.Matt.and.Kate.wake.up.in.a.closed.down.prison..They.have.no.idea.how.they.got.there..or.why.they.are.there..In.the.prison.they.encounter.several.terrors.as.they.look.for.an.exit. <dbl> …
## $ overview_Up.Route.tells.the.unnerving.tale.of.a.hitchhiker.who.is.picked.up.by.a.strange.man.and.his..even.stranger..potted.plan <dbl> …
## $ overview_Zombie.short. <dbl> …
tune::show_best(xgboost_tune, metric = "rmse")
## # A tibble: 5 × 12
## trees min_n tree_depth learn_rate loss_reduction sample_size .metric
## <int> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 1309 24 2 0.00694 2.65e+ 0 0.905 rmse
## 2 431 9 10 0.0100 7.91e-10 0.427 rmse
## 3 1020 11 5 0.00253 4.65e- 5 0.672 rmse
## 4 1926 32 7 0.0768 5.00e- 8 0.553 rmse
## 5 121 34 15 0.212 1.08e- 1 0.110 rmse
## # ℹ 5 more variables: .estimator <chr>, mean <dbl>, n <int>, std_err <dbl>,
## # .config <chr>
xgboost_fw <- tune::finalize_workflow(xgboost_workflow,
tune::select_best(xgboost_tune, metric = "rmse"))
data_fit <- tune::last_fit(xgboost_fw, data_split)
tune::collect_metrics(data_fit)
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 rmse standard 0.949 Preprocessor1_Model1
## 2 rsq standard 0.227 Preprocessor1_Model1
tune::collect_predictions(data_fit) %>%
ggplot(aes(vote_average, .pred)) +
geom_point(alpha = 0.3, fill= "midnightblue") +
geom_abline(lty = 2, color = "gray50") +
coord_fixed()
set.seed(123)
data_split <- initial_split(data, strata = vote_average)
data_train <- training(data_split)
data_test <- testing(data_split)
set.seed(234)
data_folds <- vfold_cv(data_train, strata = vote_average)
## Warning: The number of observations in each quantile is below the recommended threshold of 20.
## • Stratification will use 3 breaks instead.
# Setting up our preprocessing using step_tf
movie_rec <-
recipe(vote_average ~ overview, data = data_train) %>%
step_tokenize(overview) %>%
step_tokenfilter(overview, max_tokens = 100) %>%
step_tf(overview)
# Attempting to use the SVM Model specification
library(parsnip)
svm_spec <-
svm_linear( mode = "regression") %>%
set_engine("kernlab")
# Creating the workflow
movie_workflow_svm <-
workflow() %>%
add_recipe(movie_rec) %>%
add_model(svm_spec)
# Fit workflow to the training data
library(kernlab)
## Warning: package 'kernlab' was built under R version 4.4.1
set.seed(345)
movie_fit_svm <-
movie_workflow_svm %>%
fit(data = data_train)
## Setting default kernel parameters
# Evaluating the model
library(yardstick)
movie_predictions_svm <- predict(movie_fit_svm, data_test) %>%
bind_cols(data_test)
# Evaluting performance
movie_metrics_svm <- movie_predictions_svm %>%
metrics(truth = vote_average, estimate = .pred)
movie_metrics_svm
## # A tibble: 3 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 2.31
## 2 rsq standard 0.0119
## 3 mae standard 1.52
In this analysis I had initially used a basic linear regression model and added improvements throughout to enhace the predictive power of that model. I specifically used a text preprocessing pipeline with step_tokenize, step_tokenfilter, and step_tf to process the “overview” text data. I then transformed the text into term frequency values, so I could capture relevant info from the descriptions of the movies themselves, which might have contained patterns helpful for predicting vote_aveage, which was my target variable.
I then proceeded to replace my basic linear regression model with the SVM linear model, which is better at handling complex relationships within data. After training and evaluating both models I have observed that my changes have made an impact on the Root Mean Squared Error. The RMSE has improved showing that the SVM model might have been better at minimizing prediction errors compared to the basic linear regression model. This new model showed more variance in vote_average than the basic linear regression model I first used.