library(dplyr)
## Warning: package 'dplyr' was built under R version 3.1.3
##
## Attaching package: 'dplyr'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.1.3
require(ggplot2)
## Loading required package: ggplot2
academy <- read.csv("~/IS 607/academy_awards_project3.csv", header=T)
by_category <- group_by(academy,Category,Nominee, Won )
tb1 <- filter (by_category, Category=="Best Picture", Won=="YES")
tb1
## Source: local data frame [83 x 5]
## Groups: Category, Nominee, Won
##
## Year Category Nominee
## 1 2010 (83rd) Best Picture The King's Speech
## 2 2009 (82nd) Best Picture The Hurt Locker
## 3 2008 (81st) Best Picture Slumdog Millionaire
## 4 2007 (80th) Best Picture No Country for Old Men
## 5 2006 (79th) Best Picture The Departed
## 6 2005 (78th) Best Picture Crash
## 7 2004 (77th) Best Picture Million Dollar Baby
## 8 2003 (76th) Best Picture The Lord of the Rings: The Return of the King
## 9 2002 (75th) Best Picture Chicago
## 10 2001 (74th) Best Picture A Beautiful Mind
## .. ... ... ...
## Variables not shown: Additional.Info (fctr), Won (fctr)
tb2 <- filter(academy, Nominee==tb1$Nominee, Won=="YES")
## Warning in is.na(e1) | is.na(e2): longer object length is not a multiple
## of shorter object length
## Warning in `==.default`(structure(c(1929L, 1938L, 1952L, 872L, 1906L,
## 818L, : longer object length is not a multiple of shorter object length
summarise(tb2, Category = n_distinct(Category),
counts =n())
## Category counts
## 1 4 5
Downloaded Group file
Awards <- read.csv("~/IS 607/Awards_File.csv", header=T)
head(Awards)
## movie_id movie_name year category_id category_name won
## 1 1 BIUTIFUL 2010 1 ACTOR -- LEADING ROLE 0
## 2 2 TRUE GRIT 2010 1 ACTOR -- LEADING ROLE 0
## 3 2 TRUE GRIT 2010 4 ACTRESS -- SUPPORTING ROLE 0
## 4 2 TRUE GRIT 2010 6 ART DIRECTION 0
## 5 2 TRUE GRIT 2010 7 CINEMATOGRAPHY 0
## 6 2 TRUE GRIT 2010 8 COSTUME DESIGN 0
#to find category_id of "Best Picture"
distinct(select(Awards, category_id, category_name))
## category_id category_name
## 1 1 ACTOR -- LEADING ROLE
## 2 4 ACTRESS -- SUPPORTING ROLE
## 3 6 ART DIRECTION
## 4 7 CINEMATOGRAPHY
## 5 8 COSTUME DESIGN
## 6 9 DIRECTING
## 7 16 BEST PICTURE
## 8 19 SOUND
## 9 20 SOUND EDITING
## 10 22 WRITING
## 11 12 FILM EDITING
## 12 14 MUSIC (SCORING)
## 13 2 ACTOR -- SUPPORTING ROLE
## 14 15 MUSIC (SONG)
## 15 3 ACTRESS -- LEADING ROLE
## 16 5 ANIMATED FEATURE FILM
## 17 21 VISUAL EFFECTS
## 18 10 DOCUMENTARY (FEATURE)
## 19 11 DOCUMENTARY (SHORT SUBJECT)
## 20 13 MAKEUP
## 21 17 SHORT FILM (ANIMATED)
## 22 18 SHORT FILM (LIVE ACTION)
## 23 23 DOCUMENTARY (OTHER)
Make dataframe for analysis
awards_df <- read.csv("~/IS 607/Awards_File.csv", stringsAsFactors=F)
awards_modified <- awards_df[,c(1,3,5,6)]
head(awards_modified)
## movie_id year category_name won
## 1 1 2010 ACTOR -- LEADING ROLE 0
## 2 2 2010 ACTOR -- LEADING ROLE 0
## 3 2 2010 ACTRESS -- SUPPORTING ROLE 0
## 4 2 2010 ART DIRECTION 0
## 5 2 2010 CINEMATOGRAPHY 0
## 6 2 2010 COSTUME DESIGN 0
# Fix the duplicate in category ( From Sekhar Mekala http://rpubs.com/msekhar12/68495)
awards_modified <- awards_modified %>%
group_by(movie_id,year,category_name)%>%
summarise(won=max(won))
# Make wide dataframe (From Sekhar Mekala http://rpubs.com/msekhar12/68495)
awards_re_modified <- spread((awards_modified),category_name,won)
head(awards_re_modified)
## Source: local data frame [6 x 25]
##
## movie_id year ACTOR -- LEADING ROLE ACTOR -- SUPPORTING ROLE
## 1 1 2010 0 NA
## 2 2 2010 0 NA
## 3 3 2010 0 NA
## 4 4 2010 1 0
## 5 5 2010 0 NA
## 6 6 2010 NA 1
## Variables not shown: ACTRESS -- LEADING ROLE (int), ACTRESS -- SUPPORTING
## ROLE (int), ANIMATED FEATURE FILM (int), ART DIRECTION (int), BEST
## PICTURE (int), CINEMATOGRAPHY (int), COSTUME DESIGN (int), DIRECTING
## (int), DOCUMENTARY (FEATURE) (int), DOCUMENTARY (OTHER) (int),
## DOCUMENTARY (SHORT SUBJECT) (int), FILM EDITING (int), MAKEUP (int),
## MUSIC (SCORING) (int), MUSIC (SONG) (int), SHORT FILM (ANIMATED) (int),
## SHORT FILM (LIVE ACTION) (int), SOUND (int), SOUND EDITING (int), VISUAL
## EFFECTS (int), WRITING (int)
#Preparing data for Decision tree
library(party)
## Warning: package 'party' was built under R version 3.1.3
## Loading required package: grid
## Loading required package: mvtnorm
## Warning: package 'mvtnorm' was built under R version 3.1.3
## Loading required package: modeltools
## Warning: package 'modeltools' was built under R version 3.1.3
## Loading required package: stats4
## Loading required package: strucchange
## Warning: package 'strucchange' was built under R version 3.1.3
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 3.1.3
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 3.1.3
library(rpart)
## Warning: package 'rpart' was built under R version 3.1.3
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.1.3
df <- awards_re_modified[-2]
str(df)
## Classes 'tbl_df' and 'data.frame': 4145 obs. of 24 variables:
## $ movie_id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ ACTOR -- LEADING ROLE : int 0 0 0 1 0 NA NA NA NA NA ...
## $ ACTOR -- SUPPORTING ROLE : int NA NA NA 0 NA 1 0 0 0 NA ...
## $ ACTRESS -- LEADING ROLE : int NA NA NA NA NA NA 0 NA 0 0 ...
## $ ACTRESS -- SUPPORTING ROLE : int NA 0 NA 0 NA 1 NA NA NA NA ...
## $ ANIMATED FEATURE FILM : int NA NA NA NA NA NA NA NA NA NA ...
## $ ART DIRECTION : int NA 0 NA 0 NA NA NA NA NA NA ...
## $ BEST PICTURE : int NA 0 0 1 0 0 0 NA 0 NA ...
## $ CINEMATOGRAPHY : int NA 0 0 0 NA NA NA NA NA NA ...
## $ COSTUME DESIGN : int NA 0 NA 0 NA NA NA NA NA NA ...
## $ DIRECTING : int NA 0 0 1 NA 0 NA NA NA NA ...
## $ DOCUMENTARY (FEATURE) : int NA NA NA NA NA NA NA NA NA NA ...
## $ DOCUMENTARY (OTHER) : int NA NA NA NA NA NA NA NA NA NA ...
## $ DOCUMENTARY (SHORT SUBJECT): int NA NA NA NA NA NA NA NA NA NA ...
## $ FILM EDITING : int NA NA 1 0 0 0 NA NA NA NA ...
## $ MAKEUP : int NA NA NA NA NA NA NA NA NA NA ...
## $ MUSIC (SCORING) : int NA NA 1 0 0 NA NA NA NA NA ...
## $ MUSIC (SONG) : int NA NA NA NA 0 NA NA NA NA NA ...
## $ SHORT FILM (ANIMATED) : int NA NA NA NA NA NA NA NA NA NA ...
## $ SHORT FILM (LIVE ACTION) : int NA NA NA NA NA NA NA NA NA NA ...
## $ SOUND : int NA 0 0 0 NA NA NA NA NA NA ...
## $ SOUND EDITING : int NA 0 NA NA NA NA NA NA NA NA ...
## $ VISUAL EFFECTS : int NA NA NA NA NA NA NA NA NA NA ...
## $ WRITING : int NA 0 1 1 0 0 0 NA 0 NA ...
df_ctree <- ctree( df\("16" ~ df\)“12” + df\("9" + df\)“22”, data=df) print(df_ctree)
#The process of building a classical decision tree starts with a binary outcome variable (YES/NO in Won) and predictor variables (Categories).
df$best <- factor(df$"BEST PICTURE", levels=c(0,1), labels=c("no","yes"))
df$edit <- factor(df$"FILM EDITING", levels=c(0,1), labels=c("no","yes"))
df$directing <- factor(df$"DIRECTING", levels=c(0,1), labels=c("no","yes"))
df$writing <- factor(df$"WRITING", levels=c(0,1), labels=c("no","yes"))
df$actor <- factor(df$"ACTOR -- LEADING ROLE", levels=c(0,1), labels=c("no","yes"))
df$actress <- factor(df$"ACTRESS -- LEADING ROLE", levels=c(0,1), labels=c("no","yes"))
set.seed(1234)
train <- sample(nrow(df), 0.7*nrow(df))
df.train <- df[train,]
df.validate <- df[-train,]
# The train sample (70%) will be used to create classification schemes using logistic regresion, a decision tree, a conditional decision tree,a random forest, and a support vector machine.
table(df.train$best)
##
## no yes
## 286 60
table(df.train$edit)
##
## no yes
## 217 44
table(df.train$directing)
##
## no yes
## 217 62
table(df.train$writing)
##
## no yes
## 498 124
table(df.train$actor)
##
## no yes
## 216 62
table(df.train$actress)
##
## no yes
## 232 64
# The validation sample (30%) will be used to evaluate the effectiveness of these schemes. By using the same example, we will compare the results of each approach.
table(df.validate$best)
##
## no yes
## 115 23
table(df.validate$edit)
##
## no yes
## 91 33
table(df.validate$directing)
##
## no yes
## 107 23
table(df.validate$writing)
##
## no yes
## 195 53
table(df.validate$actor)
##
## no yes
## 94 23
table(df.validate$actress)
##
## no yes
## 90 20
Grow tree and pruning
set.seed(1234)
dtree <- rpart(best ~ edit+directing+writing+actor+actress, data=df.train, method="class")
dtree$cptable
## CP nsplit rel error xerror xstd
## 1 0.5344828 0 1.0000000 1.0000000 0.11860344
## 2 0.0100000 1 0.4655172 0.4655172 0.08566331
plotcp(dtree)
dtree.pruned <- prune(dtree, cp=0.38)
prp(dtree.pruned, type=4, extra=104, fallen.leaves=TRUE, main="Best Picture")
plot(dtree.pruned, uniform=TRUE,
main="BEST PICTURE")
text(dtree.pruned, use.n=TRUE, all=TRUE, cex=.8)