Based on decisioin tree predicting model, “DIRECTING” is the best predictor of “Best Picture.”

library(dplyr)
## Warning: package 'dplyr' was built under R version 3.1.3
## 
## Attaching package: 'dplyr'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.1.3
require(ggplot2)
## Loading required package: ggplot2
academy <- read.csv("~/IS 607/academy_awards_project3.csv", header=T)

by_category <- group_by(academy,Category,Nominee, Won )
tb1 <- filter (by_category, Category=="Best Picture",  Won=="YES")
tb1
## Source: local data frame [83 x 5]
## Groups: Category, Nominee, Won
## 
##           Year     Category                                       Nominee
## 1  2010 (83rd) Best Picture                             The King's Speech
## 2  2009 (82nd) Best Picture                               The Hurt Locker
## 3  2008 (81st) Best Picture                           Slumdog Millionaire
## 4  2007 (80th) Best Picture                        No Country for Old Men
## 5  2006 (79th) Best Picture                                  The Departed
## 6  2005 (78th) Best Picture                                         Crash
## 7  2004 (77th) Best Picture                           Million Dollar Baby
## 8  2003 (76th) Best Picture The Lord of the Rings: The Return of the King
## 9  2002 (75th) Best Picture                                       Chicago
## 10 2001 (74th) Best Picture                              A Beautiful Mind
## ..         ...          ...                                           ...
## Variables not shown: Additional.Info (fctr), Won (fctr)
tb2 <- filter(academy, Nominee==tb1$Nominee, Won=="YES")
## Warning in is.na(e1) | is.na(e2): longer object length is not a multiple
## of shorter object length
## Warning in `==.default`(structure(c(1929L, 1938L, 1952L, 872L, 1906L,
## 818L, : longer object length is not a multiple of shorter object length
summarise(tb2, Category = n_distinct(Category),
          counts =n()) 
##   Category counts
## 1        4      5

Downloaded Group file

Awards <- read.csv("~/IS 607/Awards_File.csv", header=T)


head(Awards)
##   movie_id movie_name year category_id              category_name won
## 1        1   BIUTIFUL 2010           1      ACTOR -- LEADING ROLE   0
## 2        2  TRUE GRIT 2010           1      ACTOR -- LEADING ROLE   0
## 3        2  TRUE GRIT 2010           4 ACTRESS -- SUPPORTING ROLE   0
## 4        2  TRUE GRIT 2010           6              ART DIRECTION   0
## 5        2  TRUE GRIT 2010           7             CINEMATOGRAPHY   0
## 6        2  TRUE GRIT 2010           8             COSTUME DESIGN   0
#to find category_id of "Best Picture"

distinct(select(Awards, category_id, category_name))
##    category_id               category_name
## 1            1       ACTOR -- LEADING ROLE
## 2            4  ACTRESS -- SUPPORTING ROLE
## 3            6               ART DIRECTION
## 4            7              CINEMATOGRAPHY
## 5            8              COSTUME DESIGN
## 6            9                   DIRECTING
## 7           16                BEST PICTURE
## 8           19                       SOUND
## 9           20               SOUND EDITING
## 10          22                     WRITING
## 11          12                FILM EDITING
## 12          14             MUSIC (SCORING)
## 13           2    ACTOR -- SUPPORTING ROLE
## 14          15                MUSIC (SONG)
## 15           3     ACTRESS -- LEADING ROLE
## 16           5       ANIMATED FEATURE FILM
## 17          21              VISUAL EFFECTS
## 18          10       DOCUMENTARY (FEATURE)
## 19          11 DOCUMENTARY (SHORT SUBJECT)
## 20          13                      MAKEUP
## 21          17       SHORT FILM (ANIMATED)
## 22          18    SHORT FILM (LIVE ACTION)
## 23          23         DOCUMENTARY (OTHER)

Make dataframe for analysis

awards_df <- read.csv("~/IS 607/Awards_File.csv", stringsAsFactors=F)
awards_modified <- awards_df[,c(1,3,5,6)]


head(awards_modified)
##   movie_id year              category_name won
## 1        1 2010      ACTOR -- LEADING ROLE   0
## 2        2 2010      ACTOR -- LEADING ROLE   0
## 3        2 2010 ACTRESS -- SUPPORTING ROLE   0
## 4        2 2010              ART DIRECTION   0
## 5        2 2010             CINEMATOGRAPHY   0
## 6        2 2010             COSTUME DESIGN   0
# Fix the duplicate in category ( From Sekhar Mekala http://rpubs.com/msekhar12/68495)

awards_modified <- awards_modified %>%
  group_by(movie_id,year,category_name)%>%
  
  summarise(won=max(won))
# Make wide dataframe (From Sekhar Mekala http://rpubs.com/msekhar12/68495)

awards_re_modified <- spread((awards_modified),category_name,won)

head(awards_re_modified)
## Source: local data frame [6 x 25]
## 
##   movie_id year ACTOR -- LEADING ROLE ACTOR -- SUPPORTING ROLE
## 1        1 2010                     0                       NA
## 2        2 2010                     0                       NA
## 3        3 2010                     0                       NA
## 4        4 2010                     1                        0
## 5        5 2010                     0                       NA
## 6        6 2010                    NA                        1
## Variables not shown: ACTRESS -- LEADING ROLE (int), ACTRESS -- SUPPORTING
##   ROLE (int), ANIMATED FEATURE FILM (int), ART DIRECTION (int), BEST
##   PICTURE (int), CINEMATOGRAPHY (int), COSTUME DESIGN (int), DIRECTING
##   (int), DOCUMENTARY (FEATURE) (int), DOCUMENTARY (OTHER) (int),
##   DOCUMENTARY (SHORT SUBJECT) (int), FILM EDITING (int), MAKEUP (int),
##   MUSIC (SCORING) (int), MUSIC (SONG) (int), SHORT FILM (ANIMATED) (int),
##   SHORT FILM (LIVE ACTION) (int), SOUND (int), SOUND EDITING (int), VISUAL
##   EFFECTS (int), WRITING (int)
#Preparing data for Decision tree

library(party)
## Warning: package 'party' was built under R version 3.1.3
## Loading required package: grid
## Loading required package: mvtnorm
## Warning: package 'mvtnorm' was built under R version 3.1.3
## Loading required package: modeltools
## Warning: package 'modeltools' was built under R version 3.1.3
## Loading required package: stats4
## Loading required package: strucchange
## Warning: package 'strucchange' was built under R version 3.1.3
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 3.1.3
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 3.1.3
library(rpart)
## Warning: package 'rpart' was built under R version 3.1.3
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.1.3
df <- awards_re_modified[-2]

        
str(df)
## Classes 'tbl_df' and 'data.frame':   4145 obs. of  24 variables:
##  $ movie_id                   : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ ACTOR -- LEADING ROLE      : int  0 0 0 1 0 NA NA NA NA NA ...
##  $ ACTOR -- SUPPORTING ROLE   : int  NA NA NA 0 NA 1 0 0 0 NA ...
##  $ ACTRESS -- LEADING ROLE    : int  NA NA NA NA NA NA 0 NA 0 0 ...
##  $ ACTRESS -- SUPPORTING ROLE : int  NA 0 NA 0 NA 1 NA NA NA NA ...
##  $ ANIMATED FEATURE FILM      : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ ART DIRECTION              : int  NA 0 NA 0 NA NA NA NA NA NA ...
##  $ BEST PICTURE               : int  NA 0 0 1 0 0 0 NA 0 NA ...
##  $ CINEMATOGRAPHY             : int  NA 0 0 0 NA NA NA NA NA NA ...
##  $ COSTUME DESIGN             : int  NA 0 NA 0 NA NA NA NA NA NA ...
##  $ DIRECTING                  : int  NA 0 0 1 NA 0 NA NA NA NA ...
##  $ DOCUMENTARY (FEATURE)      : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ DOCUMENTARY (OTHER)        : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ DOCUMENTARY (SHORT SUBJECT): int  NA NA NA NA NA NA NA NA NA NA ...
##  $ FILM EDITING               : int  NA NA 1 0 0 0 NA NA NA NA ...
##  $ MAKEUP                     : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ MUSIC (SCORING)            : int  NA NA 1 0 0 NA NA NA NA NA ...
##  $ MUSIC (SONG)               : int  NA NA NA NA 0 NA NA NA NA NA ...
##  $ SHORT FILM (ANIMATED)      : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ SHORT FILM (LIVE ACTION)   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ SOUND                      : int  NA 0 0 0 NA NA NA NA NA NA ...
##  $ SOUND EDITING              : int  NA 0 NA NA NA NA NA NA NA NA ...
##  $ VISUAL EFFECTS             : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ WRITING                    : int  NA 0 1 1 0 0 0 NA 0 NA ...

df_ctree <- ctree( df\("16" ~ df\)“12” + df\("9" + df\)“22”, data=df) print(df_ctree)

#The process of building a classical decision tree starts with a binary outcome variable (YES/NO in Won) and predictor variables (Categories).


df$best <- factor(df$"BEST PICTURE", levels=c(0,1), labels=c("no","yes"))
df$edit <- factor(df$"FILM EDITING", levels=c(0,1), labels=c("no","yes"))
df$directing <- factor(df$"DIRECTING", levels=c(0,1), labels=c("no","yes"))
df$writing <- factor(df$"WRITING", levels=c(0,1), labels=c("no","yes"))
df$actor <- factor(df$"ACTOR -- LEADING ROLE", levels=c(0,1), labels=c("no","yes"))
df$actress <- factor(df$"ACTRESS -- LEADING ROLE", levels=c(0,1), labels=c("no","yes"))


set.seed(1234)
train <- sample(nrow(df), 0.7*nrow(df))
df.train <- df[train,]
df.validate <- df[-train,]



# The train sample (70%) will be used to create classification schemes using logistic regresion, a decision tree, a conditional decision tree,a random forest, and a support vector machine.



table(df.train$best)
## 
##  no yes 
## 286  60
table(df.train$edit)
## 
##  no yes 
## 217  44
table(df.train$directing)
## 
##  no yes 
## 217  62
table(df.train$writing)
## 
##  no yes 
## 498 124
table(df.train$actor)
## 
##  no yes 
## 216  62
table(df.train$actress)
## 
##  no yes 
## 232  64
# The validation sample (30%) will be used to evaluate the effectiveness of these schemes. By using the same example, we will compare the results of each approach.

table(df.validate$best)
## 
##  no yes 
## 115  23
table(df.validate$edit)
## 
##  no yes 
##  91  33
table(df.validate$directing)
## 
##  no yes 
## 107  23
table(df.validate$writing)
## 
##  no yes 
## 195  53
table(df.validate$actor)
## 
##  no yes 
##  94  23
table(df.validate$actress)
## 
##  no yes 
##  90  20

Grow tree and pruning

set.seed(1234)
dtree <- rpart(best ~ edit+directing+writing+actor+actress, data=df.train, method="class")


dtree$cptable
##          CP nsplit rel error    xerror       xstd
## 1 0.5344828      0 1.0000000 1.0000000 0.11860344
## 2 0.0100000      1 0.4655172 0.4655172 0.08566331
plotcp(dtree)

dtree.pruned <- prune(dtree, cp=0.38)
prp(dtree.pruned, type=4, extra=104, fallen.leaves=TRUE, main="Best Picture")

plot(dtree.pruned, uniform=TRUE, 
    main="BEST PICTURE")
 text(dtree.pruned, use.n=TRUE, all=TRUE, cex=.8)