project3_decision tree

Based on decisioin tree predicting model, “DIRECTING” is the best predictor of “Best Picture.”

library(dplyr)

## Warning: package 'dplyr' was built under R version 3.1.3

## 
## Attaching package: 'dplyr'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)

## Warning: package 'tidyr' was built under R version 3.1.3

require(ggplot2)

## Loading required package: ggplot2

academy <- read.csv("~/IS 607/academy_awards_project3.csv", header=T)

by_category <- group_by(academy,Category,Nominee, Won )
tb1 <- filter (by_category, Category=="Best Picture",  Won=="YES")
tb1

## Source: local data frame [83 x 5]
## Groups: Category, Nominee, Won
## 
##           Year     Category                                       Nominee
## 1  2010 (83rd) Best Picture                             The King's Speech
## 2  2009 (82nd) Best Picture                               The Hurt Locker
## 3  2008 (81st) Best Picture                           Slumdog Millionaire
## 4  2007 (80th) Best Picture                        No Country for Old Men
## 5  2006 (79th) Best Picture                                  The Departed
## 6  2005 (78th) Best Picture                                         Crash
## 7  2004 (77th) Best Picture                           Million Dollar Baby
## 8  2003 (76th) Best Picture The Lord of the Rings: The Return of the King
## 9  2002 (75th) Best Picture                                       Chicago
## 10 2001 (74th) Best Picture                              A Beautiful Mind
## ..         ...          ...                                           ...
## Variables not shown: Additional.Info (fctr), Won (fctr)

tb2 <- filter(academy, Nominee==tb1$Nominee, Won=="YES")

## Warning in is.na(e1) | is.na(e2): longer object length is not a multiple
## of shorter object length

## Warning in `==.default`(structure(c(1929L, 1938L, 1952L, 872L, 1906L,
## 818L, : longer object length is not a multiple of shorter object length

summarise(tb2, Category = n_distinct(Category),
          counts =n())

##   Category counts
## 1        4      5

Downloaded Group file

Awards <- read.csv("~/IS 607/Awards_File.csv", header=T)


head(Awards)

##   movie_id movie_name year category_id              category_name won
## 1        1   BIUTIFUL 2010           1      ACTOR -- LEADING ROLE   0
## 2        2  TRUE GRIT 2010           1      ACTOR -- LEADING ROLE   0
## 3        2  TRUE GRIT 2010           4 ACTRESS -- SUPPORTING ROLE   0
## 4        2  TRUE GRIT 2010           6              ART DIRECTION   0
## 5        2  TRUE GRIT 2010           7             CINEMATOGRAPHY   0
## 6        2  TRUE GRIT 2010           8             COSTUME DESIGN   0

#to find category_id of "Best Picture"

distinct(select(Awards, category_id, category_name))

##    category_id               category_name
## 1            1       ACTOR -- LEADING ROLE
## 2            4  ACTRESS -- SUPPORTING ROLE
## 3            6               ART DIRECTION
## 4            7              CINEMATOGRAPHY
## 5            8              COSTUME DESIGN
## 6            9                   DIRECTING
## 7           16                BEST PICTURE
## 8           19                       SOUND
## 9           20               SOUND EDITING
## 10          22                     WRITING
## 11          12                FILM EDITING
## 12          14             MUSIC (SCORING)
## 13           2    ACTOR -- SUPPORTING ROLE
## 14          15                MUSIC (SONG)
## 15           3     ACTRESS -- LEADING ROLE
## 16           5       ANIMATED FEATURE FILM
## 17          21              VISUAL EFFECTS
## 18          10       DOCUMENTARY (FEATURE)
## 19          11 DOCUMENTARY (SHORT SUBJECT)
## 20          13                      MAKEUP
## 21          17       SHORT FILM (ANIMATED)
## 22          18    SHORT FILM (LIVE ACTION)
## 23          23         DOCUMENTARY (OTHER)

Make dataframe for analysis

awards_df <- read.csv("~/IS 607/Awards_File.csv", stringsAsFactors=F)
awards_modified <- awards_df[,c(1,3,5,6)]


head(awards_modified)

##   movie_id year              category_name won
## 1        1 2010      ACTOR -- LEADING ROLE   0
## 2        2 2010      ACTOR -- LEADING ROLE   0
## 3        2 2010 ACTRESS -- SUPPORTING ROLE   0
## 4        2 2010              ART DIRECTION   0
## 5        2 2010             CINEMATOGRAPHY   0
## 6        2 2010             COSTUME DESIGN   0

# Fix the duplicate in category ( From Sekhar Mekala http://rpubs.com/msekhar12/68495)

awards_modified <- awards_modified %>%
  group_by(movie_id,year,category_name)%>%
  
  summarise(won=max(won))
# Make wide dataframe (From Sekhar Mekala http://rpubs.com/msekhar12/68495)

awards_re_modified <- spread((awards_modified),category_name,won)

head(awards_re_modified)

## Source: local data frame [6 x 25]
## 
##   movie_id year ACTOR -- LEADING ROLE ACTOR -- SUPPORTING ROLE
## 1        1 2010                     0                       NA
## 2        2 2010                     0                       NA
## 3        3 2010                     0                       NA
## 4        4 2010                     1                        0
## 5        5 2010                     0                       NA
## 6        6 2010                    NA                        1
## Variables not shown: ACTRESS -- LEADING ROLE (int), ACTRESS -- SUPPORTING
##   ROLE (int), ANIMATED FEATURE FILM (int), ART DIRECTION (int), BEST
##   PICTURE (int), CINEMATOGRAPHY (int), COSTUME DESIGN (int), DIRECTING
##   (int), DOCUMENTARY (FEATURE) (int), DOCUMENTARY (OTHER) (int),
##   DOCUMENTARY (SHORT SUBJECT) (int), FILM EDITING (int), MAKEUP (int),
##   MUSIC (SCORING) (int), MUSIC (SONG) (int), SHORT FILM (ANIMATED) (int),
##   SHORT FILM (LIVE ACTION) (int), SOUND (int), SOUND EDITING (int), VISUAL
##   EFFECTS (int), WRITING (int)

#Preparing data for Decision tree

library(party)

## Warning: package 'party' was built under R version 3.1.3

## Loading required package: grid
## Loading required package: mvtnorm

## Warning: package 'mvtnorm' was built under R version 3.1.3

## Loading required package: modeltools

## Warning: package 'modeltools' was built under R version 3.1.3

## Loading required package: stats4
## Loading required package: strucchange

## Warning: package 'strucchange' was built under R version 3.1.3

## Loading required package: zoo

## Warning: package 'zoo' was built under R version 3.1.3

## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Loading required package: sandwich

## Warning: package 'sandwich' was built under R version 3.1.3

library(rpart)

## Warning: package 'rpart' was built under R version 3.1.3

library(rpart.plot)

## Warning: package 'rpart.plot' was built under R version 3.1.3

df <- awards_re_modified[-2]

        
str(df)

## Classes 'tbl_df' and 'data.frame':   4145 obs. of  24 variables:
##  $ movie_id                   : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ ACTOR -- LEADING ROLE      : int  0 0 0 1 0 NA NA NA NA NA ...
##  $ ACTOR -- SUPPORTING ROLE   : int  NA NA NA 0 NA 1 0 0 0 NA ...
##  $ ACTRESS -- LEADING ROLE    : int  NA NA NA NA NA NA 0 NA 0 0 ...
##  $ ACTRESS -- SUPPORTING ROLE : int  NA 0 NA 0 NA 1 NA NA NA NA ...
##  $ ANIMATED FEATURE FILM      : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ ART DIRECTION              : int  NA 0 NA 0 NA NA NA NA NA NA ...
##  $ BEST PICTURE               : int  NA 0 0 1 0 0 0 NA 0 NA ...
##  $ CINEMATOGRAPHY             : int  NA 0 0 0 NA NA NA NA NA NA ...
##  $ COSTUME DESIGN             : int  NA 0 NA 0 NA NA NA NA NA NA ...
##  $ DIRECTING                  : int  NA 0 0 1 NA 0 NA NA NA NA ...
##  $ DOCUMENTARY (FEATURE)      : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ DOCUMENTARY (OTHER)        : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ DOCUMENTARY (SHORT SUBJECT): int  NA NA NA NA NA NA NA NA NA NA ...
##  $ FILM EDITING               : int  NA NA 1 0 0 0 NA NA NA NA ...
##  $ MAKEUP                     : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ MUSIC (SCORING)            : int  NA NA 1 0 0 NA NA NA NA NA ...
##  $ MUSIC (SONG)               : int  NA NA NA NA 0 NA NA NA NA NA ...
##  $ SHORT FILM (ANIMATED)      : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ SHORT FILM (LIVE ACTION)   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ SOUND                      : int  NA 0 0 0 NA NA NA NA NA NA ...
##  $ SOUND EDITING              : int  NA 0 NA NA NA NA NA NA NA NA ...
##  $ VISUAL EFFECTS             : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ WRITING                    : int  NA 0 1 1 0 0 0 NA 0 NA ...

df_ctree <- ctree( df\("16" ~ df\)“12” + df\("9" + df\)“22”, data=df) print(df_ctree)

#The process of building a classical decision tree starts with a binary outcome variable (YES/NO in Won) and predictor variables (Categories).


df$best <- factor(df$"BEST PICTURE", levels=c(0,1), labels=c("no","yes"))
df$edit <- factor(df$"FILM EDITING", levels=c(0,1), labels=c("no","yes"))
df$directing <- factor(df$"DIRECTING", levels=c(0,1), labels=c("no","yes"))
df$writing <- factor(df$"WRITING", levels=c(0,1), labels=c("no","yes"))
df$actor <- factor(df$"ACTOR -- LEADING ROLE", levels=c(0,1), labels=c("no","yes"))
df$actress <- factor(df$"ACTRESS -- LEADING ROLE", levels=c(0,1), labels=c("no","yes"))


set.seed(1234)
train <- sample(nrow(df), 0.7*nrow(df))
df.train <- df[train,]
df.validate <- df[-train,]



# The train sample (70%) will be used to create classification schemes using logistic regresion, a decision tree, a conditional decision tree,a random forest, and a support vector machine.



table(df.train$best)

## 
##  no yes 
## 286  60

table(df.train$edit)

## 
##  no yes 
## 217  44

table(df.train$directing)

## 
##  no yes 
## 217  62

table(df.train$writing)

## 
##  no yes 
## 498 124

table(df.train$actor)

## 
##  no yes 
## 216  62

table(df.train$actress)

## 
##  no yes 
## 232  64

# The validation sample (30%) will be used to evaluate the effectiveness of these schemes. By using the same example, we will compare the results of each approach.

table(df.validate$best)

## 
##  no yes 
## 115  23

table(df.validate$edit)

## 
##  no yes 
##  91  33

table(df.validate$directing)

## 
##  no yes 
## 107  23

table(df.validate$writing)

## 
##  no yes 
## 195  53

table(df.validate$actor)

## 
##  no yes 
##  94  23

table(df.validate$actress)

## 
##  no yes 
##  90  20

Grow tree and pruning

set.seed(1234)
dtree <- rpart(best ~ edit+directing+writing+actor+actress, data=df.train, method="class")


dtree$cptable

##          CP nsplit rel error    xerror       xstd
## 1 0.5344828      0 1.0000000 1.0000000 0.11860344
## 2 0.0100000      1 0.4655172 0.4655172 0.08566331

plotcp(dtree)

dtree.pruned <- prune(dtree, cp=0.38)
prp(dtree.pruned, type=4, extra=104, fallen.leaves=TRUE, main="Best Picture")

plot(dtree.pruned, uniform=TRUE, 
    main="BEST PICTURE")
 text(dtree.pruned, use.n=TRUE, all=TRUE, cex=.8)

project3_decision tree

Jamey Etherton

Thursday, March 26, 2015

Based on decisioin tree predicting model, “DIRECTING” is the best predictor of “Best Picture.”