I have used a simple point based approach. If Best Picture movie was also nominated on other categories then those categories get one point, if the BEST Picture movie also won in other categories then those categories get two points. Categories with most points are the best indicators or predictors of Best picture movie.
Based on that method DIRECTING category is the best indicator with 139 points. WRITING category is the second best predictor with 130 points
Required libraries
library(dplyr)
library(tidyr)
require(ggplot2)
Load csv file and cleanup some duplicates
df.MovieAward <- read.csv("C:/Senthil/MSDataAnalytics/GitHub/607Project3/Final_Analysis_File/Awards_File.csv",header=T)
#df.MovieAward <- unique(df.MovieAward)
#cleanup some bad data
df.MovieAward <- group_by(df.MovieAward, movie_id, movie_name, year, category_id, category_name) %>% summarize(won=max(won))
#y <- filter(df.MovieAward, year==1937)
head(df.MovieAward)
## Source: local data frame [6 x 6]
## Groups: movie_id, movie_name, year, category_id
##
## movie_id movie_name year category_id category_name won
## 1 1 BIUTIFUL 2010 1 ACTOR -- LEADING ROLE 0
## 2 2 TRUE GRIT 2010 1 ACTOR -- LEADING ROLE 0
## 3 2 TRUE GRIT 2010 4 ACTRESS -- SUPPORTING ROLE 0
## 4 2 TRUE GRIT 2010 6 ART DIRECTION 0
## 5 2 TRUE GRIT 2010 7 CINEMATOGRAPHY 0
## 6 2 TRUE GRIT 2010 8 COSTUME DESIGN 0
Separate movies that won in BEST PICTURE category
df.BPWonMovie <- df.MovieAward %>% filter(won==1 & category_name == "BEST PICTURE") %>% select (year, movie_id, category_name) %>% arrange(year)
head(df.BPWonMovie)
## Source: local data frame [6 x 5]
## Groups: movie_id, movie_name, year, category_id
##
## movie_name category_id year movie_id category_name
## 1 THE KING'S SPEECH 16 2010 4 BEST PICTURE
## 2 THE HURT LOCKER 16 2009 57 BEST PICTURE
## 3 SLUMDOG MILLIONAIRE 16 2008 125 BEST PICTURE
## 4 NO COUNTRY FOR OLD MEN 16 2007 158 BEST PICTURE
## 5 THE DEPARTED 16 2006 213 BEST PICTURE
## 6 CRASH 16 2005 265 BEST PICTURE
Category gets one point if the BEST-PICTURE movie is also nominated in its category. If the nomination is also a winner then category gets two points
x <- merge(df.MovieAward, df.BPWonMovie, by=c("movie_id","year"))
x <- arrange(x,year)
x <- filter(x, category_name.x != "BEST PICTURE")
x <- arrange(x,year)
x <- mutate(x,point=ifelse(won==0,1,2))
head(tbl_df(x))
## Source: local data frame [6 x 10]
##
## movie_id year movie_name.x category_id.x
## 1 4075 1930 ALL QUIET ON THE WESTERN FRONT 7
## 2 4075 1930 ALL QUIET ON THE WESTERN FRONT 22
## 3 4039 1931 CIMARRON 22
## 4 4039 1931 CIMARRON 1
## 5 4039 1931 CIMARRON 9
## 6 4039 1931 CIMARRON 3
## Variables not shown: category_name.x (fctr), won (int), movie_name.y
## (fctr), category_id.y (int), category_name.y (fctr), point (dbl)
Conclusion based on from 1928 to 2010:- Based on the chart and data, DIRECTING category can be used as a good indicator to predict BEST PICTURE move. Next best indicator is WRITING.
df.wideformat <- x %>% group_by (year, category_name.x) %>% summarize(Point = sum(point)) %>% spread(category_name.x, Point)
df.wideformat[is.na(df.wideformat)] <- 0
df.wideformat <- arrange(df.wideformat,year)
colnames(df.wideformat) <- gsub("-","",gsub(" ","",colnames(df.wideformat)))
#sort(colSums(df.wideformat[,-1]),decreasing=T)
df.result <- data.frame(category_name=colnames(df.wideformat), points=colSums(df.wideformat))
df.result <- filter(df.result, category_name!="year")
df.result <- arrange(df.result, desc(points))
df.result
## category_name points
## 1 DIRECTING 139
## 2 WRITING 130
## 3 FILMEDITING 101
## 4 ACTORLEADINGROLE 79
## 5 CINEMATOGRAPHY 78
## 6 MUSIC(SCORING) 72
## 7 ARTDIRECTION 70
## 8 SOUND 67
## 9 ACTORSUPPORTINGROLE 60
## 10 COSTUMEDESIGN 51
## 11 ACTRESSSUPPORTINGROLE 40
## 12 ACTRESSLEADINGROLE 37
## 13 MAKEUP 14
## 14 MUSIC(SONG) 13
## 15 SOUNDEDITING 10
## 16 VISUALEFFECTS 9
ggplot(data=df.result, aes(x=category_name,y=points,fill=category_name)) +
geom_bar(stat="identity") + theme(text = element_text(size=20),
axis.text.x = element_text(angle=90, vjust=1)) + guides(fill=FALSE)

conclusion based on data from year 1981 to 2010:- Based on the chart and data, DIRECTING category can be used as a good indicator to predict BEST PICTURE move. Next best indicator is WRITING.
df.result1 <- data.frame(category_name=colnames(df.wideformat), points=colSums(filter(df.wideformat, year >= 1981)))
df.result1 <- filter(df.result1, category_name!="year")
df.result1 <- arrange(df.result1, desc(points))
df.result1
## category_name points
## 1 DIRECTING 53
## 2 WRITING 51
## 3 FILMEDITING 45
## 4 SOUND 34
## 5 CINEMATOGRAPHY 31
## 6 MUSIC(SCORING) 30
## 7 ARTDIRECTION 28
## 8 COSTUMEDESIGN 26
## 9 ACTORLEADINGROLE 24
## 10 ACTORSUPPORTINGROLE 21
## 11 ACTRESSLEADINGROLE 15
## 12 MAKEUP 14
## 13 ACTRESSSUPPORTINGROLE 11
## 14 SOUNDEDITING 9
## 15 MUSIC(SONG) 8
## 16 VISUALEFFECTS 8
ggplot(data=df.result1, aes(x=category_name,y=points,fill=category_name)) +
geom_bar(stat="identity") + theme(text = element_text(size=20),
axis.text.x = element_text(angle=90, vjust=1)) + guides(fill=FALSE)
