Project

I have used a simple point based approach. If Best Picture movie was also nominated on other categories then those categories get one point, if the BEST Picture movie also won in other categories then those categories get two points. Categories with most points are the best indicators or predictors of Best picture movie.

Based on that method DIRECTING category is the best indicator with 139 points. WRITING category is the second best predictor with 130 points

Required libraries

library(dplyr)
library(tidyr)
require(ggplot2)

Load csv file and cleanup some duplicates

df.MovieAward <- read.csv("C:/Senthil/MSDataAnalytics/GitHub/607Project3/Final_Analysis_File/Awards_File.csv",header=T)
#df.MovieAward <- unique(df.MovieAward)

#cleanup some bad data
df.MovieAward <- group_by(df.MovieAward, movie_id, movie_name, year, category_id, category_name) %>% summarize(won=max(won))

#y <- filter(df.MovieAward, year==1937)
head(df.MovieAward)

## Source: local data frame [6 x 6]
## Groups: movie_id, movie_name, year, category_id
## 
##   movie_id movie_name year category_id              category_name won
## 1        1   BIUTIFUL 2010           1      ACTOR -- LEADING ROLE   0
## 2        2  TRUE GRIT 2010           1      ACTOR -- LEADING ROLE   0
## 3        2  TRUE GRIT 2010           4 ACTRESS -- SUPPORTING ROLE   0
## 4        2  TRUE GRIT 2010           6              ART DIRECTION   0
## 5        2  TRUE GRIT 2010           7             CINEMATOGRAPHY   0
## 6        2  TRUE GRIT 2010           8             COSTUME DESIGN   0

Separate movies that won in BEST PICTURE category

df.BPWonMovie <- df.MovieAward %>%  filter(won==1 & category_name == "BEST PICTURE") %>%  select (year, movie_id, category_name) %>% arrange(year)

head(df.BPWonMovie)

## Source: local data frame [6 x 5]
## Groups: movie_id, movie_name, year, category_id
## 
##               movie_name category_id year movie_id category_name
## 1      THE KING'S SPEECH          16 2010        4  BEST PICTURE
## 2        THE HURT LOCKER          16 2009       57  BEST PICTURE
## 3    SLUMDOG MILLIONAIRE          16 2008      125  BEST PICTURE
## 4 NO COUNTRY FOR OLD MEN          16 2007      158  BEST PICTURE
## 5           THE DEPARTED          16 2006      213  BEST PICTURE
## 6                  CRASH          16 2005      265  BEST PICTURE

Create a wide format where all categories become variables and fill the cells with 0 value. BPExists stands for BEST_PICTURE_movie_Exists in that category

df.wideformat <- df.MovieAward %>% group_by (year, category_name) %>%   summarize(BPExists = 0) %>% spread(category_name, BPExists)
df.wideformat[is.na(df.wideformat)] <- 0
head(df.wideformat)

## Source: local data frame [6 x 24]
## 
##   year ACTOR -- LEADING ROLE ACTOR -- SUPPORTING ROLE
## 1 1928                     0                        0
## 2 1929                     0                        0
## 3 1930                     0                        0
## 4 1931                     0                        0
## 5 1932                     0                        0
## 6 1933                     0                        0
## Variables not shown: ACTRESS -- LEADING ROLE (dbl), ACTRESS -- SUPPORTING
##   ROLE (dbl), ANIMATED FEATURE FILM (dbl), ART DIRECTION (dbl), BEST
##   PICTURE (dbl), CINEMATOGRAPHY (dbl), COSTUME DESIGN (dbl), DIRECTING
##   (dbl), DOCUMENTARY (FEATURE) (dbl), DOCUMENTARY (OTHER) (dbl),
##   DOCUMENTARY (SHORT SUBJECT) (dbl), FILM EDITING (dbl), MAKEUP (dbl),
##   MUSIC (SCORING) (dbl), MUSIC (SONG) (dbl), SHORT FILM (ANIMATED) (dbl),
##   SHORT FILM (LIVE ACTION) (dbl), SOUND (dbl), SOUND EDITING (dbl), VISUAL
##   EFFECTS (dbl), WRITING (dbl)

Category gets one point if the BEST-PICTURE movie is also nominated in its category. If the nomination is also a winner then category gets two points

x <- merge(df.MovieAward, df.BPWonMovie, by=c("movie_id","year"))
x <- arrange(x,year)
x <- filter(x, category_name.x != "BEST PICTURE")
x <- arrange(x,year)
x <- mutate(x,point=ifelse(won==0,1,2))
head(tbl_df(x))

## Source: local data frame [6 x 10]
## 
##   movie_id year                   movie_name.x category_id.x
## 1     4075 1930 ALL QUIET ON THE WESTERN FRONT             7
## 2     4075 1930 ALL QUIET ON THE WESTERN FRONT            22
## 3     4039 1931                       CIMARRON            22
## 4     4039 1931                       CIMARRON             1
## 5     4039 1931                       CIMARRON             9
## 6     4039 1931                       CIMARRON             3
## Variables not shown: category_name.x (fctr), won (int), movie_name.y
##   (fctr), category_id.y (int), category_name.y (fctr), point (dbl)

Conclusion based on from 1928 to 2010:- Based on the chart and data, DIRECTING category can be used as a good indicator to predict BEST PICTURE move. Next best indicator is WRITING.

df.wideformat <- x %>% group_by (year, category_name.x) %>%   summarize(Point = sum(point)) %>% spread(category_name.x, Point)
df.wideformat[is.na(df.wideformat)] <- 0
df.wideformat <- arrange(df.wideformat,year)
colnames(df.wideformat) <- gsub("-","",gsub(" ","",colnames(df.wideformat)))
#sort(colSums(df.wideformat[,-1]),decreasing=T)
df.result <- data.frame(category_name=colnames(df.wideformat), points=colSums(df.wideformat))
df.result <- filter(df.result, category_name!="year")
df.result <- arrange(df.result, desc(points))
df.result

##            category_name points
## 1              DIRECTING    139
## 2                WRITING    130
## 3            FILMEDITING    101
## 4       ACTORLEADINGROLE     79
## 5         CINEMATOGRAPHY     78
## 6         MUSIC(SCORING)     72
## 7           ARTDIRECTION     70
## 8                  SOUND     67
## 9    ACTORSUPPORTINGROLE     60
## 10         COSTUMEDESIGN     51
## 11 ACTRESSSUPPORTINGROLE     40
## 12    ACTRESSLEADINGROLE     37
## 13                MAKEUP     14
## 14           MUSIC(SONG)     13
## 15          SOUNDEDITING     10
## 16         VISUALEFFECTS      9

ggplot(data=df.result, aes(x=category_name,y=points,fill=category_name)) +
    geom_bar(stat="identity") +  theme(text = element_text(size=20),
        axis.text.x = element_text(angle=90, vjust=1)) + guides(fill=FALSE)

conclusion based on data from year 1981 to 2010:- Based on the chart and data, DIRECTING category can be used as a good indicator to predict BEST PICTURE move. Next best indicator is WRITING.

df.result1 <- data.frame(category_name=colnames(df.wideformat), points=colSums(filter(df.wideformat, year >= 1981)))
df.result1 <- filter(df.result1, category_name!="year")

df.result1 <- arrange(df.result1, desc(points))  
df.result1

##            category_name points
## 1              DIRECTING     53
## 2                WRITING     51
## 3            FILMEDITING     45
## 4                  SOUND     34
## 5         CINEMATOGRAPHY     31
## 6         MUSIC(SCORING)     30
## 7           ARTDIRECTION     28
## 8          COSTUMEDESIGN     26
## 9       ACTORLEADINGROLE     24
## 10   ACTORSUPPORTINGROLE     21
## 11    ACTRESSLEADINGROLE     15
## 12                MAKEUP     14
## 13 ACTRESSSUPPORTINGROLE     11
## 14          SOUNDEDITING      9
## 15           MUSIC(SONG)      8
## 16         VISUALEFFECTS      8

ggplot(data=df.result1, aes(x=category_name,y=points,fill=category_name)) +
    geom_bar(stat="identity") +  theme(text = element_text(size=20),
        axis.text.x = element_text(angle=90, vjust=1)) + guides(fill=FALSE)

Project_3

Senthil Dhanapal

Tuesday, March 24, 2015

Based on that method DIRECTING category is the best indicator with 139 points. WRITING category is the second best predictor with 130 points

Required libraries

Load csv file and cleanup some duplicates

Separate movies that won in BEST PICTURE category

Create a wide format where all categories become variables and fill the cells with 0 value. BPExists stands for BEST_PICTURE_movie_Exists in that category

Category gets one point if the BEST-PICTURE movie is also nominated in its category. If the nomination is also a winner then category gets two points

Conclusion based on from 1928 to 2010:- Based on the chart and data, DIRECTING category can be used as a good indicator to predict BEST PICTURE move. Next best indicator is WRITING.

conclusion based on data from year 1981 to 2010:- Based on the chart and data, DIRECTING category can be used as a good indicator to predict BEST PICTURE move. Next best indicator is WRITING.