require(dplyr)
require(stringr)

https://raw.githubusercontent.com/fangseup88/IS607_Project3/gh-pages/Data/project_view_1981_2014.csv

https://raw.githubusercontent.com/mkds/IS607_Project3/gh-pages/Data/project_view_year_numeric_CLEANED.csv

#load csv
file <- read.csv("https://raw.githubusercontent.com/fangseup88/IS607_Project3/gh-pages/Data/project_view_1981_2014.csv")
head(file)
##   Year                   Category            Nominee Won
## 1 1981    ACTOR IN A LEADING ROLE              Reds   no
## 2 1981    ACTOR IN A LEADING ROLE    On Golden Pond  yes
## 3 1981    ACTOR IN A LEADING ROLE     Atlantic City   no
## 4 1981    ACTOR IN A LEADING ROLE            Arthur   no
## 5 1981    ACTOR IN A LEADING ROLE Absence of Malice   no
## 6 1981 ACTOR IN A SUPPORTING ROLE Only When I Laugh   no
# clean up white spaces from columns
file$Nominee <- gsub("^\\s+|\\s+$", "", file$Nominee)
file$Category <- gsub("^\\s+|\\s+$", "", file$Category)
file$Won <- gsub("^\\s+|\\s+$", "", file$Won)
file$Nominee <- str_replace(file$Nominee,"'","")


# Construct output  df with distinct categories and their total
output <- count(file,Category)
colnames(output)[2] <- "category_nomination"

# add category won  column
best_picture_nominee <- subset(file, Category == "BEST PICTURE") %>% select(Nominee,Won)
for(i in 1: nrow(output))
{
  #calculate 
  output$won_category[i] <-  subset(file, Category == output$Category[i]) %>%  select(Won) %>%  filter(Won != 'no')%>% nrow
  cat_nominee <- subset(file, Category == output$Category[i]) %>% select(Nominee) 
  output$bp_nomination[i] <- inner_join(cat_nominee,best_picture_nominee,by="Nominee") %>%  nrow()
  output$bp_won[i] <- inner_join(cat_nominee,best_picture_nominee,by="Nominee") %>%  select(Won) %>%  filter(Won != 'no')%>% nrow
  output$bp_lost[i] <- inner_join(cat_nominee,best_picture_nominee,by="Nominee") %>%  select(Won) %>%  filter(Won == 'no')%>% nrow
}
output
## Source: local data frame [11 x 6]
## 
##                        Category category_nomination won_category
##                           (chr)               (int)        (int)
## 1       ACTOR IN A LEADING ROLE                 170           34
## 2    ACTOR IN A SUPPORTING ROLE                 170           34
## 3     ACTRESS IN A LEADING ROLE                 170           34
## 4  ACTRESS IN A SUPPORTING ROLE                 170           34
## 5                  BEST PICTURE                 192           33
## 6                CINEMATOGRAPHY                 166           33
## 7                COSTUME DESIGN                 170           34
## 8                     DIRECTING                 167           33
## 9                  FILM EDITING                 166           33
## 10                SOUND EDITING                  61           16
## 11                 SOUND MIXING                  58           12
## Variables not shown: bp_nomination (int), bp_won (int), bp_lost (int)

Observed Frequencies

observed_freq <- output %>% select(Category,bp_won,category_nomination)
observed_freq$Total <- observed_freq$bp_won + observed_freq$category_nomination
observed_freq$percentage <- observed_freq$bp_won/observed_freq$Total
#colnames(observed_freq) <- c("Row variable (Nominated)","Won Best Picture","Nominated for Row Variable","Total","% Best Picture wins")
observed_freq
## Source: local data frame [11 x 5]
## 
##                        Category bp_won category_nomination Total
##                           (chr)  (int)               (int) (int)
## 1       ACTOR IN A LEADING ROLE     19                 170   189
## 2    ACTOR IN A SUPPORTING ROLE     22                 170   192
## 3     ACTRESS IN A LEADING ROLE     10                 170   180
## 4  ACTRESS IN A SUPPORTING ROLE     11                 170   181
## 5                  BEST PICTURE     33                 192   225
## 6                CINEMATOGRAPHY     23                 166   189
## 7                COSTUME DESIGN     18                 170   188
## 8                     DIRECTING     31                 167   198
## 9                  FILM EDITING     32                 166   198
## 10                SOUND EDITING      5                  61    66
## 11                 SOUND MIXING      7                  58    65
## Variables not shown: percentage (dbl)

Expected Frequencies

exp_freq <- observed_freq %>% select(Category)
sum_won_best_picture <- sum(observed_freq$bp_won)
sum_category_nomination <- sum(observed_freq$category_nomination)
sum_total <- sum(observed_freq$Total)
for(i in 1:nrow(exp_freq))
{
  val <- observed_freq %>% subset(Category == exp_freq$Category[i])
  exp_freq$bp_won[i] <- val$Total * (sum_won_best_picture / sum_total)
  exp_freq$category_nomination[i] <- val$Total * (sum_category_nomination / sum_total)
}
exp_freq$Total <-  exp_freq$bp_won +  exp_freq$category_nomination
exp_freq
## Source: local data frame [11 x 4]
## 
##                        Category    bp_won category_nomination Total
##                           (chr)     (dbl)               (dbl) (dbl)
## 1       ACTOR IN A LEADING ROLE 21.314270           167.68573   189
## 2    ACTOR IN A SUPPORTING ROLE 21.652592           170.34741   192
## 3     ACTRESS IN A LEADING ROLE 20.299305           159.70069   180
## 4  ACTRESS IN A SUPPORTING ROLE 20.412079           160.58792   181
## 5                  BEST PICTURE 25.374131           199.62587   225
## 6                CINEMATOGRAPHY 21.314270           167.68573   189
## 7                COSTUME DESIGN 21.201497           166.79850   188
## 8                     DIRECTING 22.329236           175.67076   198
## 9                  FILM EDITING 22.329236           175.67076   198
## 10                SOUND EDITING  7.443079            58.55692    66
## 11                 SOUND MIXING  7.330305            57.66970    65

Chi-Sqaure analysis - Angus Huang

Calculating Chi-Sqaure test to see the if there is any significant difference across the category. As p-value is less than 0.05 (0.02623 & 0.008 as shown below), we can conclude there is significant difference across the category of best pictures won.

library (tidyr)
library (reshape2)


n3data <- output [,c("Category","won_category","bp_won")]
n3data
## Source: local data frame [11 x 3]
## 
##                        Category won_category bp_won
##                           (chr)        (int)  (int)
## 1       ACTOR IN A LEADING ROLE           34     19
## 2    ACTOR IN A SUPPORTING ROLE           34     22
## 3     ACTRESS IN A LEADING ROLE           34     10
## 4  ACTRESS IN A SUPPORTING ROLE           34     11
## 5                  BEST PICTURE           33     33
## 6                CINEMATOGRAPHY           33     23
## 7                COSTUME DESIGN           34     18
## 8                     DIRECTING           33     31
## 9                  FILM EDITING           33     32
## 10                SOUND EDITING           16      5
## 11                 SOUND MIXING           12      7
n4data <-t(n3data)
n5data <- (n4data[2:3,1:11])
n5data <- matrix(as.numeric(unlist(n5data)),nrow = nrow(n5data))
colnames(n5data) <-c("LeadActor", "SupportActor","LeadActress","SupportActress","Best_Picture","Cinematograph","Costume_Design","Directing","Editing","sounds_Editing","Sounds_Mixing")

row.names (n5data) <- c("won_category", "bp_won")


ndata <- output [, c("won_category","bp_won")]


n2data <-output [,c("category_nomination","bp_won")]

pdata <- output [,c("Category","category_nomination","won_category","bp_won")]
pdata
## Source: local data frame [11 x 4]
## 
##                        Category category_nomination won_category bp_won
##                           (chr)               (int)        (int)  (int)
## 1       ACTOR IN A LEADING ROLE                 170           34     19
## 2    ACTOR IN A SUPPORTING ROLE                 170           34     22
## 3     ACTRESS IN A LEADING ROLE                 170           34     10
## 4  ACTRESS IN A SUPPORTING ROLE                 170           34     11
## 5                  BEST PICTURE                 192           33     33
## 6                CINEMATOGRAPHY                 166           33     23
## 7                COSTUME DESIGN                 170           34     18
## 8                     DIRECTING                 167           33     31
## 9                  FILM EDITING                 166           33     32
## 10                SOUND EDITING                  61           16      5
## 11                 SOUND MIXING                  58           12      7
barplot(n5data,beside = T, legend=T)

chisq.test(ndata)
## 
##  Pearson's Chi-squared test
## 
## data:  ndata
## X-squared = 20.336, df = 10, p-value = 0.02623
chisq.test(n2data)
## 
##  Pearson's Chi-squared test
## 
## data:  n2data
## X-squared = 23.786, df = 10, p-value = 0.00819