require(dplyr)
require(stringr)
“https://raw.githubusercontent.com/fangseup88/IS607_Project3/gh-pages/Data/project_view_1981_2014.csv”
#load csv
file <- read.csv("https://raw.githubusercontent.com/fangseup88/IS607_Project3/gh-pages/Data/project_view_1981_2014.csv")
head(file)
## Year Category Nominee Won
## 1 1981 ACTOR IN A LEADING ROLE Reds no
## 2 1981 ACTOR IN A LEADING ROLE On Golden Pond yes
## 3 1981 ACTOR IN A LEADING ROLE Atlantic City no
## 4 1981 ACTOR IN A LEADING ROLE Arthur no
## 5 1981 ACTOR IN A LEADING ROLE Absence of Malice no
## 6 1981 ACTOR IN A SUPPORTING ROLE Only When I Laugh no
# clean up white spaces from columns
file$Nominee <- gsub("^\\s+|\\s+$", "", file$Nominee)
file$Category <- gsub("^\\s+|\\s+$", "", file$Category)
file$Won <- gsub("^\\s+|\\s+$", "", file$Won)
file$Nominee <- str_replace(file$Nominee,"'","")
# Construct output df with distinct categories and their total
output <- count(file,Category)
colnames(output)[2] <- "category_nomination"
# add category won column
best_picture_nominee <- subset(file, Category == "BEST PICTURE") %>% select(Nominee,Won)
for(i in 1: nrow(output))
{
#calculate
output$won_category[i] <- subset(file, Category == output$Category[i]) %>% select(Won) %>% filter(Won != 'no')%>% nrow
cat_nominee <- subset(file, Category == output$Category[i]) %>% select(Nominee)
output$bp_nomination[i] <- inner_join(cat_nominee,best_picture_nominee,by="Nominee") %>% nrow()
output$bp_won[i] <- inner_join(cat_nominee,best_picture_nominee,by="Nominee") %>% select(Won) %>% filter(Won != 'no')%>% nrow
output$bp_lost[i] <- inner_join(cat_nominee,best_picture_nominee,by="Nominee") %>% select(Won) %>% filter(Won == 'no')%>% nrow
}
output
## Source: local data frame [11 x 6]
##
## Category category_nomination won_category
## (chr) (int) (int)
## 1 ACTOR IN A LEADING ROLE 170 34
## 2 ACTOR IN A SUPPORTING ROLE 170 34
## 3 ACTRESS IN A LEADING ROLE 170 34
## 4 ACTRESS IN A SUPPORTING ROLE 170 34
## 5 BEST PICTURE 192 33
## 6 CINEMATOGRAPHY 166 33
## 7 COSTUME DESIGN 170 34
## 8 DIRECTING 167 33
## 9 FILM EDITING 166 33
## 10 SOUND EDITING 61 16
## 11 SOUND MIXING 58 12
## Variables not shown: bp_nomination (int), bp_won (int), bp_lost (int)
observed_freq <- output %>% select(Category,bp_won,category_nomination)
observed_freq$Total <- observed_freq$bp_won + observed_freq$category_nomination
observed_freq$percentage <- observed_freq$bp_won/observed_freq$Total
#colnames(observed_freq) <- c("Row variable (Nominated)","Won Best Picture","Nominated for Row Variable","Total","% Best Picture wins")
observed_freq
## Source: local data frame [11 x 5]
##
## Category bp_won category_nomination Total
## (chr) (int) (int) (int)
## 1 ACTOR IN A LEADING ROLE 19 170 189
## 2 ACTOR IN A SUPPORTING ROLE 22 170 192
## 3 ACTRESS IN A LEADING ROLE 10 170 180
## 4 ACTRESS IN A SUPPORTING ROLE 11 170 181
## 5 BEST PICTURE 33 192 225
## 6 CINEMATOGRAPHY 23 166 189
## 7 COSTUME DESIGN 18 170 188
## 8 DIRECTING 31 167 198
## 9 FILM EDITING 32 166 198
## 10 SOUND EDITING 5 61 66
## 11 SOUND MIXING 7 58 65
## Variables not shown: percentage (dbl)
exp_freq <- observed_freq %>% select(Category)
sum_won_best_picture <- sum(observed_freq$bp_won)
sum_category_nomination <- sum(observed_freq$category_nomination)
sum_total <- sum(observed_freq$Total)
for(i in 1:nrow(exp_freq))
{
val <- observed_freq %>% subset(Category == exp_freq$Category[i])
exp_freq$bp_won[i] <- val$Total * (sum_won_best_picture / sum_total)
exp_freq$category_nomination[i] <- val$Total * (sum_category_nomination / sum_total)
}
exp_freq$Total <- exp_freq$bp_won + exp_freq$category_nomination
exp_freq
## Source: local data frame [11 x 4]
##
## Category bp_won category_nomination Total
## (chr) (dbl) (dbl) (dbl)
## 1 ACTOR IN A LEADING ROLE 21.314270 167.68573 189
## 2 ACTOR IN A SUPPORTING ROLE 21.652592 170.34741 192
## 3 ACTRESS IN A LEADING ROLE 20.299305 159.70069 180
## 4 ACTRESS IN A SUPPORTING ROLE 20.412079 160.58792 181
## 5 BEST PICTURE 25.374131 199.62587 225
## 6 CINEMATOGRAPHY 21.314270 167.68573 189
## 7 COSTUME DESIGN 21.201497 166.79850 188
## 8 DIRECTING 22.329236 175.67076 198
## 9 FILM EDITING 22.329236 175.67076 198
## 10 SOUND EDITING 7.443079 58.55692 66
## 11 SOUND MIXING 7.330305 57.66970 65
library (tidyr)
library (reshape2)
n3data <- output [,c("Category","won_category","bp_won")]
n3data
## Source: local data frame [11 x 3]
##
## Category won_category bp_won
## (chr) (int) (int)
## 1 ACTOR IN A LEADING ROLE 34 19
## 2 ACTOR IN A SUPPORTING ROLE 34 22
## 3 ACTRESS IN A LEADING ROLE 34 10
## 4 ACTRESS IN A SUPPORTING ROLE 34 11
## 5 BEST PICTURE 33 33
## 6 CINEMATOGRAPHY 33 23
## 7 COSTUME DESIGN 34 18
## 8 DIRECTING 33 31
## 9 FILM EDITING 33 32
## 10 SOUND EDITING 16 5
## 11 SOUND MIXING 12 7
n4data <-t(n3data)
n5data <- (n4data[2:3,1:11])
n5data <- matrix(as.numeric(unlist(n5data)),nrow = nrow(n5data))
colnames(n5data) <-c("LeadActor", "SupportActor","LeadActress","SupportActress","Best_Picture","Cinematograph","Costume_Design","Directing","Editing","sounds_Editing","Sounds_Mixing")
row.names (n5data) <- c("won_category", "bp_won")
ndata <- output [, c("won_category","bp_won")]
n2data <-output [,c("category_nomination","bp_won")]
pdata <- output [,c("Category","category_nomination","won_category","bp_won")]
pdata
## Source: local data frame [11 x 4]
##
## Category category_nomination won_category bp_won
## (chr) (int) (int) (int)
## 1 ACTOR IN A LEADING ROLE 170 34 19
## 2 ACTOR IN A SUPPORTING ROLE 170 34 22
## 3 ACTRESS IN A LEADING ROLE 170 34 10
## 4 ACTRESS IN A SUPPORTING ROLE 170 34 11
## 5 BEST PICTURE 192 33 33
## 6 CINEMATOGRAPHY 166 33 23
## 7 COSTUME DESIGN 170 34 18
## 8 DIRECTING 167 33 31
## 9 FILM EDITING 166 33 32
## 10 SOUND EDITING 61 16 5
## 11 SOUND MIXING 58 12 7
barplot(n5data,beside = T, legend=T)
chisq.test(ndata)
##
## Pearson's Chi-squared test
##
## data: ndata
## X-squared = 20.336, df = 10, p-value = 0.02623
chisq.test(n2data)
##
## Pearson's Chi-squared test
##
## data: n2data
## X-squared = 23.786, df = 10, p-value = 0.00819