library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(RCurl)
## Loading required package: bitops
##
## Attaching package: 'RCurl'
##
## The following object is masked from 'package:tidyr':
##
## complete
library(aod)
library(reshape2)
all_df<- read.csv('project_view_year_numeric_CLEANED.csv', stringsAsFactors=FALSE)
#We will use a subset of the dataset with categories "FILM EDITING", "CINEMATOGRAPHY", "DIRECTING" and "BEST PICTURE".
part_df2<- subset(all_df, Category %in% c("FILM EDITING", "BEST PICTURE", "CINEMATOGRAPHY", "DIRECTING"))
#Using dcast function from the reshape2 library to transform the variables "Won" and categories "CINEMATOGRAPHY", "DIRECTING", "BEST PICTURE" and "FILM EDITING" into variables "BEST PICTURE_no", "BEST PICTURE_yes", "FILM EDITING_no" and "FILM EDITING_yes", DIRECTING_YES, DIRECTING_NO, CINEMATOGRAPHY_YES, CINEMATOGRAPHY_NO
dichot2<-dcast(part_df2, Year+Nominee ~ Category+Won)
## Using Won as value column: use value.var to override.
head(dichot2)
## Year Nominee BEST PICTURE_no BEST PICTURE_yes
## 1 1934 Cleopatra <NA> <NA>
## 2 1934 Eskimo <NA> <NA>
## 3 1934 It Happened One Night <NA> <NA>
## 4 1934 One Night of Love <NA> <NA>
## 5 1934 Operator 13 <NA> <NA>
## 6 1934 The Affairs of Cellini <NA> <NA>
## CINEMATOGRAPHY_no CINEMATOGRAPHY_yes DIRECTING_no DIRECTING_yes
## 1 <NA> yes <NA> <NA>
## 2 <NA> <NA> <NA> <NA>
## 3 <NA> <NA> <NA> yes
## 4 <NA> <NA> no <NA>
## 5 no <NA> <NA> <NA>
## 6 no <NA> <NA> <NA>
## FILM EDITING_no FILM EDITING_yes
## 1 no <NA>
## 2 <NA> yes
## 3 <NA> <NA>
## 4 no <NA>
## 5 <NA> <NA>
## 6 <NA> <NA>
dtframe<- dichot2
#Changing the variable names
thecolnames<-c("Year", "Nominee","BestPic_NO", "BestPic_YES", "Cinemato_NO", "Cinemato_YES", "Directing_NO", "Directing_YES", "FilmEdit_NO", "FilmEdit_YES")
names(dtframe)<-thecolnames
head(dtframe)
## Year Nominee BestPic_NO BestPic_YES Cinemato_NO
## 1 1934 Cleopatra <NA> <NA> <NA>
## 2 1934 Eskimo <NA> <NA> <NA>
## 3 1934 It Happened One Night <NA> <NA> <NA>
## 4 1934 One Night of Love <NA> <NA> <NA>
## 5 1934 Operator 13 <NA> <NA> no
## 6 1934 The Affairs of Cellini <NA> <NA> no
## Cinemato_YES Directing_NO Directing_YES FilmEdit_NO FilmEdit_YES
## 1 yes <NA> <NA> no <NA>
## 2 <NA> <NA> <NA> <NA> yes
## 3 <NA> <NA> yes <NA> <NA>
## 4 <NA> no <NA> no <NA>
## 5 <NA> <NA> <NA> <NA> <NA>
## 6 <NA> <NA> <NA> <NA> <NA>
dtframe<-as.data.frame(dtframe)
# Taking care of Missing Values
# 1- Creating two columns on nomination status: Nominee_BP and Nominee_FE. If the values of BestPic_NO and BestPic_YES are not available, Nominee_BP is NO, otherwise, Nominee_BP is YES.
#If the values of FilmEdit_NO and FilmEdit_YES are not available, Nominee_FE is NO, otherwise, Nominee_FE is YES.
for (i in 1:nrow(dtframe))
{
ifelse (((is.na(dtframe[i,"BestPic_NO"])) && (is.na(dtframe[i,"BestPic_YES"]))), dtframe[i,"Nominee_BP"]<-"NO", dtframe[i,"Nominee_BP"]<-"YES")
ifelse (((is.na(dtframe[i,"Cinemato_NO"])) && (is.na(dtframe[i,"Cinemato_YES"]))), dtframe[i,"Nominee_CI"]<-"NO", dtframe[i,"Nominee_CI"]<-"YES")
ifelse (((is.na(dtframe[i,"Directing_NO"])) && (is.na(dtframe[i,"Directing_YES"]))), dtframe[i,"Nominee_DI"]<-"NO", dtframe[i,"Nominee_DI"]<-"YES")
ifelse (((is.na(dtframe[i,"FilmEdit_NO"])) && (is.na(dtframe[i,"FilmEdit_YES"]))), dtframe[i,"Nominee_FE"]<-"NO", dtframe[i,"Nominee_FE"]<-"YES")
}
dtframe2<-dtframe
# Taking care of Missing Values
# 2- Transforming missing values to NO
for (i in 1:nrow(dtframe2))
{
if (is.na(dtframe2[i,"BestPic_NO"])) dtframe2[i,"BestPic_NO"]<-"NO"
if (is.na(dtframe2[i,"BestPic_YES"])) dtframe2[i,"BestPic_YES"]<-"NO"
if (is.na(dtframe2[i,"Cinemato_NO"])) dtframe2[i,"Cinemato_NO"]<-"NO"
if (is.na(dtframe2[i,"Cinemato_YES"])) dtframe2[i,"Cinemato_YES"]<-"NO"
if (is.na(dtframe2[i,"Directing_NO"])) dtframe2[i,"Directing_NO"]<-"NO"
if (is.na(dtframe2[i,"Directing_YES"])) dtframe2[i,"Directing_YES"]<-"NO"
if (is.na(dtframe2[i,"FilmEdit_NO"])) dtframe2[i,"FilmEdit_NO"]<-"NO"
if (is.na(dtframe2[i,"FilmEdit_YES"])) dtframe2[i,"FilmEdit_YES"] <-"NO"
}
head(dtframe2)
## Year Nominee BestPic_NO BestPic_YES Cinemato_NO
## 1 1934 Cleopatra NO NO NO
## 2 1934 Eskimo NO NO NO
## 3 1934 It Happened One Night NO NO NO
## 4 1934 One Night of Love NO NO NO
## 5 1934 Operator 13 NO NO no
## 6 1934 The Affairs of Cellini NO NO no
## Cinemato_YES Directing_NO Directing_YES FilmEdit_NO FilmEdit_YES
## 1 yes NO NO no NO
## 2 NO NO NO NO yes
## 3 NO NO yes NO NO
## 4 NO no NO no NO
## 5 NO NO NO NO NO
## 6 NO NO NO NO NO
## Nominee_BP Nominee_CI Nominee_DI Nominee_FE
## 1 NO YES NO YES
## 2 NO NO NO YES
## 3 NO NO YES NO
## 4 NO NO YES YES
## 5 NO YES NO NO
## 6 NO YES NO NO
#some fields are in upper case, the other in lower case. We are going to change everything to upper case
dtframe2$BestPic_NO <- toupper(dtframe2$BestPic_NO)
dtframe2$BestPic_YES <- toupper(dtframe2$BestPic_YES)
dtframe2$FilmEdit_NO <- toupper(dtframe2$FilmEdit_NO)
dtframe2$FilmEdit_YES <- toupper(dtframe2$FilmEdit_YES)
dtframe2$Cinemato_NO <- toupper(dtframe2$Cinemato_NO)
dtframe2$Cinemato_YES <- toupper(dtframe2$Cinemato_YES)
dtframe2$Directing_NO <- toupper(dtframe2$Directing_NO)
dtframe2$Directing_YES <- toupper(dtframe2$Directing_YES)
dtframe2$Nominee_BP <- toupper(dtframe2$Nominee_BP)
dtframe2$Nominee_BP <- toupper(dtframe2$Nominee_BP)
dtframe2$Nominee_CI <- toupper(dtframe2$Nominee_CI)
dtframe2$Nominee_DI <- toupper(dtframe2$Nominee_DI)
str(dtframe2)
## 'data.frame': 775 obs. of 14 variables:
## $ Year : int 1934 1934 1934 1934 1934 1934 1934 1935 1935 1935 ...
## $ Nominee : chr "Cleopatra" "Eskimo" "It Happened One Night" "One Night of Love" ...
## $ BestPic_NO : chr "NO" "NO" "NO" "NO" ...
## $ BestPic_YES : chr "NO" "NO" "NO" "NO" ...
## $ Cinemato_NO : chr "NO" "NO" "NO" "NO" ...
## $ Cinemato_YES : chr "YES" "NO" "NO" "NO" ...
## $ Directing_NO : chr "NO" "NO" "NO" "NO" ...
## $ Directing_YES: chr "NO" "NO" "YES" "NO" ...
## $ FilmEdit_NO : chr "NO" "NO" "NO" "NO" ...
## $ FilmEdit_YES : chr "NO" "YES" "NO" "NO" ...
## $ Nominee_BP : chr "NO" "NO" "NO" "NO" ...
## $ Nominee_CI : chr "YES" "NO" "NO" "NO" ...
## $ Nominee_DI : chr "NO" "NO" "YES" "YES" ...
## $ Nominee_FE : chr "YES" "YES" "NO" "YES" ...
#For our modelisation, we will keep two variables:"BestPic_YES"and "FilmEdit_YES", from "BestPic_NO", "BestPic_YES", "FilmEdit_NO", "FilmEdit_YES". We will rename those variables to Won_BP (won BEST PICTURE) and Won_FE (won "FILM EDITING")
dtframe2$Won_BP<-dtframe2$BestPic_YES
dtframe2$Won_CI<-dtframe2$Cinemato_YES
dtframe2$Won_DI<-dtframe2$Directing_YES
dtframe2$Won_FE<-dtframe2$FilmEdit_YES
dtframe2$FilmEdit_YES<-NULL
dtframe2$BestPic_YES<-NULL
dtframe2$BestPic_NO<-NULL
dtframe2$FilmEdit_NO<-NULL
dtframe2$Cinemato_YES<-NULL
dtframe2$Directing_YES<-NULL
dtframe2$Cinemato_NO<-NULL
dtframe2$Directing_NO<-NULL
#estimates a logistic regression model using the glm (generalized linear model) function
#Changing character columns into factors
dtframe2$Won_FE <- as.factor(dtframe2$Won_FE)
dtframe2$Won_BP <- as.factor(dtframe2$Won_BP)
dtframe2$Nominee_BP <- as.factor(dtframe2$Nominee_BP)
dtframe2$Nominee_FE <- as.factor(dtframe2$Nominee_FE)
dtframe2$Won_DI <- as.factor(dtframe2$Won_DI)
dtframe2$Won_CI <- as.factor(dtframe2$Won_CI)
dtframe2$Nominee_CI <- as.factor(dtframe2$Nominee_CI)
dtframe2$Nominee_DI <- as.factor(dtframe2$Nominee_DI)
#Building the model
#Won_BP and Nominee_BP are correlated, we add Nominee_BP in our model, but we are not expecting any findings from that variable.
model2<-glm(Won_BP~Won_FE+Nominee_FE+Won_DI+Won_CI+Nominee_CI+Nominee_DI, data=dtframe2,family="binomial")
#Using the command summary to get the results
summary(model2)
##
## Call:
## glm(formula = Won_BP ~ Won_FE + Nominee_FE + Won_DI + Won_CI +
## Nominee_CI + Nominee_DI, family = "binomial", data = dtframe2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.02583 -0.17929 -0.11974 -0.05924 3.14377
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.344450 0.810000 -7.833 4.78e-15 ***
## Won_FEYES 1.085520 0.455630 2.382 0.01720 *
## Nominee_FEYES 1.409967 0.485800 2.902 0.00370 **
## Won_DIYES 3.541594 0.405369 8.737 < 2e-16 ***
## Won_CIYES -0.691716 0.650994 -1.063 0.28798
## Nominee_CIYES -0.009677 0.458376 -0.021 0.98316
## Nominee_DIYES 2.221843 0.767962 2.893 0.00381 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 470.09 on 774 degrees of freedom
## Residual deviance: 210.45 on 768 degrees of freedom
## AIC: 224.45
##
## Number of Fisher Scoring iterations: 8
In the output from the summary command, the first thing we see is the call, a reminder about what the model we ran was, what options we specified, etc.
Next we see the deviance residuals, which are a measure of model fit. This part of output shows the distribution of the deviance residuals for individual cases used in the model.
The next part of the output shows the coefficients, their standard errors, the z-statistic (Wald z-statistic), and the associated p-values.
Won_DI, Nominee_DI, Won_FE and Nominee_FE are statistically significant at the:
1- 5% level for Won_FE, p-value 0.01720,
2- 1% level for Nominee_DI and Nominee_FE, p-values 0.00381 and 0.00370
3- 0% level for Won_DI, p-value < 2e-16.
The data we have seems to indicate that “FILM EDITING’ and”DIRECTING" are influential in predicting “BEST PICTURE”.
## Confidence Intervals using standard errors
confint.default(model2)
## 2.5 % 97.5 %
## (Intercept) -7.9320215 -4.7568782
## Won_FEYES 0.1925022 1.9785370
## Nominee_FEYES 0.4578173 2.3621176
## Won_DIYES 2.7470845 4.3361036
## Won_CIYES -1.9676401 0.5842078
## Nominee_CIYES -0.9080775 0.8887242
## Nominee_DIYES 0.7166656 3.7270211
## odds ratios only
exp(coef(model2))
## (Intercept) Won_FEYES Nominee_FEYES Won_DIYES Won_CIYES
## 0.001756469 2.960977905 4.095821963 34.521904879 0.500716009
## Nominee_CIYES Nominee_DIYES
## 0.990370036 9.224318870