Logit model for Project3

library(tidyr)
library(dplyr)

## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(RCurl)

## Loading required package: bitops
## 
## Attaching package: 'RCurl'
## 
## The following object is masked from 'package:tidyr':
## 
##     complete

library(aod)
library(reshape2)

all_df<- read.csv('project_view_year_numeric_CLEANED.csv', stringsAsFactors=FALSE)

#We will use a subset of the dataset with categories "FILM EDITING", "CINEMATOGRAPHY", "DIRECTING" and "BEST PICTURE".
part_df2<- subset(all_df, Category %in% c("FILM EDITING", "BEST PICTURE", "CINEMATOGRAPHY", "DIRECTING"))

#Using dcast function from the reshape2 library to transform the variables "Won" and categories "CINEMATOGRAPHY", "DIRECTING", "BEST PICTURE" and "FILM EDITING" into variables "BEST PICTURE_no", "BEST PICTURE_yes", "FILM EDITING_no" and "FILM EDITING_yes", DIRECTING_YES, DIRECTING_NO, CINEMATOGRAPHY_YES, CINEMATOGRAPHY_NO

dichot2<-dcast(part_df2, Year+Nominee ~ Category+Won)

## Using Won as value column: use value.var to override.

head(dichot2)

##   Year                Nominee BEST PICTURE_no BEST PICTURE_yes
## 1 1934              Cleopatra            <NA>             <NA>
## 2 1934                 Eskimo            <NA>             <NA>
## 3 1934  It Happened One Night            <NA>             <NA>
## 4 1934      One Night of Love            <NA>             <NA>
## 5 1934            Operator 13            <NA>             <NA>
## 6 1934 The Affairs of Cellini            <NA>             <NA>
##   CINEMATOGRAPHY_no CINEMATOGRAPHY_yes DIRECTING_no DIRECTING_yes
## 1              <NA>                yes         <NA>          <NA>
## 2              <NA>               <NA>         <NA>          <NA>
## 3              <NA>               <NA>         <NA>           yes
## 4              <NA>               <NA>           no          <NA>
## 5                no               <NA>         <NA>          <NA>
## 6                no               <NA>         <NA>          <NA>
##   FILM EDITING_no FILM EDITING_yes
## 1              no             <NA>
## 2            <NA>              yes
## 3            <NA>             <NA>
## 4              no             <NA>
## 5            <NA>             <NA>
## 6            <NA>             <NA>

dtframe<- dichot2

#Changing the variable names 
thecolnames<-c("Year", "Nominee","BestPic_NO", "BestPic_YES", "Cinemato_NO", "Cinemato_YES", "Directing_NO", "Directing_YES", "FilmEdit_NO", "FilmEdit_YES")
names(dtframe)<-thecolnames

head(dtframe)

##   Year                Nominee BestPic_NO BestPic_YES Cinemato_NO
## 1 1934              Cleopatra       <NA>        <NA>        <NA>
## 2 1934                 Eskimo       <NA>        <NA>        <NA>
## 3 1934  It Happened One Night       <NA>        <NA>        <NA>
## 4 1934      One Night of Love       <NA>        <NA>        <NA>
## 5 1934            Operator 13       <NA>        <NA>          no
## 6 1934 The Affairs of Cellini       <NA>        <NA>          no
##   Cinemato_YES Directing_NO Directing_YES FilmEdit_NO FilmEdit_YES
## 1          yes         <NA>          <NA>          no         <NA>
## 2         <NA>         <NA>          <NA>        <NA>          yes
## 3         <NA>         <NA>           yes        <NA>         <NA>
## 4         <NA>           no          <NA>          no         <NA>
## 5         <NA>         <NA>          <NA>        <NA>         <NA>
## 6         <NA>         <NA>          <NA>        <NA>         <NA>

dtframe<-as.data.frame(dtframe)

# Taking care of Missing Values
# 1- Creating two columns on nomination status: Nominee_BP and Nominee_FE. If the values of BestPic_NO and BestPic_YES are not available, Nominee_BP is NO, otherwise, Nominee_BP is YES.

#If the values of FilmEdit_NO and FilmEdit_YES are not available, Nominee_FE is NO, otherwise, Nominee_FE is YES.

for (i in 1:nrow(dtframe))
{ 
ifelse (((is.na(dtframe[i,"BestPic_NO"])) && (is.na(dtframe[i,"BestPic_YES"]))), dtframe[i,"Nominee_BP"]<-"NO", dtframe[i,"Nominee_BP"]<-"YES")
  
ifelse (((is.na(dtframe[i,"Cinemato_NO"])) && (is.na(dtframe[i,"Cinemato_YES"]))), dtframe[i,"Nominee_CI"]<-"NO", dtframe[i,"Nominee_CI"]<-"YES")

ifelse (((is.na(dtframe[i,"Directing_NO"])) && (is.na(dtframe[i,"Directing_YES"]))), dtframe[i,"Nominee_DI"]<-"NO", dtframe[i,"Nominee_DI"]<-"YES")

ifelse (((is.na(dtframe[i,"FilmEdit_NO"])) && (is.na(dtframe[i,"FilmEdit_YES"]))),  dtframe[i,"Nominee_FE"]<-"NO", dtframe[i,"Nominee_FE"]<-"YES")
}

dtframe2<-dtframe
# Taking care of Missing Values
# 2- Transforming missing values to NO
for (i in 1:nrow(dtframe2))
{ 
if (is.na(dtframe2[i,"BestPic_NO"])) dtframe2[i,"BestPic_NO"]<-"NO"
if (is.na(dtframe2[i,"BestPic_YES"])) dtframe2[i,"BestPic_YES"]<-"NO"
if (is.na(dtframe2[i,"Cinemato_NO"])) dtframe2[i,"Cinemato_NO"]<-"NO"
if (is.na(dtframe2[i,"Cinemato_YES"])) dtframe2[i,"Cinemato_YES"]<-"NO"
if (is.na(dtframe2[i,"Directing_NO"])) dtframe2[i,"Directing_NO"]<-"NO"
if (is.na(dtframe2[i,"Directing_YES"])) dtframe2[i,"Directing_YES"]<-"NO"
if (is.na(dtframe2[i,"FilmEdit_NO"])) dtframe2[i,"FilmEdit_NO"]<-"NO"
if (is.na(dtframe2[i,"FilmEdit_YES"])) dtframe2[i,"FilmEdit_YES"] <-"NO"
}
head(dtframe2)

##   Year                Nominee BestPic_NO BestPic_YES Cinemato_NO
## 1 1934              Cleopatra         NO          NO          NO
## 2 1934                 Eskimo         NO          NO          NO
## 3 1934  It Happened One Night         NO          NO          NO
## 4 1934      One Night of Love         NO          NO          NO
## 5 1934            Operator 13         NO          NO          no
## 6 1934 The Affairs of Cellini         NO          NO          no
##   Cinemato_YES Directing_NO Directing_YES FilmEdit_NO FilmEdit_YES
## 1          yes           NO            NO          no           NO
## 2           NO           NO            NO          NO          yes
## 3           NO           NO           yes          NO           NO
## 4           NO           no            NO          no           NO
## 5           NO           NO            NO          NO           NO
## 6           NO           NO            NO          NO           NO
##   Nominee_BP Nominee_CI Nominee_DI Nominee_FE
## 1         NO        YES         NO        YES
## 2         NO         NO         NO        YES
## 3         NO         NO        YES         NO
## 4         NO         NO        YES        YES
## 5         NO        YES         NO         NO
## 6         NO        YES         NO         NO

#some fields are in upper case, the other in lower case. We are going to change everything to upper case
dtframe2$BestPic_NO <- toupper(dtframe2$BestPic_NO)
dtframe2$BestPic_YES <- toupper(dtframe2$BestPic_YES)
dtframe2$FilmEdit_NO <- toupper(dtframe2$FilmEdit_NO)
dtframe2$FilmEdit_YES <- toupper(dtframe2$FilmEdit_YES)
dtframe2$Cinemato_NO <- toupper(dtframe2$Cinemato_NO)
dtframe2$Cinemato_YES <- toupper(dtframe2$Cinemato_YES)
dtframe2$Directing_NO <- toupper(dtframe2$Directing_NO)
dtframe2$Directing_YES <- toupper(dtframe2$Directing_YES)
dtframe2$Nominee_BP <- toupper(dtframe2$Nominee_BP)
dtframe2$Nominee_BP <- toupper(dtframe2$Nominee_BP)
dtframe2$Nominee_CI <- toupper(dtframe2$Nominee_CI)
dtframe2$Nominee_DI <- toupper(dtframe2$Nominee_DI)

str(dtframe2)

## 'data.frame':    775 obs. of  14 variables:
##  $ Year         : int  1934 1934 1934 1934 1934 1934 1934 1935 1935 1935 ...
##  $ Nominee      : chr  "Cleopatra" "Eskimo" "It Happened One Night" "One Night of Love" ...
##  $ BestPic_NO   : chr  "NO" "NO" "NO" "NO" ...
##  $ BestPic_YES  : chr  "NO" "NO" "NO" "NO" ...
##  $ Cinemato_NO  : chr  "NO" "NO" "NO" "NO" ...
##  $ Cinemato_YES : chr  "YES" "NO" "NO" "NO" ...
##  $ Directing_NO : chr  "NO" "NO" "NO" "NO" ...
##  $ Directing_YES: chr  "NO" "NO" "YES" "NO" ...
##  $ FilmEdit_NO  : chr  "NO" "NO" "NO" "NO" ...
##  $ FilmEdit_YES : chr  "NO" "YES" "NO" "NO" ...
##  $ Nominee_BP   : chr  "NO" "NO" "NO" "NO" ...
##  $ Nominee_CI   : chr  "YES" "NO" "NO" "NO" ...
##  $ Nominee_DI   : chr  "NO" "NO" "YES" "YES" ...
##  $ Nominee_FE   : chr  "YES" "YES" "NO" "YES" ...

#For our modelisation, we will keep two variables:"BestPic_YES"and "FilmEdit_YES",  from "BestPic_NO", "BestPic_YES", "FilmEdit_NO", "FilmEdit_YES". We will rename those variables to Won_BP (won BEST PICTURE) and Won_FE (won "FILM EDITING")

dtframe2$Won_BP<-dtframe2$BestPic_YES
dtframe2$Won_CI<-dtframe2$Cinemato_YES
dtframe2$Won_DI<-dtframe2$Directing_YES
dtframe2$Won_FE<-dtframe2$FilmEdit_YES
dtframe2$FilmEdit_YES<-NULL
dtframe2$BestPic_YES<-NULL
dtframe2$BestPic_NO<-NULL
dtframe2$FilmEdit_NO<-NULL
dtframe2$Cinemato_YES<-NULL
dtframe2$Directing_YES<-NULL
dtframe2$Cinemato_NO<-NULL
dtframe2$Directing_NO<-NULL
#estimates a logistic regression model using the glm (generalized linear model) function
#Changing character columns into factors 

dtframe2$Won_FE <- as.factor(dtframe2$Won_FE)
dtframe2$Won_BP <- as.factor(dtframe2$Won_BP)
dtframe2$Nominee_BP <- as.factor(dtframe2$Nominee_BP)
dtframe2$Nominee_FE <- as.factor(dtframe2$Nominee_FE)
dtframe2$Won_DI <- as.factor(dtframe2$Won_DI)
dtframe2$Won_CI <- as.factor(dtframe2$Won_CI)
dtframe2$Nominee_CI <- as.factor(dtframe2$Nominee_CI)
dtframe2$Nominee_DI <- as.factor(dtframe2$Nominee_DI)
#Building the model
#Won_BP and Nominee_BP are correlated, we add Nominee_BP in our model, but we are not expecting any findings from that variable.
model2<-glm(Won_BP~Won_FE+Nominee_FE+Won_DI+Won_CI+Nominee_CI+Nominee_DI, data=dtframe2,family="binomial")


#Using the command summary to get the results
summary(model2)

## 
## Call:
## glm(formula = Won_BP ~ Won_FE + Nominee_FE + Won_DI + Won_CI + 
##     Nominee_CI + Nominee_DI, family = "binomial", data = dtframe2)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.02583  -0.17929  -0.11974  -0.05924   3.14377  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -6.344450   0.810000  -7.833 4.78e-15 ***
## Won_FEYES      1.085520   0.455630   2.382  0.01720 *  
## Nominee_FEYES  1.409967   0.485800   2.902  0.00370 ** 
## Won_DIYES      3.541594   0.405369   8.737  < 2e-16 ***
## Won_CIYES     -0.691716   0.650994  -1.063  0.28798    
## Nominee_CIYES -0.009677   0.458376  -0.021  0.98316    
## Nominee_DIYES  2.221843   0.767962   2.893  0.00381 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 470.09  on 774  degrees of freedom
## Residual deviance: 210.45  on 768  degrees of freedom
## AIC: 224.45
## 
## Number of Fisher Scoring iterations: 8

In the output from the summary command, the first thing we see is the call, a reminder about what the model we ran was, what options we specified, etc.

Next we see the deviance residuals, which are a measure of model fit. This part of output shows the distribution of the deviance residuals for individual cases used in the model.

The next part of the output shows the coefficients, their standard errors, the z-statistic (Wald z-statistic), and the associated p-values.

Won_DI, Nominee_DI, Won_FE and Nominee_FE are statistically significant at the:

1- 5% level for Won_FE, p-value 0.01720,

2- 1% level for Nominee_DI and Nominee_FE, p-values 0.00381 and 0.00370

3- 0% level for Won_DI, p-value < 2e-16.

The data we have seems to indicate that “FILM EDITING’ and”DIRECTING" are influential in predicting “BEST PICTURE”.

## Confidence Intervals using standard errors
confint.default(model2)

##                    2.5 %     97.5 %
## (Intercept)   -7.9320215 -4.7568782
## Won_FEYES      0.1925022  1.9785370
## Nominee_FEYES  0.4578173  2.3621176
## Won_DIYES      2.7470845  4.3361036
## Won_CIYES     -1.9676401  0.5842078
## Nominee_CIYES -0.9080775  0.8887242
## Nominee_DIYES  0.7166656  3.7270211

## odds ratios only
exp(coef(model2))

##   (Intercept)     Won_FEYES Nominee_FEYES     Won_DIYES     Won_CIYES 
##   0.001756469   2.960977905   4.095821963  34.521904879   0.500716009 
## Nominee_CIYES Nominee_DIYES 
##   0.990370036   9.224318870

Logit model for Project3

Edwige

October 25, 2015