Loading packages we’ll need

#Installs required package for reading a csv data file
if(!require(readr)) install.packages("readr",repos = "http://cran.us.r-project.org")
if(!require(e1071)) install.packages("e1071",repos = "http://cran.us.r-project.org")
if(!require(caret)) install.packages("caret",repos = "http://cran.us.r-project.org")

seed1 = 1234 # setting seed for random draws

Loading and reading data

library(readr)
seg.df <- read.csv("/Users/ayandacollins/Desktop/MISDI/Marketing Analytics/moviedatasegmented.csv")
#picking pre-release variables
which( colnames(seg.df)=="ReleaseTime")
## [1] 5
which( colnames(seg.df)=="Genre")
## [1] 6
which( colnames(seg.df)=="MPAA")
## [1] 7
which( colnames(seg.df)=="Production")
## [1] 8
which( colnames(seg.df)=="Distribution")
## [1] 9
which( colnames(seg.df)=="PriorPopular")
## [1] 10
which( colnames(seg.df)=="Budget")
## [1] 11
which( colnames(seg.df)=="Screens")
## [1] 12
which( colnames(seg.df)=="Duration")
## [1] 17
which( colnames(seg.df)=="Segment")
## [1] 2
seg.df.num <- seg.df[,c(2,5:12,17)]
seg.df.num$ReleaseTime <- as.factor(seg.df.num $ReleaseTime)
seg.df.num$Production <- as.factor(seg.df.num $Production)
seg.df.num$Distribution <- as.factor(seg.df.num $Distribution)
seg.df.num$PriorPopular <- as.factor(seg.df.num $PriorPopular)
#View data frame
#View(seg.df.num)

#Getting an overview of the data
summary(seg.df.num)
##    Segment          ReleaseTime    Genre               MPAA          
##  Length:309         0:179       Length:309         Length:309        
##  Class :character   1:130       Class :character   Class :character  
##  Mode  :character               Mode  :character   Mode  :character  
##                                                                      
##                                                                      
##                                                                      
##  Production Distribution PriorPopular     Budget             Screens    
##  0:244      0:150        0:142        Min.   :    10000   Min.   :   1  
##  1: 65      1:159        1:167        1st Qu.: 10000000   1st Qu.:1656  
##                                       Median : 28000000   Median :2934  
##                                       Mean   : 54159460   Mean   :2551  
##                                       3rd Qu.: 62000000   3rd Qu.:3585  
##                                       Max.   :356000000   Max.   :4662  
##     Duration    
##  Min.   : 79.0  
##  1st Qu.: 96.0  
##  Median :106.0  
##  Mean   :108.7  
##  3rd Qu.:119.0  
##  Max.   :181.0
#Examining the structure of the data
str(seg.df.num)
## 'data.frame':    309 obs. of  10 variables:
##  $ Segment     : chr  "flop" "flop" "flop" "blockbuster" ...
##  $ ReleaseTime : Factor w/ 2 levels "0","1": 1 1 2 1 2 2 1 2 1 1 ...
##  $ Genre       : chr  "Romance" "Romance" "Romance" "Romance" ...
##  $ MPAA        : chr  "PG-13" "PG-13" "PG-13" "R" ...
##  $ Production  : Factor w/ 2 levels "0","1": 1 1 1 1 2 2 2 2 1 1 ...
##  $ Distribution: Factor w/ 2 levels "0","1": 1 1 1 2 2 2 2 2 2 1 ...
##  $ PriorPopular: Factor w/ 2 levels "0","1": 2 2 2 2 1 1 2 1 1 1 ...
##  $ Budget      : int  8000000 9000000 14000000 55000000 33000000 10000000 55000000 110000000 31000000 8000000 ...
##  $ Screens     : int  870 572 317 3768 3008 142 3714 3478 3444 35 ...
##  $ Duration    : int  105 123 111 105 98 122 118 116 89 95 ...
# splitting into train and test; roughly 2/3 proportion to train, the rest to test
train.prop  <- 0.65
train.cases <- sample(nrow(seg.df.num), nrow(seg.df.num)*train.prop)
seg.df.train <- seg.df.num[train.cases, ]

#Viewing training data
#View(seg.df.train)
#Viewing test data
seg.df.test  <- seg.df.num[-train.cases, ]
#View(seg.df.test)

That done, let’s turn to our actual Naïve Bayes implementation.

Running Naïve Bayes

Start as usual by loading the appropriate library.

#load library 
library(e1071)

Now run Naïve Bayes on training data.

(seg.nb <- naiveBayes(Segment ~ ., data=seg.df.train))
## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##     average blockbuster        flop 
##       0.530       0.215       0.255 
## 
## Conditional probabilities:
##              ReleaseTime
## Y                     0         1
##   average     0.6509434 0.3490566
##   blockbuster 0.3953488 0.6046512
##   flop        0.6078431 0.3921569
## 
##              Genre
## Y                 Action  Adventure     Comedy      Drama     Horror    Romance
##   average     0.17924528 0.14150943 0.15094340 0.20754717 0.07547170 0.06603774
##   blockbuster 0.25581395 0.44186047 0.02325581 0.09302326 0.04651163 0.04651163
##   flop        0.03921569 0.11764706 0.25490196 0.15686275 0.09803922 0.21568627
##              Genre
## Y               Thriller
##   average     0.17924528
##   blockbuster 0.09302326
##   flop        0.11764706
## 
##              MPAA
## Y                      G         NR         PG      PG-13          R
##   average     0.00000000 0.00000000 0.15094340 0.42452830 0.42452830
##   blockbuster 0.02325581 0.00000000 0.16279070 0.62790698 0.18604651
##   flop        0.00000000 0.05882353 0.11764706 0.29411765 0.52941176
## 
##              Production
## Y                      0          1
##   average     0.76415094 0.23584906
##   blockbuster 0.55813953 0.44186047
##   flop        0.98039216 0.01960784
## 
##              Distribution
## Y                      0          1
##   average     0.41509434 0.58490566
##   blockbuster 0.06976744 0.93023256
##   flop        0.86274510 0.13725490
## 
##              PriorPopular
## Y                     0         1
##   average     0.4245283 0.5754717
##   blockbuster 0.2093023 0.7906977
##   flop        0.7254902 0.2745098
## 
##              Budget
## Y                  [,1]     [,2]
##   average      42585678 39802994
##   blockbuster 139639535 84078572
##   flop         15513219 16803953
## 
##              Screens
## Y                  [,1]     [,2]
##   average     2870.1887 797.1292
##   blockbuster 3862.4186 735.4409
##   flop         948.4118 944.9153
## 
##              Duration
## Y                 [,1]     [,2]
##   average     108.8679 15.18964
##   blockbuster 122.5116 18.88735
##   flop        100.6471 12.31880
# Change factors to numeric 
#seg.df.num$ReleaseTime <- ifelse(seg.df.num$ReleaseTime == "1", 1,0) # Holiday 1, Not holiday 0
#seg.df.num$Production <- ifelse(seg.df.num$Production == "1", 1,0)
#seg.df.num$Distribution <- ifelse(seg.df.num$Distribution   == "1", 1,0)
#seg.df.num$PriorPopular <- ifelse(seg.df.num$PriorPopular   == "1", 1,0)

Figuring out segment membership

# posterior probabilities of segment membership.  
seg.nb.class <- predict(seg.nb, seg.df.test, type="raw")
head(seg.nb.class)
##          average  blockbuster         flop
## [1,] 0.013347918 4.506563e-07 9.866516e-01
## [2,] 0.002106615 5.790013e-08 9.978933e-01
## [3,] 0.960858033 3.826105e-02 8.809198e-04
## [4,] 0.970570335 1.690468e-02 1.252498e-02
## [5,] 0.050434844 3.977294e-06 9.495612e-01
## [6,] 0.860300984 1.396733e-01 2.572442e-05
prob_out <- seg.nb.class # renaming it back
colnames(prob_out) <- c( "blockbuster", "average", "flop") 
maxval <- apply(prob_out, 1, max)
prob_out <- cbind(prob_out, maxval) # figure out maximum condnl probability
prob_out <- as.data.frame(prob_out)  # change matrix to data.frame

# creates column called 'predval' with predicted segment
prob_out$predval <- "blockbuster"
prob_out$predval <- ifelse(prob_out$average == prob_out$maxval, "average" , prob_out$predval)
prob_out$predval <- ifelse(prob_out$flop == prob_out$maxval, "flop" , prob_out$predval)

# view a few rows of prob_out
head(prob_out) 
seg.nb.class <- prob_out$predval 
# frequencies in predicted data
prop.table(table(seg.nb.class))
## seg.nb.class
##     average blockbuster        flop 
##   0.1743119   0.4036697   0.4220183
# compare to known segments
mean(seg.df.test$Segment==seg.nb.class)
## [1] 0.3302752
#confusion matrix
library(caret)
confusionMatrix(factor(seg.nb.class), factor(seg.df.test$Segment))
## Confusion Matrix and Statistics
## 
##              Reference
## Prediction    average blockbuster flop
##   average           5          14    0
##   blockbuster      36           7    1
##   flop             22           0   24
## 
## Overall Statistics
##                                           
##                Accuracy : 0.3303          
##                  95% CI : (0.2432, 0.4269)
##     No Information Rate : 0.578           
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0758          
##                                           
##  Mcnemar's Test P-Value : 3.762e-07       
## 
## Statistics by Class:
## 
##                      Class: average Class: blockbuster Class: flop
## Sensitivity                 0.07937            0.33333      0.9600
## Specificity                 0.69565            0.57955      0.7381
## Pos Pred Value              0.26316            0.15909      0.5217
## Neg Pred Value              0.35556            0.78462      0.9841
## Prevalence                  0.57798            0.19266      0.2294
## Detection Rate              0.04587            0.06422      0.2202
## Detection Prevalence        0.17431            0.40367      0.4220
## Balanced Accuracy           0.38751            0.45644      0.8490
# summary data for proposed segments in the test data
aggregate(seg.df.test[-7], list(seg.nb.class), function(x) mean(as.numeric(x)))
## Warning in mean(as.numeric(x)): NAs introduced by coercion

## Warning in mean(as.numeric(x)): NAs introduced by coercion

## Warning in mean(as.numeric(x)): NAs introduced by coercion

## Warning in mean(as.numeric(x)): NAs introduced by coercion

## Warning in mean(as.numeric(x)): NAs introduced by coercion

## Warning in mean(as.numeric(x)): NAs introduced by coercion

## Warning in mean(as.numeric(x)): NAs introduced by coercion

## Warning in mean(as.numeric(x)): NAs introduced by coercion

## Warning in mean(as.numeric(x)): NAs introduced by coercion
# summary data for the known segments in the test data 
aggregate(seg.df.test[-7], list(seg.df.test$Segment), function(x) mean(as.numeric(x)))  
## Warning in mean(as.numeric(x)): NAs introduced by coercion

## Warning in mean(as.numeric(x)): NAs introduced by coercion

## Warning in mean(as.numeric(x)): NAs introduced by coercion

## Warning in mean(as.numeric(x)): NAs introduced by coercion

## Warning in mean(as.numeric(x)): NAs introduced by coercion

## Warning in mean(as.numeric(x)): NAs introduced by coercion

## Warning in mean(as.numeric(x)): NAs introduced by coercion

## Warning in mean(as.numeric(x)): NAs introduced by coercion

## Warning in mean(as.numeric(x)): NAs introduced by coercion