#Installs required package for reading a csv data file
if(!require(readr)) install.packages("readr",repos = "http://cran.us.r-project.org")
if(!require(e1071)) install.packages("e1071",repos = "http://cran.us.r-project.org")
if(!require(caret)) install.packages("caret",repos = "http://cran.us.r-project.org")
seed1 = 1234 # setting seed for random draws
library(readr)
seg.df <- read.csv("/Users/ayandacollins/Desktop/MISDI/Marketing Analytics/moviedatasegmented.csv")
#picking pre-release variables
which( colnames(seg.df)=="ReleaseTime")
## [1] 5
which( colnames(seg.df)=="Genre")
## [1] 6
which( colnames(seg.df)=="MPAA")
## [1] 7
which( colnames(seg.df)=="Production")
## [1] 8
which( colnames(seg.df)=="Distribution")
## [1] 9
which( colnames(seg.df)=="PriorPopular")
## [1] 10
which( colnames(seg.df)=="Budget")
## [1] 11
which( colnames(seg.df)=="Screens")
## [1] 12
which( colnames(seg.df)=="Duration")
## [1] 17
which( colnames(seg.df)=="Segment")
## [1] 2
seg.df.num <- seg.df[,c(2,5:12,17)]
seg.df.num$ReleaseTime <- as.factor(seg.df.num $ReleaseTime)
seg.df.num$Production <- as.factor(seg.df.num $Production)
seg.df.num$Distribution <- as.factor(seg.df.num $Distribution)
seg.df.num$PriorPopular <- as.factor(seg.df.num $PriorPopular)
#View data frame
#View(seg.df.num)
#Getting an overview of the data
summary(seg.df.num)
## Segment ReleaseTime Genre MPAA
## Length:309 0:179 Length:309 Length:309
## Class :character 1:130 Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## Production Distribution PriorPopular Budget Screens
## 0:244 0:150 0:142 Min. : 10000 Min. : 1
## 1: 65 1:159 1:167 1st Qu.: 10000000 1st Qu.:1656
## Median : 28000000 Median :2934
## Mean : 54159460 Mean :2551
## 3rd Qu.: 62000000 3rd Qu.:3585
## Max. :356000000 Max. :4662
## Duration
## Min. : 79.0
## 1st Qu.: 96.0
## Median :106.0
## Mean :108.7
## 3rd Qu.:119.0
## Max. :181.0
#Examining the structure of the data
str(seg.df.num)
## 'data.frame': 309 obs. of 10 variables:
## $ Segment : chr "flop" "flop" "flop" "blockbuster" ...
## $ ReleaseTime : Factor w/ 2 levels "0","1": 1 1 2 1 2 2 1 2 1 1 ...
## $ Genre : chr "Romance" "Romance" "Romance" "Romance" ...
## $ MPAA : chr "PG-13" "PG-13" "PG-13" "R" ...
## $ Production : Factor w/ 2 levels "0","1": 1 1 1 1 2 2 2 2 1 1 ...
## $ Distribution: Factor w/ 2 levels "0","1": 1 1 1 2 2 2 2 2 2 1 ...
## $ PriorPopular: Factor w/ 2 levels "0","1": 2 2 2 2 1 1 2 1 1 1 ...
## $ Budget : int 8000000 9000000 14000000 55000000 33000000 10000000 55000000 110000000 31000000 8000000 ...
## $ Screens : int 870 572 317 3768 3008 142 3714 3478 3444 35 ...
## $ Duration : int 105 123 111 105 98 122 118 116 89 95 ...
# splitting into train and test; roughly 2/3 proportion to train, the rest to test
train.prop <- 0.65
train.cases <- sample(nrow(seg.df.num), nrow(seg.df.num)*train.prop)
seg.df.train <- seg.df.num[train.cases, ]
#Viewing training data
#View(seg.df.train)
#Viewing test data
seg.df.test <- seg.df.num[-train.cases, ]
#View(seg.df.test)
That done, let’s turn to our actual Naïve Bayes implementation.
Start as usual by loading the appropriate library.
#load library
library(e1071)
Now run Naïve Bayes on training data.
(seg.nb <- naiveBayes(Segment ~ ., data=seg.df.train))
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## average blockbuster flop
## 0.530 0.215 0.255
##
## Conditional probabilities:
## ReleaseTime
## Y 0 1
## average 0.6509434 0.3490566
## blockbuster 0.3953488 0.6046512
## flop 0.6078431 0.3921569
##
## Genre
## Y Action Adventure Comedy Drama Horror Romance
## average 0.17924528 0.14150943 0.15094340 0.20754717 0.07547170 0.06603774
## blockbuster 0.25581395 0.44186047 0.02325581 0.09302326 0.04651163 0.04651163
## flop 0.03921569 0.11764706 0.25490196 0.15686275 0.09803922 0.21568627
## Genre
## Y Thriller
## average 0.17924528
## blockbuster 0.09302326
## flop 0.11764706
##
## MPAA
## Y G NR PG PG-13 R
## average 0.00000000 0.00000000 0.15094340 0.42452830 0.42452830
## blockbuster 0.02325581 0.00000000 0.16279070 0.62790698 0.18604651
## flop 0.00000000 0.05882353 0.11764706 0.29411765 0.52941176
##
## Production
## Y 0 1
## average 0.76415094 0.23584906
## blockbuster 0.55813953 0.44186047
## flop 0.98039216 0.01960784
##
## Distribution
## Y 0 1
## average 0.41509434 0.58490566
## blockbuster 0.06976744 0.93023256
## flop 0.86274510 0.13725490
##
## PriorPopular
## Y 0 1
## average 0.4245283 0.5754717
## blockbuster 0.2093023 0.7906977
## flop 0.7254902 0.2745098
##
## Budget
## Y [,1] [,2]
## average 42585678 39802994
## blockbuster 139639535 84078572
## flop 15513219 16803953
##
## Screens
## Y [,1] [,2]
## average 2870.1887 797.1292
## blockbuster 3862.4186 735.4409
## flop 948.4118 944.9153
##
## Duration
## Y [,1] [,2]
## average 108.8679 15.18964
## blockbuster 122.5116 18.88735
## flop 100.6471 12.31880
# Change factors to numeric
#seg.df.num$ReleaseTime <- ifelse(seg.df.num$ReleaseTime == "1", 1,0) # Holiday 1, Not holiday 0
#seg.df.num$Production <- ifelse(seg.df.num$Production == "1", 1,0)
#seg.df.num$Distribution <- ifelse(seg.df.num$Distribution == "1", 1,0)
#seg.df.num$PriorPopular <- ifelse(seg.df.num$PriorPopular == "1", 1,0)
Figuring out segment membership
# posterior probabilities of segment membership.
seg.nb.class <- predict(seg.nb, seg.df.test, type="raw")
head(seg.nb.class)
## average blockbuster flop
## [1,] 0.013347918 4.506563e-07 9.866516e-01
## [2,] 0.002106615 5.790013e-08 9.978933e-01
## [3,] 0.960858033 3.826105e-02 8.809198e-04
## [4,] 0.970570335 1.690468e-02 1.252498e-02
## [5,] 0.050434844 3.977294e-06 9.495612e-01
## [6,] 0.860300984 1.396733e-01 2.572442e-05
prob_out <- seg.nb.class # renaming it back
colnames(prob_out) <- c( "blockbuster", "average", "flop")
maxval <- apply(prob_out, 1, max)
prob_out <- cbind(prob_out, maxval) # figure out maximum condnl probability
prob_out <- as.data.frame(prob_out) # change matrix to data.frame
# creates column called 'predval' with predicted segment
prob_out$predval <- "blockbuster"
prob_out$predval <- ifelse(prob_out$average == prob_out$maxval, "average" , prob_out$predval)
prob_out$predval <- ifelse(prob_out$flop == prob_out$maxval, "flop" , prob_out$predval)
# view a few rows of prob_out
head(prob_out)
seg.nb.class <- prob_out$predval
# frequencies in predicted data
prop.table(table(seg.nb.class))
## seg.nb.class
## average blockbuster flop
## 0.1743119 0.4036697 0.4220183
# compare to known segments
mean(seg.df.test$Segment==seg.nb.class)
## [1] 0.3302752
#confusion matrix
library(caret)
confusionMatrix(factor(seg.nb.class), factor(seg.df.test$Segment))
## Confusion Matrix and Statistics
##
## Reference
## Prediction average blockbuster flop
## average 5 14 0
## blockbuster 36 7 1
## flop 22 0 24
##
## Overall Statistics
##
## Accuracy : 0.3303
## 95% CI : (0.2432, 0.4269)
## No Information Rate : 0.578
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0758
##
## Mcnemar's Test P-Value : 3.762e-07
##
## Statistics by Class:
##
## Class: average Class: blockbuster Class: flop
## Sensitivity 0.07937 0.33333 0.9600
## Specificity 0.69565 0.57955 0.7381
## Pos Pred Value 0.26316 0.15909 0.5217
## Neg Pred Value 0.35556 0.78462 0.9841
## Prevalence 0.57798 0.19266 0.2294
## Detection Rate 0.04587 0.06422 0.2202
## Detection Prevalence 0.17431 0.40367 0.4220
## Balanced Accuracy 0.38751 0.45644 0.8490
# summary data for proposed segments in the test data
aggregate(seg.df.test[-7], list(seg.nb.class), function(x) mean(as.numeric(x)))
## Warning in mean(as.numeric(x)): NAs introduced by coercion
## Warning in mean(as.numeric(x)): NAs introduced by coercion
## Warning in mean(as.numeric(x)): NAs introduced by coercion
## Warning in mean(as.numeric(x)): NAs introduced by coercion
## Warning in mean(as.numeric(x)): NAs introduced by coercion
## Warning in mean(as.numeric(x)): NAs introduced by coercion
## Warning in mean(as.numeric(x)): NAs introduced by coercion
## Warning in mean(as.numeric(x)): NAs introduced by coercion
## Warning in mean(as.numeric(x)): NAs introduced by coercion
# summary data for the known segments in the test data
aggregate(seg.df.test[-7], list(seg.df.test$Segment), function(x) mean(as.numeric(x)))
## Warning in mean(as.numeric(x)): NAs introduced by coercion
## Warning in mean(as.numeric(x)): NAs introduced by coercion
## Warning in mean(as.numeric(x)): NAs introduced by coercion
## Warning in mean(as.numeric(x)): NAs introduced by coercion
## Warning in mean(as.numeric(x)): NAs introduced by coercion
## Warning in mean(as.numeric(x)): NAs introduced by coercion
## Warning in mean(as.numeric(x)): NAs introduced by coercion
## Warning in mean(as.numeric(x)): NAs introduced by coercion
## Warning in mean(as.numeric(x)): NAs introduced by coercion