Packages used.
library(naivebayes)
## Warning: package 'naivebayes' was built under R version 3.6.2
## naivebayes 0.9.6 loaded
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
df <- read.csv("draft_nb.csv")
str(df)
## 'data.frame': 908 obs. of 5 variables:
## $ Name : Factor w/ 908 levels "A'Shawn Robinson",..: 2 3 4 5 6 7 8 9 10 11 ...
## $ Pos : Factor w/ 9 levels "ATH","DB","DL",..: 2 6 9 5 8 7 3 2 3 4 ...
## $ Votes : int 1 1 3 3 4 1 2 2 4 1 ...
## $ Avg_rk: num 120 150 92 75.7 59 ...
## $ Status: Factor w/ 2 levels "Drafted","UD": 2 2 1 2 2 2 2 2 1 2 ...
Data preview
xtabs(~Status+Votes,data = df)
## Votes
## Status 1 2 3 4
## Drafted 36 32 45 146
## UD 236 100 132 181
xtabs(~Status+Pos,data = df)
## Pos
## Status ATH DB DL LB OL QB RB TE WR
## Drafted 20 44 64 23 34 15 31 5 23
## UD 49 97 99 82 99 53 54 24 92
Visualization: Density Graph
df %>%
ggplot(aes(x=Votes,fill=Status))+
geom_density(alpha=0.08,color='black')+
ggtitle("Draft Status by Votes")
Data Partition: Splitting the data into testing and training sets
set.seed(1234)
ind <- sample(2,nrow(df),replace = T,prob = c(0.8,0.2))
train <- df[ind==1,]
test <- df[ind==2,]
Creating the Naive Bayes Model
mod <- naive_bayes(Status~.,data = train,usekernel = T)
## Warning: naive_bayes(): Feature Name - zero probabilities are present.
## Consider Laplace smoothing.
Preview the mean and standard deviations for ‘Votes’.
train %>%
filter(Status=="Drafted") %>%
summarise(mean(Votes),sd(Votes))
## mean(Votes) sd(Votes)
## 1 3.145729 1.084263
Create some plots to review. Checking for collinearity.
plot(mod)
Build predictive model
p <- predict(mod,train,type = 'prob')
## Warning: predict.naive_bayes(): More features in the newdata are provided
## as there are probability tables in the object. Calculation is performed
## based on features to be found in the tables.
probs <- (cbind(p,train))
head(cbind(p,train))
## Drafted UD Name Pos Votes Avg_rk Status
## 1 0.04683969 0.9531603 A.J. Leggett DB 1 120.00 UD
## 2 0.02625782 0.9737422 Aaron Bailey QB 1 150.00 UD
## 3 0.40306098 0.5969390 Aaron Burbridge WR 3 92.00 Drafted
## 4 0.09671831 0.9032817 Abdul Bello OL 3 75.67 UD
## 6 0.06014145 0.9398585 Adam Choice RB 1 109.00 UD
## 7 0.12335405 0.8766459 Adam McLean DL 2 105.00 UD
Construct first Confusion Matrix - train data
p1 <- predict(mod,train)
## Warning: predict.naive_bayes(): More features in the newdata are provided
## as there are probability tables in the object. Calculation is performed
## based on features to be found in the tables.
(tab1 <- table(p1,train$Status))
##
## p1 Drafted UD
## Drafted 147 30
## UD 52 488
1-sum(diag(tab1))/sum(tab1)
## [1] 0.1143654
Construct second Confusion Matrix - test data
p2 <- predict(mod,test)
## Warning: predict.naive_bayes(): More features in the newdata are provided
## as there are probability tables in the object. Calculation is performed
## based on features to be found in the tables.
(tab2 <- table(p2,test$Status))
##
## p2 Drafted UD
## Drafted 22 18
## UD 38 113
1-sum(diag(tab2))/sum(tab2)
## [1] 0.2931937