R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

installing required library

library(reshape2)
library(class)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following objects are masked from 'package:reshape2':
## 
##     dcast, melt
library(naivebayes)
## naivebayes 0.9.6 loaded
## 
## Attaching package: 'naivebayes'
## The following object is masked from 'package:data.table':
## 
##     tables

We will implement Naive Bayes to identify flower species based on sepal & petal lenght & width

getwd()
## [1] "C:/Users/prasnaya/Desktop/Personal/python/Data_Science/DS With R-Saharan/Projects"
data<-read.csv("locations_data.csv")
str(data)
## 'data.frame':    2184 obs. of  7 variables:
##  $ month   : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ day     : int  4 4 4 4 4 4 4 4 4 4 ...
##  $ weekday : Factor w/ 7 levels "friday","monday",..: 7 7 7 7 7 7 7 7 7 7 ...
##  $ daytype : Factor w/ 2 levels "weekday","weekend": 1 1 1 1 1 1 1 1 1 1 ...
##  $ hour    : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ hourtype: Factor w/ 4 levels "afternoon","evening",..: 4 4 4 4 4 4 3 3 3 3 ...
##  $ location: Factor w/ 7 levels "appointment",..: 3 3 3 3 3 3 3 3 3 4 ...

Data is very homogeneous all species are together,we will use the random function to reshuffle the iris data

set.seed(1000)
rand_run<- runif(nrow(data))
data<-data[order(rand_run),]


head(data)

we have location data for one person based on time of the day , daytype & month

scatter <- ggplot(data=subset(data,daytype=="weekend"), aes(x = weekday, y = hourtype)) 
scatter + geom_point(aes(color=location, shape=daytype)) +
  xlab("days") +  ylab("hourtype") +
  ggtitle("location")

scatter <- ggplot(data=subset(data,daytype=="weekday"), aes(x = weekday, y = hourtype)) 
scatter + geom_point(aes(color=location, shape=daytype)) +
  xlab("days") +  ylab("hourtype") +
  ggtitle("location")

If we notice the scatter plot , we see in weekdays night location is always at home,morning mojority times he is office but some time he is campus and appointment

data_barchat <- data %>% 
  count(daytype,hourtype,location) %>%
  rename(avg_count = n)
bar1 <- ggplot(data=data_barchat, aes(x=location, y=avg_count, fill=hourtype))
bar1 + geom_bar(stat="identity", position="dodge") + 
  scale_fill_manual(values=c("orange", "blue", "darkgreen", "purple"),
                    name="Measurements"
                    )

bar1 <- ggplot(data=data_barchat, aes(x=location, y=avg_count, fill=daytype))
bar1 + geom_bar(stat="identity", position="dodge") + 
  scale_fill_manual(values=c("orange", "blue", "darkgreen", "purple"),
                    name="Measurements"
                    )

ggtitle("location")
## $title
## [1] "location"
## 
## attr(,"class")
## [1] "labels"
nrow(data)
## [1] 2184
training_count <-nrow(data)*.75
training_count
## [1] 1638
training_data<-data[1:training_count,]
str(training_data)
## 'data.frame':    1638 obs. of  7 variables:
##  $ month   : int  2 1 3 3 2 3 1 3 3 3 ...
##  $ day     : int  22 24 24 11 15 22 30 24 6 24 ...
##  $ weekday : Factor w/ 7 levels "friday","monday",..: 7 6 1 3 7 7 2 1 2 1 ...
##  $ daytype : Factor w/ 2 levels "weekday","weekend": 1 1 1 2 1 1 1 1 1 1 ...
##  $ hour    : int  14 15 6 12 21 4 0 8 11 20 ...
##  $ hourtype: Factor w/ 4 levels "afternoon","evening",..: 1 1 3 1 2 4 4 3 3 2 ...
##  $ location: Factor w/ 7 levels "appointment",..: 4 4 3 3 3 3 3 2 4 3 ...
testing_data_label<-data[(training_count+1):2184,6]

testing_data_label1<-data[2174:2184,6]

testing_data<-data[(training_count+1):2184,1:4]

testing_data1<-data[2174:2184,1:4]

str(testing_data)
## 'data.frame':    546 obs. of  4 variables:
##  $ month  : int  2 3 2 2 2 3 1 2 3 1 ...
##  $ day    : int  25 8 11 4 9 4 28 22 1 26 ...
##  $ weekday: Factor w/ 7 levels "friday","monday",..: 3 7 3 3 5 3 3 7 7 5 ...
##  $ daytype: Factor w/ 2 levels "weekday","weekend": 2 1 2 2 1 2 2 1 1 1 ...
location_modell <- naive_bayes(location~daytype, data = training_data)
## Warning: naive_bayes(): Feature daytype - zero probabilities are present.
## Consider Laplace smoothing.
location_modell
## 
## ================================ Naive Bayes ================================= 
##  
##  Call: 
## naive_bayes.formula(formula = location ~ daytype, data = training_data)
## 
## ------------------------------------------------------------------------------ 
##  
## Laplace smoothing: 0
## 
## ------------------------------------------------------------------------------ 
##  
##  A priori probabilities: 
## 
## appointment      campus        home      office  restaurant       store 
## 0.003052503 0.031135531 0.727716728 0.176434676 0.039682540 0.018925519 
##     theater 
## 0.003052503 
## 
## ------------------------------------------------------------------------------ 
##  
##  Tables: 
## 
## ------------------------------------------------------------------------------ 
##  ::: daytype (Bernoulli) 
## ------------------------------------------------------------------------------ 
##          
## daytype   appointment    campus      home    office restaurant     store
##   weekday   0.2000000 1.0000000 0.6224832 1.0000000  0.8000000 0.7096774
##   weekend   0.8000000 0.0000000 0.3775168 0.0000000  0.2000000 0.2903226
##          
## daytype     theater
##   weekday 0.0000000
##   weekend 1.0000000
## 
## ------------------------------------------------------------------------------
location_model2 <- naive_bayes(location~daytype + hourtype, data = training_data)
## Warning: naive_bayes(): Feature daytype - zero probabilities are present.
## Consider Laplace smoothing.
## Warning: naive_bayes(): Feature hourtype - zero probabilities are present.
## Consider Laplace smoothing.
location_model2
## 
## ================================ Naive Bayes ================================= 
##  
##  Call: 
## naive_bayes.formula(formula = location ~ daytype + hourtype, 
##     data = training_data)
## 
## ------------------------------------------------------------------------------ 
##  
## Laplace smoothing: 0
## 
## ------------------------------------------------------------------------------ 
##  
##  A priori probabilities: 
## 
## appointment      campus        home      office  restaurant       store 
## 0.003052503 0.031135531 0.727716728 0.176434676 0.039682540 0.018925519 
##     theater 
## 0.003052503 
## 
## ------------------------------------------------------------------------------ 
##  
##  Tables: 
## 
## ------------------------------------------------------------------------------ 
##  ::: daytype (Bernoulli) 
## ------------------------------------------------------------------------------ 
##          
## daytype   appointment    campus      home    office restaurant     store
##   weekday   0.2000000 1.0000000 0.6224832 1.0000000  0.8000000 0.7096774
##   weekend   0.8000000 0.0000000 0.3775168 0.0000000  0.2000000 0.2903226
##          
## daytype     theater
##   weekday 0.0000000
##   weekend 1.0000000
## 
## ------------------------------------------------------------------------------ 
##  ::: hourtype (Categorical) 
## ------------------------------------------------------------------------------ 
##            
## hourtype    appointment     campus       home     office restaurant
##   afternoon  0.80000000 0.52941176 0.11409396 0.67474048 0.52307692
##   evening    0.20000000 0.00000000 0.19463087 0.03806228 0.15384615
##   morning    0.00000000 0.47058824 0.22651007 0.28719723 0.30769231
##   night      0.00000000 0.00000000 0.46476510 0.00000000 0.01538462
##            
## hourtype         store    theater
##   afternoon 0.06451613 0.00000000
##   evening   0.90322581 1.00000000
##   morning   0.00000000 0.00000000
##   night     0.03225806 0.00000000
## 
## ------------------------------------------------------------------------------

With adding addition features help us to get probability based on time of the day more accurately , still we see we have few probability of 0 which should not be the case . We will add Laplace to do correction on probabilty having 0 value

location_model3 <- naive_bayes(location~daytype + hourtype, data = training_data,laplace = 1)

location_model3
## 
## ================================ Naive Bayes ================================= 
##  
##  Call: 
## naive_bayes.formula(formula = location ~ daytype + hourtype, 
##     data = training_data, laplace = 1)
## 
## ------------------------------------------------------------------------------ 
##  
## Laplace smoothing: 1
## 
## ------------------------------------------------------------------------------ 
##  
##  A priori probabilities: 
## 
## appointment      campus        home      office  restaurant       store 
## 0.003052503 0.031135531 0.727716728 0.176434676 0.039682540 0.018925519 
##     theater 
## 0.003052503 
## 
## ------------------------------------------------------------------------------ 
##  
##  Tables: 
## 
## ------------------------------------------------------------------------------ 
##  ::: daytype (Bernoulli) 
## ------------------------------------------------------------------------------ 
##          
## daytype   appointment      campus        home      office  restaurant
##   weekday 0.285714286 0.981132075 0.622278057 0.996563574 0.791044776
##   weekend 0.714285714 0.018867925 0.377721943 0.003436426 0.208955224
##          
## daytype         store     theater
##   weekday 0.696969697 0.142857143
##   weekend 0.303030303 0.857142857
## 
## ------------------------------------------------------------------------------ 
##  ::: hourtype (Categorical) 
## ------------------------------------------------------------------------------ 
##            
## hourtype    appointment      campus        home      office  restaurant
##   afternoon 0.555555556 0.509090909 0.114548495 0.668941980 0.507246377
##   evening   0.222222222 0.018181818 0.194816054 0.040955631 0.159420290
##   morning   0.111111111 0.454545455 0.226588629 0.286689420 0.304347826
##   night     0.111111111 0.018181818 0.464046823 0.003412969 0.028985507
##            
## hourtype          store     theater
##   afternoon 0.085714286 0.111111111
##   evening   0.828571429 0.666666667
##   morning   0.028571429 0.111111111
##   night     0.057142857 0.111111111
## 
## ------------------------------------------------------------------------------
location_model3
## 
## ================================ Naive Bayes ================================= 
##  
##  Call: 
## naive_bayes.formula(formula = location ~ daytype + hourtype, 
##     data = training_data, laplace = 1)
## 
## ------------------------------------------------------------------------------ 
##  
## Laplace smoothing: 1
## 
## ------------------------------------------------------------------------------ 
##  
##  A priori probabilities: 
## 
## appointment      campus        home      office  restaurant       store 
## 0.003052503 0.031135531 0.727716728 0.176434676 0.039682540 0.018925519 
##     theater 
## 0.003052503 
## 
## ------------------------------------------------------------------------------ 
##  
##  Tables: 
## 
## ------------------------------------------------------------------------------ 
##  ::: daytype (Bernoulli) 
## ------------------------------------------------------------------------------ 
##          
## daytype   appointment      campus        home      office  restaurant
##   weekday 0.285714286 0.981132075 0.622278057 0.996563574 0.791044776
##   weekend 0.714285714 0.018867925 0.377721943 0.003436426 0.208955224
##          
## daytype         store     theater
##   weekday 0.696969697 0.142857143
##   weekend 0.303030303 0.857142857
## 
## ------------------------------------------------------------------------------ 
##  ::: hourtype (Categorical) 
## ------------------------------------------------------------------------------ 
##            
## hourtype    appointment      campus        home      office  restaurant
##   afternoon 0.555555556 0.509090909 0.114548495 0.668941980 0.507246377
##   evening   0.222222222 0.018181818 0.194816054 0.040955631 0.159420290
##   morning   0.111111111 0.454545455 0.226588629 0.286689420 0.304347826
##   night     0.111111111 0.018181818 0.464046823 0.003412969 0.028985507
##            
## hourtype          store     theater
##   afternoon 0.085714286 0.111111111
##   evening   0.828571429 0.666666667
##   morning   0.028571429 0.111111111
##   night     0.057142857 0.111111111
## 
## ------------------------------------------------------------------------------
test_result1<-predict(location_model3,testing_data)
## Warning: predict.naive_bayes(): Only 1 feature(s) out of 2 defined in the naive_bayes object "location_model3" are used for prediction.
## Warning: predict.naive_bayes(): More features in the newdata are provided
## as there are probability tables in the object. Calculation is performed
## based on features to be found in the tables.
test_result2<-predict(location_model2,testing_data)
## Warning: predict.naive_bayes(): Only 1 feature(s) out of 2 defined in the naive_bayes object "location_model2" are used for prediction.
## Warning: predict.naive_bayes(): More features in the newdata are provided
## as there are probability tables in the object. Calculation is performed
## based on features to be found in the tables.
testing_data_label1
##  [1] afternoon night     night     evening   morning   night     evening  
##  [8] evening   morning   afternoon evening  
## Levels: afternoon evening morning night
test_result3<-predict(location_modell,testing_data1)
## Warning: predict.naive_bayes(): More features in the newdata are provided
## as there are probability tables in the object. Calculation is performed
## based on features to be found in the tables.
test_result3
##  [1] home home home home home home home home home home home
## Levels: appointment campus home office restaurant store theater
testing_data_label1
##  [1] afternoon night     night     evening   morning   night     evening  
##  [8] evening   morning   afternoon evening  
## Levels: afternoon evening morning night
library(gmodels)


CrossTable(test_result3, testing_data_label1,
           prop.chisq = FALSE, prop.t = FALSE, prop.r = FALSE,
           dnn = c('predicted', 'actual'))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |-------------------------|
## 
##  
## Total Observations in Table:  11 
## 
##  
##              | testing_data_label1 
## test_result3 | afternoon |   evening |   morning |     night | Row Total | 
## -------------|-----------|-----------|-----------|-----------|-----------|
##         home |         2 |         4 |         2 |         3 |        11 | 
## -------------|-----------|-----------|-----------|-----------|-----------|
## Column Total |         2 |         4 |         2 |         3 |        11 | 
## -------------|-----------|-----------|-----------|-----------|-----------|
## 
##