shop = read.csv("online_shoppers_intention.csv")
View(shop)
str(shop)
## 'data.frame':    12330 obs. of  18 variables:
##  $ Administrative         : int  0 0 0 0 0 0 0 1 0 0 ...
##  $ Administrative_Duration: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Informational          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Informational_Duration : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ProductRelated         : int  1 2 1 2 10 19 1 0 2 3 ...
##  $ ProductRelated_Duration: num  0 64 0 2.67 627.5 ...
##  $ BounceRates            : num  0.2 0 0.2 0.05 0.02 ...
##  $ ExitRates              : num  0.2 0.1 0.2 0.14 0.05 ...
##  $ PageValues             : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SpecialDay             : num  0 0 0 0 0 0 0.4 0 0.8 0.4 ...
##  $ Month                  : Factor w/ 10 levels "Aug","Dec","Feb",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ OperatingSystems       : int  1 2 4 3 3 2 2 1 2 2 ...
##  $ Browser                : int  1 2 1 2 3 2 4 2 2 4 ...
##  $ Region                 : int  1 1 9 2 1 1 3 1 2 1 ...
##  $ TrafficType            : int  1 2 3 4 4 3 3 5 3 2 ...
##  $ VisitorType            : Factor w/ 3 levels "New_Visitor",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ Weekend                : logi  FALSE FALSE FALSE FALSE TRUE FALSE ...
##  $ Revenue                : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
glimpse(shop)
## Observations: 12,330
## Variables: 18
## $ Administrative          <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,...
## $ Administrative_Duration <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Informational           <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Informational_Duration  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ ProductRelated          <int> 1, 2, 1, 2, 10, 19, 1, 0, 2, 3, 3, 16,...
## $ ProductRelated_Duration <dbl> 0.000000, 64.000000, 0.000000, 2.66666...
## $ BounceRates             <dbl> 0.200000000, 0.000000000, 0.200000000,...
## $ ExitRates               <dbl> 0.200000000, 0.100000000, 0.200000000,...
## $ PageValues              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ SpecialDay              <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.4, 0.0...
## $ Month                   <fct> Feb, Feb, Feb, Feb, Feb, Feb, Feb, Feb...
## $ OperatingSystems        <int> 1, 2, 4, 3, 3, 2, 2, 1, 2, 2, 1, 1, 1,...
## $ Browser                 <int> 1, 2, 1, 2, 3, 2, 4, 2, 2, 4, 1, 1, 1,...
## $ Region                  <int> 1, 1, 9, 2, 1, 1, 3, 1, 2, 1, 3, 4, 1,...
## $ TrafficType             <int> 1, 2, 3, 4, 4, 3, 3, 5, 3, 2, 3, 3, 3,...
## $ VisitorType             <fct> Returning_Visitor, Returning_Visitor, ...
## $ Weekend                 <lgl> FALSE, FALSE, FALSE, FALSE, TRUE, FALS...
## $ Revenue                 <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FAL...
names(shop)
##  [1] "Administrative"          "Administrative_Duration"
##  [3] "Informational"           "Informational_Duration" 
##  [5] "ProductRelated"          "ProductRelated_Duration"
##  [7] "BounceRates"             "ExitRates"              
##  [9] "PageValues"              "SpecialDay"             
## [11] "Month"                   "OperatingSystems"       
## [13] "Browser"                 "Region"                 
## [15] "TrafficType"             "VisitorType"            
## [17] "Weekend"                 "Revenue"
#install.packages("ggplot2")

library(ggplot2)
attach(shop)
table(Weekend,Month)
##        Month
## Weekend  Aug  Dec  Feb  Jul June  Mar  May  Nov  Oct  Sep
##   FALSE  337 1361  156  328  241 1426 2649 2207  405  352
##   TRUE    96  366   28  104   47  481  715  791  144   96
table(Weekend,SpecialDay)
##        SpecialDay
## Weekend    0  0.2  0.4  0.6  0.8    1
##   FALSE 8371  178  243  351  319    0
##   TRUE  2708    0    0    0    6  154
table(SpecialDay,TrafficType)
##           TrafficType
## SpecialDay    1    2    3    4    5    6    7    8    9   10   11   12
##        0   2290 3695 1725  850  240  357   37  343   42  450  219    1
##        0.2   25   27   48   35    4    7    0    0    0    0    3    0
##        0.4   28   40   67   43    3   13    1    0    0    0    9    0
##        0.6   36   66  122   48    4   27    1    0    0    0    5    0
##        0.8   48   53   61   59    4   29    0    0    0    0    7    0
##        1     24   32   29   34    5   11    1    0    0    0    4    0
##           TrafficType
## SpecialDay   13   14   15   16   17   18   19   20
##        0    575   11   33    3    0    9    9  190
##        0.2   25    0    1    0    0    0    3    0
##        0.4   31    1    1    0    0    0    2    4
##        0.6   37    0    2    0    0    0    2    1
##        0.8   61    1    0    0    0    1    0    1
##        1      9    0    1    0    1    0    1    2
# in special day ----  freq month
shop %>% filter(SpecialDay!=0)%>%ggplot(aes(x = SpecialDay , fill = Month))+ geom_histogram(position = 'dodge')+ggtitle("Histogram  for special day")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

shop %>% filter(ProductRelated<300)%>%ggplot(aes(x=VisitorType , y=ProductRelated))+geom_boxplot()+ facet_wrap(~Month)+ggtitle("box plot for visitor type ~ productrelated ~ by month")

table(VisitorType,Weekend)
##                    Weekend
## VisitorType         FALSE TRUE
##   New_Visitor        1215  479
##   Other                78    7
##   Returning_Visitor  8169 2382
shop %>% ggplot(aes(x = Weekend,fill = VisitorType))+geom_bar(position = 'dodge')+
  ggtitle("bar for weekend")

shop %>% ggplot(aes(x = BounceRates, y = ExitRates))+geom_point()+facet_wrap(~Month)+
  ggtitle("scatter plot for bounce rate ~ exit rate ~ by month")

ggplot(data=shop,aes(x=shop$SpecialDay,fill=shop$Month))+
  geom_density(alpha=0.4)+
  ggtitle("density plot for special day ~ month")

#install.packages("corrgram")
library(corrgram)
## Registered S3 method overwritten by 'seriation':
##   method         from 
##   reorder.hclust gclus
corrgram(shop, order=TRUE, lower.panel=panel.shade,
         upper.panel=panel.pie, text.panel=panel.txt,
         main="shop data")

ggplot(shop, aes(x=shop$PageValues, y=shop$ExitRates)) +
  geom_violin()+
  ggtitle("voilin chart for pagevalue and exit rate")

#plot(prcomp(scale(shop[,1:10])),type = "l")



set.seed(678)


shuffel_index =sample(1:nrow(shop))
shop = shop[shuffel_index,]

#3. Clean
library(dplyr)
dim(shop)
## [1] 12330    18
#install.packages("caTools")
library("caTools")

attach(shop)
## The following objects are masked from shop (pos = 5):
## 
##     Administrative, Administrative_Duration, BounceRates, Browser,
##     ExitRates, Informational, Informational_Duration, Month,
##     OperatingSystems, PageValues, ProductRelated,
##     ProductRelated_Duration, Region, Revenue, SpecialDay,
##     TrafficType, VisitorType, Weekend
class(VisitorType)
## [1] "factor"
clean_shop =shop
View(clean_shop)
split = sample.split(clean_shop$ExitRates,
                     SplitRatio = 0.8)

training_set =subset(clean_shop, split==TRUE)

test_set =subset(clean_shop, split==FALSE)
library(rpart)
#install.packages('rpart.plot')
library(rpart.plot)
fit <- rpart(Browser~.,
             data = training_set,
             method = 'class')
a = rpart.plot(fit,box.palette="blue")

library(RColorBrewer)
library(rattle)
## Rattle: A free graphical interface for data science with R.
## Version 5.2.0 Copyright (c) 2006-2018 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
#install.packages("rattle")
fancyRpartPlot(fit)

levels(shop$VisitorType)
## [1] "New_Visitor"       "Other"             "Returning_Visitor"
#5. Make a Prediction
predict_unseen=predict(object = fit,
                       newdata=test_set,
                       type='class')
#6. Create a Confusion Matrix
tab_mat= table(test_set$Browser,
               predict_unseen)
tab_mat
##     predict_unseen
##        1   2   3   4   5   6   7   8   9  10  11  12  13
##   1  333   1   0   0   0   0   0   0   0   0   0   0   0
##   2   62 909   0   0   0   0   0   0   0   0   0   0   1
##   3    0  14   0   0   0   0   0   0   0   0   0   0   0
##   4    0 117   0   0   0   0   0   0   0   0   0   0   0
##   5    6  62   0   0   0   0   0   0   0   0   0   0   0
##   6    1  25   0   0   0   0   0   0   0   0   0   0   0
##   7    0  10   0   0   0   0   0   0   0   0   0   0   0
##   8   21   0   0   0   0   0   0   0   0   0   0   0   0
##   10   0  27   0   0   0   0   0   0   0   0   0   0   0
##   11   2   0   0   0   0   0   0   0   0   0   0   0   0
##   12   0   3   0   0   0   0   0   0   0   0   0   0   0
##   13   2   1   0   0   0   0   0   0   0   0   0   0   8
#ACCURACY --------------------
sum(diag(tab_mat))/sum(tab_mat)
## [1] 0.7738318