shop = read.csv("online_shoppers_intention.csv")
View(shop)
str(shop)
## 'data.frame': 12330 obs. of 18 variables:
## $ Administrative : int 0 0 0 0 0 0 0 1 0 0 ...
## $ Administrative_Duration: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Informational : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Informational_Duration : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ProductRelated : int 1 2 1 2 10 19 1 0 2 3 ...
## $ ProductRelated_Duration: num 0 64 0 2.67 627.5 ...
## $ BounceRates : num 0.2 0 0.2 0.05 0.02 ...
## $ ExitRates : num 0.2 0.1 0.2 0.14 0.05 ...
## $ PageValues : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SpecialDay : num 0 0 0 0 0 0 0.4 0 0.8 0.4 ...
## $ Month : Factor w/ 10 levels "Aug","Dec","Feb",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ OperatingSystems : int 1 2 4 3 3 2 2 1 2 2 ...
## $ Browser : int 1 2 1 2 3 2 4 2 2 4 ...
## $ Region : int 1 1 9 2 1 1 3 1 2 1 ...
## $ TrafficType : int 1 2 3 4 4 3 3 5 3 2 ...
## $ VisitorType : Factor w/ 3 levels "New_Visitor",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ Weekend : logi FALSE FALSE FALSE FALSE TRUE FALSE ...
## $ Revenue : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
glimpse(shop)
## Observations: 12,330
## Variables: 18
## $ Administrative <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,...
## $ Administrative_Duration <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Informational <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Informational_Duration <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ ProductRelated <int> 1, 2, 1, 2, 10, 19, 1, 0, 2, 3, 3, 16,...
## $ ProductRelated_Duration <dbl> 0.000000, 64.000000, 0.000000, 2.66666...
## $ BounceRates <dbl> 0.200000000, 0.000000000, 0.200000000,...
## $ ExitRates <dbl> 0.200000000, 0.100000000, 0.200000000,...
## $ PageValues <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ SpecialDay <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.4, 0.0...
## $ Month <fct> Feb, Feb, Feb, Feb, Feb, Feb, Feb, Feb...
## $ OperatingSystems <int> 1, 2, 4, 3, 3, 2, 2, 1, 2, 2, 1, 1, 1,...
## $ Browser <int> 1, 2, 1, 2, 3, 2, 4, 2, 2, 4, 1, 1, 1,...
## $ Region <int> 1, 1, 9, 2, 1, 1, 3, 1, 2, 1, 3, 4, 1,...
## $ TrafficType <int> 1, 2, 3, 4, 4, 3, 3, 5, 3, 2, 3, 3, 3,...
## $ VisitorType <fct> Returning_Visitor, Returning_Visitor, ...
## $ Weekend <lgl> FALSE, FALSE, FALSE, FALSE, TRUE, FALS...
## $ Revenue <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FAL...
names(shop)
## [1] "Administrative" "Administrative_Duration"
## [3] "Informational" "Informational_Duration"
## [5] "ProductRelated" "ProductRelated_Duration"
## [7] "BounceRates" "ExitRates"
## [9] "PageValues" "SpecialDay"
## [11] "Month" "OperatingSystems"
## [13] "Browser" "Region"
## [15] "TrafficType" "VisitorType"
## [17] "Weekend" "Revenue"
#install.packages("ggplot2")
library(ggplot2)
attach(shop)
table(Weekend,Month)
## Month
## Weekend Aug Dec Feb Jul June Mar May Nov Oct Sep
## FALSE 337 1361 156 328 241 1426 2649 2207 405 352
## TRUE 96 366 28 104 47 481 715 791 144 96
table(Weekend,SpecialDay)
## SpecialDay
## Weekend 0 0.2 0.4 0.6 0.8 1
## FALSE 8371 178 243 351 319 0
## TRUE 2708 0 0 0 6 154
table(SpecialDay,TrafficType)
## TrafficType
## SpecialDay 1 2 3 4 5 6 7 8 9 10 11 12
## 0 2290 3695 1725 850 240 357 37 343 42 450 219 1
## 0.2 25 27 48 35 4 7 0 0 0 0 3 0
## 0.4 28 40 67 43 3 13 1 0 0 0 9 0
## 0.6 36 66 122 48 4 27 1 0 0 0 5 0
## 0.8 48 53 61 59 4 29 0 0 0 0 7 0
## 1 24 32 29 34 5 11 1 0 0 0 4 0
## TrafficType
## SpecialDay 13 14 15 16 17 18 19 20
## 0 575 11 33 3 0 9 9 190
## 0.2 25 0 1 0 0 0 3 0
## 0.4 31 1 1 0 0 0 2 4
## 0.6 37 0 2 0 0 0 2 1
## 0.8 61 1 0 0 0 1 0 1
## 1 9 0 1 0 1 0 1 2
# in special day ---- freq month
shop %>% filter(SpecialDay!=0)%>%ggplot(aes(x = SpecialDay , fill = Month))+ geom_histogram(position = 'dodge')+ggtitle("Histogram for special day")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

shop %>% filter(ProductRelated<300)%>%ggplot(aes(x=VisitorType , y=ProductRelated))+geom_boxplot()+ facet_wrap(~Month)+ggtitle("box plot for visitor type ~ productrelated ~ by month")

table(VisitorType,Weekend)
## Weekend
## VisitorType FALSE TRUE
## New_Visitor 1215 479
## Other 78 7
## Returning_Visitor 8169 2382
shop %>% ggplot(aes(x = Weekend,fill = VisitorType))+geom_bar(position = 'dodge')+
ggtitle("bar for weekend")

shop %>% ggplot(aes(x = BounceRates, y = ExitRates))+geom_point()+facet_wrap(~Month)+
ggtitle("scatter plot for bounce rate ~ exit rate ~ by month")

ggplot(data=shop,aes(x=shop$SpecialDay,fill=shop$Month))+
geom_density(alpha=0.4)+
ggtitle("density plot for special day ~ month")

#install.packages("corrgram")
library(corrgram)
## Registered S3 method overwritten by 'seriation':
## method from
## reorder.hclust gclus
corrgram(shop, order=TRUE, lower.panel=panel.shade,
upper.panel=panel.pie, text.panel=panel.txt,
main="shop data")

ggplot(shop, aes(x=shop$PageValues, y=shop$ExitRates)) +
geom_violin()+
ggtitle("voilin chart for pagevalue and exit rate")

#plot(prcomp(scale(shop[,1:10])),type = "l")
set.seed(678)
shuffel_index =sample(1:nrow(shop))
shop = shop[shuffel_index,]
#3. Clean
library(dplyr)
dim(shop)
## [1] 12330 18
#install.packages("caTools")
library("caTools")
attach(shop)
## The following objects are masked from shop (pos = 5):
##
## Administrative, Administrative_Duration, BounceRates, Browser,
## ExitRates, Informational, Informational_Duration, Month,
## OperatingSystems, PageValues, ProductRelated,
## ProductRelated_Duration, Region, Revenue, SpecialDay,
## TrafficType, VisitorType, Weekend
class(VisitorType)
## [1] "factor"
clean_shop =shop
View(clean_shop)
split = sample.split(clean_shop$ExitRates,
SplitRatio = 0.8)
training_set =subset(clean_shop, split==TRUE)
test_set =subset(clean_shop, split==FALSE)
library(rpart)
#install.packages('rpart.plot')
library(rpart.plot)
fit <- rpart(Browser~.,
data = training_set,
method = 'class')
a = rpart.plot(fit,box.palette="blue")

library(RColorBrewer)
library(rattle)
## Rattle: A free graphical interface for data science with R.
## Version 5.2.0 Copyright (c) 2006-2018 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
#install.packages("rattle")
fancyRpartPlot(fit)

levels(shop$VisitorType)
## [1] "New_Visitor" "Other" "Returning_Visitor"
#5. Make a Prediction
predict_unseen=predict(object = fit,
newdata=test_set,
type='class')
#6. Create a Confusion Matrix
tab_mat= table(test_set$Browser,
predict_unseen)
tab_mat
## predict_unseen
## 1 2 3 4 5 6 7 8 9 10 11 12 13
## 1 333 1 0 0 0 0 0 0 0 0 0 0 0
## 2 62 909 0 0 0 0 0 0 0 0 0 0 1
## 3 0 14 0 0 0 0 0 0 0 0 0 0 0
## 4 0 117 0 0 0 0 0 0 0 0 0 0 0
## 5 6 62 0 0 0 0 0 0 0 0 0 0 0
## 6 1 25 0 0 0 0 0 0 0 0 0 0 0
## 7 0 10 0 0 0 0 0 0 0 0 0 0 0
## 8 21 0 0 0 0 0 0 0 0 0 0 0 0
## 10 0 27 0 0 0 0 0 0 0 0 0 0 0
## 11 2 0 0 0 0 0 0 0 0 0 0 0 0
## 12 0 3 0 0 0 0 0 0 0 0 0 0 0
## 13 2 1 0 0 0 0 0 0 0 0 0 0 8
#ACCURACY --------------------
sum(diag(tab_mat))/sum(tab_mat)
## [1] 0.7738318