DATA ACQUISTION
# Dataset ----> https://www.kaggle.com/roshansharma/online-shoppers-intention
shop<-read.csv("C:/Users/navee/OneDrive/Documents/DA 5030/online_shoppers_intention.csv")
summary(shop)
## Administrative Administrative_Duration Informational
## Min. : 0.000 Min. : -1.00 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 0.00 1st Qu.: 0.000
## Median : 1.000 Median : 8.00 Median : 0.000
## Mean : 2.318 Mean : 80.91 Mean : 0.504
## 3rd Qu.: 4.000 3rd Qu.: 93.50 3rd Qu.: 0.000
## Max. :27.000 Max. :3398.75 Max. :24.000
## NA's :14 NA's :14 NA's :14
## Informational_Duration ProductRelated ProductRelated_Duration
## Min. : -1.00 Min. : 0.00 Min. : -1.0
## 1st Qu.: 0.00 1st Qu.: 7.00 1st Qu.: 185.0
## Median : 0.00 Median : 18.00 Median : 599.8
## Mean : 34.51 Mean : 31.76 Mean : 1196.0
## 3rd Qu.: 0.00 3rd Qu.: 38.00 3rd Qu.: 1466.5
## Max. :2549.38 Max. :705.00 Max. :63973.5
## NA's :14 NA's :14 NA's :14
## BounceRates ExitRates PageValues SpecialDay
## Min. :0.000000 Min. :0.00000 Min. : 0.000 Min. :0.00000
## 1st Qu.:0.000000 1st Qu.:0.01429 1st Qu.: 0.000 1st Qu.:0.00000
## Median :0.003119 Median :0.02512 Median : 0.000 Median :0.00000
## Mean :0.022152 Mean :0.04300 Mean : 5.889 Mean :0.06143
## 3rd Qu.:0.016684 3rd Qu.:0.05000 3rd Qu.: 0.000 3rd Qu.:0.00000
## Max. :0.200000 Max. :0.20000 Max. :361.764 Max. :1.00000
## NA's :14 NA's :14
## Month OperatingSystems Browser Region
## May :3364 Min. :1.000 Min. : 1.000 Min. :1.000
## Nov :2998 1st Qu.:2.000 1st Qu.: 2.000 1st Qu.:1.000
## Mar :1907 Median :2.000 Median : 2.000 Median :3.000
## Dec :1727 Mean :2.124 Mean : 2.357 Mean :3.147
## Oct : 549 3rd Qu.:3.000 3rd Qu.: 2.000 3rd Qu.:4.000
## Sep : 448 Max. :8.000 Max. :13.000 Max. :9.000
## (Other):1337
## TrafficType VisitorType Weekend Revenue
## Min. : 1.00 New_Visitor : 1694 Mode :logical Mode :logical
## 1st Qu.: 2.00 Other : 85 FALSE:9462 FALSE:10422
## Median : 2.00 Returning_Visitor:10551 TRUE :2868 TRUE :1908
## Mean : 4.07
## 3rd Qu.: 4.00
## Max. :20.00
##
# Loading all the required packages
library(funModeling)
## Warning: package 'funModeling' was built under R version 3.5.3
## Loading required package: Hmisc
## Warning: package 'Hmisc' was built under R version 3.5.3
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.3
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
## funModeling v.1.7 :)
## Examples and tutorials at livebook.datascienceheroes.com
library(GGally)
## Warning: package 'GGally' was built under R version 3.5.3
##
## Attaching package: 'GGally'
## The following object is masked from 'package:funModeling':
##
## range01
library(tidyverse)
## -- Attaching packages ------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v tibble 2.0.1 v purrr 0.2.5
## v tidyr 0.8.2 v dplyr 0.8.3
## v readr 1.3.1 v stringr 1.3.1
## v tibble 2.0.1 v forcats 0.3.0
## Warning: package 'readr' was built under R version 3.5.3
## Warning: package 'dplyr' was built under R version 3.5.3
## -- Conflicts ---------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x dplyr::src() masks Hmisc::src()
## x dplyr::summarize() masks Hmisc::summarize()
library(MLmetrics)
## Warning: package 'MLmetrics' was built under R version 3.5.3
##
## Attaching package: 'MLmetrics'
## The following object is masked from 'package:base':
##
## Recall
library(caret)
## Warning: package 'caret' was built under R version 3.5.3
##
## Attaching package: 'caret'
## The following objects are masked from 'package:MLmetrics':
##
## MAE, RMSE
## The following object is masked from 'package:purrr':
##
## lift
## The following object is masked from 'package:survival':
##
## cluster
library(C50)
## Warning: package 'C50' was built under R version 3.5.3
library(class)
## Warning: package 'class' was built under R version 3.5.3
library(e1071)
## Warning: package 'e1071' was built under R version 3.5.3
##
## Attaching package: 'e1071'
## The following object is masked from 'package:Hmisc':
##
## impute
library(SuperLearner)
## Warning: package 'SuperLearner' was built under R version 3.5.3
## Loading required package: nnls
## Super Learner
## Version: 2.0-24
## Package created on 2018-08-10
library(ranger)
## Warning: package 'ranger' was built under R version 3.5.3
library(kernlab)
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:purrr':
##
## cross
## The following object is masked from 'package:ggplot2':
##
## alpha
library(ipred)
## Warning: package 'ipred' was built under R version 3.5.3
library(arm)
## Warning: package 'arm' was built under R version 3.5.3
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
##
## expand
## Loading required package: lme4
## Warning: package 'lme4' was built under R version 3.5.3
##
## arm (Version 1.10-1, built: 2018-4-12)
## Working directory is C:/Users/navee/OneDrive/Documents/DA 5030/assignments
HAVING A LOOK AT THE DATA
## MISSING VALUE TREATMENT
summary(shop)
## Administrative Administrative_Duration Informational
## Min. : 0.000 Min. : -1.00 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 0.00 1st Qu.: 0.000
## Median : 1.000 Median : 8.00 Median : 0.000
## Mean : 2.318 Mean : 80.91 Mean : 0.504
## 3rd Qu.: 4.000 3rd Qu.: 93.50 3rd Qu.: 0.000
## Max. :27.000 Max. :3398.75 Max. :24.000
## NA's :14 NA's :14 NA's :14
## Informational_Duration ProductRelated ProductRelated_Duration
## Min. : -1.00 Min. : 0.00 Min. : -1.0
## 1st Qu.: 0.00 1st Qu.: 7.00 1st Qu.: 185.0
## Median : 0.00 Median : 18.00 Median : 599.8
## Mean : 34.51 Mean : 31.76 Mean : 1196.0
## 3rd Qu.: 0.00 3rd Qu.: 38.00 3rd Qu.: 1466.5
## Max. :2549.38 Max. :705.00 Max. :63973.5
## NA's :14 NA's :14 NA's :14
## BounceRates ExitRates PageValues SpecialDay
## Min. :0.000000 Min. :0.00000 Min. : 0.000 Min. :0.00000
## 1st Qu.:0.000000 1st Qu.:0.01429 1st Qu.: 0.000 1st Qu.:0.00000
## Median :0.003119 Median :0.02512 Median : 0.000 Median :0.00000
## Mean :0.022152 Mean :0.04300 Mean : 5.889 Mean :0.06143
## 3rd Qu.:0.016684 3rd Qu.:0.05000 3rd Qu.: 0.000 3rd Qu.:0.00000
## Max. :0.200000 Max. :0.20000 Max. :361.764 Max. :1.00000
## NA's :14 NA's :14
## Month OperatingSystems Browser Region
## May :3364 Min. :1.000 Min. : 1.000 Min. :1.000
## Nov :2998 1st Qu.:2.000 1st Qu.: 2.000 1st Qu.:1.000
## Mar :1907 Median :2.000 Median : 2.000 Median :3.000
## Dec :1727 Mean :2.124 Mean : 2.357 Mean :3.147
## Oct : 549 3rd Qu.:3.000 3rd Qu.: 2.000 3rd Qu.:4.000
## Sep : 448 Max. :8.000 Max. :13.000 Max. :9.000
## (Other):1337
## TrafficType VisitorType Weekend Revenue
## Min. : 1.00 New_Visitor : 1694 Mode :logical Mode :logical
## 1st Qu.: 2.00 Other : 85 FALSE:9462 FALSE:10422
## Median : 2.00 Returning_Visitor:10551 TRUE :2868 TRUE :1908
## Mean : 4.07
## 3rd Qu.: 4.00
## Max. :20.00
##
# We can see that there are a few NA's present. 14 for each of the first 8 columns to be precise
sum(is.na(shop[1:8]))
## [1] 112
shop<-na.omit(shop) # Since there are only 112 NA's in the whole dataset we can exclude them
summary(shop)
## Administrative Administrative_Duration Informational
## Min. : 0.000 Min. : -1.00 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 0.00 1st Qu.: 0.000
## Median : 1.000 Median : 8.00 Median : 0.000
## Mean : 2.318 Mean : 80.91 Mean : 0.504
## 3rd Qu.: 4.000 3rd Qu.: 93.50 3rd Qu.: 0.000
## Max. :27.000 Max. :3398.75 Max. :24.000
##
## Informational_Duration ProductRelated ProductRelated_Duration
## Min. : -1.00 Min. : 0.00 Min. : -1.0
## 1st Qu.: 0.00 1st Qu.: 7.00 1st Qu.: 185.0
## Median : 0.00 Median : 18.00 Median : 599.8
## Mean : 34.51 Mean : 31.76 Mean : 1196.0
## 3rd Qu.: 0.00 3rd Qu.: 38.00 3rd Qu.: 1466.5
## Max. :2549.38 Max. :705.00 Max. :63973.5
##
## BounceRates ExitRates PageValues SpecialDay
## Min. :0.000000 Min. :0.00000 Min. : 0.000 Min. :0.0000
## 1st Qu.:0.000000 1st Qu.:0.01429 1st Qu.: 0.000 1st Qu.:0.0000
## Median :0.003119 Median :0.02512 Median : 0.000 Median :0.0000
## Mean :0.022152 Mean :0.04300 Mean : 5.896 Mean :0.0615
## 3rd Qu.:0.016684 3rd Qu.:0.05000 3rd Qu.: 0.000 3rd Qu.:0.0000
## Max. :0.200000 Max. :0.20000 Max. :361.764 Max. :1.0000
##
## Month OperatingSystems Browser Region
## May :3363 Min. :1.000 Min. : 1.000 Min. :1.000
## Nov :2998 1st Qu.:2.000 1st Qu.: 2.000 1st Qu.:1.000
## Mar :1894 Median :2.000 Median : 2.000 Median :3.000
## Dec :1727 Mean :2.124 Mean : 2.358 Mean :3.148
## Oct : 549 3rd Qu.:3.000 3rd Qu.: 2.000 3rd Qu.:4.000
## Sep : 448 Max. :8.000 Max. :13.000 Max. :9.000
## (Other):1337
## TrafficType VisitorType Weekend Revenue
## Min. : 1.00 New_Visitor : 1694 Mode :logical Mode :logical
## 1st Qu.: 2.00 Other : 85 FALSE:9451 FALSE:10408
## Median : 2.00 Returning_Visitor:10537 TRUE :2865 TRUE :1908
## Mean : 4.07
## 3rd Qu.: 4.00
## Max. :20.00
##
OUTLIER TREATMENT
# Creating a duplicate dataset at every possible step to preserve the original dataset
shop_out = shop
shop_out$Revenue<-as.factor(ifelse(shop_out$Revenue==TRUE,1,0))
summary(shop_out)
## Administrative Administrative_Duration Informational
## Min. : 0.000 Min. : -1.00 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 0.00 1st Qu.: 0.000
## Median : 1.000 Median : 8.00 Median : 0.000
## Mean : 2.318 Mean : 80.91 Mean : 0.504
## 3rd Qu.: 4.000 3rd Qu.: 93.50 3rd Qu.: 0.000
## Max. :27.000 Max. :3398.75 Max. :24.000
##
## Informational_Duration ProductRelated ProductRelated_Duration
## Min. : -1.00 Min. : 0.00 Min. : -1.0
## 1st Qu.: 0.00 1st Qu.: 7.00 1st Qu.: 185.0
## Median : 0.00 Median : 18.00 Median : 599.8
## Mean : 34.51 Mean : 31.76 Mean : 1196.0
## 3rd Qu.: 0.00 3rd Qu.: 38.00 3rd Qu.: 1466.5
## Max. :2549.38 Max. :705.00 Max. :63973.5
##
## BounceRates ExitRates PageValues SpecialDay
## Min. :0.000000 Min. :0.00000 Min. : 0.000 Min. :0.0000
## 1st Qu.:0.000000 1st Qu.:0.01429 1st Qu.: 0.000 1st Qu.:0.0000
## Median :0.003119 Median :0.02512 Median : 0.000 Median :0.0000
## Mean :0.022152 Mean :0.04300 Mean : 5.896 Mean :0.0615
## 3rd Qu.:0.016684 3rd Qu.:0.05000 3rd Qu.: 0.000 3rd Qu.:0.0000
## Max. :0.200000 Max. :0.20000 Max. :361.764 Max. :1.0000
##
## Month OperatingSystems Browser Region
## May :3363 Min. :1.000 Min. : 1.000 Min. :1.000
## Nov :2998 1st Qu.:2.000 1st Qu.: 2.000 1st Qu.:1.000
## Mar :1894 Median :2.000 Median : 2.000 Median :3.000
## Dec :1727 Mean :2.124 Mean : 2.358 Mean :3.148
## Oct : 549 3rd Qu.:3.000 3rd Qu.: 2.000 3rd Qu.:4.000
## Sep : 448 Max. :8.000 Max. :13.000 Max. :9.000
## (Other):1337
## TrafficType VisitorType Weekend Revenue
## Min. : 1.00 New_Visitor : 1694 Mode :logical 0:10408
## 1st Qu.: 2.00 Other : 85 FALSE:9451 1: 1908
## Median : 2.00 Returning_Visitor:10537 TRUE :2865
## Mean : 4.07
## 3rd Qu.: 4.00
## Max. :20.00
##
#If the analysis to be conducted does contain a grouping variable, then data should be assessed for outliers separately within each group
shop_1 <- shop_out[ which(shop_out$Revenue==1),] #dataframe with revenue=1
shop_0 <- shop_out[ which(shop_out$Revenue==0),] #dataframe with revenue=0
# Calculating Mahalanobis distances to detect outliers in the data, Mahalanobis distance can only be used for numeric Data
MD<- mahalanobis(shop_1[1:9],colMeans(shop_1[1:9]),cov(shop_1[1:9]))
shop_1$MD<- round(MD,2)
# creating a function to calcuate a threshold for outliers
bench<- function(x){
y<- summary(x)[5] + 1.5*IQR(x)
return(y)
}
shop_1$out<- ifelse(shop_1$MD > bench(shop_1$MD),1,0)
table(shop_1$out)
##
## 0 1
## 1708 200
shop_1 <- shop_1[ which(shop_1$out==0),]
# Applying the same process for dataframe shop_0
MD0<- mahalanobis(shop_0[1:9],colMeans(shop_0[1:9]),cov(shop_0[1:9]))
shop_0$MD0<- MD0
shop_0$out<- ifelse(shop_0$MD0 > bench(shop_0$MD0),1,0)
table(shop_0$out)
##
## 0 1
## 9630 778
shop_0<- shop_0[ which(shop_0$out==0),]
shop_0$MD0<-NULL
shop_0$out <-NULL
head(shop_0)
## Administrative Administrative_Duration Informational
## 1 0 0 0
## 2 0 0 0
## 3 0 -1 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## Informational_Duration ProductRelated ProductRelated_Duration
## 1 0 1 0.000000
## 2 0 2 64.000000
## 3 -1 1 -1.000000
## 4 0 2 2.666667
## 5 0 10 627.500000
## 6 0 19 154.216667
## BounceRates ExitRates PageValues SpecialDay Month OperatingSystems
## 1 0.20000000 0.2000000 0 0 Feb 1
## 2 0.00000000 0.1000000 0 0 Feb 2
## 3 0.20000000 0.2000000 0 0 Feb 4
## 4 0.05000000 0.1400000 0 0 Feb 3
## 5 0.02000000 0.0500000 0 0 Feb 3
## 6 0.01578947 0.0245614 0 0 Feb 2
## Browser Region TrafficType VisitorType Weekend Revenue
## 1 1 1 1 Returning_Visitor FALSE 0
## 2 2 1 2 Returning_Visitor FALSE 0
## 3 1 9 3 Returning_Visitor FALSE 0
## 4 2 2 4 Returning_Visitor FALSE 0
## 5 3 1 4 Returning_Visitor TRUE 0
## 6 2 1 3 Returning_Visitor FALSE 0
shop_1$MD<-NULL
shop_1$out <-NULL
shop_out<- rbind(shop_1,shop_0)
shop_out <- shop_out[sample(nrow(shop_out)),] #Randomizing the data based on rows, inorder to avoid improper split in test and train datasets
shop_out<-as.data.frame(shop_out)
# I SHALL NOT BE USING THE DATASET "shop_out"(outlier treated dataset) FOR ANALYSIS/BUILDING MODELS AS THE DATA IS ASSUMED TO BE WITHOUT OUTLIERS IN A STUDY CONDUCTED/PAPER,
# WHICH I HAVE REFFERED - Sakar, C.O., Polat, S.O., Katircioglu, M. et al. Neural Comput & Applic (2018)
DATA EXPLORATION & VISUALIZATION
library(funModeling) #Package gives us colourful frequency plots
shop_num <- shop[1:10]
plot_num(shop_num) # Histogram of all the continuous variables
shop_cat <- shop[11:18]
shop_cat$OperatingSystems<- as.factor(shop_cat$OperatingSystems)
shop_cat$Browser<- as.factor(shop_cat$Browser)
shop_cat$Region<- as.factor(shop_cat$Region)
shop_cat$TrafficType<- as.factor(shop_cat$TrafficType)
shop_cat$Weekend<- as.factor(shop$Weekend)
shop_cat$Revenue<- as.factor(shop_cat$Revenue)
print(freq(shop_cat)) # Frequency & frequency plots of catergorical variables
## Month frequency percentage cumulative_perc
## 1 May 3363 27.31 27.31
## 2 Nov 2998 24.34 51.65
## 3 Mar 1894 15.38 67.03
## 4 Dec 1727 14.02 81.05
## 5 Oct 549 4.46 85.51
## 6 Sep 448 3.64 89.15
## 7 Aug 433 3.52 92.67
## 8 Jul 432 3.51 96.18
## 9 June 288 2.34 98.52
## 10 Feb 184 1.49 100.00
## OperatingSystems frequency percentage cumulative_perc
## 1 2 6593 53.53 53.53
## 2 1 2582 20.96 74.49
## 3 3 2552 20.72 95.21
## 4 4 478 3.88 99.09
## 5 8 79 0.64 99.73
## 6 6 19 0.15 99.88
## 7 7 7 0.06 99.94
## 8 5 6 0.05 100.00
## Browser frequency percentage cumulative_perc
## 1 2 7951 64.56 64.56
## 2 1 2459 19.97 84.53
## 3 4 735 5.97 90.50
## 4 5 467 3.79 94.29
## 5 6 174 1.41 95.70
## 6 10 163 1.32 97.02
## 7 8 135 1.10 98.12
## 8 3 105 0.85 98.97
## 9 13 61 0.50 99.47
## 10 7 49 0.40 99.87
## 11 12 10 0.08 99.95
## 12 11 6 0.05 100.00
## 13 9 1 0.01 100.00
## Region frequency percentage cumulative_perc
## 1 1 4774 38.76 38.76
## 2 3 2402 19.50 58.26
## 3 4 1179 9.57 67.83
## 4 2 1134 9.21 77.04
## 5 6 804 6.53 83.57
## 6 7 761 6.18 89.75
## 7 9 511 4.15 93.90
## 8 8 434 3.52 97.42
## 9 5 317 2.57 100.00
## TrafficType frequency percentage cumulative_perc
## 1 2 3909 31.74 31.74
## 2 1 2444 19.84 51.58
## 3 3 2051 16.65 68.23
## 4 4 1069 8.68 76.91
## 5 13 737 5.98 82.89
## 6 10 450 3.65 86.54
## 7 6 444 3.61 90.15
## 8 8 343 2.78 92.93
## 9 5 260 2.11 95.04
## 10 11 247 2.01 97.05
## 11 20 198 1.61 98.66
## 12 9 42 0.34 99.00
## 13 7 40 0.32 99.32
## 14 15 37 0.30 99.62
## 15 19 17 0.14 99.76
## 16 14 13 0.11 99.87
## 17 18 10 0.08 99.95
## 18 16 3 0.02 99.97
## 19 12 1 0.01 99.98
## 20 17 1 0.01 100.00
## VisitorType frequency percentage cumulative_perc
## 1 Returning_Visitor 10537 85.56 85.56
## 2 New_Visitor 1694 13.75 99.31
## 3 Other 85 0.69 100.00
## Weekend frequency percentage cumulative_perc
## 1 FALSE 9451 76.74 76.74
## 2 TRUE 2865 23.26 100.00
## Revenue frequency percentage cumulative_perc
## 1 FALSE 10408 84.51 84.51
## 2 TRUE 1908 15.49 100.00
##
## [1] "Variables processed: Month, OperatingSystems, Browser, Region, TrafficType, VisitorType, Weekend, Revenue"
dat<- shop
dat$Month<- as.numeric(dat$Month)
dat$VisitorType<- as.numeric(dat$VisitorType)
M<-cor(dat)
ggcorr(M)
plot(shop_out$BounceRates,shop_out$Revenue,xlab = "BounceRates",ylab="Revenue") # We can notice when the bounce rate is close to zero, The customer is more likely to develop revnue.
#Bounce rate is an Internet marketing term used in web traffic analysis. It represents the percentage of visitors who enter the site and then leave ("bounce") rather than continuing to view other pages within the same site.
plot(shop_out$ExitRates,shop_out$Revenue,xlab = "ExitRates",ylab = "Revenue")
dat1<- shop_out %>% filter(shop_out$Revenue==1)
ggplot(dat1, aes(dat1$Month, ..count..)) + geom_bar(aes(fill = dat1$Revenue), position = "dodge") + labs(x="Month",y="Revenue")
MODELLING THE DATA
set.seed(100)
shop1=shop
shop1$Revenue<- as.factor(shop1$Revenue)
shop1$Weekend<- as.factor(shop1$Weekend)
index <- createDataPartition(shop1$Revenue, p=0.75, list=FALSE)
train <-shop1[ index,]
test <- shop1[-index,]
# C4.5 Decision trees
library(RWeka)
## Warning: package 'RWeka' was built under R version 3.5.3
fit<-J48(Revenue ~.,data=train)
summary(fit)
##
## === Summary ===
##
## Correctly Classified Instances 8709 94.2839 %
## Incorrectly Classified Instances 528 5.7161 %
## Kappa statistic 0.7611
## Mean absolute error 0.0987
## Root mean squared error 0.2222
## Relative absolute error 37.6929 %
## Root relative squared error 61.4006 %
## Total Number of Instances 9237
##
## === Confusion Matrix ===
##
## a b <-- classified as
## 7693 113 | a = FALSE
## 415 1016 | b = TRUE
p_tree<-predict(fit,test[,1:17])
confusionMatrix(p_tree,test$Revenue)
## Confusion Matrix and Statistics
##
## Reference
## Prediction FALSE TRUE
## FALSE 2476 205
## TRUE 126 272
##
## Accuracy : 0.8925
## 95% CI : (0.881, 0.9032)
## No Information Rate : 0.8451
## P-Value [Acc > NIR] : 1.577e-14
##
## Kappa : 0.5597
##
## Mcnemar's Test P-Value : 1.809e-05
##
## Sensitivity : 0.9516
## Specificity : 0.5702
## Pos Pred Value : 0.9235
## Neg Pred Value : 0.6834
## Prevalence : 0.8451
## Detection Rate : 0.8042
## Detection Prevalence : 0.8707
## Balanced Accuracy : 0.7609
##
## 'Positive' Class : FALSE
##
a1<-Accuracy(p_tree,test$Revenue)
a1
## [1] 0.8924976
# C5.0 Boosted trees
dtree<-C5.0(train,train$Revenue)
plot(dtree)
p_dtree<-predict(dtree,test)
confusionMatrix(table(p_dtree,test$Revenue))
## Confusion Matrix and Statistics
##
##
## p_dtree FALSE TRUE
## FALSE 2602 0
## TRUE 0 477
##
## Accuracy : 1
## 95% CI : (0.9988, 1)
## No Information Rate : 0.8451
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.8451
## Detection Rate : 0.8451
## Detection Prevalence : 0.8451
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : FALSE
##
Accuracy(p_dtree,test$Revenue)
## [1] 1
a2<-Accuracy(p_dtree,test$Revenue)
a2
## [1] 1
# C5.0 Decision tree produces 100% accuracy
set.seed(101)
shop2=shop
summary(shop2)
## Administrative Administrative_Duration Informational
## Min. : 0.000 Min. : -1.00 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 0.00 1st Qu.: 0.000
## Median : 1.000 Median : 8.00 Median : 0.000
## Mean : 2.318 Mean : 80.91 Mean : 0.504
## 3rd Qu.: 4.000 3rd Qu.: 93.50 3rd Qu.: 0.000
## Max. :27.000 Max. :3398.75 Max. :24.000
##
## Informational_Duration ProductRelated ProductRelated_Duration
## Min. : -1.00 Min. : 0.00 Min. : -1.0
## 1st Qu.: 0.00 1st Qu.: 7.00 1st Qu.: 185.0
## Median : 0.00 Median : 18.00 Median : 599.8
## Mean : 34.51 Mean : 31.76 Mean : 1196.0
## 3rd Qu.: 0.00 3rd Qu.: 38.00 3rd Qu.: 1466.5
## Max. :2549.38 Max. :705.00 Max. :63973.5
##
## BounceRates ExitRates PageValues SpecialDay
## Min. :0.000000 Min. :0.00000 Min. : 0.000 Min. :0.0000
## 1st Qu.:0.000000 1st Qu.:0.01429 1st Qu.: 0.000 1st Qu.:0.0000
## Median :0.003119 Median :0.02512 Median : 0.000 Median :0.0000
## Mean :0.022152 Mean :0.04300 Mean : 5.896 Mean :0.0615
## 3rd Qu.:0.016684 3rd Qu.:0.05000 3rd Qu.: 0.000 3rd Qu.:0.0000
## Max. :0.200000 Max. :0.20000 Max. :361.764 Max. :1.0000
##
## Month OperatingSystems Browser Region
## May :3363 Min. :1.000 Min. : 1.000 Min. :1.000
## Nov :2998 1st Qu.:2.000 1st Qu.: 2.000 1st Qu.:1.000
## Mar :1894 Median :2.000 Median : 2.000 Median :3.000
## Dec :1727 Mean :2.124 Mean : 2.358 Mean :3.148
## Oct : 549 3rd Qu.:3.000 3rd Qu.: 2.000 3rd Qu.:4.000
## Sep : 448 Max. :8.000 Max. :13.000 Max. :9.000
## (Other):1337
## TrafficType VisitorType Weekend Revenue
## Min. : 1.00 New_Visitor : 1694 Mode :logical Mode :logical
## 1st Qu.: 2.00 Other : 85 FALSE:9451 FALSE:10408
## Median : 2.00 Returning_Visitor:10537 TRUE :2865 TRUE :1908
## Mean : 4.07
## 3rd Qu.: 4.00
## Max. :20.00
##
str(shop2)
## 'data.frame': 12316 obs. of 18 variables:
## $ Administrative : int 0 0 0 0 0 0 0 1 0 0 ...
## $ Administrative_Duration: num 0 0 -1 0 0 0 -1 -1 0 0 ...
## $ Informational : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Informational_Duration : num 0 0 -1 0 0 0 -1 -1 0 0 ...
## $ ProductRelated : int 1 2 1 2 10 19 1 1 2 3 ...
## $ ProductRelated_Duration: num 0 64 -1 2.67 627.5 ...
## $ BounceRates : num 0.2 0 0.2 0.05 0.02 ...
## $ ExitRates : num 0.2 0.1 0.2 0.14 0.05 ...
## $ PageValues : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SpecialDay : num 0 0 0 0 0 0 0.4 0 0.8 0.4 ...
## $ Month : Factor w/ 10 levels "Aug","Dec","Feb",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ OperatingSystems : int 1 2 4 3 3 2 2 1 2 2 ...
## $ Browser : int 1 2 1 2 3 2 4 2 2 4 ...
## $ Region : int 1 1 9 2 1 1 3 1 2 1 ...
## $ TrafficType : int 1 2 3 4 4 3 3 5 3 2 ...
## $ VisitorType : Factor w/ 3 levels "New_Visitor",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ Weekend : logi FALSE FALSE FALSE FALSE TRUE FALSE ...
## $ Revenue : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## - attr(*, "na.action")= 'omit' Named int 1066 1133 1134 1135 1136 1137 1474 1475 1476 1477 ...
## ..- attr(*, "names")= chr "1066" "1133" "1134" "1135" ...
shop2$Month=as.numeric(shop1$Month)
shop2$VisitorType=as.numeric(shop1$VisitorType)
shop2$Weekend=as.numeric(shop1$Weekend)
shop2$Revenue=as.numeric(shop1$Revenue)
index1 <- createDataPartition(shop1$Revenue, p=0.75, list=FALSE)
train1 <-shop2[ index1,]
test1 <- shop2[-index1,]
train1_lab<-train1$Revenue
test1_lab<-test1$Revenue
normalize <- function(x) { # Since KNN is a distance Based algorithm Normalization of feature values is a must.
return ((x - min(x)) / (max(x) - min(x)))
}
shop2[1:6]<-lapply(shop2[1:6],normalize)
knn_pred<-knn(train1,test1,train1_lab,k=111)
confusionMatrix(table(knn_pred,test1$Revenue))
## Confusion Matrix and Statistics
##
##
## knn_pred 1 2
## 1 2602 475
## 2 0 2
##
## Accuracy : 0.8457
## 95% CI : (0.8325, 0.8583)
## No Information Rate : 0.8451
## P-Value [Acc > NIR] : 0.4725
##
## Kappa : 0.0071
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 1.000000
## Specificity : 0.004193
## Pos Pred Value : 0.845629
## Neg Pred Value : 1.000000
## Prevalence : 0.845080
## Detection Rate : 0.845080
## Detection Prevalence : 0.999350
## Balanced Accuracy : 0.502096
##
## 'Positive' Class : 1
##
a3<-Accuracy(knn_pred,test1$Revenue)
a3
## [1] 0.8457291
set.seed(102)
shop3=shop
index2 <- createDataPartition(shop3$Revenue, p=0.75, list=FALSE)
train2 <-shop3[ index2,]
test2<- shop3[-index2,]
lg_mod<-glm(Revenue ~. ,data=train2,family = binomial(link = "logit"))
anova(lg_mod,test = "Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Revenue
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 9236 7965.1
## Administrative 1 145.80 9235 7819.3 < 2.2e-16 ***
## Administrative_Duration 1 0.60 9234 7818.7 0.4370413
## Informational 1 12.54 9233 7806.1 0.0003981 ***
## Informational_Duration 1 0.10 9232 7806.0 0.7550210
## ProductRelated 1 72.08 9231 7733.9 < 2.2e-16 ***
## ProductRelated_Duration 1 5.65 9230 7728.3 0.0174933 *
## BounceRates 1 279.88 9229 7448.4 < 2.2e-16 ***
## ExitRates 1 218.53 9228 7229.9 < 2.2e-16 ***
## PageValues 1 1575.70 9227 5654.2 < 2.2e-16 ***
## SpecialDay 1 25.27 9226 5628.9 4.991e-07 ***
## Month 9 205.28 9217 5423.6 < 2.2e-16 ***
## OperatingSystems 1 4.50 9216 5419.1 0.0339777 *
## Browser 1 1.07 9215 5418.1 0.3001190
## Region 1 0.46 9214 5417.6 0.4964312
## TrafficType 1 0.04 9213 5417.6 0.8345991
## VisitorType 2 10.07 9211 5407.5 0.0064975 **
## Weekend 1 1.91 9210 5405.6 0.1666200
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lg_mod<-glm(Revenue~ Administrative + Informational + ProductRelated + BounceRates + ExitRates + PageValues + SpecialDay +Month + VisitorType,data=train2,family = binomial(link = "logit"))
#Retreiving only the Influential Variables after removing variables based on P-value
anova(lg_mod,test = "Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Revenue
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 9236 7965.1
## Administrative 1 145.80 9235 7819.3 < 2.2e-16 ***
## Informational 1 13.04 9234 7806.2 0.0003044 ***
## ProductRelated 1 72.15 9233 7734.1 < 2.2e-16 ***
## BounceRates 1 279.66 9232 7454.4 < 2.2e-16 ***
## ExitRates 1 211.80 9231 7242.6 < 2.2e-16 ***
## PageValues 1 1582.34 9230 5660.3 < 2.2e-16 ***
## SpecialDay 1 25.73 9229 5634.5 3.925e-07 ***
## Month 9 204.51 9220 5430.0 < 2.2e-16 ***
## VisitorType 2 11.00 9218 5419.0 0.0040793 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lg_pred <- predict(lg_mod,newdata = test2,type = "response")
lg_pred=as.numeric(lg_pred)
lg_pred=as.factor(round(lg_pred,0))
test2$Revenue=as.numeric(test2$Revenue)
confusionMatrix(table(lg_pred,test2$Revenue))
## Confusion Matrix and Statistics
##
##
## lg_pred 0 1
## 0 2539 297
## 1 63 180
##
## Accuracy : 0.8831
## 95% CI : (0.8712, 0.8942)
## No Information Rate : 0.8451
## P-Value [Acc > NIR] : 9.048e-10
##
## Kappa : 0.4416
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9758
## Specificity : 0.3774
## Pos Pred Value : 0.8953
## Neg Pred Value : 0.7407
## Prevalence : 0.8451
## Detection Rate : 0.8246
## Detection Prevalence : 0.9211
## Balanced Accuracy : 0.6766
##
## 'Positive' Class : 0
##
a4<- Accuracy(lg_pred,test2$Revenue)
a4
## [1] 0.8830789
4.NAIVE BAYES
set.seed(104)
x=train
y=train$Revenue
model = naiveBayes(x,y)
p<- predict(model,test,type="class")
confusionMatrix(p,test$Revenue)
## Confusion Matrix and Statistics
##
## Reference
## Prediction FALSE TRUE
## FALSE 2501 10
## TRUE 101 467
##
## Accuracy : 0.9639
## 95% CI : (0.9567, 0.9703)
## No Information Rate : 0.8451
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8723
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9612
## Specificity : 0.9790
## Pos Pred Value : 0.9960
## Neg Pred Value : 0.8222
## Prevalence : 0.8451
## Detection Rate : 0.8123
## Detection Prevalence : 0.8155
## Balanced Accuracy : 0.9701
##
## 'Positive' Class : FALSE
##
a5<- Accuracy(p,test$Revenue)
a5
## [1] 0.9639493
COMPARING MODEL ACCURACIES
X<-c("C4.5-Tree","C5.0-Tree","KNN","Logistic","Naive-Bayes")
Y<-round(c(a1,a2,a3,a4,a5),2)
X_name <- "model"
Y_name <- "accuracy"
df <- data.frame(X,Y)
names(df) <- c(X_name,Y_name)
ggplot(df,aes(x=model,y=accuracy,fill=model))+geom_bar(stat = "identity") + geom_text(aes(label=accuracy),position=position_dodge(width=0.9), vjust=-0.25)
ENSEMBLE MODEL
## Reference - https://www.datacamp.com/community/tutorials/ensemble-r-machine-learning
set.seed(667)
shop3=shop
shop3$Revenue<- as.numeric(shop3$Revenue)
shop3$Weekend<-as.factor(shop3$Weekend)
index3 <- createDataPartition(shop3$Revenue, p=0.75, list=FALSE)
train3 <-shop3[ index3,]
test3 <- shop3[-index3,]
xtrain <- data.frame(train3[,1:17])
xtest <- data.frame(test3[,1:17])
y_lab=(train3[,18])
ytest_lab=as.numeric((test3[,18]))
## With random forest, Logistic regression and Bagging
# Fit the ensemble model
model <- SuperLearner(y_lab,xtrain,family=binomial(),SL.library=list("SL.ranger","SL.ipredbagg","SL.bayesglm"))
# Bayes GLM model is simply an implementation of logistic regression. At least in this case, where we are classifying a 0-1 problem, Ranger for random Forest, ipredbagg for Bagging
print(model)
##
## Call:
## SuperLearner(Y = y_lab, X = xtrain, family = binomial(), SL.library = list("SL.ranger",
## "SL.ipredbagg", "SL.bayesglm"))
##
##
## Risk Coef
## SL.ranger_All 0.07178292 0.6695822
## SL.ipredbagg_All 0.07468293 0.2935592
## SL.bayesglm_All 0.08678850 0.0368586
predictions <- predict.SuperLearner(model, newdata=as.data.frame(xtest))
head(predictions$library.predict) # Predictions of Individual libraries
## SL.ranger_All SL.ipredbagg_All SL.bayesglm_All
## [1,] 0.00000 0.04265707 0.001067033
## [2,] 0.00600 0.04265707 0.030741309
## [3,] 0.00000 0.04265707 0.021526787
## [4,] 0.00016 0.04265707 0.020389081
## [5,] 0.02200 0.04265707 0.016927636
## [6,] 0.00000 0.04265707 0.010549686
#Recoding the Probabilities
conv.preds <- ifelse(predictions$pred>=0.5,1,0)
cm <- Accuracy(conv.preds,ytest_lab)
cm
## [1] 0.9142579
#To understand each model's specific contribution to the model and the variation, we can use SuperLearner's internal V-fold cross-validation function --> CV.SuperLearner()
v.model <- CV.SuperLearner(y_lab,xtrain,V=5,SL.library=list("SL.ranger","SL.ipredbagg","SL.bayesglm"))
summary(v.model)
##
## Call:
## CV.SuperLearner(Y = y_lab, X = xtrain, V = 5, SL.library = list("SL.ranger",
## "SL.ipredbagg", "SL.bayesglm"))
##
## Risk is based on: Mean Squared Error
##
## All risk estimates are based on V = 5
##
## Algorithm Ave se Min Max
## Super Learner 0.071223 0.0017570 0.068369 0.074229
## Discrete SL 0.071918 0.0017568 0.068830 0.074525
## SL.ranger_All 0.071918 0.0017568 0.068830 0.074525
## SL.ipredbagg_All 0.073623 0.0018591 0.070834 0.077592
## SL.bayesglm_All 0.096237 0.0024065 0.093952 0.103946
plot(v.model)
## Tuning Hyper Parameters.The model Performance is great but maybe we can try and Improve the performance by a bit.
#For this I will changing the nbagg parameter to 250 from the default of 25
SL.ipredbagg.tune <- function(...){
SL.ipredbagg(..., nbagg=250)
}
set.seed(150)
# Tuning the model
cv.model.tune <- CV.SuperLearner(y_lab,xtrain,V=5,SL.library=list("SL.ranger","SL.ksvm","SL.ipredbagg","SL.bayesglm","SL.ipredbagg.tune"))
# summary statistics
summary(cv.model.tune)
##
## Call:
## CV.SuperLearner(Y = y_lab, X = xtrain, V = 5, SL.library = list("SL.ranger",
## "SL.ksvm", "SL.ipredbagg", "SL.bayesglm", "SL.ipredbagg.tune"))
##
## Risk is based on: Mean Squared Error
##
## All risk estimates are based on V = 5
##
## Algorithm Ave se Min Max
## Super Learner 0.071308 0.0017802 0.066704 0.075322
## Discrete SL 0.071809 0.0017575 0.067673 0.075486
## SL.ranger_All 0.071809 0.0017575 0.067673 0.075486
## SL.ksvm_All 0.086555 0.0024075 0.082180 0.097047
## SL.ipredbagg_All 0.073679 0.0018641 0.068151 0.078030
## SL.bayesglm_All 0.095736 0.0023166 0.092588 0.104010
## SL.ipredbagg.tune_All 0.073738 0.0018640 0.068128 0.078421
plot(cv.model.tune)
# We can see very negligible improvement the tuned ipredbagg, But let's leave it alone and see if the SuperLearner finds it relevant
set.seed(155)
model.tune<-SuperLearner(y_lab,xtrain,SL.library=list("SL.ranger","SL.ksvm","SL.ipredbagg","SL.bayesglm","SL.ipredbagg.tune"))
model.tune
##
## Call:
## SuperLearner(Y = y_lab, X = xtrain, SL.library = list("SL.ranger", "SL.ksvm",
## "SL.ipredbagg", "SL.bayesglm", "SL.ipredbagg.tune"))
##
##
## Risk Coef
## SL.ranger_All 0.07138059 0.63700483
## SL.ksvm_All 0.08601019 0.05823424
## SL.ipredbagg_All 0.07354597 0.00000000
## SL.bayesglm_All 0.09579675 0.00000000
## SL.ipredbagg.tune_All 0.07348829 0.30476094
predictions.tune <- predict.SuperLearner(model.tune, newdata=as.data.frame(xtest))
conv.preds.tune <- ifelse(predictions.tune$pred>=0.5,1,0)
Accuracy(conv.preds.tune,ytest_lab)
## [1] 0.9132835
# We can see no improvement with the tuned model hence we can assume the first model to be adequate