DATA ACQUISTION

# Dataset ----> https://www.kaggle.com/roshansharma/online-shoppers-intention

shop<-read.csv("C:/Users/navee/OneDrive/Documents/DA 5030/online_shoppers_intention.csv")
summary(shop)
##  Administrative   Administrative_Duration Informational   
##  Min.   : 0.000   Min.   :  -1.00         Min.   : 0.000  
##  1st Qu.: 0.000   1st Qu.:   0.00         1st Qu.: 0.000  
##  Median : 1.000   Median :   8.00         Median : 0.000  
##  Mean   : 2.318   Mean   :  80.91         Mean   : 0.504  
##  3rd Qu.: 4.000   3rd Qu.:  93.50         3rd Qu.: 0.000  
##  Max.   :27.000   Max.   :3398.75         Max.   :24.000  
##  NA's   :14       NA's   :14              NA's   :14      
##  Informational_Duration ProductRelated   ProductRelated_Duration
##  Min.   :  -1.00        Min.   :  0.00   Min.   :   -1.0        
##  1st Qu.:   0.00        1st Qu.:  7.00   1st Qu.:  185.0        
##  Median :   0.00        Median : 18.00   Median :  599.8        
##  Mean   :  34.51        Mean   : 31.76   Mean   : 1196.0        
##  3rd Qu.:   0.00        3rd Qu.: 38.00   3rd Qu.: 1466.5        
##  Max.   :2549.38        Max.   :705.00   Max.   :63973.5        
##  NA's   :14             NA's   :14       NA's   :14             
##   BounceRates         ExitRates         PageValues        SpecialDay     
##  Min.   :0.000000   Min.   :0.00000   Min.   :  0.000   Min.   :0.00000  
##  1st Qu.:0.000000   1st Qu.:0.01429   1st Qu.:  0.000   1st Qu.:0.00000  
##  Median :0.003119   Median :0.02512   Median :  0.000   Median :0.00000  
##  Mean   :0.022152   Mean   :0.04300   Mean   :  5.889   Mean   :0.06143  
##  3rd Qu.:0.016684   3rd Qu.:0.05000   3rd Qu.:  0.000   3rd Qu.:0.00000  
##  Max.   :0.200000   Max.   :0.20000   Max.   :361.764   Max.   :1.00000  
##  NA's   :14         NA's   :14                                           
##      Month      OperatingSystems    Browser           Region     
##  May    :3364   Min.   :1.000    Min.   : 1.000   Min.   :1.000  
##  Nov    :2998   1st Qu.:2.000    1st Qu.: 2.000   1st Qu.:1.000  
##  Mar    :1907   Median :2.000    Median : 2.000   Median :3.000  
##  Dec    :1727   Mean   :2.124    Mean   : 2.357   Mean   :3.147  
##  Oct    : 549   3rd Qu.:3.000    3rd Qu.: 2.000   3rd Qu.:4.000  
##  Sep    : 448   Max.   :8.000    Max.   :13.000   Max.   :9.000  
##  (Other):1337                                                    
##   TrafficType               VisitorType     Weekend         Revenue       
##  Min.   : 1.00   New_Visitor      : 1694   Mode :logical   Mode :logical  
##  1st Qu.: 2.00   Other            :   85   FALSE:9462      FALSE:10422    
##  Median : 2.00   Returning_Visitor:10551   TRUE :2868      TRUE :1908     
##  Mean   : 4.07                                                            
##  3rd Qu.: 4.00                                                            
##  Max.   :20.00                                                            
## 
# Loading all the required packages

library(funModeling)
## Warning: package 'funModeling' was built under R version 3.5.3
## Loading required package: Hmisc
## Warning: package 'Hmisc' was built under R version 3.5.3
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.3
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## funModeling v.1.7 :)
## Examples and tutorials at livebook.datascienceheroes.com
library(GGally)
## Warning: package 'GGally' was built under R version 3.5.3
## 
## Attaching package: 'GGally'
## The following object is masked from 'package:funModeling':
## 
##     range01
library(tidyverse)
## -- Attaching packages ------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v tibble  2.0.1     v purrr   0.2.5
## v tidyr   0.8.2     v dplyr   0.8.3
## v readr   1.3.1     v stringr 1.3.1
## v tibble  2.0.1     v forcats 0.3.0
## Warning: package 'readr' was built under R version 3.5.3
## Warning: package 'dplyr' was built under R version 3.5.3
## -- Conflicts ---------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter()    masks stats::filter()
## x dplyr::lag()       masks stats::lag()
## x dplyr::src()       masks Hmisc::src()
## x dplyr::summarize() masks Hmisc::summarize()
library(MLmetrics)
## Warning: package 'MLmetrics' was built under R version 3.5.3
## 
## Attaching package: 'MLmetrics'
## The following object is masked from 'package:base':
## 
##     Recall
library(caret)
## Warning: package 'caret' was built under R version 3.5.3
## 
## Attaching package: 'caret'
## The following objects are masked from 'package:MLmetrics':
## 
##     MAE, RMSE
## The following object is masked from 'package:purrr':
## 
##     lift
## The following object is masked from 'package:survival':
## 
##     cluster
library(C50)
## Warning: package 'C50' was built under R version 3.5.3
library(class)
## Warning: package 'class' was built under R version 3.5.3
library(e1071)
## Warning: package 'e1071' was built under R version 3.5.3
## 
## Attaching package: 'e1071'
## The following object is masked from 'package:Hmisc':
## 
##     impute
library(SuperLearner)
## Warning: package 'SuperLearner' was built under R version 3.5.3
## Loading required package: nnls
## Super Learner
## Version: 2.0-24
## Package created on 2018-08-10
library(ranger)
## Warning: package 'ranger' was built under R version 3.5.3
library(kernlab)
## 
## Attaching package: 'kernlab'
## The following object is masked from 'package:purrr':
## 
##     cross
## The following object is masked from 'package:ggplot2':
## 
##     alpha
library(ipred)
## Warning: package 'ipred' was built under R version 3.5.3
library(arm)
## Warning: package 'arm' was built under R version 3.5.3
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
## 
##     expand
## Loading required package: lme4
## Warning: package 'lme4' was built under R version 3.5.3
## 
## arm (Version 1.10-1, built: 2018-4-12)
## Working directory is C:/Users/navee/OneDrive/Documents/DA 5030/assignments

HAVING A LOOK AT THE DATA

## MISSING VALUE TREATMENT

summary(shop)
##  Administrative   Administrative_Duration Informational   
##  Min.   : 0.000   Min.   :  -1.00         Min.   : 0.000  
##  1st Qu.: 0.000   1st Qu.:   0.00         1st Qu.: 0.000  
##  Median : 1.000   Median :   8.00         Median : 0.000  
##  Mean   : 2.318   Mean   :  80.91         Mean   : 0.504  
##  3rd Qu.: 4.000   3rd Qu.:  93.50         3rd Qu.: 0.000  
##  Max.   :27.000   Max.   :3398.75         Max.   :24.000  
##  NA's   :14       NA's   :14              NA's   :14      
##  Informational_Duration ProductRelated   ProductRelated_Duration
##  Min.   :  -1.00        Min.   :  0.00   Min.   :   -1.0        
##  1st Qu.:   0.00        1st Qu.:  7.00   1st Qu.:  185.0        
##  Median :   0.00        Median : 18.00   Median :  599.8        
##  Mean   :  34.51        Mean   : 31.76   Mean   : 1196.0        
##  3rd Qu.:   0.00        3rd Qu.: 38.00   3rd Qu.: 1466.5        
##  Max.   :2549.38        Max.   :705.00   Max.   :63973.5        
##  NA's   :14             NA's   :14       NA's   :14             
##   BounceRates         ExitRates         PageValues        SpecialDay     
##  Min.   :0.000000   Min.   :0.00000   Min.   :  0.000   Min.   :0.00000  
##  1st Qu.:0.000000   1st Qu.:0.01429   1st Qu.:  0.000   1st Qu.:0.00000  
##  Median :0.003119   Median :0.02512   Median :  0.000   Median :0.00000  
##  Mean   :0.022152   Mean   :0.04300   Mean   :  5.889   Mean   :0.06143  
##  3rd Qu.:0.016684   3rd Qu.:0.05000   3rd Qu.:  0.000   3rd Qu.:0.00000  
##  Max.   :0.200000   Max.   :0.20000   Max.   :361.764   Max.   :1.00000  
##  NA's   :14         NA's   :14                                           
##      Month      OperatingSystems    Browser           Region     
##  May    :3364   Min.   :1.000    Min.   : 1.000   Min.   :1.000  
##  Nov    :2998   1st Qu.:2.000    1st Qu.: 2.000   1st Qu.:1.000  
##  Mar    :1907   Median :2.000    Median : 2.000   Median :3.000  
##  Dec    :1727   Mean   :2.124    Mean   : 2.357   Mean   :3.147  
##  Oct    : 549   3rd Qu.:3.000    3rd Qu.: 2.000   3rd Qu.:4.000  
##  Sep    : 448   Max.   :8.000    Max.   :13.000   Max.   :9.000  
##  (Other):1337                                                    
##   TrafficType               VisitorType     Weekend         Revenue       
##  Min.   : 1.00   New_Visitor      : 1694   Mode :logical   Mode :logical  
##  1st Qu.: 2.00   Other            :   85   FALSE:9462      FALSE:10422    
##  Median : 2.00   Returning_Visitor:10551   TRUE :2868      TRUE :1908     
##  Mean   : 4.07                                                            
##  3rd Qu.: 4.00                                                            
##  Max.   :20.00                                                            
## 
# We can see that there are a few NA's present. 14 for each of the first 8 columns to be precise

sum(is.na(shop[1:8]))
## [1] 112
shop<-na.omit(shop) # Since there are only 112 NA's in the whole dataset we can exclude them

summary(shop)
##  Administrative   Administrative_Duration Informational   
##  Min.   : 0.000   Min.   :  -1.00         Min.   : 0.000  
##  1st Qu.: 0.000   1st Qu.:   0.00         1st Qu.: 0.000  
##  Median : 1.000   Median :   8.00         Median : 0.000  
##  Mean   : 2.318   Mean   :  80.91         Mean   : 0.504  
##  3rd Qu.: 4.000   3rd Qu.:  93.50         3rd Qu.: 0.000  
##  Max.   :27.000   Max.   :3398.75         Max.   :24.000  
##                                                           
##  Informational_Duration ProductRelated   ProductRelated_Duration
##  Min.   :  -1.00        Min.   :  0.00   Min.   :   -1.0        
##  1st Qu.:   0.00        1st Qu.:  7.00   1st Qu.:  185.0        
##  Median :   0.00        Median : 18.00   Median :  599.8        
##  Mean   :  34.51        Mean   : 31.76   Mean   : 1196.0        
##  3rd Qu.:   0.00        3rd Qu.: 38.00   3rd Qu.: 1466.5        
##  Max.   :2549.38        Max.   :705.00   Max.   :63973.5        
##                                                                 
##   BounceRates         ExitRates         PageValues        SpecialDay    
##  Min.   :0.000000   Min.   :0.00000   Min.   :  0.000   Min.   :0.0000  
##  1st Qu.:0.000000   1st Qu.:0.01429   1st Qu.:  0.000   1st Qu.:0.0000  
##  Median :0.003119   Median :0.02512   Median :  0.000   Median :0.0000  
##  Mean   :0.022152   Mean   :0.04300   Mean   :  5.896   Mean   :0.0615  
##  3rd Qu.:0.016684   3rd Qu.:0.05000   3rd Qu.:  0.000   3rd Qu.:0.0000  
##  Max.   :0.200000   Max.   :0.20000   Max.   :361.764   Max.   :1.0000  
##                                                                         
##      Month      OperatingSystems    Browser           Region     
##  May    :3363   Min.   :1.000    Min.   : 1.000   Min.   :1.000  
##  Nov    :2998   1st Qu.:2.000    1st Qu.: 2.000   1st Qu.:1.000  
##  Mar    :1894   Median :2.000    Median : 2.000   Median :3.000  
##  Dec    :1727   Mean   :2.124    Mean   : 2.358   Mean   :3.148  
##  Oct    : 549   3rd Qu.:3.000    3rd Qu.: 2.000   3rd Qu.:4.000  
##  Sep    : 448   Max.   :8.000    Max.   :13.000   Max.   :9.000  
##  (Other):1337                                                    
##   TrafficType               VisitorType     Weekend         Revenue       
##  Min.   : 1.00   New_Visitor      : 1694   Mode :logical   Mode :logical  
##  1st Qu.: 2.00   Other            :   85   FALSE:9451      FALSE:10408    
##  Median : 2.00   Returning_Visitor:10537   TRUE :2865      TRUE :1908     
##  Mean   : 4.07                                                            
##  3rd Qu.: 4.00                                                            
##  Max.   :20.00                                                            
## 

OUTLIER TREATMENT

# Creating a duplicate dataset at every possible step to preserve the original dataset

shop_out = shop

shop_out$Revenue<-as.factor(ifelse(shop_out$Revenue==TRUE,1,0))
summary(shop_out)
##  Administrative   Administrative_Duration Informational   
##  Min.   : 0.000   Min.   :  -1.00         Min.   : 0.000  
##  1st Qu.: 0.000   1st Qu.:   0.00         1st Qu.: 0.000  
##  Median : 1.000   Median :   8.00         Median : 0.000  
##  Mean   : 2.318   Mean   :  80.91         Mean   : 0.504  
##  3rd Qu.: 4.000   3rd Qu.:  93.50         3rd Qu.: 0.000  
##  Max.   :27.000   Max.   :3398.75         Max.   :24.000  
##                                                           
##  Informational_Duration ProductRelated   ProductRelated_Duration
##  Min.   :  -1.00        Min.   :  0.00   Min.   :   -1.0        
##  1st Qu.:   0.00        1st Qu.:  7.00   1st Qu.:  185.0        
##  Median :   0.00        Median : 18.00   Median :  599.8        
##  Mean   :  34.51        Mean   : 31.76   Mean   : 1196.0        
##  3rd Qu.:   0.00        3rd Qu.: 38.00   3rd Qu.: 1466.5        
##  Max.   :2549.38        Max.   :705.00   Max.   :63973.5        
##                                                                 
##   BounceRates         ExitRates         PageValues        SpecialDay    
##  Min.   :0.000000   Min.   :0.00000   Min.   :  0.000   Min.   :0.0000  
##  1st Qu.:0.000000   1st Qu.:0.01429   1st Qu.:  0.000   1st Qu.:0.0000  
##  Median :0.003119   Median :0.02512   Median :  0.000   Median :0.0000  
##  Mean   :0.022152   Mean   :0.04300   Mean   :  5.896   Mean   :0.0615  
##  3rd Qu.:0.016684   3rd Qu.:0.05000   3rd Qu.:  0.000   3rd Qu.:0.0000  
##  Max.   :0.200000   Max.   :0.20000   Max.   :361.764   Max.   :1.0000  
##                                                                         
##      Month      OperatingSystems    Browser           Region     
##  May    :3363   Min.   :1.000    Min.   : 1.000   Min.   :1.000  
##  Nov    :2998   1st Qu.:2.000    1st Qu.: 2.000   1st Qu.:1.000  
##  Mar    :1894   Median :2.000    Median : 2.000   Median :3.000  
##  Dec    :1727   Mean   :2.124    Mean   : 2.358   Mean   :3.148  
##  Oct    : 549   3rd Qu.:3.000    3rd Qu.: 2.000   3rd Qu.:4.000  
##  Sep    : 448   Max.   :8.000    Max.   :13.000   Max.   :9.000  
##  (Other):1337                                                    
##   TrafficType               VisitorType     Weekend        Revenue  
##  Min.   : 1.00   New_Visitor      : 1694   Mode :logical   0:10408  
##  1st Qu.: 2.00   Other            :   85   FALSE:9451      1: 1908  
##  Median : 2.00   Returning_Visitor:10537   TRUE :2865               
##  Mean   : 4.07                                                      
##  3rd Qu.: 4.00                                                      
##  Max.   :20.00                                                      
## 
#If the analysis to be conducted does contain a grouping variable, then data should be assessed for outliers separately within each group

shop_1 <- shop_out[ which(shop_out$Revenue==1),] #dataframe with revenue=1
shop_0 <- shop_out[ which(shop_out$Revenue==0),] #dataframe with revenue=0

# Calculating Mahalanobis distances to detect outliers in the data, Mahalanobis distance can only be used for numeric Data

MD<- mahalanobis(shop_1[1:9],colMeans(shop_1[1:9]),cov(shop_1[1:9]))
shop_1$MD<- round(MD,2)

# creating a function to calcuate a threshold for outliers

bench<- function(x){
  y<- summary(x)[5] + 1.5*IQR(x)
  return(y)
}


shop_1$out<- ifelse(shop_1$MD > bench(shop_1$MD),1,0)
table(shop_1$out)
## 
##    0    1 
## 1708  200
shop_1 <- shop_1[ which(shop_1$out==0),]

# Applying the same process for dataframe shop_0

MD0<- mahalanobis(shop_0[1:9],colMeans(shop_0[1:9]),cov(shop_0[1:9]))
shop_0$MD0<- MD0
shop_0$out<- ifelse(shop_0$MD0 > bench(shop_0$MD0),1,0)
table(shop_0$out)
## 
##    0    1 
## 9630  778
shop_0<- shop_0[ which(shop_0$out==0),]

shop_0$MD0<-NULL
shop_0$out <-NULL

head(shop_0)
##   Administrative Administrative_Duration Informational
## 1              0                       0             0
## 2              0                       0             0
## 3              0                      -1             0
## 4              0                       0             0
## 5              0                       0             0
## 6              0                       0             0
##   Informational_Duration ProductRelated ProductRelated_Duration
## 1                      0              1                0.000000
## 2                      0              2               64.000000
## 3                     -1              1               -1.000000
## 4                      0              2                2.666667
## 5                      0             10              627.500000
## 6                      0             19              154.216667
##   BounceRates ExitRates PageValues SpecialDay Month OperatingSystems
## 1  0.20000000 0.2000000          0          0   Feb                1
## 2  0.00000000 0.1000000          0          0   Feb                2
## 3  0.20000000 0.2000000          0          0   Feb                4
## 4  0.05000000 0.1400000          0          0   Feb                3
## 5  0.02000000 0.0500000          0          0   Feb                3
## 6  0.01578947 0.0245614          0          0   Feb                2
##   Browser Region TrafficType       VisitorType Weekend Revenue
## 1       1      1           1 Returning_Visitor   FALSE       0
## 2       2      1           2 Returning_Visitor   FALSE       0
## 3       1      9           3 Returning_Visitor   FALSE       0
## 4       2      2           4 Returning_Visitor   FALSE       0
## 5       3      1           4 Returning_Visitor    TRUE       0
## 6       2      1           3 Returning_Visitor   FALSE       0
shop_1$MD<-NULL
shop_1$out <-NULL


shop_out<- rbind(shop_1,shop_0)

shop_out <- shop_out[sample(nrow(shop_out)),] #Randomizing the data based on rows, inorder to avoid improper split in test and train datasets 

shop_out<-as.data.frame(shop_out)

# I SHALL NOT BE USING THE DATASET "shop_out"(outlier treated dataset) FOR ANALYSIS/BUILDING MODELS AS THE DATA IS ASSUMED TO BE WITHOUT OUTLIERS IN A STUDY CONDUCTED/PAPER,
# WHICH I HAVE REFFERED - Sakar, C.O., Polat, S.O., Katircioglu, M. et al. Neural Comput & Applic (2018)

DATA EXPLORATION & VISUALIZATION

library(funModeling) #Package gives us colourful frequency plots  

shop_num <- shop[1:10] 

plot_num(shop_num) # Histogram of all the continuous variables

shop_cat <- shop[11:18]

shop_cat$OperatingSystems<- as.factor(shop_cat$OperatingSystems)
shop_cat$Browser<- as.factor(shop_cat$Browser)
shop_cat$Region<- as.factor(shop_cat$Region)
shop_cat$TrafficType<- as.factor(shop_cat$TrafficType)
shop_cat$Weekend<- as.factor(shop$Weekend)
shop_cat$Revenue<- as.factor(shop_cat$Revenue)

print(freq(shop_cat)) # Frequency & frequency plots of catergorical variables

##    Month frequency percentage cumulative_perc
## 1    May      3363      27.31           27.31
## 2    Nov      2998      24.34           51.65
## 3    Mar      1894      15.38           67.03
## 4    Dec      1727      14.02           81.05
## 5    Oct       549       4.46           85.51
## 6    Sep       448       3.64           89.15
## 7    Aug       433       3.52           92.67
## 8    Jul       432       3.51           96.18
## 9   June       288       2.34           98.52
## 10   Feb       184       1.49          100.00

##   OperatingSystems frequency percentage cumulative_perc
## 1                2      6593      53.53           53.53
## 2                1      2582      20.96           74.49
## 3                3      2552      20.72           95.21
## 4                4       478       3.88           99.09
## 5                8        79       0.64           99.73
## 6                6        19       0.15           99.88
## 7                7         7       0.06           99.94
## 8                5         6       0.05          100.00

##    Browser frequency percentage cumulative_perc
## 1        2      7951      64.56           64.56
## 2        1      2459      19.97           84.53
## 3        4       735       5.97           90.50
## 4        5       467       3.79           94.29
## 5        6       174       1.41           95.70
## 6       10       163       1.32           97.02
## 7        8       135       1.10           98.12
## 8        3       105       0.85           98.97
## 9       13        61       0.50           99.47
## 10       7        49       0.40           99.87
## 11      12        10       0.08           99.95
## 12      11         6       0.05          100.00
## 13       9         1       0.01          100.00

##   Region frequency percentage cumulative_perc
## 1      1      4774      38.76           38.76
## 2      3      2402      19.50           58.26
## 3      4      1179       9.57           67.83
## 4      2      1134       9.21           77.04
## 5      6       804       6.53           83.57
## 6      7       761       6.18           89.75
## 7      9       511       4.15           93.90
## 8      8       434       3.52           97.42
## 9      5       317       2.57          100.00

##    TrafficType frequency percentage cumulative_perc
## 1            2      3909      31.74           31.74
## 2            1      2444      19.84           51.58
## 3            3      2051      16.65           68.23
## 4            4      1069       8.68           76.91
## 5           13       737       5.98           82.89
## 6           10       450       3.65           86.54
## 7            6       444       3.61           90.15
## 8            8       343       2.78           92.93
## 9            5       260       2.11           95.04
## 10          11       247       2.01           97.05
## 11          20       198       1.61           98.66
## 12           9        42       0.34           99.00
## 13           7        40       0.32           99.32
## 14          15        37       0.30           99.62
## 15          19        17       0.14           99.76
## 16          14        13       0.11           99.87
## 17          18        10       0.08           99.95
## 18          16         3       0.02           99.97
## 19          12         1       0.01           99.98
## 20          17         1       0.01          100.00

##         VisitorType frequency percentage cumulative_perc
## 1 Returning_Visitor     10537      85.56           85.56
## 2       New_Visitor      1694      13.75           99.31
## 3             Other        85       0.69          100.00

##   Weekend frequency percentage cumulative_perc
## 1   FALSE      9451      76.74           76.74
## 2    TRUE      2865      23.26          100.00

##   Revenue frequency percentage cumulative_perc
## 1   FALSE     10408      84.51           84.51
## 2    TRUE      1908      15.49          100.00
## 
## [1] "Variables processed: Month, OperatingSystems, Browser, Region, TrafficType, VisitorType, Weekend, Revenue"
dat<- shop

dat$Month<- as.numeric(dat$Month)
dat$VisitorType<- as.numeric(dat$VisitorType)

M<-cor(dat)
ggcorr(M)

plot(shop_out$BounceRates,shop_out$Revenue,xlab = "BounceRates",ylab="Revenue") # We can notice when the bounce rate is close to zero, The customer is more likely to develop revnue.

#Bounce rate is an Internet marketing term used in web traffic analysis. It represents the percentage of visitors who enter the site and then leave ("bounce") rather than continuing to view other pages within the same site.

plot(shop_out$ExitRates,shop_out$Revenue,xlab = "ExitRates",ylab = "Revenue")

dat1<- shop_out %>% filter(shop_out$Revenue==1)
ggplot(dat1, aes(dat1$Month, ..count..)) + geom_bar(aes(fill = dat1$Revenue), position = "dodge") + labs(x="Month",y="Revenue")

MODELLING THE DATA

  1. DECISION TREE MODEL
set.seed(100)

shop1=shop   

shop1$Revenue<- as.factor(shop1$Revenue)
shop1$Weekend<- as.factor(shop1$Weekend)

index <- createDataPartition(shop1$Revenue, p=0.75, list=FALSE)
train <-shop1[ index,]
test <- shop1[-index,]

# C4.5 Decision trees

library(RWeka)
## Warning: package 'RWeka' was built under R version 3.5.3
fit<-J48(Revenue ~.,data=train)
summary(fit)
## 
## === Summary ===
## 
## Correctly Classified Instances        8709               94.2839 %
## Incorrectly Classified Instances       528                5.7161 %
## Kappa statistic                          0.7611
## Mean absolute error                      0.0987
## Root mean squared error                  0.2222
## Relative absolute error                 37.6929 %
## Root relative squared error             61.4006 %
## Total Number of Instances             9237     
## 
## === Confusion Matrix ===
## 
##     a    b   <-- classified as
##  7693  113 |    a = FALSE
##   415 1016 |    b = TRUE
p_tree<-predict(fit,test[,1:17])
confusionMatrix(p_tree,test$Revenue)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE TRUE
##      FALSE  2476  205
##      TRUE    126  272
##                                          
##                Accuracy : 0.8925         
##                  95% CI : (0.881, 0.9032)
##     No Information Rate : 0.8451         
##     P-Value [Acc > NIR] : 1.577e-14      
##                                          
##                   Kappa : 0.5597         
##                                          
##  Mcnemar's Test P-Value : 1.809e-05      
##                                          
##             Sensitivity : 0.9516         
##             Specificity : 0.5702         
##          Pos Pred Value : 0.9235         
##          Neg Pred Value : 0.6834         
##              Prevalence : 0.8451         
##          Detection Rate : 0.8042         
##    Detection Prevalence : 0.8707         
##       Balanced Accuracy : 0.7609         
##                                          
##        'Positive' Class : FALSE          
## 
a1<-Accuracy(p_tree,test$Revenue)
a1
## [1] 0.8924976
# C5.0 Boosted trees

dtree<-C5.0(train,train$Revenue)
plot(dtree)

p_dtree<-predict(dtree,test)

confusionMatrix(table(p_dtree,test$Revenue))
## Confusion Matrix and Statistics
## 
##        
## p_dtree FALSE TRUE
##   FALSE  2602    0
##   TRUE      0  477
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9988, 1)
##     No Information Rate : 0.8451     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.8451     
##          Detection Rate : 0.8451     
##    Detection Prevalence : 0.8451     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : FALSE      
## 
Accuracy(p_dtree,test$Revenue)
## [1] 1
a2<-Accuracy(p_dtree,test$Revenue)
a2
## [1] 1
# C5.0 Decision tree produces 100% accuracy
  1. K - NEAREST NEIGHBOUR (KNN)
set.seed(101)

shop2=shop
summary(shop2)
##  Administrative   Administrative_Duration Informational   
##  Min.   : 0.000   Min.   :  -1.00         Min.   : 0.000  
##  1st Qu.: 0.000   1st Qu.:   0.00         1st Qu.: 0.000  
##  Median : 1.000   Median :   8.00         Median : 0.000  
##  Mean   : 2.318   Mean   :  80.91         Mean   : 0.504  
##  3rd Qu.: 4.000   3rd Qu.:  93.50         3rd Qu.: 0.000  
##  Max.   :27.000   Max.   :3398.75         Max.   :24.000  
##                                                           
##  Informational_Duration ProductRelated   ProductRelated_Duration
##  Min.   :  -1.00        Min.   :  0.00   Min.   :   -1.0        
##  1st Qu.:   0.00        1st Qu.:  7.00   1st Qu.:  185.0        
##  Median :   0.00        Median : 18.00   Median :  599.8        
##  Mean   :  34.51        Mean   : 31.76   Mean   : 1196.0        
##  3rd Qu.:   0.00        3rd Qu.: 38.00   3rd Qu.: 1466.5        
##  Max.   :2549.38        Max.   :705.00   Max.   :63973.5        
##                                                                 
##   BounceRates         ExitRates         PageValues        SpecialDay    
##  Min.   :0.000000   Min.   :0.00000   Min.   :  0.000   Min.   :0.0000  
##  1st Qu.:0.000000   1st Qu.:0.01429   1st Qu.:  0.000   1st Qu.:0.0000  
##  Median :0.003119   Median :0.02512   Median :  0.000   Median :0.0000  
##  Mean   :0.022152   Mean   :0.04300   Mean   :  5.896   Mean   :0.0615  
##  3rd Qu.:0.016684   3rd Qu.:0.05000   3rd Qu.:  0.000   3rd Qu.:0.0000  
##  Max.   :0.200000   Max.   :0.20000   Max.   :361.764   Max.   :1.0000  
##                                                                         
##      Month      OperatingSystems    Browser           Region     
##  May    :3363   Min.   :1.000    Min.   : 1.000   Min.   :1.000  
##  Nov    :2998   1st Qu.:2.000    1st Qu.: 2.000   1st Qu.:1.000  
##  Mar    :1894   Median :2.000    Median : 2.000   Median :3.000  
##  Dec    :1727   Mean   :2.124    Mean   : 2.358   Mean   :3.148  
##  Oct    : 549   3rd Qu.:3.000    3rd Qu.: 2.000   3rd Qu.:4.000  
##  Sep    : 448   Max.   :8.000    Max.   :13.000   Max.   :9.000  
##  (Other):1337                                                    
##   TrafficType               VisitorType     Weekend         Revenue       
##  Min.   : 1.00   New_Visitor      : 1694   Mode :logical   Mode :logical  
##  1st Qu.: 2.00   Other            :   85   FALSE:9451      FALSE:10408    
##  Median : 2.00   Returning_Visitor:10537   TRUE :2865      TRUE :1908     
##  Mean   : 4.07                                                            
##  3rd Qu.: 4.00                                                            
##  Max.   :20.00                                                            
## 
str(shop2)
## 'data.frame':    12316 obs. of  18 variables:
##  $ Administrative         : int  0 0 0 0 0 0 0 1 0 0 ...
##  $ Administrative_Duration: num  0 0 -1 0 0 0 -1 -1 0 0 ...
##  $ Informational          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Informational_Duration : num  0 0 -1 0 0 0 -1 -1 0 0 ...
##  $ ProductRelated         : int  1 2 1 2 10 19 1 1 2 3 ...
##  $ ProductRelated_Duration: num  0 64 -1 2.67 627.5 ...
##  $ BounceRates            : num  0.2 0 0.2 0.05 0.02 ...
##  $ ExitRates              : num  0.2 0.1 0.2 0.14 0.05 ...
##  $ PageValues             : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SpecialDay             : num  0 0 0 0 0 0 0.4 0 0.8 0.4 ...
##  $ Month                  : Factor w/ 10 levels "Aug","Dec","Feb",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ OperatingSystems       : int  1 2 4 3 3 2 2 1 2 2 ...
##  $ Browser                : int  1 2 1 2 3 2 4 2 2 4 ...
##  $ Region                 : int  1 1 9 2 1 1 3 1 2 1 ...
##  $ TrafficType            : int  1 2 3 4 4 3 3 5 3 2 ...
##  $ VisitorType            : Factor w/ 3 levels "New_Visitor",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ Weekend                : logi  FALSE FALSE FALSE FALSE TRUE FALSE ...
##  $ Revenue                : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  - attr(*, "na.action")= 'omit' Named int  1066 1133 1134 1135 1136 1137 1474 1475 1476 1477 ...
##   ..- attr(*, "names")= chr  "1066" "1133" "1134" "1135" ...
shop2$Month=as.numeric(shop1$Month)
shop2$VisitorType=as.numeric(shop1$VisitorType)
shop2$Weekend=as.numeric(shop1$Weekend)
shop2$Revenue=as.numeric(shop1$Revenue)

index1 <- createDataPartition(shop1$Revenue, p=0.75, list=FALSE)
train1 <-shop2[ index1,]
test1 <- shop2[-index1,]

train1_lab<-train1$Revenue
test1_lab<-test1$Revenue

normalize <- function(x) {                    # Since KNN is a distance Based algorithm Normalization of feature values is a must.
 return ((x - min(x)) / (max(x) - min(x)))
 }
  
shop2[1:6]<-lapply(shop2[1:6],normalize)

knn_pred<-knn(train1,test1,train1_lab,k=111)

confusionMatrix(table(knn_pred,test1$Revenue))
## Confusion Matrix and Statistics
## 
##         
## knn_pred    1    2
##        1 2602  475
##        2    0    2
##                                           
##                Accuracy : 0.8457          
##                  95% CI : (0.8325, 0.8583)
##     No Information Rate : 0.8451          
##     P-Value [Acc > NIR] : 0.4725          
##                                           
##                   Kappa : 0.0071          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 1.000000        
##             Specificity : 0.004193        
##          Pos Pred Value : 0.845629        
##          Neg Pred Value : 1.000000        
##              Prevalence : 0.845080        
##          Detection Rate : 0.845080        
##    Detection Prevalence : 0.999350        
##       Balanced Accuracy : 0.502096        
##                                           
##        'Positive' Class : 1               
## 
a3<-Accuracy(knn_pred,test1$Revenue)
a3
## [1] 0.8457291
  1. LOGISTIC REGRESSION
set.seed(102)

shop3=shop

index2 <- createDataPartition(shop3$Revenue, p=0.75, list=FALSE)
train2 <-shop3[ index2,]
test2<- shop3[-index2,]

lg_mod<-glm(Revenue ~. ,data=train2,family = binomial(link = "logit"))

anova(lg_mod,test = "Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: Revenue
## 
## Terms added sequentially (first to last)
## 
## 
##                         Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                                     9236     7965.1              
## Administrative           1   145.80      9235     7819.3 < 2.2e-16 ***
## Administrative_Duration  1     0.60      9234     7818.7 0.4370413    
## Informational            1    12.54      9233     7806.1 0.0003981 ***
## Informational_Duration   1     0.10      9232     7806.0 0.7550210    
## ProductRelated           1    72.08      9231     7733.9 < 2.2e-16 ***
## ProductRelated_Duration  1     5.65      9230     7728.3 0.0174933 *  
## BounceRates              1   279.88      9229     7448.4 < 2.2e-16 ***
## ExitRates                1   218.53      9228     7229.9 < 2.2e-16 ***
## PageValues               1  1575.70      9227     5654.2 < 2.2e-16 ***
## SpecialDay               1    25.27      9226     5628.9 4.991e-07 ***
## Month                    9   205.28      9217     5423.6 < 2.2e-16 ***
## OperatingSystems         1     4.50      9216     5419.1 0.0339777 *  
## Browser                  1     1.07      9215     5418.1 0.3001190    
## Region                   1     0.46      9214     5417.6 0.4964312    
## TrafficType              1     0.04      9213     5417.6 0.8345991    
## VisitorType              2    10.07      9211     5407.5 0.0064975 ** 
## Weekend                  1     1.91      9210     5405.6 0.1666200    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lg_mod<-glm(Revenue~ Administrative + Informational + ProductRelated + BounceRates + ExitRates + PageValues + SpecialDay +Month + VisitorType,data=train2,family = binomial(link = "logit"))

#Retreiving only the Influential Variables after removing variables based on P-value

anova(lg_mod,test = "Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: Revenue
## 
## Terms added sequentially (first to last)
## 
## 
##                Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                            9236     7965.1              
## Administrative  1   145.80      9235     7819.3 < 2.2e-16 ***
## Informational   1    13.04      9234     7806.2 0.0003044 ***
## ProductRelated  1    72.15      9233     7734.1 < 2.2e-16 ***
## BounceRates     1   279.66      9232     7454.4 < 2.2e-16 ***
## ExitRates       1   211.80      9231     7242.6 < 2.2e-16 ***
## PageValues      1  1582.34      9230     5660.3 < 2.2e-16 ***
## SpecialDay      1    25.73      9229     5634.5 3.925e-07 ***
## Month           9   204.51      9220     5430.0 < 2.2e-16 ***
## VisitorType     2    11.00      9218     5419.0 0.0040793 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lg_pred <- predict(lg_mod,newdata = test2,type = "response")

lg_pred=as.numeric(lg_pred)

lg_pred=as.factor(round(lg_pred,0))
                  
test2$Revenue=as.numeric(test2$Revenue)
confusionMatrix(table(lg_pred,test2$Revenue))
## Confusion Matrix and Statistics
## 
##        
## lg_pred    0    1
##       0 2539  297
##       1   63  180
##                                           
##                Accuracy : 0.8831          
##                  95% CI : (0.8712, 0.8942)
##     No Information Rate : 0.8451          
##     P-Value [Acc > NIR] : 9.048e-10       
##                                           
##                   Kappa : 0.4416          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9758          
##             Specificity : 0.3774          
##          Pos Pred Value : 0.8953          
##          Neg Pred Value : 0.7407          
##              Prevalence : 0.8451          
##          Detection Rate : 0.8246          
##    Detection Prevalence : 0.9211          
##       Balanced Accuracy : 0.6766          
##                                           
##        'Positive' Class : 0               
## 
a4<- Accuracy(lg_pred,test2$Revenue)
a4
## [1] 0.8830789

4.NAIVE BAYES

set.seed(104)

x=train
y=train$Revenue

model = naiveBayes(x,y)
p<- predict(model,test,type="class")

confusionMatrix(p,test$Revenue)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE TRUE
##      FALSE  2501   10
##      TRUE    101  467
##                                           
##                Accuracy : 0.9639          
##                  95% CI : (0.9567, 0.9703)
##     No Information Rate : 0.8451          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8723          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9612          
##             Specificity : 0.9790          
##          Pos Pred Value : 0.9960          
##          Neg Pred Value : 0.8222          
##              Prevalence : 0.8451          
##          Detection Rate : 0.8123          
##    Detection Prevalence : 0.8155          
##       Balanced Accuracy : 0.9701          
##                                           
##        'Positive' Class : FALSE           
## 
a5<- Accuracy(p,test$Revenue)
a5
## [1] 0.9639493

COMPARING MODEL ACCURACIES

X<-c("C4.5-Tree","C5.0-Tree","KNN","Logistic","Naive-Bayes")
Y<-round(c(a1,a2,a3,a4,a5),2)

X_name <- "model"
Y_name <- "accuracy"

df <- data.frame(X,Y)
names(df) <- c(X_name,Y_name)


ggplot(df,aes(x=model,y=accuracy,fill=model))+geom_bar(stat = "identity") + geom_text(aes(label=accuracy),position=position_dodge(width=0.9), vjust=-0.25)

ENSEMBLE MODEL

## Reference - https://www.datacamp.com/community/tutorials/ensemble-r-machine-learning

set.seed(667)

shop3=shop

shop3$Revenue<- as.numeric(shop3$Revenue)
shop3$Weekend<-as.factor(shop3$Weekend)

index3 <- createDataPartition(shop3$Revenue, p=0.75, list=FALSE)
train3 <-shop3[ index3,]
test3 <- shop3[-index3,]

xtrain <- data.frame(train3[,1:17])
xtest <- data.frame(test3[,1:17])

y_lab=(train3[,18])
ytest_lab=as.numeric((test3[,18]))


## With random forest, Logistic regression and Bagging 

# Fit the ensemble model
model <- SuperLearner(y_lab,xtrain,family=binomial(),SL.library=list("SL.ranger","SL.ipredbagg","SL.bayesglm"))

# Bayes GLM model is simply an implementation of logistic regression. At least in this case, where we are classifying a 0-1 problem, Ranger for random Forest, ipredbagg for Bagging
print(model)
## 
## Call:  
## SuperLearner(Y = y_lab, X = xtrain, family = binomial(), SL.library = list("SL.ranger",  
##     "SL.ipredbagg", "SL.bayesglm")) 
## 
## 
##                        Risk      Coef
## SL.ranger_All    0.07178292 0.6695822
## SL.ipredbagg_All 0.07468293 0.2935592
## SL.bayesglm_All  0.08678850 0.0368586
predictions <- predict.SuperLearner(model, newdata=as.data.frame(xtest))

head(predictions$library.predict) # Predictions of Individual libraries
##      SL.ranger_All SL.ipredbagg_All SL.bayesglm_All
## [1,]       0.00000       0.04265707     0.001067033
## [2,]       0.00600       0.04265707     0.030741309
## [3,]       0.00000       0.04265707     0.021526787
## [4,]       0.00016       0.04265707     0.020389081
## [5,]       0.02200       0.04265707     0.016927636
## [6,]       0.00000       0.04265707     0.010549686
 #Recoding the Probabilities
conv.preds <- ifelse(predictions$pred>=0.5,1,0)

cm <- Accuracy(conv.preds,ytest_lab)
cm
## [1] 0.9142579
#To understand each model's specific contribution to the model and the variation, we can use SuperLearner's internal V-fold cross-validation function --> CV.SuperLearner()

v.model <- CV.SuperLearner(y_lab,xtrain,V=5,SL.library=list("SL.ranger","SL.ipredbagg","SL.bayesglm"))
summary(v.model)
## 
## Call:  
## CV.SuperLearner(Y = y_lab, X = xtrain, V = 5, SL.library = list("SL.ranger",  
##     "SL.ipredbagg", "SL.bayesglm")) 
## 
## Risk is based on: Mean Squared Error
## 
## All risk estimates are based on V =  5 
## 
##         Algorithm      Ave        se      Min      Max
##     Super Learner 0.071223 0.0017570 0.068369 0.074229
##       Discrete SL 0.071918 0.0017568 0.068830 0.074525
##     SL.ranger_All 0.071918 0.0017568 0.068830 0.074525
##  SL.ipredbagg_All 0.073623 0.0018591 0.070834 0.077592
##   SL.bayesglm_All 0.096237 0.0024065 0.093952 0.103946
plot(v.model)

## Tuning Hyper Parameters.The model Performance is great but maybe we can try and Improve the performance by a bit. 
#For this I will changing the nbagg parameter to 250 from the default of 25

SL.ipredbagg.tune <- function(...){
      SL.ipredbagg(..., nbagg=250)
    }

set.seed(150)

# Tuning the model
cv.model.tune <- CV.SuperLearner(y_lab,xtrain,V=5,SL.library=list("SL.ranger","SL.ksvm","SL.ipredbagg","SL.bayesglm","SL.ipredbagg.tune"))

# summary statistics
summary(cv.model.tune)
## 
## Call:  
## CV.SuperLearner(Y = y_lab, X = xtrain, V = 5, SL.library = list("SL.ranger",  
##     "SL.ksvm", "SL.ipredbagg", "SL.bayesglm", "SL.ipredbagg.tune")) 
## 
## Risk is based on: Mean Squared Error
## 
## All risk estimates are based on V =  5 
## 
##              Algorithm      Ave        se      Min      Max
##          Super Learner 0.071308 0.0017802 0.066704 0.075322
##            Discrete SL 0.071809 0.0017575 0.067673 0.075486
##          SL.ranger_All 0.071809 0.0017575 0.067673 0.075486
##            SL.ksvm_All 0.086555 0.0024075 0.082180 0.097047
##       SL.ipredbagg_All 0.073679 0.0018641 0.068151 0.078030
##        SL.bayesglm_All 0.095736 0.0023166 0.092588 0.104010
##  SL.ipredbagg.tune_All 0.073738 0.0018640 0.068128 0.078421
plot(cv.model.tune)

# We can see very negligible improvement the tuned ipredbagg, But let's leave it alone and see if the SuperLearner finds it relevant

set.seed(155)

model.tune<-SuperLearner(y_lab,xtrain,SL.library=list("SL.ranger","SL.ksvm","SL.ipredbagg","SL.bayesglm","SL.ipredbagg.tune"))
model.tune
## 
## Call:  
## SuperLearner(Y = y_lab, X = xtrain, SL.library = list("SL.ranger", "SL.ksvm",  
##     "SL.ipredbagg", "SL.bayesglm", "SL.ipredbagg.tune")) 
## 
## 
##                             Risk       Coef
## SL.ranger_All         0.07138059 0.63700483
## SL.ksvm_All           0.08601019 0.05823424
## SL.ipredbagg_All      0.07354597 0.00000000
## SL.bayesglm_All       0.09579675 0.00000000
## SL.ipredbagg.tune_All 0.07348829 0.30476094
predictions.tune <- predict.SuperLearner(model.tune, newdata=as.data.frame(xtest))

conv.preds.tune <- ifelse(predictions.tune$pred>=0.5,1,0)
                          
Accuracy(conv.preds.tune,ytest_lab)
## [1] 0.9132835
# We can see no improvement with the tuned model hence we can assume the first model to be adequate