Problem Definition

Kira Plastinina is a Russian brand that is sold through a defunct chain of retail stores in Russia, Ukraine, Kazakhstan, Belarus, China, Philippines, and Armenia. The brand’s Sales and Marketing team would like to understand their customer’s behavior from data that they have collected over the past year. More specifically, they would like to learn the characteristics of customer groups.

Defining the Metric of Success

To use Unsupervised learning algorithms on the given data to extract useful information on customer behaviour.

Data Relevance

The dataset can be retrieved from this link -> http://bit.ly/EcommerceCustomersDataset

The description of the dataset is as follows:

Loading Libraries

library(caret)
## Warning: package 'caret' was built under R version 4.1.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.1.3
## Loading required package: lattice
library(Amelia)
## Warning: package 'Amelia' was built under R version 4.1.3
## Loading required package: Rcpp
## Warning: package 'Rcpp' was built under R version 4.1.3
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.0, built: 2021-05-26)
## ## Copyright (C) 2005-2022 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(e1071)
## Warning: package 'e1071' was built under R version 4.1.3
library(factoextra, warn.conflicts = FALSE)
## Warning: package 'factoextra' was built under R version 4.1.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(ggplot2)
library(CatEncoders, warn.conflicts = FALSE)
## Warning: package 'CatEncoders' was built under R version 4.1.3
library(cluster)
library(purrr)
## 
## Attaching package: 'purrr'
## The following object is masked from 'package:caret':
## 
##     lift

Loading the data

df <- read.csv("http://bit.ly/EcommerceCustomersDataset")

Preview of the Dataset

head(df)

Data Cleaning

Completeness

# checking for missing values
colSums(is.na(df))
##          Administrative Administrative_Duration           Informational 
##                      14                      14                      14 
##  Informational_Duration          ProductRelated ProductRelated_Duration 
##                      14                      14                      14 
##             BounceRates               ExitRates              PageValues 
##                      14                      14                       0 
##              SpecialDay                   Month        OperatingSystems 
##                       0                       0                       0 
##                 Browser                  Region             TrafficType 
##                       0                       0                       0 
##             VisitorType                 Weekend                 Revenue 
##                       0                       0                       0
# dropping null values
df <- na.omit(df)

Consistency

# eliminating for duplicates
df <- df[!duplicated(df), ]

Accuracy

# casting categorical columns as factors
df$Month <- factor(df$Month)
df$OperatingSystems <- factor(df$OperatingSystems)
df$Browser <- factor(df$Browser)
df$Region <- factor(df$Region)
df$TrafficType <- factor(df$TrafficType)
df$VisitorType <- factor(df$VisitorType)
df$Weekend <- factor(df$Weekend)
df$Revenue <- factor(df$Revenue)

Dataset structure

str(df)
## 'data.frame':    12199 obs. of  18 variables:
##  $ Administrative         : int  0 0 0 0 0 0 0 1 0 0 ...
##  $ Administrative_Duration: num  0 0 -1 0 0 0 -1 -1 0 0 ...
##  $ Informational          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Informational_Duration : num  0 0 -1 0 0 0 -1 -1 0 0 ...
##  $ ProductRelated         : int  1 2 1 2 10 19 1 1 2 3 ...
##  $ ProductRelated_Duration: num  0 64 -1 2.67 627.5 ...
##  $ BounceRates            : num  0.2 0 0.2 0.05 0.02 ...
##  $ ExitRates              : num  0.2 0.1 0.2 0.14 0.05 ...
##  $ PageValues             : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SpecialDay             : num  0 0 0 0 0 0 0.4 0 0.8 0.4 ...
##  $ Month                  : Factor w/ 10 levels "Aug","Dec","Feb",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ OperatingSystems       : Factor w/ 8 levels "1","2","3","4",..: 1 2 4 3 3 2 2 1 2 2 ...
##  $ Browser                : Factor w/ 13 levels "1","2","3","4",..: 1 2 1 2 3 2 4 2 2 4 ...
##  $ Region                 : Factor w/ 9 levels "1","2","3","4",..: 1 1 9 2 1 1 3 1 2 1 ...
##  $ TrafficType            : Factor w/ 20 levels "1","2","3","4",..: 1 2 3 4 4 3 3 5 3 2 ...
##  $ VisitorType            : Factor w/ 3 levels "New_Visitor",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ Weekend                : Factor w/ 2 levels "FALSE","TRUE": 1 1 1 1 2 1 1 2 1 1 ...
##  $ Revenue                : Factor w/ 2 levels "FALSE","TRUE": 1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, "na.action")= 'omit' Named int [1:14] 1066 1133 1134 1135 1136 1137 1474 1475 1476 1477 ...
##   ..- attr(*, "names")= chr [1:14] "1066" "1133" "1134" "1135" ...
# numerical summary of variables
summary(df)
##  Administrative  Administrative_Duration Informational    
##  Min.   : 0.00   Min.   :  -1.00         Min.   : 0.0000  
##  1st Qu.: 0.00   1st Qu.:   0.00         1st Qu.: 0.0000  
##  Median : 1.00   Median :   9.00         Median : 0.0000  
##  Mean   : 2.34   Mean   :  81.68         Mean   : 0.5088  
##  3rd Qu.: 4.00   3rd Qu.:  94.75         3rd Qu.: 0.0000  
##  Max.   :27.00   Max.   :3398.75         Max.   :24.0000  
##                                                           
##  Informational_Duration ProductRelated   ProductRelated_Duration
##  Min.   :  -1.00        Min.   :  0.00   Min.   :   -1.0        
##  1st Qu.:   0.00        1st Qu.:  8.00   1st Qu.:  193.6        
##  Median :   0.00        Median : 18.00   Median :  609.5        
##  Mean   :  34.84        Mean   : 32.06   Mean   : 1207.5        
##  3rd Qu.:   0.00        3rd Qu.: 38.00   3rd Qu.: 1477.6        
##  Max.   :2549.38        Max.   :705.00   Max.   :63973.5        
##                                                                 
##   BounceRates        ExitRates         PageValues        SpecialDay     
##  Min.   :0.00000   Min.   :0.00000   Min.   :  0.000   Min.   :0.00000  
##  1st Qu.:0.00000   1st Qu.:0.01422   1st Qu.:  0.000   1st Qu.:0.00000  
##  Median :0.00293   Median :0.02500   Median :  0.000   Median :0.00000  
##  Mean   :0.02045   Mean   :0.04150   Mean   :  5.952   Mean   :0.06197  
##  3rd Qu.:0.01667   3rd Qu.:0.04848   3rd Qu.:  0.000   3rd Qu.:0.00000  
##  Max.   :0.20000   Max.   :0.20000   Max.   :361.764   Max.   :1.00000  
##                                                                         
##      Month      OperatingSystems    Browser         Region      TrafficType  
##  May    :3328   2      :6536     2      :7878   1      :4711   2      :3907  
##  Nov    :2983   1      :2548     1      :2426   3      :2382   1      :2383  
##  Mar    :1853   3      :2530     4      : 730   4      :1168   3      :2017  
##  Dec    :1706   4      : 478     5      : 466   2      :1127   4      :1066  
##  Oct    : 549   8      :  75     6      : 174   6      : 800   13     : 728  
##  Sep    : 448   6      :  19     10     : 163   7      : 758   10     : 450  
##  (Other):1332   (Other):  13     (Other): 362   (Other):1253   (Other):1648  
##             VisitorType     Weekend      Revenue     
##  New_Visitor      : 1693   FALSE:9343   FALSE:10291  
##  Other            :   81   TRUE :2856   TRUE : 1908  
##  Returning_Visitor:10425                             
##                                                      
##                                                      
##                                                      
## 

Univariate Analysis

Administrative

# mean
admin.mean <- mean(df$Administrative)
admin.mean
## [1] 2.340028
# mode
mode <- function(x){
  uniqx <- unique(x)
  uniqx[which.max(tabulate(match(x, uniqx)))]
}
admin.mode <- mode(df$Administrative)
admin.mode
## [1] 0
# Median
admin.median <- median(df$Administrative)
admin.median
## [1] 1
# Range
admin.range <- range(df$Administrative)
admin.range
## [1]  0 27
# standard deviation
admin.sd <- sd(df$Administrative)
admin.sd
## [1] 3.330851
# kurtsosis
admin.kurt <- kurtosis(df$Administrative)
admin.kurt
## [1] 4.634854
# skewness
admin.skew <- skewness(df$Administrative)
admin.skew
## [1] 1.946009
# quantiles
admin.quant <- quantile(df$Administrative)
admin.quant
##   0%  25%  50%  75% 100% 
##    0    0    1    4   27
# Distribution
ggplot(data=df, aes(y=Administrative)) +
  geom_boxplot(outlier.colour = 'red') +
  labs(title="Adiministrative Feature")

Administartive Duration

# Mean
aduration.mean <- mean(df$Administrative_Duration)
aduration.mean
## [1] 81.68214
# Mode
aduration.mode <- mode(df$Administrative_Duration)
aduration.mode
## [1] 0
# Median
aduration.median <- median(df$Administrative_Duration)
aduration.median
## [1] 9
# Range
aduration.range <- range(df$Administrative_Duration)
aduration.range
## [1]   -1.00 3398.75
# Standard Deviation
aduration.sd <- sd(df$Administrative_Duration)
aduration.sd
## [1] 177.5282
# Quantiles
aduration.quant <- quantile(df$Administrative_Duration)
aduration.quant
##      0%     25%     50%     75%    100% 
##   -1.00    0.00    9.00   94.75 3398.75
# Kurtosis
aduration.kurt <- kurtosis(df$Administrative_Duration)
aduration.kurt
## [1] 50.08518
# Skewness
aduration.skew <- skewness(df$Administrative_Duration)
aduration.skew
## [1] 5.589523
# Distribution
ggplot(data=df, aes(y=Administrative_Duration)) +
  geom_boxplot(outlier.color = "red") +
  labs(title="Administrative Duration")

Informational

# mean
info.mean <- mean(df$Informational)
info.mean
## [1] 0.5088122
# Mode
info.mode <- mode(df$Informational)
info.mode
## [1] 0
# Median
info.median <- median(df$Informational)
info.median
## [1] 0
# Range
info.range <- range(df$Informational)
info.range
## [1]  0 24
# Standard deviation
info.sd <- sd(df$Informational)
info.sd
## [1] 1.275817
# Quantiles
info.quant <- sd(df$Informational)
info.quant
## [1] 1.275817
# Skewness
info.skew <- skewness(df$Informational)
info.skew
## [1] 4.012958
# Kurtosis
info.kurt <- kurtosis(df$Informational)
info.kurt
## [1] 26.63768
# Distribution
ggplot(data=df, aes(y=Informational)) +
  geom_boxplot(outlier.colour = "red") +
  labs(title="Informational Boxplot")

Informational Duration

# mean
infod.mean <- mean(df$Informational_Duration)
infod.mean
## [1] 34.83734
# Mode
infod.mode <- mode(df$Informational_Duration)
infod.mode
## [1] 0
# Median
infod.median <- median(df$Informational_Duration)
infod.median
## [1] 0
# Range
infod.range <- range(df$Informational_Duration)
infod.range
## [1]   -1.000 2549.375
# Standard deviation
infod.sd <- sd(df$Informational_Duration)
infod.sd
## [1] 141.4585
# Quantiles
infod.quant <- sd(df$Informational_Duration)
infod.quant
## [1] 141.4585
# Skewness
infod.skew <- skewness(df$Informational_Duration)
infod.skew
## [1] 7.536508
# Kurtosis
infod.kurt <- kurtosis(df$Informational_Duration)
infod.kurt
## [1] 75.45122
# Distribution
ggplot(data=df, aes(y=Informational_Duration)) +
  geom_boxplot(outlier.colour = "red") +
  labs(title="Informational_Duration Boxplot")

Bounce Rate

# Mean
bounce.mean <- mean(df$BounceRates)
bounce.mean
## [1] 0.02044674
# Mode
bounce.mode <- mode(df$BounceRates)
bounce.mode
## [1] 0
# Median
bounce.median <- median(df$BounceRates)
bounce.median
## [1] 0.002930403
# Range
bounce.range <- range(df$BounceRates)
bounce.range
## [1] 0.0 0.2
# Standard Deviation
bounce.sd <- sd(df$BounceRates)
bounce.sd
## [1] 0.0454025
# Quantiles
bounce.quant <- quantile(df$BounceRates)
bounce.quant
##          0%         25%         50%         75%        100% 
## 0.000000000 0.000000000 0.002930403 0.016666667 0.200000000
# Kurtosis
bounce.kurt <- kurtosis(df$BounceRates)
bounce.kurt
## [1] 9.253055
# Skewness
bounce.skew <- skewness(df$BounceRates)
bounce.skew
## [1] 3.152486
# Distribution
ggplot(df, aes(y=BounceRates)) +
  geom_boxplot(outlier.colour = "red") +
  labs(y="BounceRates")

Exit Rate

# Mean
exit.mean <- mean(df$ExitRates)
exit.mean
## [1] 0.04149678
# Mode
exit.mode <- mode(df$ExitRates)
exit.mode
## [1] 0.2
# Median
exit.median <- median(df$ExitRates)
exit.median
## [1] 0.025
# Range
exit.range <- range(df$ExitRates)
exit.range
## [1] 0.0 0.2
# Standard Deviation
exit.sd <- sd(df$ExitRates)
exit.sd
## [1] 0.04624716
# Quantiles
exit.quant <- quantile(df$ExitRates)
exit.quant
##         0%        25%        50%        75%       100% 
## 0.00000000 0.01422258 0.02500000 0.04848485 0.20000000
# Kurtosis
exit.kurt <- kurtosis(df$ExitRates)
exit.kurt
## [1] 4.623003
# Skewness
exit.skew <- skewness(df$ExitRates)
exit.skew
## [1] 2.232851
# Distribution
ggplot(df, aes(y=ExitRates)) +
  geom_boxplot(outlier.color = "red")

Page Value

# Mean
page.mean <- mean(df$PageValues)
page.mean
## [1] 5.9525
# Mode
page.mode <- mode(df$PageValues)
page.mode
## [1] 0
# Median
page.median <- median(df$PageValues)
page.median
## [1] 0
# Range
page.range <- range(df$PageValues)
page.range
## [1]   0.0000 361.7637
# Standard Deviatoion
page.sd <- sd(df$PageValues)
page.sd
## [1] 18.65779
# Quantiles
page.quant <- sd(df$PageValues)
page.quant
## [1] 18.65779
# Kurtosis
page.kurt <- kurtosis(df$PageValues)
page.kurt
## [1] 64.92917
# Skewness
page.skew <- skewness(df$PageValues)
page.skew
## [1] 6.347882
# Distribution
ggplot(df, aes(y=PageValues)) +
  geom_boxplot(outlier.colour = "red")

Special Day

# Mean
special.mean <- mean(df$SpecialDay)
special.mean
## [1] 0.06197229
# Mode
special.mode <- mode(df$SpecialDay)
special.mode
## [1] 0
# Median
special.median <- median(df$SpecialDay)
special.median
## [1] 0
# Range
special.range <- range(df$PageValues)
special.range
## [1]   0.0000 361.7637
# Standard Deviatoion
special.sd <- sd(df$SpecialDay)
special.sd
## [1] 0.1997106
# Quantiles
special.quant <- sd(df$SpecialDay)
special.quant
## [1] 0.1997106
# Kurtosis
special.kurt <- kurtosis(df$SpecialDay)
special.kurt
## [1] 9.783958
# Skewness
special.skew <- skewness(df$SpecialDay)
special.skew
## [1] 3.284077
# Distribution
ggplot(df, aes(y=SpecialDay)) +
  geom_boxplot(outlier.colour = "red")

Operating System

# Mode
op.mode <- mode(df$OperatingSystems)
op.mode
## [1] 2
## Levels: 1 2 3 4 5 6 7 8
# Distribution
ggplot(df, aes(OperatingSystems)) +
  geom_bar()

Most individuals use the 2nd Operating system. Only the first four OSs seem to be popular.

Browser

# Mode
browser.mode <- mode(df$Browser)
browser.mode
## [1] 2
## Levels: 1 2 3 4 5 6 7 8 9 10 11 12 13
# Distribution
ggplot(df, aes(Browser)) +
  geom_bar()

The 2nd browser is the most popular one. Other then the first two, the other browsers barely used by site visitors.

Traffic Type

# Mode
traffic.mode <- mode(df$TrafficType)
traffic.mode
## [1] 2
## Levels: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
# Distribution
ggplot(df, aes(TrafficType)) +
  geom_bar()

The most prevalent traffic type is the second one with 7, 9, 12, 14, 15, 16 ,17, 19, 19 having very low contributions to the pool.

Visitor Type

# Mode
visitor.mode <- mode(df$VisitorType)
visitor.mode
## [1] Returning_Visitor
## Levels: New_Visitor Other Returning_Visitor
# Distribution
ggplot(df, aes(VisitorType)) +
  geom_bar()

Returning Visitors are the most abundant on the website.

Region

# Mode
region.mode <- mode(df$Region)
region.mode
## [1] 1
## Levels: 1 2 3 4 5 6 7 8 9
# Distributions
ggplot(df, aes(Region)) +
  geom_bar()

The first region contributes the most site users with the rest coming from the others.

Month

# Mode
month.mode <- mode(df$Month)
month.mode
## [1] May
## Levels: Aug Dec Feb Jul June Mar May Nov Oct Sep
# Distributions
ggplot(df, aes(Month)) +
  geom_bar()

The website recieves the most customers in the months of March, May, November and December.

Weekend

# Mode
wknd.mode <- mode(df$Weekend)
wknd.mode
## [1] FALSE
## Levels: FALSE TRUE
# Distribution
ggplot(df, aes(Weekend)) +
  geom_bar()

Most users visit the site during weekends.

Revenue

# Mode
revenue.mode <- mode(df$Revenue)
revenue.mode
## [1] FALSE
## Levels: FALSE TRUE
# Distribution
ggplot(df, aes(Revenue)) +
  geom_bar()

Most users do not bring in revenue through their activity on the site.

BIvariate Analysis

Examining how different variables affect the labels

# Administrative sites and Revenue
ggplot(df, aes(Administrative, color=Revenue)) +
  geom_freqpoly(binwidth=1)

ggplot(df, aes(Administrative_Duration, color=Revenue)) +
  geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(df, aes(Informational, color=Revenue)) +
  geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(df, aes(Informational_Duration, color=Revenue)) +
  geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(df, aes(ProductRelated, color=Revenue)) +
  geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(df, aes(SpecialDay, color=Revenue)) +
  geom_freqpoly() +
  theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(df, aes(PageValues, color=Revenue)) +
  geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Months vs GeneratingRevenue
ggplot(df, aes(Month, color=Revenue, fill=Revenue)) +
  geom_bar(binwidth=1)
## Warning: Ignoring unknown parameters: binwidth

March, May, November and December are the months which generate significantly more revenue for the business.

# Day type vs Generating Revenue
ggplot(df, aes(Weekend, color=Revenue, fill=Revenue)) +
  geom_bar(binwidth=1)
## Warning: Ignoring unknown parameters: binwidth

Weekdays generate slightly more Revenue than weekends.

# Operating systems vs Generating Revenue
ggplot(df, aes(OperatingSystems, color=Revenue, fill=Revenue)) +
  geom_bar()

Users of type 2 OS generated the most revenue for the site, while 1, and 3 followed.

ggplot(df, aes(Region, fill=Revenue, color=Revenue)) +
  geom_bar(binwidth=1)
## Warning: Ignoring unknown parameters: binwidth

Region 1 produced the most revenue out of all the others with region 5 producing the least.

# Visitor type and revenue
ggplot(df, aes(VisitorType, color=Revenue, fill=Revenue)) +
  geom_bar(binwidth=2)
## Warning: Ignoring unknown parameters: binwidth

Returning visitors generated a lot more revenue than new ones

# Bounce rates vs Revenue
ggplot(df, aes(BounceRates, color=Revenue)) +
  geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

A lot of sites had a high percentage of visitors just leaving without triggering any requests from our target website.

Correlations

cor(df[,unlist(lapply(df, is.numeric))])
##                         Administrative Administrative_Duration Informational
## Administrative              1.00000000              0.60040965    0.37528761
## Administrative_Duration     0.60040965              1.00000000    0.30143630
## Informational               0.37528761              0.30143630    1.00000000
## Informational_Duration      0.25478602              0.23718986    0.61867795
## ProductRelated              0.42819151              0.28678391    0.37260472
## ProductRelated_Duration     0.37102722              0.35351379    0.38608372
## BounceRates                -0.21366664             -0.13733340   -0.10950530
## ExitRates                  -0.31127413             -0.20202445   -0.15956681
## PageValues                  0.09692097              0.06616837    0.04739015
## SpecialDay                 -0.09707210             -0.07473689   -0.04937677
##                         Informational_Duration ProductRelated
## Administrative                      0.25478602     0.42819151
## Administrative_Duration             0.23718986     0.28678391
## Informational                       0.61867795     0.37260472
## Informational_Duration              1.00000000     0.27906195
## ProductRelated                      0.27906195     1.00000000
## ProductRelated_Duration             0.34658069     0.86030819
## BounceRates                        -0.07015947    -0.19351577
## ExitRates                          -0.10293268    -0.28616321
## PageValues                          0.03006416     0.05411549
## SpecialDay                         -0.03129304    -0.02593062
##                         ProductRelated_Duration BounceRates  ExitRates
## Administrative                       0.37102722 -0.21366664 -0.3112741
## Administrative_Duration              0.35351379 -0.13733340 -0.2020245
## Informational                        0.38608372 -0.10950530 -0.1595668
## Informational_Duration               0.34658069 -0.07015947 -0.1029327
## ProductRelated                       0.86030819 -0.19351577 -0.2861632
## ProductRelated_Duration              1.00000000 -0.17437550 -0.2453340
## BounceRates                         -0.17437550  1.00000000  0.9033582
## ExitRates                           -0.24533401  0.90335819  1.0000000
## PageValues                           0.05084062 -0.11599198 -0.1735715
## SpecialDay                          -0.03821065  0.08783999  0.1167838
##                          PageValues  SpecialDay
## Administrative           0.09692097 -0.09707210
## Administrative_Duration  0.06616837 -0.07473689
## Informational            0.04739015 -0.04937677
## Informational_Duration   0.03006416 -0.03129304
## ProductRelated           0.05411549 -0.02593062
## ProductRelated_Duration  0.05084062 -0.03821065
## BounceRates             -0.11599198  0.08783999
## ExitRates               -0.17357154  0.11678376
## PageValues               1.00000000 -0.06453271
## SpecialDay              -0.06453271  1.00000000

The rates were significantly correlated while types of number of sites were strongly correlated with how much time was spent in them.

Solution Implementation using Clustering

Data Preparation

# label encoding
month <- LabelEncoder.fit(df$Month)
df$Month <- transform(month, factor(df$Month))
wknd <- LabelEncoder.fit(df$Weekend)
df$Weekend <- transform(wknd, factor(df$Weekend))
visitor <- LabelEncoder.fit(df$VisitorType)
df$VisitorType <- transform(visitor, factor(df$VisitorType))
head(df)

KMEANS CLUSTERING

# separating features from Revenue labels
x <- df[, -18]
# normalizing
normalize <- function(x){
  return ((x-min(x)) / (max(x)-min(x)))
}
x$Administrative <- normalize(x$Administrative)
x$Administrative_Duration <- normalize(x$Administrative_Duration)
x$Informational <- normalize(x$Informational)
x$Informational_Duration <- normalize(x$Informational_Duration)
x$ProductRelated <- normalize(x$ProductRelated)
x$ProductRelated_Duration <- normalize(x$ProductRelated_Duration)
x$BounceRates <- normalize(x$BounceRates)
x$ExitRates <- normalize(x$ExitRates)
x$PageValues <- normalize(x$PageValues)
x$SpecialDay <- normalize(x$SpecialDay)
# finding optimum k
fviz_nbclust(x, kmeans, method="wss")

According to the elbow plot above, only 3 clusters are sufficient. This shall be cross checked using the silhouette method

# silhouette method
fviz_nbclust(x, kmeans, method="silhouette")

3 clusters shall be used.

# Using 3 clusters
k <- kmeans(x, centers=3, nstart=25)
# Number of records in each cluster
k$size
## [1] 1993 7784 2422
str(k)
## List of 9
##  $ cluster     : Named int [1:12199] 3 3 3 3 3 3 3 3 3 3 ...
##   ..- attr(*, "names")= chr [1:12199] "1" "2" "3" "4" ...
##  $ centers     : num [1:3, 1:17] 0.0781 0.0891 0.086 0.0233 0.0247 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:3] "1" "2" "3"
##   .. ..$ : chr [1:17] "Administrative" "Administrative_Duration" "Informational" "Informational_Duration" ...
##  $ totss       : num 391846
##  $ withinss    : num [1:3] 57682 97131 32945
##  $ tot.withinss: num 187758
##  $ betweenss   : num 204089
##  $ size        : int [1:3] 1993 7784 2422
##  $ iter        : int 3
##  $ ifault      : int 0
##  - attr(*, "class")= chr "kmeans"
df$cluster <- as.factor(k$cluster)
head(df)

Hierarchical Clustering

# copy of the dataset
copy <- df[, 1:17]
# scaling the data
copy$Administrative <- scale(copy$Administrative)
copy$Administrative_Duration <- scale(copy$Administrative_Duration)
copy$Informational <- scale(copy$Informational)
copy$Informational_Duration <- scale(copy$Informational_Duration)
copy$ProductRelated <- scale(copy$ProductRelated)
copy$ProductRelated_Duration <- scale(copy$ProductRelated_Duration)
copy$BounceRates <- scale(copy$BounceRates)
copy$ExitRates <- scale(copy$ExitRates)
copy$PageValues <- scale(copy$PageValues)
copy$SpecialDay <- scale(copy$SpecialDay)
# computing the distance
d <- dist(copy, method="euclidean")
# Clustering  algorithm deployment
model <- hclust(d, method="ward.D2")
# viewing the dendogram
plot(model, cex=0.6, hang=-1)

# Ward's method
hc <- hclust(d, method="ward.D2")
# cut the tree into 5 parts
sub_grp <- cutree(hc, k=4)
table(sub_grp)
## sub_grp
##    1    2    3    4 
## 7008 1653 1959 1579
plot(hc, cex=2, hang=-1 )
rect.hclust(hc, k=4, border=2:5)

Conclusion