1.)PROBLEM DEFINITION:
a.) Defining the question: What are the key characteristics of our customer groups? b.) Success metric: getting the key characteristics of our customer groups
2.)DATA SOURCING: The dataset has been provided by Kira Plastinina
3.)CHECKING THE DATA
# loading the dataset using the fread function
library(data.table)
#import data
df <- fread("C:\\Users\\Gakungi\\OneDrive\\Desktop\\R\\Datasets\\online_shoppers_intention.csv")
#previewing the first 6 rows dataset
head(df)
## Administrative Administrative_Duration Informational Informational_Duration
## 1: 0 0 0 0
## 2: 0 0 0 0
## 3: 0 -1 0 -1
## 4: 0 0 0 0
## 5: 0 0 0 0
## 6: 0 0 0 0
## ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues
## 1: 1 0.000000 0.20000000 0.2000000 0
## 2: 2 64.000000 0.00000000 0.1000000 0
## 3: 1 -1.000000 0.20000000 0.2000000 0
## 4: 2 2.666667 0.05000000 0.1400000 0
## 5: 10 627.500000 0.02000000 0.0500000 0
## 6: 19 154.216667 0.01578947 0.0245614 0
## SpecialDay Month OperatingSystems Browser Region TrafficType
## 1: 0 Feb 1 1 1 1
## 2: 0 Feb 2 2 1 2
## 3: 0 Feb 4 1 9 3
## 4: 0 Feb 3 2 2 4
## 5: 0 Feb 3 3 1 4
## 6: 0 Feb 2 2 1 3
## VisitorType Weekend Revenue
## 1: Returning_Visitor FALSE FALSE
## 2: Returning_Visitor FALSE FALSE
## 3: Returning_Visitor FALSE FALSE
## 4: Returning_Visitor FALSE FALSE
## 5: Returning_Visitor TRUE FALSE
## 6: Returning_Visitor FALSE FALSE
# previewing the last 6 rows of the dataset
tail(df)
## Administrative Administrative_Duration Informational Informational_Duration
## 1: 0 0 1 0
## 2: 3 145 0 0
## 3: 0 0 0 0
## 4: 0 0 0 0
## 5: 4 75 0 0
## 6: 0 0 0 0
## ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues
## 1: 16 503.000 0.000000000 0.03764706 0.00000
## 2: 53 1783.792 0.007142857 0.02903061 12.24172
## 3: 5 465.750 0.000000000 0.02133333 0.00000
## 4: 6 184.250 0.083333333 0.08666667 0.00000
## 5: 15 346.000 0.000000000 0.02105263 0.00000
## 6: 3 21.250 0.000000000 0.06666667 0.00000
## SpecialDay Month OperatingSystems Browser Region TrafficType
## 1: 0 Nov 2 2 1 1
## 2: 0 Dec 4 6 1 1
## 3: 0 Nov 3 2 1 8
## 4: 0 Nov 3 2 1 13
## 5: 0 Nov 2 2 3 11
## 6: 0 Nov 3 2 1 2
## VisitorType Weekend Revenue
## 1: Returning_Visitor FALSE FALSE
## 2: Returning_Visitor TRUE FALSE
## 3: Returning_Visitor TRUE FALSE
## 4: Returning_Visitor TRUE FALSE
## 5: Returning_Visitor FALSE FALSE
## 6: New_Visitor TRUE FALSE
# checking data types of the columns
str(df)
## Classes 'data.table' and 'data.frame': 12330 obs. of 18 variables:
## $ Administrative : int 0 0 0 0 0 0 0 1 0 0 ...
## $ Administrative_Duration: num 0 0 -1 0 0 0 -1 -1 0 0 ...
## $ Informational : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Informational_Duration : num 0 0 -1 0 0 0 -1 -1 0 0 ...
## $ ProductRelated : int 1 2 1 2 10 19 1 1 2 3 ...
## $ ProductRelated_Duration: num 0 64 -1 2.67 627.5 ...
## $ BounceRates : num 0.2 0 0.2 0.05 0.02 ...
## $ ExitRates : num 0.2 0.1 0.2 0.14 0.05 ...
## $ PageValues : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SpecialDay : num 0 0 0 0 0 0 0.4 0 0.8 0.4 ...
## $ Month : chr "Feb" "Feb" "Feb" "Feb" ...
## $ OperatingSystems : int 1 2 4 3 3 2 2 1 2 2 ...
## $ Browser : int 1 2 1 2 3 2 4 2 2 4 ...
## $ Region : int 1 1 9 2 1 1 3 1 2 1 ...
## $ TrafficType : int 1 2 3 4 4 3 3 5 3 2 ...
## $ VisitorType : chr "Returning_Visitor" "Returning_Visitor" "Returning_Visitor" "Returning_Visitor" ...
## $ Weekend : logi FALSE FALSE FALSE FALSE TRUE FALSE ...
## $ Revenue : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## - attr(*, ".internal.selfref")=<externalptr>
# checking the unique values in the month and special day certain columns
print(unique(df$Month))
## [1] "Feb" "Mar" "May" "Oct" "June" "Jul" "Aug" "Nov" "Sep" "Dec"
print(unique(df$SpecialDay))
## [1] 0.0 0.4 0.8 1.0 0.2 0.6
# checking the shape of our data
dim(df)
## [1] 12330 18
#we have 12330 rows and 18 columns
4.) DATA CLEANING
# checking for duplicates in the data
dup <- df[duplicated(df),]
dup
## Administrative Administrative_Duration Informational
## 1: 0 0 0
## 2: 0 0 0
## 3: 0 0 0
## 4: 0 0 0
## 5: 0 0 0
## ---
## 115: 0 0 0
## 116: 0 0 0
## 117: 0 0 0
## 118: 0 0 0
## 119: 0 0 0
## Informational_Duration ProductRelated ProductRelated_Duration BounceRates
## 1: 0 1 0 0.2
## 2: 0 1 0 0.2
## 3: 0 1 0 0.2
## 4: 0 1 0 0.2
## 5: 0 1 0 0.2
## ---
## 115: 0 1 0 0.2
## 116: 0 1 0 0.2
## 117: 0 1 0 0.2
## 118: 0 1 0 0.2
## 119: 0 1 0 0.2
## ExitRates PageValues SpecialDay Month OperatingSystems Browser Region
## 1: 0.2 0 0 Feb 1 1 1
## 2: 0.2 0 0 Feb 3 2 3
## 3: 0.2 0 0 Mar 1 1 1
## 4: 0.2 0 0 Mar 2 2 4
## 5: 0.2 0 0 Mar 3 2 3
## ---
## 115: 0.2 0 0 Dec 1 1 1
## 116: 0.2 0 0 Dec 1 1 4
## 117: 0.2 0 0 Dec 1 1 1
## 118: 0.2 0 0 Dec 1 13 9
## 119: 0.2 0 0 Dec 8 13 9
## TrafficType VisitorType Weekend Revenue
## 1: 3 Returning_Visitor FALSE FALSE
## 2: 3 Returning_Visitor FALSE FALSE
## 3: 1 Returning_Visitor TRUE FALSE
## 4: 1 Returning_Visitor FALSE FALSE
## 5: 1 Returning_Visitor FALSE FALSE
## ---
## 115: 2 New_Visitor FALSE FALSE
## 116: 1 Returning_Visitor TRUE FALSE
## 117: 3 Returning_Visitor FALSE FALSE
## 118: 20 Returning_Visitor FALSE FALSE
## 119: 20 Other FALSE FALSE
# there are duplicates in our data
# removing duplicates from the data
df[!duplicated(df), ]
## Administrative Administrative_Duration Informational
## 1: 0 0 0
## 2: 0 0 0
## 3: 0 -1 0
## 4: 0 0 0
## 5: 0 0 0
## ---
## 12207: 3 145 0
## 12208: 0 0 0
## 12209: 0 0 0
## 12210: 4 75 0
## 12211: 0 0 0
## Informational_Duration ProductRelated ProductRelated_Duration
## 1: 0 1 0.000000
## 2: 0 2 64.000000
## 3: -1 1 -1.000000
## 4: 0 2 2.666667
## 5: 0 10 627.500000
## ---
## 12207: 0 53 1783.791667
## 12208: 0 5 465.750000
## 12209: 0 6 184.250000
## 12210: 0 15 346.000000
## 12211: 0 3 21.250000
## BounceRates ExitRates PageValues SpecialDay Month OperatingSystems
## 1: 0.200000000 0.20000000 0.00000 0 Feb 1
## 2: 0.000000000 0.10000000 0.00000 0 Feb 2
## 3: 0.200000000 0.20000000 0.00000 0 Feb 4
## 4: 0.050000000 0.14000000 0.00000 0 Feb 3
## 5: 0.020000000 0.05000000 0.00000 0 Feb 3
## ---
## 12207: 0.007142857 0.02903061 12.24172 0 Dec 4
## 12208: 0.000000000 0.02133333 0.00000 0 Nov 3
## 12209: 0.083333333 0.08666667 0.00000 0 Nov 3
## 12210: 0.000000000 0.02105263 0.00000 0 Nov 2
## 12211: 0.000000000 0.06666667 0.00000 0 Nov 3
## Browser Region TrafficType VisitorType Weekend Revenue
## 1: 1 1 1 Returning_Visitor FALSE FALSE
## 2: 2 1 2 Returning_Visitor FALSE FALSE
## 3: 1 9 3 Returning_Visitor FALSE FALSE
## 4: 2 2 4 Returning_Visitor FALSE FALSE
## 5: 3 1 4 Returning_Visitor TRUE FALSE
## ---
## 12207: 6 1 1 Returning_Visitor TRUE FALSE
## 12208: 2 1 8 Returning_Visitor TRUE FALSE
## 12209: 2 1 13 Returning_Visitor TRUE FALSE
## 12210: 2 3 11 Returning_Visitor FALSE FALSE
## 12211: 2 1 2 New_Visitor TRUE FALSE
# converting the logical and factor columns to integers
df$Weekend <- as.integer(df$Weekend)
df$Revenue <- as.integer(df$Revenue)
df$VisitorType <- as.factor(df$VisitorType)
df$VisitorType <- unclass(df$VisitorType)
df$Month <- as.factor(df$Month)
df$Month <- unclass(df$Month)
df$Month <- as.numeric(df$Month)
df$VisitorType <- as.numeric(df$VisitorType)
# returnvisitor =3, new=1, other=2
# february=3, march=6, may=7, october=9, june=5, july=4, august=1, novemeber=8, september=10, december=2
# checking for missing values per column
colSums(is.na(df))
## Administrative Administrative_Duration Informational
## 14 14 14
## Informational_Duration ProductRelated ProductRelated_Duration
## 14 14 14
## BounceRates ExitRates PageValues
## 14 14 0
## SpecialDay Month OperatingSystems
## 0 0 0
## Browser Region TrafficType
## 0 0 0
## VisitorType Weekend Revenue
## 0 0 0
# we have missing values in our columns
# loading the mice library
library(mice)
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
init = mice(df, maxit=0)
meth = init$method
predM = init$predictorMatrix
# specifying the methods for imputing the missing values for the respective columns
meth[c("Administrative")]="norm"
meth[c("Administrative_Duration")]="norm"
meth[c("Informational")]="norm"
meth[c("Informational_Duration")]="norm"
meth[c("ProductRelated")]="norm"
meth[c("ProductRelated_Duration")]="norm"
meth[c("BounceRates")]="norm"
meth[c("ExitRates")]="norm"
#creating imputation
dfc = mice(df, method=meth, predictorMatrix=predM, m=5)
##
## iter imp variable
## 1 1 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates
## 1 2 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates
## 1 3 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates
## 1 4 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates
## 1 5 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates
## 2 1 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates
## 2 2 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates
## 2 3 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates
## 2 4 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates
## 2 5 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates
## 3 1 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates
## 3 2 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates
## 3 3 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates
## 3 4 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates
## 3 5 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates
## 4 1 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates
## 4 2 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates
## 4 3 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates
## 4 4 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates
## 4 5 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates
## 5 1 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates
## 5 2 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates
## 5 3 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates
## 5 4 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates
## 5 5 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates
# creating a dataset after imputation
dfd <- complete(dfc)
# checking for missing values in the imputed dataset
sapply(dfd, function(x) sum(is.na(x)))
## Administrative Administrative_Duration Informational
## 0 0 0
## Informational_Duration ProductRelated ProductRelated_Duration
## 0 0 0
## BounceRates ExitRates PageValues
## 0 0 0
## SpecialDay Month OperatingSystems
## 0 0 0
## Browser Region TrafficType
## 0 0 0
## VisitorType Weekend Revenue
## 0 0 0
# no missing values are present
# plotting boxplot to check for outliers
library(lattice)
boxplot(dfd$Administrative, xlab=c("Administrative"))
boxplot(dfd$Administrative_Duration, xlab=c("Administrative duration"))
boxplot(dfd$Informational, xlab=c("Informational"))
boxplot(dfd$Informational_Duration, xlab=c("Informational duration"))
boxplot(dfd$ProductRelated, xlab=c("Product related"))
boxplot(dfd$ProductRelated_Duration, xlab=c("Product related duration"))
boxplot(dfd$BounceRates, xlab=c("Bounce rates"))
boxplot(dfd$ExitRates, xlab=c("Exit rates"))
boxplot(dfd$PageValues, xlab=c("Page values"))
boxplot(dfd$SpecialDay, xlab=c("Special day"))
boxplot(dfd$OperatingSystems, xlab=c("Operating systems"))
boxplot(dfd$Browser, xlab=c("Browser"))
boxplot(dfd$Region, xlab=c("Region"))
boxplot(dfd$TrafficType, xlab=c("Traffic type"))
boxplot(dfd$Weekend, xlab=c("Weekend"))
boxplot(dfd$Revenue, xlab=c("Revenue"))
# all of our columns contain a significant number of outliers which will not be eliminated since they are crucial for the analysis and are assumed to be correct entries
5.) EXPLORATORY DATA ANALYSIS
a.) Univariate analysis
#checking the summary statistics
summary(dfd)
## Administrative Administrative_Duration Informational
## Min. :-3.291 Min. :-231.07 Min. :-1.3622
## 1st Qu.: 0.000 1st Qu.: 0.00 1st Qu.: 0.0000
## Median : 1.000 Median : 8.00 Median : 0.0000
## Mean : 2.319 Mean : 80.89 Mean : 0.5046
## 3rd Qu.: 4.000 3rd Qu.: 93.50 3rd Qu.: 0.0000
## Max. :27.000 Max. :3398.75 Max. :24.0000
## Informational_Duration ProductRelated ProductRelated_Duration
## Min. :-253.39 Min. : -4.653 Min. : -656.2
## 1st Qu.: 0.00 1st Qu.: 7.000 1st Qu.: 185.0
## Median : 0.00 Median : 18.000 Median : 600.5
## Mean : 34.54 Mean : 31.789 Mean : 1196.6
## 3rd Qu.: 0.00 3rd Qu.: 38.000 3rd Qu.: 1469.7
## Max. :2549.38 Max. :705.000 Max. :63973.5
## BounceRates ExitRates PageValues SpecialDay
## Min. :-0.089592 Min. :-0.05312 Min. : 0.000 Min. :0.00000
## 1st Qu.: 0.000000 1st Qu.: 0.01429 1st Qu.: 0.000 1st Qu.:0.00000
## Median : 0.003125 Median : 0.02513 Median : 0.000 Median :0.00000
## Mean : 0.022167 Mean : 0.04302 Mean : 5.889 Mean :0.06143
## 3rd Qu.: 0.016923 3rd Qu.: 0.05000 3rd Qu.: 0.000 3rd Qu.:0.00000
## Max. : 0.200000 Max. : 0.20000 Max. :361.764 Max. :1.00000
## Month OperatingSystems Browser Region
## Min. : 1.000 Min. :1.000 Min. : 1.000 Min. :1.000
## 1st Qu.: 6.000 1st Qu.:2.000 1st Qu.: 2.000 1st Qu.:1.000
## Median : 7.000 Median :2.000 Median : 2.000 Median :3.000
## Mean : 6.164 Mean :2.124 Mean : 2.357 Mean :3.147
## 3rd Qu.: 8.000 3rd Qu.:3.000 3rd Qu.: 2.000 3rd Qu.:4.000
## Max. :10.000 Max. :8.000 Max. :13.000 Max. :9.000
## TrafficType VisitorType Weekend Revenue
## Min. : 1.00 Min. :1.000 Min. :0.0000 Min. :0.0000
## 1st Qu.: 2.00 1st Qu.:3.000 1st Qu.:0.0000 1st Qu.:0.0000
## Median : 2.00 Median :3.000 Median :0.0000 Median :0.0000
## Mean : 4.07 Mean :2.718 Mean :0.2326 Mean :0.1547
## 3rd Qu.: 4.00 3rd Qu.:3.000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :20.00 Max. :3.000 Max. :1.0000 Max. :1.0000
# we are able to check for the mean, median, upper quantile, lower quantile, minimum value and maximum value of each column using summary statistics
# getting the mode of the relevant columns
getmode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
mode_admin <- getmode(dfd$Administrative)
mode_admindur <- getmode(dfd$Administrative_Duration)
mode_info <- getmode(dfd$Informational)
mode_infodur <- getmode(dfd$Informational_Duration)
mode_prod <- getmode(dfd$ProductRelated)
mode_proddur <- getmode(dfd$ProductRelated_Duration)
mode_bounce <- getmode(dfd$BounceRates)
mode_exit <- getmode(dfd$ExitRates)
mode_page <- getmode(dfd$PageValues)
mode_day <- getmode(dfd$SpecialDay)
mode_os <- getmode(dfd$OperatingSystems)
mode_browser <- getmode(dfd$Browser)
mode_region <- getmode(dfd$Region)
mode_traffic <- getmode(dfd$TrafficType)
mode_visitor <- getmode(dfd$VisitorType)
mode_weekend <- getmode(dfd$Weekend)
mode_revenue <- getmode(dfd$Revenue)
mode_month <- getmode(dfd$Month)
print(mode_admin)
## [1] 0
print(mode_admindur)
## [1] 0
print(mode_info)
## [1] 0
print(mode_infodur)
## [1] 0
print(mode_prod)
## [1] 1
print(mode_proddur)
## [1] 0
print(mode_bounce)
## [1] 0
print(mode_exit)
## [1] 0.2
print(mode_page)
## [1] 0
print(mode_day)
## [1] 0
print(mode_os)
## [1] 2
print(mode_browser)
## [1] 2
print(mode_region)
## [1] 1
print(mode_traffic)
## [1] 2
print(mode_visitor)
## [1] 3
print(mode_weekend)
## [1] 0
print(mode_revenue)
## [1] 0
print(mode_month)
## [1] 7
# the mode represents the most repeated value per column respectively
# we can tell that the visitors mostly frequent the sites on other days other than the weekend and that most visitors are return visitors. We can also tell that most visitors are from region 1 and that most visits were in the month of May(month 7)
library(moments)
# getting the skewness of the relevant columns
sk_admin <- skewness(dfd$Administrative)
sk_admindur <- skewness(dfd$Administrative_Duration)
sk_info <- skewness(dfd$Informational)
sk_infodur <- skewness(dfd$Informational_Duration)
sk_prod <- skewness(dfd$ProductRelated)
sk_proddur <- skewness(dfd$ProductRelated_Duration)
sk_bounce <- skewness(dfd$BounceRates)
sk_exit <- skewness(dfd$ExitRates)
sk_page <- skewness(dfd$PageValues)
sk_day <- skewness(dfd$SpecialDay)
sk_os <- skewness(dfd$OperatingSystems)
sk_browser <- skewness(dfd$Browser)
sk_region <- skewness(dfd$Region)
sk_traffic <- skewness(dfd$TrafficType)
sk_visitor <- skewness(dfd$VisitorType)
sk_weekend <- skewness(dfd$Weekend)
sk_revenue <- skewness(dfd$Revenue)
sk_month <- skewness(dfd$Month)
print(sk_admin)
## [1] 1.956069
print(sk_admindur)
## [1] 5.609103
print(sk_info)
## [1] 4.026356
print(sk_infodur)
## [1] 7.561382
print(sk_prod)
## [1] 4.334366
print(sk_proddur)
## [1] 7.256408
print(sk_bounce)
## [1] 2.946824
print(sk_exit)
## [1] 2.149313
print(sk_page)
## [1] 6.382188
print(sk_day)
## [1] 3.302265
print(sk_os)
## [1] 2.066034
print(sk_browser)
## [1] 3.241955
print(sk_region)
## [1] 0.9834295
print(sk_traffic)
## [1] 1.962748
print(sk_visitor)
## [1] -2.064884
print(sk_weekend)
## [1] 1.265808
print(sk_revenue)
## [1] 1.909277
print(sk_month)
## [1] -0.8324332
# skewness of all the columns being positive indicates that their distributions have a longer right tail than left tail except for that of the month and visitor type columns whose left tails are longer given that they are skewed negatively
# getting the kurtosis of the columns
kt_admin <- kurtosis(dfd$Administrative)
kt_admindur <- kurtosis(dfd$Administrative_Duration)
kt_info <- kurtosis(dfd$Informational)
kt_infodur <- kurtosis(dfd$Informational_Duration)
kt_prod <- kurtosis(dfd$ProductRelated)
kt_proddur <- kurtosis(dfd$ProductRelated_Duration)
kt_bounce <- kurtosis(dfd$BounceRates)
kt_exit <- kurtosis(dfd$ExitRates)
kt_page <- kurtosis(dfd$PageValues)
kt_day <- kurtosis(dfd$SpecialDay)
kt_os <- kurtosis(dfd$OperatingSystems)
kt_browser <- kurtosis(dfd$Browser)
kt_region <- kurtosis(dfd$Region)
kt_traffic <- kurtosis(dfd$TrafficType)
kt_visitor <- kurtosis(dfd$VisitorType)
kt_weekend <- kurtosis(dfd$Weekend)
kt_revenue <- kurtosis(dfd$Revenue)
kt_month <- kurtosis(dfd$Month)
print(kt_admin)
## [1] 7.684324
print(kt_admindur)
## [1] 53.46679
print(kt_info)
## [1] 29.81645
print(kt_infodur)
## [1] 79.0414
print(kt_prod)
## [1] 34.12952
print(kt_proddur)
## [1] 139.9892
print(kt_bounce)
## [1] 10.73241
print(kt_exit)
## [1] 7.027053
print(kt_page)
## [1] 68.60859
print(kt_day)
## [1] 12.90915
print(kt_os)
## [1] 13.45212
print(kt_browser)
## [1] 15.74108
print(kt_region)
## [1] 2.850893
print(kt_traffic)
## [1] 6.477813
print(kt_visitor)
## [1] 5.293596
print(kt_weekend)
## [1] 2.60227
print(kt_revenue)
## [1] 4.645338
print(kt_month)
## [1] 2.631333
# the kurtosis levels of most of our columns are relatively high indicating the presence of outliers in the dataset
CHECKING THE DISTRIBUTION OF OUR NUMERICAL COLUMNS
# plotting histograms for our various columns
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.7 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::between() masks data.table::between()
## ✖ dplyr::filter() masks mice::filter(), stats::filter()
## ✖ dplyr::first() masks data.table::first()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::last() masks data.table::last()
## ✖ purrr::transpose() masks data.table::transpose()
dfd %>%
gather(attributes, value, 1:18) %>%
ggplot(aes(x = value)) +
geom_histogram(fill = 'lightblue2', color = 'black') +
facet_wrap(~attributes, scales = 'free_x') +
labs(x="Values", y="Frequency") +
theme_bw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# all our columns lack a normal distribution
CONFIRMING THE UNNORMAL DISTRIBUTION USING QQ PLOT
qqnorm(dfd$Administrative_Duration)
qqline(dfd$Administrative_Duration)
# the column's data does not follow a normal distribution
qqnorm(dfd$Informational_Duration)
qqline(dfd$Informational_Duration)
# the column's data does not follow a normal distribution
qqnorm(dfd$ProductRelated_Duration)
qqline(dfd$ProductRelated_Duration)
# the column's data does not follow a normal distribution
qqnorm(dfd$BounceRates)
qqline(dfd$BounceRates)
# the column's data does not follow a normal distribution
qqnorm(dfd$ExitRates)
qqline(dfd$ExitRates)
# the column's data does not follow a normal distribution
qqnorm(dfd$Administrative)
qqline(dfd$Administrative)
# the column's data does not follow a normal distribution
qqnorm(dfd$ProductRelated)
qqline(dfd$ProductRelated)
# the column's data does not follow a normal distribution
qqnorm(dfd$Informational)
qqline(dfd$Informational)
# the column's data does not follow a normal distribution
qqnorm(dfd$PageValues)
qqline(dfd$PageValues)
# the column's data does not follow a normal distribution
qqnorm(dfd$SpecialDay)
qqline(dfd$SpecialDay)
# the column's data does not follow a normal distribution
b.)Bivariate analysis
# looking at the correlation between different columns
cor(dfd)
## Administrative Administrative_Duration Informational
## Administrative 1.000000000 0.601503934 0.377247122
## Administrative_Duration 0.601503934 1.000000000 0.303004615
## Informational 0.377247122 0.303004615 1.000000000
## Informational_Duration 0.256119766 0.238298187 0.619345877
## ProductRelated 0.431251225 0.288980875 0.374614892
## ProductRelated_Duration 0.373990295 0.355274137 0.387904897
## BounceRates -0.223674090 -0.144461247 -0.116063862
## ExitRates -0.316417094 -0.205974810 -0.163637714
## PageValues 0.098617187 0.067465212 0.048331971
## SpecialDay -0.095087342 -0.073406621 -0.048431383
## Month 0.048499407 0.029048084 0.019739333
## OperatingSystems -0.006198283 -0.007176932 -0.009293307
## Browser -0.025219653 -0.015407786 -0.038373742
## Region -0.005646346 -0.005627135 -0.029458867
## TrafficType -0.033333829 -0.014169320 -0.033946731
## VisitorType -0.025368971 -0.023773759 0.056114223
## Weekend 0.026649002 0.015030281 0.035316110
## Revenue 0.138413243 0.093392760 0.094771340
## Informational_Duration ProductRelated
## Administrative 0.256119766 0.431251225
## Administrative_Duration 0.238298187 0.288980875
## Informational 0.619345877 0.374614892
## Informational_Duration 1.000000000 0.280511401
## ProductRelated 0.280511401 1.000000000
## ProductRelated_Duration 0.347647178 0.860859155
## BounceRates -0.074220193 -0.204275356
## ExitRates -0.105382104 -0.292076735
## PageValues 0.030682479 0.055853149
## SpecialDay -0.030709639 -0.024346990
## Month 0.005963857 0.070187655
## OperatingSystems -0.009482363 0.004469575
## Browser -0.019442570 -0.013375306
## Region -0.027153049 -0.038460138
## TrafficType -0.024549001 -0.043104656
## VisitorType 0.044843174 0.127134796
## Weekend 0.023685578 0.016009877
## Revenue 0.070082424 0.157929809
## ProductRelated_Duration BounceRates ExitRates
## Administrative 0.373990295 -0.223674090 -0.316417094
## Administrative_Duration 0.355274137 -0.144461247 -0.205974810
## Informational 0.387904897 -0.116063862 -0.163637714
## Informational_Duration 0.347647178 -0.074220193 -0.105382104
## ProductRelated 0.860859155 -0.204275356 -0.292076735
## ProductRelated_Duration 1.000000000 -0.184273447 -0.251551834
## BounceRates -0.184273447 1.000000000 0.913506122
## ExitRates -0.251551834 0.913506122 1.000000000
## PageValues 0.052506095 -0.119353356 -0.174314785
## SpecialDay -0.036672867 0.072929982 0.102713169
## Month 0.061135992 -0.023794468 -0.039059592
## OperatingSystems 0.003123475 0.024013046 0.014749135
## Browser -0.007569700 -0.016352101 -0.004737218
## Region -0.033358820 -0.007053948 -0.009013226
## TrafficType -0.036284709 0.078619522 0.078714420
## VisitorType 0.119702708 0.135475772 0.178859364
## Weekend 0.007162565 -0.046895549 -0.062973784
## Revenue 0.151930302 -0.150618078 -0.206793044
## PageValues SpecialDay Month OperatingSystems
## Administrative 0.09861719 -0.095087342 0.048499407 -0.0061982825
## Administrative_Duration 0.06746521 -0.073406621 0.029048084 -0.0071769319
## Informational 0.04833197 -0.048431383 0.019739333 -0.0092933066
## Informational_Duration 0.03068248 -0.030709639 0.005963857 -0.0094823628
## ProductRelated 0.05585315 -0.024346990 0.070187655 0.0044695750
## ProductRelated_Duration 0.05250610 -0.036672867 0.061135992 0.0031234751
## BounceRates -0.11935336 0.072929982 -0.023794468 0.0240130460
## ExitRates -0.17431479 0.102713169 -0.039059592 0.0147491346
## PageValues 1.00000000 -0.063541272 0.021780268 0.0185079466
## SpecialDay -0.06354127 1.000000000 0.079341098 0.0126522347
## Month 0.02178027 0.079341098 1.000000000 -0.0295799600
## OperatingSystems 0.01850795 0.012652235 -0.029579960 1.0000000000
## Browser 0.04559192 0.003498747 -0.045913324 0.2230128882
## Region 0.01131530 -0.016097975 -0.032530328 0.0767754856
## TrafficType 0.01253169 0.052301443 0.041839131 0.1891536121
## VisitorType -0.11122783 0.085556612 0.026481310 0.0015042220
## Weekend 0.01200164 -0.016767155 0.029131513 0.0002842506
## Revenue 0.49256930 -0.082304598 0.080150468 -0.0146675596
## Browser Region TrafficType VisitorType
## Administrative -0.025219653 -0.0056463458 -0.033333829 -0.025368971
## Administrative_Duration -0.015407786 -0.0056271347 -0.014169320 -0.023773759
## Informational -0.038373742 -0.0294588670 -0.033946731 0.056114223
## Informational_Duration -0.019442570 -0.0271530493 -0.024549001 0.044843174
## ProductRelated -0.013375306 -0.0384601381 -0.043104656 0.127134796
## ProductRelated_Duration -0.007569700 -0.0333588204 -0.036284709 0.119702708
## BounceRates -0.016352101 -0.0070539481 0.078619522 0.135475772
## ExitRates -0.004737218 -0.0090132262 0.078714420 0.178859364
## PageValues 0.045591919 0.0113152995 0.012531693 -0.111227826
## SpecialDay 0.003498747 -0.0160979746 0.052301443 0.085556612
## Month -0.045913324 -0.0325303281 0.041839131 0.026481310
## OperatingSystems 0.223012888 0.0767754856 0.189153612 0.001504222
## Browser 1.000000000 0.0973928492 0.111938224 -0.021866988
## Region 0.097392849 1.0000000000 0.047520231 -0.036190794
## TrafficType 0.111938224 0.0475202313 1.000000000 -0.002839178
## VisitorType -0.021866988 -0.0361907939 -0.002839178 1.000000000
## Weekend -0.040260864 -0.0006906703 -0.002221229 -0.043679249
## Revenue 0.023984289 -0.0115950678 -0.005112971 -0.104725722
## Weekend Revenue
## Administrative 0.0266490019 0.138413243
## Administrative_Duration 0.0150302813 0.093392760
## Informational 0.0353161098 0.094771340
## Informational_Duration 0.0236855777 0.070082424
## ProductRelated 0.0160098771 0.157929809
## ProductRelated_Duration 0.0071625648 0.151930302
## BounceRates -0.0468955489 -0.150618078
## ExitRates -0.0629737841 -0.206793044
## PageValues 0.0120016392 0.492569295
## SpecialDay -0.0167671553 -0.082304598
## Month 0.0291315131 0.080150468
## OperatingSystems 0.0002842506 -0.014667560
## Browser -0.0402608638 0.023984289
## Region -0.0006906703 -0.011595068
## TrafficType -0.0022212292 -0.005112971
## VisitorType -0.0436792493 -0.104725722
## Weekend 1.0000000000 0.029295368
## Revenue 0.0292953680 1.000000000
# we are able to look at the relationship between various columns and conclude that the following columns have no relation with each otheri.e:
# pages visited is in reference to the administrative, productrelated and informational pages
# all pages visited have a weak positive correlation/no relationship with the months
# all pages visited have a weak negative correlation/no relationship with special day
# administrative page has a weak negative correlation/no relationship with visitor type
# informational and productrelated pages have a weak positive correlation/no relationship with visitor type
# all pages visited have a weak negative correlation/no relationship with region
# bounce rates have a weak negative correlation/no relationship with month
# bounce rates have a weak positive correlation/no relationship with special day
# page value has a weak positive correlation/no relationship with month
# page value has a weak negative correlation/no relationship with special day
# visitor type has a weak positive correlation/no relationship with special day
# visitor type has a weak positive correlation/no relationship with month
library(corrplot)
## corrplot 0.92 loaded
#getting the correlation matrix
x <- cor(dfd)
#visualizing correlogram
corrplot(x, method="color")
CREATING SCATTERPLOTS
plot(dfd$BounceRates, dfd$ExitRates, pch=16, col='steelblue',
main='bounce vs. exit',
xlab='bouncerates', ylab='exitrates')
# bounce rates exhibit some sort of linear relationship with exit rates
plot(dfd$ProductRelated, dfd$ProductRelated_Duration, pch=16, col='steelblue',
main='productrelated vs. duration',
xlab='productrelated', ylab='duration')
# productrelated exhibits some sort of linear relationship with productrelated duration
plot(dfd$Administrative, dfd$Administrative_Duration, pch=16, col='steelblue',
main='administrative vs. duration',
xlab='administrative', ylab='duration')
# administrative exhibits no relationship with administrative duration
plot(dfd$Informational, dfd$Informational_Duration, pch=16, col='steelblue',
main='informational vs. duration',
xlab='informational', ylab='duration')
# informational exhibits no relationship with informational duration
6.)IMPLEMENTING THE SOLUTION
a.)K-MEANS CLUSTERING
# removing the label variable revenue
library(dplyr)
dfe <- dfd %>%
select(-c(Revenue))
# rescaling our variables with the scale function
library(dplyr)
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
dff <- dfe %>%
mutate(admin_scal = scale(Administrative),
admindur_scal = scale(Administrative_Duration),
info_scal = scale(Informational),
infodur_scal = scale(Informational_Duration),
product_scal = scale(ProductRelated),
productdur_scal = scale(ProductRelated_Duration),
bounce_scale = scale(BounceRates),
exit_scale = scale(ExitRates),
page_scale = scale(PageValues),
day_scale = scale(SpecialDay),
month_scale = scale(Month),
os_scale = scale(OperatingSystems),
browser_scale = scale(Browser),
region_scale = scale(Region),
traffic_scale = scale(TrafficType),
visitor_scale = scale(VisitorType),
weekend_scale = scale(Weekend)) %>%
select(-c(Administrative, Administrative_Duration, Informational, Informational_Duration, ProductRelated, ProductRelated_Duration, BounceRates, ExitRates, PageValues, SpecialDay, Month,OperatingSystems, Browser, Region, TrafficType, VisitorType, Weekend))
# grouping the data into 2 clusters and starting with nstart=25
set.seed(123)
# using a sample of 500 random values
rand_dff1 <- dff[sample(nrow(dff), size=500), ]
dfg <- kmeans(rand_dff1, centers = 2, nstart = 25)
print(dfg)
## K-means clustering with 2 clusters of sizes 437, 63
##
## Cluster means:
## admin_scal admindur_scal info_scal infodur_scal product_scal productdur_scal
## 1 -0.2440096 -0.2167333 -0.2223376 -0.1918554 -0.1324228 -0.1414847
## 2 1.4088590 0.9448086 2.3251038 1.4780934 0.8556309 0.8840704
## bounce_scale exit_scale page_scale day_scale month_scale os_scale
## 1 0.0339693 0.0485089 -0.02430666 0.05471521 -0.01801908 -0.07580895
## 2 -0.2987851 -0.4695186 0.37373377 -0.24497118 0.11162823 0.03810241
## browser_scale region_scale traffic_scale visitor_scale weekend_scale
## 1 0.07188864 -0.01657762 -0.05537766 -0.03945481 -0.04142029
## 2 -0.05081024 0.04438895 -0.08432631 0.13202077 0.12570366
##
## Clustering vector:
## 2463 2511 10419 8718 2986 1842 9334 3371 11638 4761 6746 9819 2757
## 1 1 1 1 1 1 1 2 1 1 2 2 1
## 5107 9145 9209 10205 2888 6170 2567 9642 9982 2980 1614 555 4469
## 1 1 2 1 1 1 1 1 2 1 1 1 1
## 9359 10784 10730 7789 9991 9097 1047 7067 3004 3207 7989 3995 8358
## 1 1 1 1 1 1 1 1 1 1 2 1 1
## 217 9506 8157 10821 6216 8780 1599 4237 3937 4089 2907 294 8469
## 1 1 1 1 1 1 1 1 2 1 1 1 1
## 41 8508 7391 6672 7284 11014 10987 2504 6742 12118 9375 8944 11473
## 1 1 2 2 1 2 1 1 2 1 1 1 1
## 8566 10034 6129 10274 4612 2117 6134 755 6553 5428 9198 10777 7127
## 1 2 1 1 1 1 2 1 1 1 2 1 1
## 10531 9640 12144 3358 3980 9326 3230 5603 10126 9693 4576 3783 7831
## 2 2 1 1 1 1 1 1 1 1 1 1 2
## 10106 5967 9301 7816 9267 11338 1386 10476 4706 2378 4044 10452 686
## 2 1 1 1 1 1 1 1 1 1 1 2 1
## 12049 6078 5027 6387 9039 7281 9175 4715 151 6810 9830 8174 6911
## 1 1 1 1 1 1 1 1 1 2 1 1 1
## 2208 10666 1029 8518 12048 7448 11029 7735 8011 1956 8172 5358 5884
## 1 2 1 1 1 1 1 1 1 1 1 1 1
## 4093 985 6183 986 4233 10695 9954 1584 4685 7864 10279 10436 9985
## 1 1 1 1 1 1 1 1 1 1 1 1 1
## 4776 6644 12199 10000 8536 7478 2507 11184 11284 195 11428 3124 6678
## 1 1 1 1 2 1 1 1 1 1 1 1 1
## 10417 4650 11067 2132 3464 9518 3949 10859 7757 11694 2758 3833 712
## 1 1 1 1 1 1 1 1 1 2 2 1 1
## 9644 5370 10020 3501 3069 10638 11253 8720 4055 9761 473 6098 1149
## 1 1 1 1 1 1 1 1 1 1 1 1 1
## 2037 2313 11015 10119 7741 11516 8650 11416 1078 5015 5658 6379 1313
## 1 2 1 1 1 1 1 1 1 1 1 1 1
## 185 7933 413 4723 10762 1333 4875 9753 564 8986 9607 3799 9562
## 2 1 1 1 1 1 1 1 1 1 1 1 1
## 4256 3581 11160 3129 6601 4713 8549 279 4366 6790 6491 3201 2266
## 1 1 1 1 1 1 2 1 1 1 1 2 1
## 618 1905 8529 10266 539 9148 3625 6868 3462 5618 6815 7005 6801
## 1 2 1 1 1 2 1 1 1 2 1 1 1
## 3008 9637 6804 2211 2286 5793 10818 7684 12230 5509 4213 9271 9433
## 1 1 1 1 1 1 2 1 1 1 1 1 1
## 10797 7826 1706 7879 9065 8081 8949 988 10687 12294 11061 11071 4807
## 1 1 1 1 1 1 1 1 1 1 1 1 1
## 5588 9426 9617 3809 7426 2225 1165 10264 11263 6083 7634 2213 1612
## 1 1 2 1 1 1 1 2 1 2 1 1 1
## 3853 5565 710 8042 1258 2470 10748 2112 6224 3035 2339 6909 1347
## 1 1 1 1 1 1 1 1 1 1 1 1 2
## 6623 7728 8983 10179 5001 9533 5407 5854 11849 4388 5346 1708 4701
## 1 1 1 1 1 1 1 1 1 1 1 1 2
## 1835 3111 9453 6309 12219 9249 8275 3468 9058 10517 7377 4172 5214
## 1 1 1 1 1 2 2 1 1 1 1 1 1
## 7385 7090 6518 10563 2163 5497 10848 7633 682 12251 7082 11360 1562
## 2 1 1 1 1 1 1 1 1 1 1 1 1
## 6541 3814 9181 4052 10153 8176 10833 5567 1362 2154 3959 6155 4721
## 1 1 1 1 1 1 1 1 1 1 1 1 1
## 11628 386 10643 8333 31 4190 3786 11973 7588 8370 11681 11728 5532
## 1 1 2 2 1 1 1 1 1 1 1 1 1
## 4620 5698 204 3247 2421 12113 7632 2432 5242 7567 11667 2363 259
## 1 1 1 1 1 1 1 1 1 1 1 1 1
## 5985 5409 7216 9547 11256 1325 4106 4266 3352 7516 4071 2450 1233
## 1 1 1 1 2 1 2 1 1 1 1 1 1
## 11732 10043 1673 3467 4357 2589 6470 5194 2330 1609 3339 6007 5382
## 1 1 1 1 1 1 1 1 1 1 2 1 2
## 1760 11977 5339 6840 3424 8859 5086 4548 9048 1948 5699 10862 1960
## 1 1 1 1 1 1 1 1 1 1 2 1 1
## 646 1060 3127 11754 1264 9848 10078 9520 4084 3082 8857 3494 10227
## 1 1 1 2 1 1 2 1 1 1 1 1 1
## 5732 11369 10703 1204 4374 8433 4120 6823 7727 8124 9919 2085 10926
## 1 1 1 1 1 1 2 1 1 1 2 1 1
## 5423 1743 3984 7880 5791 8570 549 5735 9460 2096 4489 11403 12323
## 1 1 1 1 1 1 1 1 1 1 1 1 1
## 9587 4766 5309 8138 8503 6333 1442 5502 2086 9300 319 2894 10054
## 1 1 1 1 1 2 1 1 2 1 1 1 1
## 7570 1666 8312 2760 7499 1557 6585 7367 3571 599 12287 6821 134
## 1 1 1 1 1 2 1 1 1 1 1 2 1
## 4520 10996 6150 3200 9071 8860 5920 8241 5596 3311 7508 193 10949
## 1 2 1 1 2 1 1 1 1 1 1 1 2
## 4399 7042 10430 3696 191 6590 6263 3699 7690 4913 10301 9638 11372
## 2 1 1 1 1 2 1 1 2 1 2 1 2
## 1316 1397 5655 12014 11170 5720
## 1 1 1 1 1 1
##
## Within cluster sum of squares by cluster:
## [1] 5500.241 1530.194
## (between_SS / total_SS = 11.2 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
# plotting the clusters
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_cluster(dfg, data = rand_dff1)
# setting different values for the centers
dfh <- kmeans(rand_dff1, centers = 3, nstart = 25)
dfi <- kmeans(rand_dff1, centers = 4, nstart = 25)
dfj <- kmeans(rand_dff1, centers = 5, nstart = 25)
# plotting clusters with different values of k
library(cluster)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
p1 <- fviz_cluster(dfg, geom = "point", data = rand_dff1) + ggtitle(" K = 2")
p2 <- fviz_cluster(dfh, geom = "point", data = rand_dff1) + ggtitle(" K = 3")
p3 <- fviz_cluster(dfi, geom = "point", data = rand_dff1) + ggtitle(" K = 4")
p4 <- fviz_cluster(dfj, geom = "point", data = rand_dff1) + ggtitle(" K = 5")
grid.arrange(p1, p2, p3, p4, nrow = 2)
FINDING THE OPTIMAL VALUE FOR K a.)ELBOW METHOD
# Determining Optimal K value Using Elbow method
fviz_nbclust(x = rand_dff1,FUNcluster = kmeans, method = 'wss')
# K is optimal at 8
# Determining Optimal clusters (k) Using Average Silhouette Method
fviz_nbclust(x = rand_dff1,FUNcluster = kmeans, method = 'silhouette')
# k is optimal at 2
gap_stat <- clusGap(x = rand_dff1, FUN = kmeans, K.max = 15, nstart = 25, B = 50, iter.max=30)
# plot the result to determine the optimal number of clusters.
fviz_gap_stat(gap_stat)
# k is optimal at 14
WE USE 2 CLUSTERS FROM THE SILHOUETTE METHOD SINCE ANY FIGURE BELOW OR ABOVE THIS DOES NOT GIVE CLEAR RESULTS
# Compute k-means clustering with k = 2
finalx <- kmeans(rand_dff1, centers = 2, nstart = 25)
fviz_cluster(finalx, data = rand_dff1)
KMEANS DOESNOT DO A GOOD JOB OF CLUSTERING THE DATA USING THE OPTIMAL 2 CLUSTERS
b.) HIERARCHICAL CLUSTERING
# we are unable to plot a clear dendogram with a sample that contains more than 80 items so we create a random sample to pick from
rand_df <- dff[sample(nrow(dff), size=80), ]
d <- dist(rand_df, method = "euclidean")
m <- c( "average", "single", "complete", "ward")
names(m) <- c( "average", "single", "complete", "ward")
#function to compute agglomerative coefficient
ac <- function(x) {
agnes(rand_df, method = x)$ac
}
#calculate agglomerative coefficient for each clustering linkage method
sapply(m, ac)
## average single complete ward
## 0.7582472 0.7390228 0.7983662 0.8605239
#Ward’s minimum variance method produces the highest agglomerative coefficient, thus we’ll use that as the method for our final hierarchical clustering:
# obtaining our hierarchical cluster using ward method
res.hc <- hclust(d, method = "ward" )
## The "ward" method has been renamed to "ward.D"; note new "ward.D2"
# Lastly, we plot the obtained dendrogram
plot(res.hc, cex = 0.6, hang = -1)
#calculate gap statistic for each number of clusters (up to 10 clusters)
gap_stat <- clusGap(rand_df, FUN = hcut, nstart = 25, K.max = 10, B = 50)
#produce plot of clusters vs. gap statistic
fviz_gap_stat(gap_stat)
# k is optimal at 10
#perform hierarchical clustering using Ward's method
final_clust <- hclust(d, method = "ward.D2" )
#cut the dendrogram into 4 clusters
groups <- cutree(final_clust, k=10)
plot(final_clust, cex = 0.6, hang = -1)
#append cluster labels to the sampled data to check the clusters for each row
final_data <- cbind(rand_df, cluster = groups)
#display first six rows of final data
head(final_data)
## admin_scal admindur_scal info_scal infodur_scal product_scal
## 3207 2.01077862 0.3682199 -0.3969483 -0.24522764 0.3868429
## 7989 -0.69785982 -0.4574304 4.3229401 3.46057905 -0.1076322
## 3995 -0.69785982 -0.4574304 -0.3969483 -0.24522764 -0.4447743
## 8358 -0.69785982 -0.4574304 0.3896998 0.01390520 -0.3323936
## 217 -0.09594017 0.2098760 -0.3969483 -0.24522764 -0.6245835
## 9506 -0.09594017 -0.1831562 1.1763479 -0.01804268 0.2969383
## productdur_scal bounce_scale exit_scale page_scale day_scale month_scale
## 3207 0.2066504 -0.38257332 -0.47412309 -0.1292097 -0.3088088 0.35271705
## 7989 -0.2420637 -0.04057688 -0.23589996 -0.3171650 -0.3088088 0.77462249
## 3995 -0.4895992 -0.28560483 -0.19946007 -0.3171650 -0.3088088 0.35271705
## 8358 -0.4589046 -0.22825786 0.14386872 0.6501050 -0.3088088 0.77462249
## 217 -0.6032346 -0.45764572 0.48719750 -0.3171650 -0.3088088 -0.06918839
## 9506 0.5954590 -0.02754348 -0.07786446 -0.3171650 -0.3088088 -1.75681013
## os_scale browser_scale region_scale traffic_scale visitor_scale
## 3207 -0.1360728 -0.2079435 -0.06136105 -0.01728781 0.4077699
## 7989 -0.1360728 -0.2079435 1.60420133 -0.51416134 0.4077699
## 3995 -0.1360728 0.9566912 1.18781073 0.47958571 0.4077699
## 8358 -0.1360728 1.5390086 -0.89414223 -0.76259811 0.4077699
## 217 -1.2333763 -0.2079435 1.60420133 -0.26572458 0.4077699
## 9506 -1.2333763 -0.7902608 -0.89414223 -0.51416134 0.4077699
## weekend_scale cluster
## 3207 1.8162862 1
## 7989 -0.5505294 2
## 3995 -0.5505294 3
## 8358 1.8162862 1
## 217 -0.5505294 3
## 9506 -0.5505294 4
7.) CHALLENGE THE SOLUTION
DENSITY-BASED SPATIAL CLUSTERING OF APPLICATIONS
library(dbscan)
library(factoextra)
library(fpc)
##
## Attaching package: 'fpc'
## The following object is masked from 'package:dbscan':
##
## dbscan
# Fitting DBScan clustering Model
# to training dataset
Dbscan_cl <- dbscan(dff, eps = 0.45, MinPts = 5)
Dbscan_cl
## dbscan Pts=12330 MinPts=5 eps=0.45
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
## border 11533 23 1 5 76 6 8 9 4 3 6 5 5 12 4 4 4 2 4 2 4 8 4
## seed 0 30 6 23 186 8 6 65 27 6 2 10 17 24 1 1 1 1 1 1 1 19 1
## total 11533 53 7 28 262 14 14 74 31 9 8 15 22 36 5 5 5 3 5 3 5 27 5
## 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
## border 4 4 3 4 4 0 4 4 4 4 1 4 3 7 5 4 3 5 3 4 0 3 4 4
## seed 1 1 6 1 1 6 1 4 1 9 10 7 6 5 3 1 1 2 1 1 5 1 1 1
## total 5 5 9 5 5 6 5 8 5 13 11 11 9 12 8 5 4 7 4 5 5 4 5 5
# Plotting Cluster
plot(Dbscan_cl, dff, main = "DBScan")
#THE RESULTS OBTAINED ARE INCONCLUSIVE SINCE THE CLUSTERS CAN NOT BE DECIPHERED CLEARLY
8.)FOLLOW UP QUESTIONS a.)was the data sufficient? no additional information is required in order to reach
b.)did we have the right question given the data? yes since kira plastinina’s core aim was to find out the key characteristics of their customer groups