R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

Details

Spending Behavior Big Data Group 4 Simon Chow

Section 1

Read the spending behavior data read.table() is the function used to open and read the data file

df = as.data.frame(read.table("Group 4.txt", sep="\t", header=T))

check the dimension and number of samples in the dataset

dim(df)
## [1] 1000    4

the dimension is 1000*4, which means there are 1000 samples each with 4 attributes

check the type of features

str(df)
## 'data.frame':    1000 obs. of  4 variables:
##  $ customer_id   : int  80 90 130 190 220 290 330 400 410 480 ...
##  $ Last.purchased: chr  "343" "758" "2970" "2592" ...
##  $ frequency     : int  4 4 1 2 1 2 3 1 4 3 ...
##  $ amount        : num  75 126 60 65 30 ...

there are total of 4 attributes which ‘customer_id’ is not useful in our study which would be removed. Besides, data type for ‘Last.purchased’ is in chr which has to be converted into data type int.

remove the first column (customer_id)

df = df[,-1]

the dimension of the dataset becomes 1000*3

convert the ‘Last.purchased’ from character to integer

df[,1] = as.integer(df[,1])
## Warning: NAs introduced by coercion

all three attributes are in numeric form

check the total number of missing values in the dataset

sum(is.na(df))
## [1] 32

there are total 32 missing values in the dataset, which woould be removed details explained in the report

data imputation by removing rows with missing values

df= na.omit(df)

Original Distributions Distributions of the features

-> Recency (Last.purchased) Histogram, Boxplot

hist(df$Last.purchased,
     col="blue",
     main="Histogram to Show Count of Last.purchased Class",
     xlab="Last.purchased",
     ylab="No. of samples",
     labels=TRUE)

boxplot(df$Last.purchased,
        main = "Boxplot for Descriptive Analysis of Last.purchased(Recency)",
        xlab = "Number of Days since Last Purchase",
        ylab = "Last.purchased",
        col = "orange",
        border = "brown",
        horizontal = TRUE,
        notch = TRUE)

-> Frequency (frequency) Histogram, Density Plot, Boxplot

hist(df$frequency,
     col="blue",
     main="Histogram to Show Count of frequency Class",
     xlab="frequency",
     ylab="No. of samples",
     labels=TRUE)

d1_f = density(df$frequency)
plot(d1_f, main="frequency")
polygon(d1_f, col="green", border="blue")

boxplot(df$frequency,
        main = "Boxplot for Descriptive Analysis of freqeuncy(Frequency)",
        xlab = "Number of Times Has Been Purchased",
        ylab = "Frequency",
        col = "orange",
        border = "brown",
        horizontal = TRUE,
        notch = TRUE)

-> Monetary (Amount) Histogram, Density Plot, Boxplot

hist(df$amount,
     col="blue",
     main="Histogram to Show Count of amount Class",
     xlab="Amount Class",
     ylab="No. of samples",
     labels=TRUE)

d1_a = density(df$amount)
plot(d1_a, main="amount")
polygon(d1_a, col="green", border="blue")

boxplot(df$amount,
        main = "Boxplot for Descriptive Analysis of amount(Monetary)",
        xlab = "Total Amount Has Been Spent",
        ylab = "Amount",
        col = "orange",
        border = "brown",
        horizontal = TRUE,
        notch = TRUE)

3D plot

library(scatterplot3d)
scatterplot3d(df[, c(1,2,3)], pch=20, angle=80, main='Original 3D plot',
              xlab='Last.purchased (Recency)', ylab='frequency (Frequency)',
              zlab='amount (Monetary)')

check if there are outliers in the three attributes, general plot showing all 3 attributes

boxplot(df)

there are outliers in ‘frequency’ and ‘amount’

checking which are those outliers

boxplot(df$Last.purchased)$out

## numeric(0)
boxplot(df$frequency)$out

##  [1] 10  9 10 10 10 18  9 12 15  9  9 11
boxplot(df$amount)$out

##  [1]  125.7500  600.0000  300.0000  237.5000  200.0000  600.0000  116.6667
##  [8]  475.0000  175.0000  400.0000  176.6667  125.0000  125.0000  237.5000
## [15]  527.0000  150.0000  252.4500  150.0000  130.0000  130.0000  145.0000
## [22]  150.0000  126.5000  125.0000  115.0000  166.6667  152.0000  311.2500
## [29]  152.2500  733.3333  150.0000  173.7500  133.3333  130.0000  158.3333
## [36]  225.0000  300.0000  200.0000  150.0000  130.0000  125.0000  125.0000
## [43]  116.0000  150.0000  200.0000  258.7500 2500.0000  606.6667  175.0000
## [50]  666.6667  143.7500  300.0000  150.0000  200.0000  140.0000 1000.0000
## [57]  150.0000  112.5000 3100.0000  150.0000  111.0000  226.6667  300.0000
## [64]  300.0000  950.0000  303.3333  137.5000  150.0000  308.3333  150.0000
## [71]  165.0000  108.3333  300.0000  116.6667  137.5000  164.2857  175.0000
## [78]  290.0000  200.0000  125.0000  276.0000 1807.0000  125.0000  150.0000
## [85]  116.6667  166.6667  125.0000  164.0000  160.0000  925.0000  150.0000
## [92]  160.0000  133.3333

-thre are zero outlier in ‘Last.purchased’ -there are 12 outliers in ‘frequency’ (10 9 10 10 10 18 9 12 15 9 9 11) -there are 93 outliers in ‘amount’ (125.7500 600.0000 300.0000 237.5000 200.0000 600.0000 116.6667 475.0000 175.0000 400.0000) (176.6667 125.0000 125.0000 237.5000 527.0000 150.0000 252.4500 150.0000 130.0000 130.0000) (145.0000 150.0000 126.5000 125.0000 115.0000 166.6667 152.0000 311.2500 152.2500 733.3333) (150.0000 173.7500 133.3333 130.0000 158.3333 225.0000 300.0000 200.0000 150.0000 130.0000) (125.0000 125.0000 116.0000 150.0000 200.0000 258.7500 2500.0000 606.6667 175.0000 666.6667) (143.7500 300.0000 150.0000 200.0000 140.0000 1000.0000 150.0000 112.5000 3100.0000 150.0000) (111.0000 226.6667 300.0000 300.0000 950.0000 303.3333 137.5000 150.0000 308.3333 150.0000) (165.0000 108.3333 300.0000 116.6667 137.5000 164.2857 175.0000 290.0000 200.0000 125.0000) (276.0000 1807.0000 125.0000 150.0000 116.6667 166.6667 125.0000 164.0000 160.0000 925.0000) (150.0000 160.0000 133.3333)

Summary packages: Hmisc, pastecs, summarytools

library(Hmisc)
## Warning: package 'Hmisc' was built under R version 4.1.3
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(pastecs)
## Warning: package 'pastecs' was built under R version 4.1.3
library(summarytools)
## Warning: package 'summarytools' was built under R version 4.1.3
## 
## Attaching package: 'summarytools'
## The following objects are masked from 'package:Hmisc':
## 
##     label, label<-
sum1 = describe(df)
sum2 = stat.desc(df)
sum3 = descr(df)
sum1
## df 
## 
##  3  Variables      968  Observations
## --------------------------------------------------------------------------------
## Last.purchased 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      968        0      529        1     1913     1496     34.0    217.4 
##      .25      .50      .75      .90      .95 
##    731.0   1836.5   3144.2   3731.0   3892.7 
## 
## lowest :    1    3    6    8    9, highest: 3988 3989 3999 4005 4014
## --------------------------------------------------------------------------------
## frequency 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      968        0       14    0.938    2.652    1.913        1        1 
##      .25      .50      .75      .90      .95 
##        1        2        4        5        6 
## 
## lowest :  1  2  3  4  5, highest: 10 11 12 15 18
##                                                                             
## Value          1     2     3     4     5     6     7     8     9    10    11
## Frequency    348   202   161   114    71    38    15     7     4     4     1
## Proportion 0.360 0.209 0.166 0.118 0.073 0.039 0.015 0.007 0.004 0.004 0.001
##                             
## Value         12    15    18
## Frequency      1     1     1
## Proportion 0.001 0.001 0.001
## --------------------------------------------------------------------------------
## amount 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      968        0      202    0.993    67.59    69.97    15.00    17.50 
##      .25      .50      .75      .90      .95 
##    28.33    38.67    60.00   106.08   156.20 
## 
## lowest :    7.25000    7.50000    8.00000   10.00000   11.66667
## highest:  950.00000 1000.00000 1807.00000 2500.00000 3100.00000
##                                                                             
## Value          0    50   100   150   200   250   300   400   500   550   600
## Frequency    212   550   129    37    10     5    11     1     1     1     3
## Proportion 0.219 0.568 0.133 0.038 0.010 0.005 0.011 0.001 0.001 0.001 0.003
##                                                           
## Value        650   750   900   950  1000  1800  2500  3100
## Frequency      1     1     1     1     1     1     1     1
## Proportion 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001
## 
## For the frequency table, variable is rounded to the nearest 50
## --------------------------------------------------------------------------------
sum2
##              Last.purchased    frequency       amount
## nbr.val        9.680000e+02 9.680000e+02   968.000000
## nbr.null       0.000000e+00 0.000000e+00     0.000000
## nbr.na         0.000000e+00 0.000000e+00     0.000000
## min            1.000000e+00 1.000000e+00     7.250000
## max            4.014000e+03 1.800000e+01  3100.000000
## range          4.013000e+03 1.700000e+01  3092.750000
## sum            1.852225e+06 2.567000e+03 65423.081299
## median         1.836500e+03 2.000000e+00    38.666667
## mean           1.913456e+03 2.651860e+00    67.585828
## SE.mean        4.175356e+01 6.102287e-02     5.161687
## CI.mean.0.95   8.193802e+01 1.197525e-01    10.129400
## var            1.687572e+06 3.604629e+00 25790.438984
## std.dev        1.299066e+03 1.898586e+00   160.594019
## coef.var       6.789109e-01 7.159452e-01     2.376149
sum3
## Descriptive Statistics  
## df  
## N: 968  
## 
##                      amount   frequency   Last.purchased
## ----------------- --------- ----------- ----------------
##              Mean     67.59        2.65          1913.46
##           Std.Dev    160.59        1.90          1299.07
##               Min      7.25        1.00             1.00
##                Q1     28.33        1.00           731.00
##            Median     38.67        2.00          1836.50
##                Q3     60.00        4.00          3144.50
##               Max   3100.00       18.00          4014.00
##               MAD     20.51        1.48          1769.48
##               IQR     31.67        3.00          2413.25
##                CV      2.38        0.72             0.68
##          Skewness     12.72        1.93             0.09
##       SE.Skewness      0.08        0.08             0.08
##          Kurtosis    200.96        7.37            -1.37
##           N.Valid    968.00      968.00           968.00
##         Pct.Valid    100.00      100.00           100.00

showing the information of the original data set without missing values

Transformation (log function) Transform “frequency” and “amount” feature to log(amount) to make data set more normal

df$logfrequency = log(df$frequency)
df$logamount = log(df$amount)

plot histogram to show the effect of log transformation for frequency and amount

hist(df$logfrequency,
     col="blue",
     main="Histogram to Show Count of frequency Class (after log transformation)",
     xlab="frequency",
     ylab="No. of samples",
     labels=TRUE)

hist(df$logamount,
     col="blue",
     main="Histogram to Show Count of amount Class (after log transformation)",
     xlab="Amount Class",
     ylab="No. of samples",
     labels=TRUE)

Winsorizing outliers Winsorizing or winsorization is the transformation of statistics by limiting extreme values in the statistical data to reduce the effect of possibly spurious outliers not using winsorizing to handle outliers as DBSCAN is robust to outliers

Center and Scale df

df <- as.data.frame(scale(df))

Distributions of the features after pre-processing (no NA, standardized)(Histogram, Box plot, Density Plot, 3D plot) ->Recency (Last.purchased) Histogram, Boxplot

hist(df$Last.purchased,
     col="blue",
     main="Histogram to Show Count of Last.purchased Class (Pre-processed)",
     xlab="Last.purchased (Re-scaled)",
     ylab="No. of samples",
     labels=TRUE)

boxplot(df$Last.purchased,
        main = "Boxplot for Descriptive Analysis of Last.purchased(Recency) (Pre-processed)",
        xlab = "Number of Days since Last Purchase (Re-scaled)",
        ylab = "Last.purchased",
        col = "orange",
        border = "brown",
        horizontal = TRUE,
        notch = TRUE)

->Frequency (frequency) Histogram, Density Plot, Boxplot

hist(df$logfrequency,
     col="blue",
     main="Histogram to Show Count of frequency Class (Pre-processed)",
     xlab="frequency (Re-scaled)",
     ylab="No. of samples",
     labels=TRUE)

d1_f = density(df$logfrequency)
plot(d1_f, main="frequency (Pre-processed)")
polygon(d1_f, col="green", border="blue")

boxplot(df$logfrequency,
        main = "Boxplot for Descriptive Analysis of freqeuncy(Frequency) (Pre-proccesed)",
        xlab = "Number of Times Has Been Purchased (Re-scaled)",
        ylab = "Frequency",
        col = "orange",
        border = "brown",
        horizontal = TRUE,
        notch = TRUE)

-> Monetary (Amount) Histogram, Density Plot, Boxplot

hist(df$logamount,
     col="blue",
     main="Histogram to Show Count of amount Class (Pre-processed)",
     xlab="Amount Class (Re-scaled)",
     ylab="No. of samples",
     labels=TRUE)

d1_a = density(df$logamount)
plot(d1_a, main="amount (Pre-processed)")
polygon(d1_a, col="green", border="blue")

boxplot(df$logamount,
        main = "Boxplot for Descriptive Analysis of amount(Monetary) (Pre-processed)",
        xlab = "Total Amount Has Been Spent (Re-scaled)",
        ylab = "Amount",
        col = "orange",
        border = "brown",
        horizontal = TRUE,
        notch = TRUE)

3D plot

scatterplot3d(df[, c(1,4,5)], pch=20, angle=70, main='Preprocessed 3D plot',
              xlab='Last.purchased (Recency)', ylab='logfrequency (Frequency)(logfrequency*)',
              zlab='logamount (Monetary)(logamount*)')

## Section 2

Using the model DBSCAN to identify customer segments Density-based spatial clustering of applications with noise (DBSCAN) is a data clustering algorithm proposed by Martin Ester, Hans-Peter Kriegel, Jörg Sander and Xiaowei Xu in 1996. (wiki) advantages of DBSCAN 1. Unlike to K-means, DBSCAN does not require the user to specify the number of clusters to be generated 2. DBSCAN can find any shape of clusters. The cluster doesn’t have to be circular. 3. DBSCAN can identify outliers Two important parameters are required for DBSCAN epsilon (“eps”) The parameter eps defines the radius of neighborhood around a point x. It’s called the \(\epsilon\)-neighborhood of x. minimum points (“MinPts”) The parameter MinPts is the minimum number of neighbors within “eps” radius.

only keeps useful columns (removing ‘frequency’, removing ‘amount’)

df = df[,-2]
df = df[,-2]
str(df)
## 'data.frame':    968 obs. of  3 variables:
##  $ Last.purchased: num  -1.209 -0.889 0.813 0.522 1.348 ...
##  $ logfrequency  : num  0.958 0.958 -1.1411 -0.0916 -1.1411 ...
##  $ logamount     : num  0.726 1.393 0.438 0.542 -0.456 ...

only ‘last.purchased’,‘logfreqeuncy’,‘logamount’ remain

check the correlation of all variables corrplot()

library(corrplot)
## corrplot 0.92 loaded
M = cor(df)
corrplot(M, method = 'number')

corrplot(M, method = 'color')

Create two new variables

all = df
a = all

Compute DBSCAN using fpc package Initial Hyperparameter setting For more than 2 dimensions: minPts=2dim (Sander et al., 1998) minPts = 23 = 6 Plot the k-distances with k=minPts (Ester et al., 1996) Find the ‘elbow’ in the graph–> The k-distance value is your Epsilon value. requiring ‘dbscan’ package

library(fpc)
## Warning: package 'fpc' was built under R version 4.1.3
set.seed(123)

dbscan::kNNdistplot(all, k = 6)
abline(h = 0.65, lty = 2)

the optimal eps value is around a distance of 0.8 according to elbow method with silhouette score, eps: 0.65 -> 0.45

db = fpc::dbscan(all, eps = 0.65, MinPts = 6)
db
## dbscan Pts=968 MinPts=6 eps=0.65
##         0   1   2
## border 22  22  10
## seed    0 584 330
## total  22 606 340
db = fpc::dbscan(all, eps = 0.45, MinPts = 6)
db
## dbscan Pts=968 MinPts=6 eps=0.45
##         0   1   2   3
## border 94  32  10  15
## seed    0 342 303 172
## total  94 374 313 187

From 2 clusters to 3 clusters

Silhouette score

library(magrittr)
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:pastecs':
## 
##     extract
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble  3.1.6     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## v purrr   0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x tidyr::extract()   masks magrittr::extract(), pastecs::extract()
## x dplyr::filter()    masks stats::filter()
## x dplyr::first()     masks pastecs::first()
## x dplyr::lag()       masks stats::lag()
## x dplyr::last()      masks pastecs::last()
## x purrr::set_names() masks magrittr::set_names()
## x dplyr::src()       masks Hmisc::src()
## x dplyr::summarize() masks Hmisc::summarize()
## x tibble::view()     masks summarytools::view()
library(cluster)
db$cluster
##   [1] 1 1 2 3 2 3 1 2 1 1 2 3 3 3 2 3 1 1 2 2 2 2 2 3 0 1 0 3 1 3 3 1 2 2 0 3 2
##  [38] 2 2 0 2 3 1 0 1 2 1 0 3 0 1 2 2 3 3 1 0 2 1 2 1 3 2 3 2 2 1 3 2 1 1 3 3 3
##  [75] 1 2 1 1 1 2 1 1 2 2 1 0 3 2 2 3 1 2 2 2 1 3 2 3 3 2 1 1 1 1 1 1 3 2 1 2 2
## [112] 1 3 2 2 2 1 2 3 1 2 1 3 2 3 1 0 3 3 3 1 0 2 2 1 1 1 3 1 2 2 2 2 3 1 1 1 1
## [149] 2 2 1 3 2 2 2 0 1 2 1 2 1 1 1 1 2 1 1 1 1 3 2 3 1 1 2 1 3 1 0 0 2 1 1 2 2
## [186] 1 3 2 3 0 2 2 2 1 1 2 0 1 0 2 1 1 1 0 3 1 3 1 2 3 2 2 3 2 1 1 2 2 1 1 0 1
## [223] 2 3 3 2 1 2 2 1 1 1 1 2 2 3 1 3 2 1 1 2 1 1 2 2 1 1 1 3 3 2 2 1 1 3 2 2 2
## [260] 3 3 1 1 2 1 2 1 3 2 0 2 1 1 2 1 2 3 2 2 3 0 3 1 2 3 0 3 1 1 1 3 2 1 1 2 1
## [297] 2 1 2 2 3 3 3 1 1 1 3 3 2 1 1 1 1 1 3 0 1 2 2 2 1 2 1 0 0 1 1 0 1 2 2 1 1
## [334] 1 3 2 0 2 2 1 2 2 3 2 2 2 2 1 1 2 3 2 1 3 1 0 2 2 1 0 0 2 3 1 1 2 1 3 1 2
## [371] 1 1 2 0 0 2 1 1 1 2 0 2 2 3 3 1 3 3 1 1 1 1 2 3 1 0 2 0 3 1 1 3 1 2 1 1 1
## [408] 2 2 3 1 3 1 1 1 1 2 2 0 2 2 1 1 0 1 0 0 1 3 2 0 2 1 1 1 0 2 1 1 2 1 3 1 1
## [445] 1 3 1 0 2 1 2 1 1 3 1 3 1 2 2 1 2 3 3 2 2 1 1 1 1 2 2 2 0 3 1 3 1 2 2 1 1
## [482] 1 1 3 1 1 0 3 1 2 0 1 0 0 0 1 1 3 1 1 0 1 2 1 1 1 3 2 1 1 2 1 2 3 1 2 2 1
## [519] 1 1 2 3 1 1 1 1 0 1 0 1 1 1 1 1 3 3 1 3 2 1 1 1 1 2 2 3 1 1 0 0 1 1 2 1 3
## [556] 1 3 3 1 0 1 0 2 1 0 3 3 3 1 0 1 1 1 0 1 1 2 1 1 1 1 2 0 1 2 3 1 1 1 3 0 0
## [593] 1 1 1 2 1 1 1 2 2 2 1 1 3 3 3 0 1 1 3 2 2 1 3 1 3 2 1 2 3 2 1 1 3 3 2 2 3
## [630] 1 0 1 1 0 2 1 1 3 2 1 1 2 1 3 1 2 2 2 3 0 2 2 3 2 1 3 3 3 2 3 1 1 3 2 2 0
## [667] 1 2 0 2 3 1 2 3 3 2 1 1 2 2 0 2 1 3 2 1 1 0 2 1 2 2 1 1 1 3 1 3 3 3 2 2 2
## [704] 2 1 2 2 3 2 2 3 2 2 1 2 2 2 2 1 1 1 0 2 1 2 1 1 0 2 2 2 1 2 2 0 2 3 2 1 0
## [741] 1 2 1 2 1 3 2 2 1 0 1 2 3 2 1 2 2 1 2 0 1 0 3 2 1 1 1 1 2 3 1 3 2 2 3 1 2
## [778] 2 0 3 2 0 2 2 1 2 1 1 1 1 2 1 3 3 0 1 1 0 0 1 1 1 1 1 2 3 2 3 1 2 2 3 2 2
## [815] 2 1 1 1 3 1 0 3 2 2 2 3 3 0 2 3 1 0 3 3 2 2 3 1 2 1 2 3 1 1 2 3 2 1 2 3 2
## [852] 3 1 2 1 1 0 2 2 1 2 1 1 2 3 3 1 2 2 2 3 2 1 1 1 3 1 3 2 2 2 3 0 1 2 3 2 1
## [889] 3 1 1 1 0 0 1 1 1 2 3 2 2 1 1 2 2 2 2 1 2 2 3 2 1 1 2 3 1 1 2 2 0 2 0 2 3
## [926] 3 0 0 1 3 2 0 2 3 3 2 2 2 1 1 3 1 3 3 2 2 2 1 3 1 3 3 1 2 1 0 1 2 2 2 1 1
## [963] 2 1 1 3 1 2
all = all %>%
  mutate(
    dbscan.cluster = db$cluster
  )

sil.dbscan <- silhouette(all$dbscan.cluster, dist(select(all,Last.purchased,logfrequency,logamount)))
summary(sil.dbscan)
## Silhouette of 968 units in 4 clusters from silhouette.default(x = all$dbscan.cluster, dist = dist(select(all,  from     Last.purchased, logfrequency, logamount))) :
##  Cluster sizes, ids = (0, 1, 2, 3), and average silhouette widths:
##         94        374        313        187 
## -0.2371555  0.2497144  0.3449052  0.1647128 
## Individual silhouette widths:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.56714  0.07344  0.28052  0.21679  0.41846  0.52380
# the plot require an external window
# dev.new(width=5, height=4) (re)
s3 = plot(silhouette(db$cluster,dist(all[,1:3],"euclidean")))

The silhouette plot require an external window to show the complete plot (shown in report) with <<dev.new(width=5, height=4)>>

Plot DBSCAN results

creating an attribute in factor (datatype for coloring)

all$Species = factor(db$cluster)
str(all)
## 'data.frame':    968 obs. of  5 variables:
##  $ Last.purchased: num  -1.209 -0.889 0.813 0.522 1.348 ...
##  $ logfrequency  : num  0.958 0.958 -1.1411 -0.0916 -1.1411 ...
##  $ logamount     : num  0.726 1.393 0.438 0.542 -0.456 ...
##  $ dbscan.cluster: num  1 1 2 3 2 3 1 2 1 1 ...
##  $ Species       : Factor w/ 4 levels "0","1","2","3": 2 2 3 4 3 4 2 3 2 2 ...
colors <- c("black", "blue", "yellow", "red","orange","green","pink")
colors <- colors[as.numeric(all$Species)]

Colored 3D Scatter Plot

library(plotly)
## Warning: package 'plotly' was built under R version 4.1.3
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:Hmisc':
## 
##     subplot
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(dplyr)
str(df)
## 'data.frame':    968 obs. of  3 variables:
##  $ Last.purchased: num  -1.209 -0.889 0.813 0.522 1.348 ...
##  $ logfrequency  : num  0.958 0.958 -1.1411 -0.0916 -1.1411 ...
##  $ logamount     : num  0.726 1.393 0.438 0.542 -0.456 ...
p <- plot_ly(all, x=~Last.purchased, y=~logfrequency, 
             z=~logamount, color=~Species) %>%
  add_markers(size=1.5)
print(p)

Print DBSCAN and analysis on clustering result

print(db)
## dbscan Pts=968 MinPts=6 eps=0.45
##         0   1   2   3
## border 94  32  10  15
## seed    0 342 303 172
## total  94 374 313 187
db$cluster
##   [1] 1 1 2 3 2 3 1 2 1 1 2 3 3 3 2 3 1 1 2 2 2 2 2 3 0 1 0 3 1 3 3 1 2 2 0 3 2
##  [38] 2 2 0 2 3 1 0 1 2 1 0 3 0 1 2 2 3 3 1 0 2 1 2 1 3 2 3 2 2 1 3 2 1 1 3 3 3
##  [75] 1 2 1 1 1 2 1 1 2 2 1 0 3 2 2 3 1 2 2 2 1 3 2 3 3 2 1 1 1 1 1 1 3 2 1 2 2
## [112] 1 3 2 2 2 1 2 3 1 2 1 3 2 3 1 0 3 3 3 1 0 2 2 1 1 1 3 1 2 2 2 2 3 1 1 1 1
## [149] 2 2 1 3 2 2 2 0 1 2 1 2 1 1 1 1 2 1 1 1 1 3 2 3 1 1 2 1 3 1 0 0 2 1 1 2 2
## [186] 1 3 2 3 0 2 2 2 1 1 2 0 1 0 2 1 1 1 0 3 1 3 1 2 3 2 2 3 2 1 1 2 2 1 1 0 1
## [223] 2 3 3 2 1 2 2 1 1 1 1 2 2 3 1 3 2 1 1 2 1 1 2 2 1 1 1 3 3 2 2 1 1 3 2 2 2
## [260] 3 3 1 1 2 1 2 1 3 2 0 2 1 1 2 1 2 3 2 2 3 0 3 1 2 3 0 3 1 1 1 3 2 1 1 2 1
## [297] 2 1 2 2 3 3 3 1 1 1 3 3 2 1 1 1 1 1 3 0 1 2 2 2 1 2 1 0 0 1 1 0 1 2 2 1 1
## [334] 1 3 2 0 2 2 1 2 2 3 2 2 2 2 1 1 2 3 2 1 3 1 0 2 2 1 0 0 2 3 1 1 2 1 3 1 2
## [371] 1 1 2 0 0 2 1 1 1 2 0 2 2 3 3 1 3 3 1 1 1 1 2 3 1 0 2 0 3 1 1 3 1 2 1 1 1
## [408] 2 2 3 1 3 1 1 1 1 2 2 0 2 2 1 1 0 1 0 0 1 3 2 0 2 1 1 1 0 2 1 1 2 1 3 1 1
## [445] 1 3 1 0 2 1 2 1 1 3 1 3 1 2 2 1 2 3 3 2 2 1 1 1 1 2 2 2 0 3 1 3 1 2 2 1 1
## [482] 1 1 3 1 1 0 3 1 2 0 1 0 0 0 1 1 3 1 1 0 1 2 1 1 1 3 2 1 1 2 1 2 3 1 2 2 1
## [519] 1 1 2 3 1 1 1 1 0 1 0 1 1 1 1 1 3 3 1 3 2 1 1 1 1 2 2 3 1 1 0 0 1 1 2 1 3
## [556] 1 3 3 1 0 1 0 2 1 0 3 3 3 1 0 1 1 1 0 1 1 2 1 1 1 1 2 0 1 2 3 1 1 1 3 0 0
## [593] 1 1 1 2 1 1 1 2 2 2 1 1 3 3 3 0 1 1 3 2 2 1 3 1 3 2 1 2 3 2 1 1 3 3 2 2 3
## [630] 1 0 1 1 0 2 1 1 3 2 1 1 2 1 3 1 2 2 2 3 0 2 2 3 2 1 3 3 3 2 3 1 1 3 2 2 0
## [667] 1 2 0 2 3 1 2 3 3 2 1 1 2 2 0 2 1 3 2 1 1 0 2 1 2 2 1 1 1 3 1 3 3 3 2 2 2
## [704] 2 1 2 2 3 2 2 3 2 2 1 2 2 2 2 1 1 1 0 2 1 2 1 1 0 2 2 2 1 2 2 0 2 3 2 1 0
## [741] 1 2 1 2 1 3 2 2 1 0 1 2 3 2 1 2 2 1 2 0 1 0 3 2 1 1 1 1 2 3 1 3 2 2 3 1 2
## [778] 2 0 3 2 0 2 2 1 2 1 1 1 1 2 1 3 3 0 1 1 0 0 1 1 1 1 1 2 3 2 3 1 2 2 3 2 2
## [815] 2 1 1 1 3 1 0 3 2 2 2 3 3 0 2 3 1 0 3 3 2 2 3 1 2 1 2 3 1 1 2 3 2 1 2 3 2
## [852] 3 1 2 1 1 0 2 2 1 2 1 1 2 3 3 1 2 2 2 3 2 1 1 1 3 1 3 2 2 2 3 0 1 2 3 2 1
## [889] 3 1 1 1 0 0 1 1 1 2 3 2 2 1 1 2 2 2 2 1 2 2 3 2 1 1 2 3 1 1 2 2 0 2 0 2 3
## [926] 3 0 0 1 3 2 0 2 3 3 2 2 2 1 1 3 1 3 3 2 2 2 1 3 1 3 3 1 2 1 0 1 2 2 2 1 1
## [963] 2 1 1 3 1 2
all$dbscan.cluster
##   [1] 1 1 2 3 2 3 1 2 1 1 2 3 3 3 2 3 1 1 2 2 2 2 2 3 0 1 0 3 1 3 3 1 2 2 0 3 2
##  [38] 2 2 0 2 3 1 0 1 2 1 0 3 0 1 2 2 3 3 1 0 2 1 2 1 3 2 3 2 2 1 3 2 1 1 3 3 3
##  [75] 1 2 1 1 1 2 1 1 2 2 1 0 3 2 2 3 1 2 2 2 1 3 2 3 3 2 1 1 1 1 1 1 3 2 1 2 2
## [112] 1 3 2 2 2 1 2 3 1 2 1 3 2 3 1 0 3 3 3 1 0 2 2 1 1 1 3 1 2 2 2 2 3 1 1 1 1
## [149] 2 2 1 3 2 2 2 0 1 2 1 2 1 1 1 1 2 1 1 1 1 3 2 3 1 1 2 1 3 1 0 0 2 1 1 2 2
## [186] 1 3 2 3 0 2 2 2 1 1 2 0 1 0 2 1 1 1 0 3 1 3 1 2 3 2 2 3 2 1 1 2 2 1 1 0 1
## [223] 2 3 3 2 1 2 2 1 1 1 1 2 2 3 1 3 2 1 1 2 1 1 2 2 1 1 1 3 3 2 2 1 1 3 2 2 2
## [260] 3 3 1 1 2 1 2 1 3 2 0 2 1 1 2 1 2 3 2 2 3 0 3 1 2 3 0 3 1 1 1 3 2 1 1 2 1
## [297] 2 1 2 2 3 3 3 1 1 1 3 3 2 1 1 1 1 1 3 0 1 2 2 2 1 2 1 0 0 1 1 0 1 2 2 1 1
## [334] 1 3 2 0 2 2 1 2 2 3 2 2 2 2 1 1 2 3 2 1 3 1 0 2 2 1 0 0 2 3 1 1 2 1 3 1 2
## [371] 1 1 2 0 0 2 1 1 1 2 0 2 2 3 3 1 3 3 1 1 1 1 2 3 1 0 2 0 3 1 1 3 1 2 1 1 1
## [408] 2 2 3 1 3 1 1 1 1 2 2 0 2 2 1 1 0 1 0 0 1 3 2 0 2 1 1 1 0 2 1 1 2 1 3 1 1
## [445] 1 3 1 0 2 1 2 1 1 3 1 3 1 2 2 1 2 3 3 2 2 1 1 1 1 2 2 2 0 3 1 3 1 2 2 1 1
## [482] 1 1 3 1 1 0 3 1 2 0 1 0 0 0 1 1 3 1 1 0 1 2 1 1 1 3 2 1 1 2 1 2 3 1 2 2 1
## [519] 1 1 2 3 1 1 1 1 0 1 0 1 1 1 1 1 3 3 1 3 2 1 1 1 1 2 2 3 1 1 0 0 1 1 2 1 3
## [556] 1 3 3 1 0 1 0 2 1 0 3 3 3 1 0 1 1 1 0 1 1 2 1 1 1 1 2 0 1 2 3 1 1 1 3 0 0
## [593] 1 1 1 2 1 1 1 2 2 2 1 1 3 3 3 0 1 1 3 2 2 1 3 1 3 2 1 2 3 2 1 1 3 3 2 2 3
## [630] 1 0 1 1 0 2 1 1 3 2 1 1 2 1 3 1 2 2 2 3 0 2 2 3 2 1 3 3 3 2 3 1 1 3 2 2 0
## [667] 1 2 0 2 3 1 2 3 3 2 1 1 2 2 0 2 1 3 2 1 1 0 2 1 2 2 1 1 1 3 1 3 3 3 2 2 2
## [704] 2 1 2 2 3 2 2 3 2 2 1 2 2 2 2 1 1 1 0 2 1 2 1 1 0 2 2 2 1 2 2 0 2 3 2 1 0
## [741] 1 2 1 2 1 3 2 2 1 0 1 2 3 2 1 2 2 1 2 0 1 0 3 2 1 1 1 1 2 3 1 3 2 2 3 1 2
## [778] 2 0 3 2 0 2 2 1 2 1 1 1 1 2 1 3 3 0 1 1 0 0 1 1 1 1 1 2 3 2 3 1 2 2 3 2 2
## [815] 2 1 1 1 3 1 0 3 2 2 2 3 3 0 2 3 1 0 3 3 2 2 3 1 2 1 2 3 1 1 2 3 2 1 2 3 2
## [852] 3 1 2 1 1 0 2 2 1 2 1 1 2 3 3 1 2 2 2 3 2 1 1 1 3 1 3 2 2 2 3 0 1 2 3 2 1
## [889] 3 1 1 1 0 0 1 1 1 2 3 2 2 1 1 2 2 2 2 1 2 2 3 2 1 1 2 3 1 1 2 2 0 2 0 2 3
## [926] 3 0 0 1 3 2 0 2 3 3 2 2 2 1 1 3 1 3 3 2 2 2 1 3 1 3 3 1 2 1 0 1 2 2 2 1 1
## [963] 2 1 1 3 1 2
seq = c(1:968)

c0l = list(c())
c0f = list(c())
c0a = list(c())
c1l = list(c())
c1f = list(c())
c1a = list(c())
c2l = list(c())
c2f = list(c())
c2a = list(c())
c3l = list(c())
c3f = list(c())
c3a = list(c())

for (i in seq){
  if ( all$dbscan.cluster[i] == 0 ) {
    nl = list(all[i,1])
    nf = list(all[i,2])
    na = list(all[i,3])
    c0l = append(c0l, nl)
    c0f = append(c0f, nf)
    c0a = append(c0a, na)
  } else if ( all$dbscan.cluster[i] == 1) {
    nl = list(all[i,1])
    nf = list(all[i,2])
    na = list(all[i,3])
    c1l = append(c1l, nl)
    c1f = append(c1f, nf)
    c1a = append(c1a, na)
  } else if ( all$dbscan.cluster[i] == 2) {
    nl = list(all[i,1])
    nf = list(all[i,2])
    na = list(all[i,3])
    c2l = append(c2l, nl)
    c2f = append(c2f, nf)
    c2a = append(c2a, na)
  } else {
    nl = list(all[i,1])
    nf = list(all[i,2])
    na = list(all[i,3])
    c3l = append(c3l, nl)
    c3f = append(c3f, nf)
    c3a = append(c3a, na)
  }
}

maxlast0 = max(unlist(c0l))
minlast0 = min(unlist(c0l))
mealast0 = mean(unlist(c0l))
maxlast1 = max(unlist(c1l))
minlast1 = min(unlist(c1l))
mealast1 = mean(unlist(c1l))
maxlast2 = max(unlist(c2l))
minlast2 = min(unlist(c2l))
mealast2 = mean(unlist(c2l))
maxlast3 = max(unlist(c3l))
minlast3 = min(unlist(c3l))
mealast3 = mean(unlist(c3l))

maxfre0 = max(unlist(c0f))
minfre0 = min(unlist(c0f))
meafre0 = mean(unlist(c0f))
maxfre1 = max(unlist(c1f))
minfre1 = min(unlist(c1f))
meafre1 = mean(unlist(c1f))
maxfre2 = max(unlist(c2f))
minfre2 = min(unlist(c2f))
meafre2 = mean(unlist(c2f))
maxfre3 = max(unlist(c3f))
minfre3 = min(unlist(c3f))
meafre3 = mean(unlist(c3f))

maxa0 = max(unlist(c0a))
mina0 = min(unlist(c0a))
meaa0 = mean(unlist(c0a))
maxa1 = max(unlist(c1a))
mina1 = min(unlist(c1a))
meaa1 = mean(unlist(c1a))
maxa2 = max(unlist(c2a))
mina2 = min(unlist(c2a))
meaa2 = mean(unlist(c2a))
maxa3 = max(unlist(c3a))
mina3 = min(unlist(c3a))
meaa3 = mean(unlist(c3a))

Prediction Model (Supervised Learning)(Decision Tree)(Classification)

library(tidyverse)
library(caret)
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
## The following object is masked from 'package:survival':
## 
##     cluster
exp = all[,-5]

set.seed(123)

validation_index = createDataPartition(exp$dbscan.cluster, p = 0.8, list=FALSE)
training = exp[validation_index,]
testing = exp[-validation_index,]


library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.1.3
fit <- rpart(dbscan.cluster ~ Last.purchased + logfrequency + logamount, data = training, method = 'class')
rpart.plot(fit, extra = 104)

predict_unseen <-predict(fit, testing, type = 'class')

table_mat <- table(testing$dbscan.cluster, predict_unseen)
table_mat
##    predict_unseen
##      0  1  2  3
##   0  7  4  4  2
##   1  4 72  0  0
##   2  5  0 57  0
##   3  0  0  0 37
accuracy_Test <- sum(diag(table_mat)) / sum(table_mat)

print(paste('Accuracy for test', accuracy_Test))
## [1] "Accuracy for test 0.901041666666667"

The End Thank You!