This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
Spending Behavior Big Data Group 4 Simon Chow
Read the spending behavior data read.table() is the function used to open and read the data file
df = as.data.frame(read.table("Group 4.txt", sep="\t", header=T))
check the dimension and number of samples in the dataset
dim(df)
## [1] 1000 4
the dimension is 1000*4, which means there are 1000 samples each with 4 attributes
check the type of features
str(df)
## 'data.frame': 1000 obs. of 4 variables:
## $ customer_id : int 80 90 130 190 220 290 330 400 410 480 ...
## $ Last.purchased: chr "343" "758" "2970" "2592" ...
## $ frequency : int 4 4 1 2 1 2 3 1 4 3 ...
## $ amount : num 75 126 60 65 30 ...
there are total of 4 attributes which ‘customer_id’ is not useful in our study which would be removed. Besides, data type for ‘Last.purchased’ is in chr which has to be converted into data type int.
remove the first column (customer_id)
df = df[,-1]
the dimension of the dataset becomes 1000*3
convert the ‘Last.purchased’ from character to integer
df[,1] = as.integer(df[,1])
## Warning: NAs introduced by coercion
all three attributes are in numeric form
check the total number of missing values in the dataset
sum(is.na(df))
## [1] 32
there are total 32 missing values in the dataset, which woould be removed details explained in the report
data imputation by removing rows with missing values
df= na.omit(df)
Original Distributions Distributions of the features
-> Recency (Last.purchased) Histogram, Boxplot
hist(df$Last.purchased,
col="blue",
main="Histogram to Show Count of Last.purchased Class",
xlab="Last.purchased",
ylab="No. of samples",
labels=TRUE)
boxplot(df$Last.purchased,
main = "Boxplot for Descriptive Analysis of Last.purchased(Recency)",
xlab = "Number of Days since Last Purchase",
ylab = "Last.purchased",
col = "orange",
border = "brown",
horizontal = TRUE,
notch = TRUE)
-> Frequency (frequency) Histogram, Density Plot, Boxplot
hist(df$frequency,
col="blue",
main="Histogram to Show Count of frequency Class",
xlab="frequency",
ylab="No. of samples",
labels=TRUE)
d1_f = density(df$frequency)
plot(d1_f, main="frequency")
polygon(d1_f, col="green", border="blue")
boxplot(df$frequency,
main = "Boxplot for Descriptive Analysis of freqeuncy(Frequency)",
xlab = "Number of Times Has Been Purchased",
ylab = "Frequency",
col = "orange",
border = "brown",
horizontal = TRUE,
notch = TRUE)
-> Monetary (Amount) Histogram, Density Plot, Boxplot
hist(df$amount,
col="blue",
main="Histogram to Show Count of amount Class",
xlab="Amount Class",
ylab="No. of samples",
labels=TRUE)
d1_a = density(df$amount)
plot(d1_a, main="amount")
polygon(d1_a, col="green", border="blue")
boxplot(df$amount,
main = "Boxplot for Descriptive Analysis of amount(Monetary)",
xlab = "Total Amount Has Been Spent",
ylab = "Amount",
col = "orange",
border = "brown",
horizontal = TRUE,
notch = TRUE)
3D plot
library(scatterplot3d)
scatterplot3d(df[, c(1,2,3)], pch=20, angle=80, main='Original 3D plot',
xlab='Last.purchased (Recency)', ylab='frequency (Frequency)',
zlab='amount (Monetary)')
check if there are outliers in the three attributes, general plot showing all 3 attributes
boxplot(df)
there are outliers in ‘frequency’ and ‘amount’
checking which are those outliers
boxplot(df$Last.purchased)$out
## numeric(0)
boxplot(df$frequency)$out
## [1] 10 9 10 10 10 18 9 12 15 9 9 11
boxplot(df$amount)$out
## [1] 125.7500 600.0000 300.0000 237.5000 200.0000 600.0000 116.6667
## [8] 475.0000 175.0000 400.0000 176.6667 125.0000 125.0000 237.5000
## [15] 527.0000 150.0000 252.4500 150.0000 130.0000 130.0000 145.0000
## [22] 150.0000 126.5000 125.0000 115.0000 166.6667 152.0000 311.2500
## [29] 152.2500 733.3333 150.0000 173.7500 133.3333 130.0000 158.3333
## [36] 225.0000 300.0000 200.0000 150.0000 130.0000 125.0000 125.0000
## [43] 116.0000 150.0000 200.0000 258.7500 2500.0000 606.6667 175.0000
## [50] 666.6667 143.7500 300.0000 150.0000 200.0000 140.0000 1000.0000
## [57] 150.0000 112.5000 3100.0000 150.0000 111.0000 226.6667 300.0000
## [64] 300.0000 950.0000 303.3333 137.5000 150.0000 308.3333 150.0000
## [71] 165.0000 108.3333 300.0000 116.6667 137.5000 164.2857 175.0000
## [78] 290.0000 200.0000 125.0000 276.0000 1807.0000 125.0000 150.0000
## [85] 116.6667 166.6667 125.0000 164.0000 160.0000 925.0000 150.0000
## [92] 160.0000 133.3333
-thre are zero outlier in ‘Last.purchased’ -there are 12 outliers in ‘frequency’ (10 9 10 10 10 18 9 12 15 9 9 11) -there are 93 outliers in ‘amount’ (125.7500 600.0000 300.0000 237.5000 200.0000 600.0000 116.6667 475.0000 175.0000 400.0000) (176.6667 125.0000 125.0000 237.5000 527.0000 150.0000 252.4500 150.0000 130.0000 130.0000) (145.0000 150.0000 126.5000 125.0000 115.0000 166.6667 152.0000 311.2500 152.2500 733.3333) (150.0000 173.7500 133.3333 130.0000 158.3333 225.0000 300.0000 200.0000 150.0000 130.0000) (125.0000 125.0000 116.0000 150.0000 200.0000 258.7500 2500.0000 606.6667 175.0000 666.6667) (143.7500 300.0000 150.0000 200.0000 140.0000 1000.0000 150.0000 112.5000 3100.0000 150.0000) (111.0000 226.6667 300.0000 300.0000 950.0000 303.3333 137.5000 150.0000 308.3333 150.0000) (165.0000 108.3333 300.0000 116.6667 137.5000 164.2857 175.0000 290.0000 200.0000 125.0000) (276.0000 1807.0000 125.0000 150.0000 116.6667 166.6667 125.0000 164.0000 160.0000 925.0000) (150.0000 160.0000 133.3333)
Summary packages: Hmisc, pastecs, summarytools
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 4.1.3
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
library(pastecs)
## Warning: package 'pastecs' was built under R version 4.1.3
library(summarytools)
## Warning: package 'summarytools' was built under R version 4.1.3
##
## Attaching package: 'summarytools'
## The following objects are masked from 'package:Hmisc':
##
## label, label<-
sum1 = describe(df)
sum2 = stat.desc(df)
sum3 = descr(df)
sum1
## df
##
## 3 Variables 968 Observations
## --------------------------------------------------------------------------------
## Last.purchased
## n missing distinct Info Mean Gmd .05 .10
## 968 0 529 1 1913 1496 34.0 217.4
## .25 .50 .75 .90 .95
## 731.0 1836.5 3144.2 3731.0 3892.7
##
## lowest : 1 3 6 8 9, highest: 3988 3989 3999 4005 4014
## --------------------------------------------------------------------------------
## frequency
## n missing distinct Info Mean Gmd .05 .10
## 968 0 14 0.938 2.652 1.913 1 1
## .25 .50 .75 .90 .95
## 1 2 4 5 6
##
## lowest : 1 2 3 4 5, highest: 10 11 12 15 18
##
## Value 1 2 3 4 5 6 7 8 9 10 11
## Frequency 348 202 161 114 71 38 15 7 4 4 1
## Proportion 0.360 0.209 0.166 0.118 0.073 0.039 0.015 0.007 0.004 0.004 0.001
##
## Value 12 15 18
## Frequency 1 1 1
## Proportion 0.001 0.001 0.001
## --------------------------------------------------------------------------------
## amount
## n missing distinct Info Mean Gmd .05 .10
## 968 0 202 0.993 67.59 69.97 15.00 17.50
## .25 .50 .75 .90 .95
## 28.33 38.67 60.00 106.08 156.20
##
## lowest : 7.25000 7.50000 8.00000 10.00000 11.66667
## highest: 950.00000 1000.00000 1807.00000 2500.00000 3100.00000
##
## Value 0 50 100 150 200 250 300 400 500 550 600
## Frequency 212 550 129 37 10 5 11 1 1 1 3
## Proportion 0.219 0.568 0.133 0.038 0.010 0.005 0.011 0.001 0.001 0.001 0.003
##
## Value 650 750 900 950 1000 1800 2500 3100
## Frequency 1 1 1 1 1 1 1 1
## Proportion 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.001
##
## For the frequency table, variable is rounded to the nearest 50
## --------------------------------------------------------------------------------
sum2
## Last.purchased frequency amount
## nbr.val 9.680000e+02 9.680000e+02 968.000000
## nbr.null 0.000000e+00 0.000000e+00 0.000000
## nbr.na 0.000000e+00 0.000000e+00 0.000000
## min 1.000000e+00 1.000000e+00 7.250000
## max 4.014000e+03 1.800000e+01 3100.000000
## range 4.013000e+03 1.700000e+01 3092.750000
## sum 1.852225e+06 2.567000e+03 65423.081299
## median 1.836500e+03 2.000000e+00 38.666667
## mean 1.913456e+03 2.651860e+00 67.585828
## SE.mean 4.175356e+01 6.102287e-02 5.161687
## CI.mean.0.95 8.193802e+01 1.197525e-01 10.129400
## var 1.687572e+06 3.604629e+00 25790.438984
## std.dev 1.299066e+03 1.898586e+00 160.594019
## coef.var 6.789109e-01 7.159452e-01 2.376149
sum3
## Descriptive Statistics
## df
## N: 968
##
## amount frequency Last.purchased
## ----------------- --------- ----------- ----------------
## Mean 67.59 2.65 1913.46
## Std.Dev 160.59 1.90 1299.07
## Min 7.25 1.00 1.00
## Q1 28.33 1.00 731.00
## Median 38.67 2.00 1836.50
## Q3 60.00 4.00 3144.50
## Max 3100.00 18.00 4014.00
## MAD 20.51 1.48 1769.48
## IQR 31.67 3.00 2413.25
## CV 2.38 0.72 0.68
## Skewness 12.72 1.93 0.09
## SE.Skewness 0.08 0.08 0.08
## Kurtosis 200.96 7.37 -1.37
## N.Valid 968.00 968.00 968.00
## Pct.Valid 100.00 100.00 100.00
showing the information of the original data set without missing values
Transformation (log function) Transform “frequency” and “amount” feature to log(amount) to make data set more normal
df$logfrequency = log(df$frequency)
df$logamount = log(df$amount)
plot histogram to show the effect of log transformation for frequency and amount
hist(df$logfrequency,
col="blue",
main="Histogram to Show Count of frequency Class (after log transformation)",
xlab="frequency",
ylab="No. of samples",
labels=TRUE)
hist(df$logamount,
col="blue",
main="Histogram to Show Count of amount Class (after log transformation)",
xlab="Amount Class",
ylab="No. of samples",
labels=TRUE)
Winsorizing outliers Winsorizing or winsorization is the transformation of statistics by limiting extreme values in the statistical data to reduce the effect of possibly spurious outliers not using winsorizing to handle outliers as DBSCAN is robust to outliers
Center and Scale df
df <- as.data.frame(scale(df))
Distributions of the features after pre-processing (no NA, standardized)(Histogram, Box plot, Density Plot, 3D plot) ->Recency (Last.purchased) Histogram, Boxplot
hist(df$Last.purchased,
col="blue",
main="Histogram to Show Count of Last.purchased Class (Pre-processed)",
xlab="Last.purchased (Re-scaled)",
ylab="No. of samples",
labels=TRUE)
boxplot(df$Last.purchased,
main = "Boxplot for Descriptive Analysis of Last.purchased(Recency) (Pre-processed)",
xlab = "Number of Days since Last Purchase (Re-scaled)",
ylab = "Last.purchased",
col = "orange",
border = "brown",
horizontal = TRUE,
notch = TRUE)
->Frequency (frequency) Histogram, Density Plot, Boxplot
hist(df$logfrequency,
col="blue",
main="Histogram to Show Count of frequency Class (Pre-processed)",
xlab="frequency (Re-scaled)",
ylab="No. of samples",
labels=TRUE)
d1_f = density(df$logfrequency)
plot(d1_f, main="frequency (Pre-processed)")
polygon(d1_f, col="green", border="blue")
boxplot(df$logfrequency,
main = "Boxplot for Descriptive Analysis of freqeuncy(Frequency) (Pre-proccesed)",
xlab = "Number of Times Has Been Purchased (Re-scaled)",
ylab = "Frequency",
col = "orange",
border = "brown",
horizontal = TRUE,
notch = TRUE)
-> Monetary (Amount) Histogram, Density Plot, Boxplot
hist(df$logamount,
col="blue",
main="Histogram to Show Count of amount Class (Pre-processed)",
xlab="Amount Class (Re-scaled)",
ylab="No. of samples",
labels=TRUE)
d1_a = density(df$logamount)
plot(d1_a, main="amount (Pre-processed)")
polygon(d1_a, col="green", border="blue")
boxplot(df$logamount,
main = "Boxplot for Descriptive Analysis of amount(Monetary) (Pre-processed)",
xlab = "Total Amount Has Been Spent (Re-scaled)",
ylab = "Amount",
col = "orange",
border = "brown",
horizontal = TRUE,
notch = TRUE)
3D plot
scatterplot3d(df[, c(1,4,5)], pch=20, angle=70, main='Preprocessed 3D plot',
xlab='Last.purchased (Recency)', ylab='logfrequency (Frequency)(logfrequency*)',
zlab='logamount (Monetary)(logamount*)')
## Section 2
Using the model DBSCAN to identify customer segments Density-based spatial clustering of applications with noise (DBSCAN) is a data clustering algorithm proposed by Martin Ester, Hans-Peter Kriegel, Jörg Sander and Xiaowei Xu in 1996. (wiki) advantages of DBSCAN 1. Unlike to K-means, DBSCAN does not require the user to specify the number of clusters to be generated 2. DBSCAN can find any shape of clusters. The cluster doesn’t have to be circular. 3. DBSCAN can identify outliers Two important parameters are required for DBSCAN epsilon (“eps”) The parameter eps defines the radius of neighborhood around a point x. It’s called the \(\epsilon\)-neighborhood of x. minimum points (“MinPts”) The parameter MinPts is the minimum number of neighbors within “eps” radius.
only keeps useful columns (removing ‘frequency’, removing ‘amount’)
df = df[,-2]
df = df[,-2]
str(df)
## 'data.frame': 968 obs. of 3 variables:
## $ Last.purchased: num -1.209 -0.889 0.813 0.522 1.348 ...
## $ logfrequency : num 0.958 0.958 -1.1411 -0.0916 -1.1411 ...
## $ logamount : num 0.726 1.393 0.438 0.542 -0.456 ...
only ‘last.purchased’,‘logfreqeuncy’,‘logamount’ remain
check the correlation of all variables corrplot()
library(corrplot)
## corrplot 0.92 loaded
M = cor(df)
corrplot(M, method = 'number')
corrplot(M, method = 'color')
Create two new variables
all = df
a = all
Compute DBSCAN using fpc package Initial Hyperparameter setting For more than 2 dimensions: minPts=2dim (Sander et al., 1998) minPts = 23 = 6 Plot the k-distances with k=minPts (Ester et al., 1996) Find the ‘elbow’ in the graph–> The k-distance value is your Epsilon value. requiring ‘dbscan’ package
library(fpc)
## Warning: package 'fpc' was built under R version 4.1.3
set.seed(123)
dbscan::kNNdistplot(all, k = 6)
abline(h = 0.65, lty = 2)
the optimal eps value is around a distance of 0.8 according to elbow method with silhouette score, eps: 0.65 -> 0.45
db = fpc::dbscan(all, eps = 0.65, MinPts = 6)
db
## dbscan Pts=968 MinPts=6 eps=0.65
## 0 1 2
## border 22 22 10
## seed 0 584 330
## total 22 606 340
db = fpc::dbscan(all, eps = 0.45, MinPts = 6)
db
## dbscan Pts=968 MinPts=6 eps=0.45
## 0 1 2 3
## border 94 32 10 15
## seed 0 342 303 172
## total 94 374 313 187
From 2 clusters to 3 clusters
Silhouette score
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:pastecs':
##
## extract
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble 3.1.6 v dplyr 1.0.8
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## v purrr 0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x tidyr::extract() masks magrittr::extract(), pastecs::extract()
## x dplyr::filter() masks stats::filter()
## x dplyr::first() masks pastecs::first()
## x dplyr::lag() masks stats::lag()
## x dplyr::last() masks pastecs::last()
## x purrr::set_names() masks magrittr::set_names()
## x dplyr::src() masks Hmisc::src()
## x dplyr::summarize() masks Hmisc::summarize()
## x tibble::view() masks summarytools::view()
library(cluster)
db$cluster
## [1] 1 1 2 3 2 3 1 2 1 1 2 3 3 3 2 3 1 1 2 2 2 2 2 3 0 1 0 3 1 3 3 1 2 2 0 3 2
## [38] 2 2 0 2 3 1 0 1 2 1 0 3 0 1 2 2 3 3 1 0 2 1 2 1 3 2 3 2 2 1 3 2 1 1 3 3 3
## [75] 1 2 1 1 1 2 1 1 2 2 1 0 3 2 2 3 1 2 2 2 1 3 2 3 3 2 1 1 1 1 1 1 3 2 1 2 2
## [112] 1 3 2 2 2 1 2 3 1 2 1 3 2 3 1 0 3 3 3 1 0 2 2 1 1 1 3 1 2 2 2 2 3 1 1 1 1
## [149] 2 2 1 3 2 2 2 0 1 2 1 2 1 1 1 1 2 1 1 1 1 3 2 3 1 1 2 1 3 1 0 0 2 1 1 2 2
## [186] 1 3 2 3 0 2 2 2 1 1 2 0 1 0 2 1 1 1 0 3 1 3 1 2 3 2 2 3 2 1 1 2 2 1 1 0 1
## [223] 2 3 3 2 1 2 2 1 1 1 1 2 2 3 1 3 2 1 1 2 1 1 2 2 1 1 1 3 3 2 2 1 1 3 2 2 2
## [260] 3 3 1 1 2 1 2 1 3 2 0 2 1 1 2 1 2 3 2 2 3 0 3 1 2 3 0 3 1 1 1 3 2 1 1 2 1
## [297] 2 1 2 2 3 3 3 1 1 1 3 3 2 1 1 1 1 1 3 0 1 2 2 2 1 2 1 0 0 1 1 0 1 2 2 1 1
## [334] 1 3 2 0 2 2 1 2 2 3 2 2 2 2 1 1 2 3 2 1 3 1 0 2 2 1 0 0 2 3 1 1 2 1 3 1 2
## [371] 1 1 2 0 0 2 1 1 1 2 0 2 2 3 3 1 3 3 1 1 1 1 2 3 1 0 2 0 3 1 1 3 1 2 1 1 1
## [408] 2 2 3 1 3 1 1 1 1 2 2 0 2 2 1 1 0 1 0 0 1 3 2 0 2 1 1 1 0 2 1 1 2 1 3 1 1
## [445] 1 3 1 0 2 1 2 1 1 3 1 3 1 2 2 1 2 3 3 2 2 1 1 1 1 2 2 2 0 3 1 3 1 2 2 1 1
## [482] 1 1 3 1 1 0 3 1 2 0 1 0 0 0 1 1 3 1 1 0 1 2 1 1 1 3 2 1 1 2 1 2 3 1 2 2 1
## [519] 1 1 2 3 1 1 1 1 0 1 0 1 1 1 1 1 3 3 1 3 2 1 1 1 1 2 2 3 1 1 0 0 1 1 2 1 3
## [556] 1 3 3 1 0 1 0 2 1 0 3 3 3 1 0 1 1 1 0 1 1 2 1 1 1 1 2 0 1 2 3 1 1 1 3 0 0
## [593] 1 1 1 2 1 1 1 2 2 2 1 1 3 3 3 0 1 1 3 2 2 1 3 1 3 2 1 2 3 2 1 1 3 3 2 2 3
## [630] 1 0 1 1 0 2 1 1 3 2 1 1 2 1 3 1 2 2 2 3 0 2 2 3 2 1 3 3 3 2 3 1 1 3 2 2 0
## [667] 1 2 0 2 3 1 2 3 3 2 1 1 2 2 0 2 1 3 2 1 1 0 2 1 2 2 1 1 1 3 1 3 3 3 2 2 2
## [704] 2 1 2 2 3 2 2 3 2 2 1 2 2 2 2 1 1 1 0 2 1 2 1 1 0 2 2 2 1 2 2 0 2 3 2 1 0
## [741] 1 2 1 2 1 3 2 2 1 0 1 2 3 2 1 2 2 1 2 0 1 0 3 2 1 1 1 1 2 3 1 3 2 2 3 1 2
## [778] 2 0 3 2 0 2 2 1 2 1 1 1 1 2 1 3 3 0 1 1 0 0 1 1 1 1 1 2 3 2 3 1 2 2 3 2 2
## [815] 2 1 1 1 3 1 0 3 2 2 2 3 3 0 2 3 1 0 3 3 2 2 3 1 2 1 2 3 1 1 2 3 2 1 2 3 2
## [852] 3 1 2 1 1 0 2 2 1 2 1 1 2 3 3 1 2 2 2 3 2 1 1 1 3 1 3 2 2 2 3 0 1 2 3 2 1
## [889] 3 1 1 1 0 0 1 1 1 2 3 2 2 1 1 2 2 2 2 1 2 2 3 2 1 1 2 3 1 1 2 2 0 2 0 2 3
## [926] 3 0 0 1 3 2 0 2 3 3 2 2 2 1 1 3 1 3 3 2 2 2 1 3 1 3 3 1 2 1 0 1 2 2 2 1 1
## [963] 2 1 1 3 1 2
all = all %>%
mutate(
dbscan.cluster = db$cluster
)
sil.dbscan <- silhouette(all$dbscan.cluster, dist(select(all,Last.purchased,logfrequency,logamount)))
summary(sil.dbscan)
## Silhouette of 968 units in 4 clusters from silhouette.default(x = all$dbscan.cluster, dist = dist(select(all, from Last.purchased, logfrequency, logamount))) :
## Cluster sizes, ids = (0, 1, 2, 3), and average silhouette widths:
## 94 374 313 187
## -0.2371555 0.2497144 0.3449052 0.1647128
## Individual silhouette widths:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.56714 0.07344 0.28052 0.21679 0.41846 0.52380
# the plot require an external window
# dev.new(width=5, height=4) (re)
s3 = plot(silhouette(db$cluster,dist(all[,1:3],"euclidean")))
The silhouette plot require an external window to show the complete plot (shown in report) with <<dev.new(width=5, height=4)>>
Plot DBSCAN results
creating an attribute in factor (datatype for coloring)
all$Species = factor(db$cluster)
str(all)
## 'data.frame': 968 obs. of 5 variables:
## $ Last.purchased: num -1.209 -0.889 0.813 0.522 1.348 ...
## $ logfrequency : num 0.958 0.958 -1.1411 -0.0916 -1.1411 ...
## $ logamount : num 0.726 1.393 0.438 0.542 -0.456 ...
## $ dbscan.cluster: num 1 1 2 3 2 3 1 2 1 1 ...
## $ Species : Factor w/ 4 levels "0","1","2","3": 2 2 3 4 3 4 2 3 2 2 ...
colors <- c("black", "blue", "yellow", "red","orange","green","pink")
colors <- colors[as.numeric(all$Species)]
Colored 3D Scatter Plot
library(plotly)
## Warning: package 'plotly' was built under R version 4.1.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:Hmisc':
##
## subplot
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(dplyr)
str(df)
## 'data.frame': 968 obs. of 3 variables:
## $ Last.purchased: num -1.209 -0.889 0.813 0.522 1.348 ...
## $ logfrequency : num 0.958 0.958 -1.1411 -0.0916 -1.1411 ...
## $ logamount : num 0.726 1.393 0.438 0.542 -0.456 ...
p <- plot_ly(all, x=~Last.purchased, y=~logfrequency,
z=~logamount, color=~Species) %>%
add_markers(size=1.5)
print(p)
Print DBSCAN and analysis on clustering result
print(db)
## dbscan Pts=968 MinPts=6 eps=0.45
## 0 1 2 3
## border 94 32 10 15
## seed 0 342 303 172
## total 94 374 313 187
db$cluster
## [1] 1 1 2 3 2 3 1 2 1 1 2 3 3 3 2 3 1 1 2 2 2 2 2 3 0 1 0 3 1 3 3 1 2 2 0 3 2
## [38] 2 2 0 2 3 1 0 1 2 1 0 3 0 1 2 2 3 3 1 0 2 1 2 1 3 2 3 2 2 1 3 2 1 1 3 3 3
## [75] 1 2 1 1 1 2 1 1 2 2 1 0 3 2 2 3 1 2 2 2 1 3 2 3 3 2 1 1 1 1 1 1 3 2 1 2 2
## [112] 1 3 2 2 2 1 2 3 1 2 1 3 2 3 1 0 3 3 3 1 0 2 2 1 1 1 3 1 2 2 2 2 3 1 1 1 1
## [149] 2 2 1 3 2 2 2 0 1 2 1 2 1 1 1 1 2 1 1 1 1 3 2 3 1 1 2 1 3 1 0 0 2 1 1 2 2
## [186] 1 3 2 3 0 2 2 2 1 1 2 0 1 0 2 1 1 1 0 3 1 3 1 2 3 2 2 3 2 1 1 2 2 1 1 0 1
## [223] 2 3 3 2 1 2 2 1 1 1 1 2 2 3 1 3 2 1 1 2 1 1 2 2 1 1 1 3 3 2 2 1 1 3 2 2 2
## [260] 3 3 1 1 2 1 2 1 3 2 0 2 1 1 2 1 2 3 2 2 3 0 3 1 2 3 0 3 1 1 1 3 2 1 1 2 1
## [297] 2 1 2 2 3 3 3 1 1 1 3 3 2 1 1 1 1 1 3 0 1 2 2 2 1 2 1 0 0 1 1 0 1 2 2 1 1
## [334] 1 3 2 0 2 2 1 2 2 3 2 2 2 2 1 1 2 3 2 1 3 1 0 2 2 1 0 0 2 3 1 1 2 1 3 1 2
## [371] 1 1 2 0 0 2 1 1 1 2 0 2 2 3 3 1 3 3 1 1 1 1 2 3 1 0 2 0 3 1 1 3 1 2 1 1 1
## [408] 2 2 3 1 3 1 1 1 1 2 2 0 2 2 1 1 0 1 0 0 1 3 2 0 2 1 1 1 0 2 1 1 2 1 3 1 1
## [445] 1 3 1 0 2 1 2 1 1 3 1 3 1 2 2 1 2 3 3 2 2 1 1 1 1 2 2 2 0 3 1 3 1 2 2 1 1
## [482] 1 1 3 1 1 0 3 1 2 0 1 0 0 0 1 1 3 1 1 0 1 2 1 1 1 3 2 1 1 2 1 2 3 1 2 2 1
## [519] 1 1 2 3 1 1 1 1 0 1 0 1 1 1 1 1 3 3 1 3 2 1 1 1 1 2 2 3 1 1 0 0 1 1 2 1 3
## [556] 1 3 3 1 0 1 0 2 1 0 3 3 3 1 0 1 1 1 0 1 1 2 1 1 1 1 2 0 1 2 3 1 1 1 3 0 0
## [593] 1 1 1 2 1 1 1 2 2 2 1 1 3 3 3 0 1 1 3 2 2 1 3 1 3 2 1 2 3 2 1 1 3 3 2 2 3
## [630] 1 0 1 1 0 2 1 1 3 2 1 1 2 1 3 1 2 2 2 3 0 2 2 3 2 1 3 3 3 2 3 1 1 3 2 2 0
## [667] 1 2 0 2 3 1 2 3 3 2 1 1 2 2 0 2 1 3 2 1 1 0 2 1 2 2 1 1 1 3 1 3 3 3 2 2 2
## [704] 2 1 2 2 3 2 2 3 2 2 1 2 2 2 2 1 1 1 0 2 1 2 1 1 0 2 2 2 1 2 2 0 2 3 2 1 0
## [741] 1 2 1 2 1 3 2 2 1 0 1 2 3 2 1 2 2 1 2 0 1 0 3 2 1 1 1 1 2 3 1 3 2 2 3 1 2
## [778] 2 0 3 2 0 2 2 1 2 1 1 1 1 2 1 3 3 0 1 1 0 0 1 1 1 1 1 2 3 2 3 1 2 2 3 2 2
## [815] 2 1 1 1 3 1 0 3 2 2 2 3 3 0 2 3 1 0 3 3 2 2 3 1 2 1 2 3 1 1 2 3 2 1 2 3 2
## [852] 3 1 2 1 1 0 2 2 1 2 1 1 2 3 3 1 2 2 2 3 2 1 1 1 3 1 3 2 2 2 3 0 1 2 3 2 1
## [889] 3 1 1 1 0 0 1 1 1 2 3 2 2 1 1 2 2 2 2 1 2 2 3 2 1 1 2 3 1 1 2 2 0 2 0 2 3
## [926] 3 0 0 1 3 2 0 2 3 3 2 2 2 1 1 3 1 3 3 2 2 2 1 3 1 3 3 1 2 1 0 1 2 2 2 1 1
## [963] 2 1 1 3 1 2
all$dbscan.cluster
## [1] 1 1 2 3 2 3 1 2 1 1 2 3 3 3 2 3 1 1 2 2 2 2 2 3 0 1 0 3 1 3 3 1 2 2 0 3 2
## [38] 2 2 0 2 3 1 0 1 2 1 0 3 0 1 2 2 3 3 1 0 2 1 2 1 3 2 3 2 2 1 3 2 1 1 3 3 3
## [75] 1 2 1 1 1 2 1 1 2 2 1 0 3 2 2 3 1 2 2 2 1 3 2 3 3 2 1 1 1 1 1 1 3 2 1 2 2
## [112] 1 3 2 2 2 1 2 3 1 2 1 3 2 3 1 0 3 3 3 1 0 2 2 1 1 1 3 1 2 2 2 2 3 1 1 1 1
## [149] 2 2 1 3 2 2 2 0 1 2 1 2 1 1 1 1 2 1 1 1 1 3 2 3 1 1 2 1 3 1 0 0 2 1 1 2 2
## [186] 1 3 2 3 0 2 2 2 1 1 2 0 1 0 2 1 1 1 0 3 1 3 1 2 3 2 2 3 2 1 1 2 2 1 1 0 1
## [223] 2 3 3 2 1 2 2 1 1 1 1 2 2 3 1 3 2 1 1 2 1 1 2 2 1 1 1 3 3 2 2 1 1 3 2 2 2
## [260] 3 3 1 1 2 1 2 1 3 2 0 2 1 1 2 1 2 3 2 2 3 0 3 1 2 3 0 3 1 1 1 3 2 1 1 2 1
## [297] 2 1 2 2 3 3 3 1 1 1 3 3 2 1 1 1 1 1 3 0 1 2 2 2 1 2 1 0 0 1 1 0 1 2 2 1 1
## [334] 1 3 2 0 2 2 1 2 2 3 2 2 2 2 1 1 2 3 2 1 3 1 0 2 2 1 0 0 2 3 1 1 2 1 3 1 2
## [371] 1 1 2 0 0 2 1 1 1 2 0 2 2 3 3 1 3 3 1 1 1 1 2 3 1 0 2 0 3 1 1 3 1 2 1 1 1
## [408] 2 2 3 1 3 1 1 1 1 2 2 0 2 2 1 1 0 1 0 0 1 3 2 0 2 1 1 1 0 2 1 1 2 1 3 1 1
## [445] 1 3 1 0 2 1 2 1 1 3 1 3 1 2 2 1 2 3 3 2 2 1 1 1 1 2 2 2 0 3 1 3 1 2 2 1 1
## [482] 1 1 3 1 1 0 3 1 2 0 1 0 0 0 1 1 3 1 1 0 1 2 1 1 1 3 2 1 1 2 1 2 3 1 2 2 1
## [519] 1 1 2 3 1 1 1 1 0 1 0 1 1 1 1 1 3 3 1 3 2 1 1 1 1 2 2 3 1 1 0 0 1 1 2 1 3
## [556] 1 3 3 1 0 1 0 2 1 0 3 3 3 1 0 1 1 1 0 1 1 2 1 1 1 1 2 0 1 2 3 1 1 1 3 0 0
## [593] 1 1 1 2 1 1 1 2 2 2 1 1 3 3 3 0 1 1 3 2 2 1 3 1 3 2 1 2 3 2 1 1 3 3 2 2 3
## [630] 1 0 1 1 0 2 1 1 3 2 1 1 2 1 3 1 2 2 2 3 0 2 2 3 2 1 3 3 3 2 3 1 1 3 2 2 0
## [667] 1 2 0 2 3 1 2 3 3 2 1 1 2 2 0 2 1 3 2 1 1 0 2 1 2 2 1 1 1 3 1 3 3 3 2 2 2
## [704] 2 1 2 2 3 2 2 3 2 2 1 2 2 2 2 1 1 1 0 2 1 2 1 1 0 2 2 2 1 2 2 0 2 3 2 1 0
## [741] 1 2 1 2 1 3 2 2 1 0 1 2 3 2 1 2 2 1 2 0 1 0 3 2 1 1 1 1 2 3 1 3 2 2 3 1 2
## [778] 2 0 3 2 0 2 2 1 2 1 1 1 1 2 1 3 3 0 1 1 0 0 1 1 1 1 1 2 3 2 3 1 2 2 3 2 2
## [815] 2 1 1 1 3 1 0 3 2 2 2 3 3 0 2 3 1 0 3 3 2 2 3 1 2 1 2 3 1 1 2 3 2 1 2 3 2
## [852] 3 1 2 1 1 0 2 2 1 2 1 1 2 3 3 1 2 2 2 3 2 1 1 1 3 1 3 2 2 2 3 0 1 2 3 2 1
## [889] 3 1 1 1 0 0 1 1 1 2 3 2 2 1 1 2 2 2 2 1 2 2 3 2 1 1 2 3 1 1 2 2 0 2 0 2 3
## [926] 3 0 0 1 3 2 0 2 3 3 2 2 2 1 1 3 1 3 3 2 2 2 1 3 1 3 3 1 2 1 0 1 2 2 2 1 1
## [963] 2 1 1 3 1 2
seq = c(1:968)
c0l = list(c())
c0f = list(c())
c0a = list(c())
c1l = list(c())
c1f = list(c())
c1a = list(c())
c2l = list(c())
c2f = list(c())
c2a = list(c())
c3l = list(c())
c3f = list(c())
c3a = list(c())
for (i in seq){
if ( all$dbscan.cluster[i] == 0 ) {
nl = list(all[i,1])
nf = list(all[i,2])
na = list(all[i,3])
c0l = append(c0l, nl)
c0f = append(c0f, nf)
c0a = append(c0a, na)
} else if ( all$dbscan.cluster[i] == 1) {
nl = list(all[i,1])
nf = list(all[i,2])
na = list(all[i,3])
c1l = append(c1l, nl)
c1f = append(c1f, nf)
c1a = append(c1a, na)
} else if ( all$dbscan.cluster[i] == 2) {
nl = list(all[i,1])
nf = list(all[i,2])
na = list(all[i,3])
c2l = append(c2l, nl)
c2f = append(c2f, nf)
c2a = append(c2a, na)
} else {
nl = list(all[i,1])
nf = list(all[i,2])
na = list(all[i,3])
c3l = append(c3l, nl)
c3f = append(c3f, nf)
c3a = append(c3a, na)
}
}
maxlast0 = max(unlist(c0l))
minlast0 = min(unlist(c0l))
mealast0 = mean(unlist(c0l))
maxlast1 = max(unlist(c1l))
minlast1 = min(unlist(c1l))
mealast1 = mean(unlist(c1l))
maxlast2 = max(unlist(c2l))
minlast2 = min(unlist(c2l))
mealast2 = mean(unlist(c2l))
maxlast3 = max(unlist(c3l))
minlast3 = min(unlist(c3l))
mealast3 = mean(unlist(c3l))
maxfre0 = max(unlist(c0f))
minfre0 = min(unlist(c0f))
meafre0 = mean(unlist(c0f))
maxfre1 = max(unlist(c1f))
minfre1 = min(unlist(c1f))
meafre1 = mean(unlist(c1f))
maxfre2 = max(unlist(c2f))
minfre2 = min(unlist(c2f))
meafre2 = mean(unlist(c2f))
maxfre3 = max(unlist(c3f))
minfre3 = min(unlist(c3f))
meafre3 = mean(unlist(c3f))
maxa0 = max(unlist(c0a))
mina0 = min(unlist(c0a))
meaa0 = mean(unlist(c0a))
maxa1 = max(unlist(c1a))
mina1 = min(unlist(c1a))
meaa1 = mean(unlist(c1a))
maxa2 = max(unlist(c2a))
mina2 = min(unlist(c2a))
meaa2 = mean(unlist(c2a))
maxa3 = max(unlist(c3a))
mina3 = min(unlist(c3a))
meaa3 = mean(unlist(c3a))
Prediction Model (Supervised Learning)(Decision Tree)(Classification)
library(tidyverse)
library(caret)
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
## The following object is masked from 'package:survival':
##
## cluster
exp = all[,-5]
set.seed(123)
validation_index = createDataPartition(exp$dbscan.cluster, p = 0.8, list=FALSE)
training = exp[validation_index,]
testing = exp[-validation_index,]
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.1.3
fit <- rpart(dbscan.cluster ~ Last.purchased + logfrequency + logamount, data = training, method = 'class')
rpart.plot(fit, extra = 104)
predict_unseen <-predict(fit, testing, type = 'class')
table_mat <- table(testing$dbscan.cluster, predict_unseen)
table_mat
## predict_unseen
## 0 1 2 3
## 0 7 4 4 2
## 1 4 72 0 0
## 2 5 0 57 0
## 3 0 0 0 37
accuracy_Test <- sum(diag(table_mat)) / sum(table_mat)
print(paste('Accuracy for test', accuracy_Test))
## [1] "Accuracy for test 0.901041666666667"
The End Thank You!