data <- read.csv("C:\\Users\\tariqm\\Documents\\R\\Datasets\\Capstone\\campus_web_data_6months.csv")
str(data)
## 'data.frame': 32 obs. of 10 variables:
## $ Campus : chr "CAMP001" "CAMP002" "CAMP003" "CAMP004" ...
## $ Average.Sessions : int 81 459 17 262 296 202 481 680 108 249 ...
## $ Average.Bounce.Rate : int 47 60 75 68 21 50 71 5 70 73 ...
## $ Total.Bounces : int 7203 52391 2621 32787 11454 19386 67942 7968 13468 35508 ...
## $ Average.Repeat.Users : int 5 37 1 16 29 20 36 67 9 16 ...
## $ Average.New.Users : int 64 397 15 211 243 171 394 578 94 213 ...
## $ Total.Session.Duration : int 386193 3883454 131926 1256348 4155633 1962315 4502922 3955837 620753 1152193 ...
## $ Average.Session.Duration : int 24 45 45 25 74 50 54 31 31 24 ...
## $ Average.Pages.per.Session: num 1.73 1.36 1.24 1.38 1.37 1.76 1.39 2.58 1.4 1.36 ...
## $ Average.Conversion.Rate : num 0.008 0.05 0.077 0.014 0.059 0.021 0.06 0.036 0 0.058 ...
library(ggcorrplot)
## Warning: package 'ggcorrplot' was built under R version 4.0.5
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.5
corr <- round(cor(data[,-1]),1)
ggcorrplot(corr,
method = "square",
type = "lower",
lab = TRUE,
lab_size = 3,
colors = c("tomato2", "white", "springgreen3"),
title="Correlogram",
ggtheme = theme_bw)

ndata <- scale(data[,-1])
pc <- prcomp(ndata, scale = TRUE)
print(pc)
## Standard deviations (1, .., p=9):
## [1] 2.0804134 1.4401953 1.2750465 0.7373798 0.4458608 0.3762814 0.2495790
## [8] 0.1540635 0.0428968
##
## Rotation (n x k) = (9 x 9):
## PC1 PC2 PC3 PC4
## Average.Sessions 0.46697128 0.14797759 -0.04781459 0.04349462
## Average.Bounce.Rate -0.19511309 0.56130906 0.24563567 0.06478086
## Total.Bounces 0.25968851 0.52366293 0.17222734 0.01261436
## Average.Repeat.Users 0.45980679 0.05159668 -0.05754900 -0.02804140
## Average.New.Users 0.46442044 0.14669018 -0.05644001 0.08576043
## Total.Session.Duration 0.44528308 -0.13651311 0.14899441 -0.20826741
## Average.Session.Duration 0.16601352 -0.50232261 0.37220288 -0.36185943
## Average.Pages.per.Session 0.13857297 -0.20455908 -0.63471739 0.46514610
## Average.Conversion.Rate 0.06543676 -0.22764759 0.58104800 0.77131895
## PC5 PC6 PC7 PC8
## Average.Sessions -0.06713937 -0.04819242 0.106037257 0.35463106
## Average.Bounce.Rate 0.18010612 0.72371922 -0.060200707 0.14826604
## Total.Bounces 0.52938550 -0.47275016 -0.144382414 -0.31571030
## Average.Repeat.Users -0.42394685 0.20848708 -0.696172700 -0.25256346
## Average.New.Users -0.06539026 -0.03891268 0.229376210 0.56765919
## Total.Session.Duration 0.01883710 0.34504703 0.567316957 -0.52438958
## Average.Session.Duration 0.50359753 0.13055125 -0.315005895 0.28414065
## Average.Pages.per.Session 0.48810642 0.26011936 -0.077022419 -0.08878989
## Average.Conversion.Rate -0.08114851 -0.04087857 0.002474325 -0.05433006
## PC9
## Average.Sessions -7.823204e-01
## Average.Bounce.Rate -2.269168e-02
## Total.Bounces 6.524297e-02
## Average.Repeat.Users 1.008700e-01
## Average.New.Users 6.096022e-01
## Total.Session.Duration 3.559853e-02
## Average.Session.Duration -3.942972e-03
## Average.Pages.per.Session 7.375082e-05
## Average.Conversion.Rate -1.144091e-02
summary(pc)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.0804 1.4402 1.2750 0.73738 0.44586 0.37628 0.24958
## Proportion of Variance 0.4809 0.2305 0.1806 0.06041 0.02209 0.01573 0.00692
## Cumulative Proportion 0.4809 0.7114 0.8920 0.95242 0.97451 0.99024 0.99716
## PC8 PC9
## Standard deviation 0.15406 0.0429
## Proportion of Variance 0.00264 0.0002
## Cumulative Proportion 0.99980 1.0000
plot(pc, type = "lines")

library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
pca <- principal(ndata, nfactors=3, rotate="varimax", scores=TRUE)
pca$values
## [1] 4.328119966 2.074162540 1.625743666 0.543728911 0.198791866 0.141587686
## [7] 0.062289673 0.023735556 0.001840135
pca$loadings
##
## Loadings:
## RC1 RC2 RC3
## Average.Sessions 0.993
## Average.Bounce.Rate -0.167 -0.891 -0.308
## Total.Bounces 0.727 -0.592 -0.172
## Average.Repeat.Users 0.940 0.191
## Average.New.Users 0.988
## Total.Session.Duration 0.830 0.226 0.439
## Average.Session.Duration 0.119 0.326 0.865
## Average.Pages.per.Session 0.208 0.785 -0.407
## Average.Conversion.Rate -0.183 0.801
##
## RC1 RC2 RC3
## SS loadings 4.148 2.002 1.878
## Proportion Var 0.461 0.222 0.209
## Cumulative Var 0.461 0.683 0.892
pdata <- pca$scores
set.seed(123)
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.0.5
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_nbclust(pdata, kmeans, method = "wss")

fviz_nbclust (pdata, kmeans, method = "silhouette")

fviz_nbclust (pdata, kmeans, method = "gap_stat")

set.seed(123)
cluster <- kmeans(pdata,2)
cluster
## K-means clustering with 2 clusters of sizes 12, 20
##
## Cluster means:
## RC1 RC2 RC3
## 1 1.1059982 -0.1880291 -0.13112329
## 2 -0.6635989 0.1128174 0.07867397
##
## Clustering vector:
## [1] 2 1 2 2 2 2 1 1 2 2 2 1 1 1 2 2 2 2 1 2 2 2 1 1 2 1 2 2 2 1 1 2
##
## Within cluster sum of squares by cluster:
## [1] 36.48854 32.01648
## (between_SS / total_SS = 26.3 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
fviz_cluster(cluster, data = pdata, axes = c(1,2), labelsize = 1)

biplot(pc)

library(rgl)
## Warning: package 'rgl' was built under R version 4.0.5
plot3d(pdata, type="p", col = cluster$cluster)
text3d(pdata, texts=rownames(data), font=2)
grid3d('x')
grid3d('y')
grid3d('z')
text3d(pca$loadings[,1:3], texts=rownames(pca$loadings), col="black")
coords <- NULL
for (i in 1:nrow(pca$loadings)) {
coords <- rbind(coords, rbind(c(0,0,0),pca$loadings[i,1:3]))
}
lines3d(coords, col="black", lwd=4)
library(pca3d)
## Warning: package 'pca3d' was built under R version 4.0.5
pca2d(pc, group = cluster$cluster, legend="bottomleft",
bg = "white", axes.color= "white", biplot = TRUE,
show.ellipses=TRUE, ellipse.ci= .75, show.plane = TRUE)

pca2d(pdata, group = cluster$cluster, legend="bottomleft",
bg = "white", axes.color= "white", biplot = pc$rotation,
show.ellipses=TRUE, ellipse.ci=0.75, show.plane = TRUE)

pca3d(pc, group = cluster$cluster, legend = "bottom",
bg = "black", axes.color= "white", biplot = TRUE,
show.ellipses=TRUE, ellipse.ci=0.75, show.plane = TRUE)
## [1] 0.11061592 0.06336506 0.07233034
pca3d(pdata, group = cluster$cluster, legend = "bottom",
bg = "black", axes.color= "white", biplot = pc$rotation,
show.ellipses=TRUE, ellipse.ci=0.75, show.plane = TRUE)
## [1] 0.04026762 0.06512394 0.04579045