Kira Plastinina is a Russian brand that is sold through a defunct chain of retail stores in Russia, Ukraine, Kazakhstan, Belarus, China, Philippines, and Armenia. The brand’s Sales and Marketing team would like to understand their customer’s behavior from data that they have collected over the past year. More specifically, they would like to learn the characteristics of customer groups.
The dataset for this Independent project can be found here [http://bit.ly/EcommerceCustomersDataset
df <- read.csv("C:/Users/user/Downloads/online_shoppers_intention.csv")
head(df)
## Administrative Administrative_Duration Informational Informational_Duration
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 -1 0 -1
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues
## 1 1 0.000000 0.20000000 0.2000000 0
## 2 2 64.000000 0.00000000 0.1000000 0
## 3 1 -1.000000 0.20000000 0.2000000 0
## 4 2 2.666667 0.05000000 0.1400000 0
## 5 10 627.500000 0.02000000 0.0500000 0
## 6 19 154.216667 0.01578947 0.0245614 0
## SpecialDay Month OperatingSystems Browser Region TrafficType
## 1 0 Feb 1 1 1 1
## 2 0 Feb 2 2 1 2
## 3 0 Feb 4 1 9 3
## 4 0 Feb 3 2 2 4
## 5 0 Feb 3 3 1 4
## 6 0 Feb 2 2 1 3
## VisitorType Weekend Revenue
## 1 Returning_Visitor FALSE FALSE
## 2 Returning_Visitor FALSE FALSE
## 3 Returning_Visitor FALSE FALSE
## 4 Returning_Visitor FALSE FALSE
## 5 Returning_Visitor TRUE FALSE
## 6 Returning_Visitor FALSE FALSE
# checking for missing values
colSums(is.na(df))
## Administrative Administrative_Duration Informational
## 14 14 14
## Informational_Duration ProductRelated ProductRelated_Duration
## 14 14 14
## BounceRates ExitRates PageValues
## 14 14 0
## SpecialDay Month OperatingSystems
## 0 0 0
## Browser Region TrafficType
## 0 0 0
## VisitorType Weekend Revenue
## 0 0 0
# dropping null values
df <- na.omit(df)
# confirming there are no null values
colSums(is.na(df))
## Administrative Administrative_Duration Informational
## 0 0 0
## Informational_Duration ProductRelated ProductRelated_Duration
## 0 0 0
## BounceRates ExitRates PageValues
## 0 0 0
## SpecialDay Month OperatingSystems
## 0 0 0
## Browser Region TrafficType
## 0 0 0
## VisitorType Weekend Revenue
## 0 0 0
duplicates <- df[duplicated(df),]
duplicates
## Administrative Administrative_Duration Informational
## 159 0 0 0
## 179 0 0 0
## 419 0 0 0
## 457 0 0 0
## 484 0 0 0
## 513 0 0 0
## 555 0 0 0
## 590 0 0 0
## 660 0 0 0
## 775 0 0 0
## 873 0 0 0
## 890 0 0 0
## 923 0 0 0
## 948 0 0 0
## 975 0 0 0
## 1035 0 0 0
## 1120 0 0 0
## 1171 0 0 0
## 1177 0 0 0
## 1214 0 0 0
## 1215 0 0 0
## 1292 0 0 0
## 1326 0 0 0
## 1357 0 0 0
## 1367 0 0 0
## 1382 0 0 0
## 1391 0 0 0
## 1395 0 0 0
## 1437 0 0 0
## 1454 0 0 0
## 1516 0 0 0
## 1574 0 0 0
## 1609 0 0 0
## 1698 0 0 0
## 1776 0 0 0
## 1805 0 0 0
## 1840 0 0 0
## 1867 0 0 0
## 1926 0 0 0
## 1934 0 0 0
## 1950 0 0 0
## 2057 0 0 0
## 2058 0 0 0
## 2236 0 0 0
## 2622 0 0 0
## 2740 0 0 0
## 3232 0 0 0
## 3273 0 0 0
## 3282 0 0 0
## 3578 0 0 0
## 3651 0 0 0
## 3664 0 0 0
## 3722 0 0 0
## 3892 0 0 0
## 4164 0 0 0
## 4183 0 0 0
## 4232 0 0 0
## 4344 0 0 0
## 4375 0 0 0
## 4404 0 0 0
## 4427 0 0 0
## 4464 0 0 0
## 4490 0 0 0
## 4553 0 0 0
## 4818 0 0 0
## 4884 0 0 0
## 4914 0 0 0
## 5039 0 0 0
## 5044 0 0 0
## 5057 0 0 0
## 5119 0 0 0
## 5199 0 0 0
## 5200 0 0 0
## 5255 0 0 0
## 5277 0 0 0
## 5287 0 0 0
## 5356 0 0 0
## 5408 0 0 0
## 6930 0 0 0
## 7152 0 0 0
## 7636 0 0 0
## 8545 0 0 0
## 9307 0 0 0
## 9495 0 0 0
## 9552 0 0 0
## 9569 0 0 0
## 9582 0 0 0
## 9719 0 0 0
## 9770 0 0 0
## 9879 0 0 0
## 9908 0 0 0
## 10147 0 0 0
## 10223 0 0 0
## 10270 0 0 0
## 10573 0 0 0
## 10632 0 0 0
## 10752 0 0 0
## 10796 0 0 0
## 10842 0 0 0
## 10989 0 0 0
## 11044 0 0 0
## 11206 0 0 0
## 11405 0 0 0
## 11524 0 0 0
## 11582 0 0 0
## 11625 0 0 0
## 11659 0 0 0
## 11734 0 0 0
## 11748 0 0 0
## 11802 0 0 0
## 11814 0 0 0
## 11828 0 0 0
## 11935 0 0 0
## 11939 0 0 0
## 12160 0 0 0
## 12181 0 0 0
## 12186 0 0 0
## Informational_Duration ProductRelated ProductRelated_Duration BounceRates
## 159 0 1 0 0.2
## 179 0 1 0 0.2
## 419 0 1 0 0.2
## 457 0 1 0 0.2
## 484 0 1 0 0.2
## 513 0 1 0 0.2
## 555 0 1 0 0.2
## 590 0 1 0 0.2
## 660 0 2 0 0.2
## 775 0 1 0 0.2
## 873 0 1 0 0.2
## 890 0 1 0 0.2
## 923 0 1 0 0.2
## 948 0 1 0 0.2
## 975 0 1 0 0.2
## 1035 0 1 0 0.2
## 1120 0 1 0 0.2
## 1171 0 1 0 0.2
## 1177 0 1 0 0.2
## 1214 0 1 0 0.2
## 1215 0 1 0 0.2
## 1292 0 2 0 0.2
## 1326 0 1 0 0.2
## 1357 0 2 0 0.2
## 1367 0 1 0 0.2
## 1382 0 1 0 0.2
## 1391 0 1 0 0.2
## 1395 0 1 0 0.2
## 1437 0 1 0 0.2
## 1454 0 1 0 0.2
## 1516 0 1 0 0.2
## 1574 0 1 0 0.2
## 1609 0 1 0 0.2
## 1698 0 1 0 0.2
## 1776 0 1 0 0.2
## 1805 0 1 0 0.2
## 1840 0 1 0 0.2
## 1867 0 1 0 0.2
## 1926 0 1 0 0.2
## 1934 0 1 0 0.2
## 1950 0 1 0 0.2
## 2057 0 1 0 0.2
## 2058 0 1 0 0.2
## 2236 0 1 0 0.2
## 2622 0 1 0 0.2
## 2740 0 1 0 0.2
## 3232 0 1 0 0.2
## 3273 0 1 0 0.2
## 3282 0 1 0 0.2
## 3578 0 1 0 0.2
## 3651 0 1 0 0.2
## 3664 0 1 0 0.2
## 3722 0 1 0 0.2
## 3892 0 1 0 0.2
## 4164 0 1 0 0.2
## 4183 0 1 0 0.2
## 4232 0 1 0 0.2
## 4344 0 1 0 0.2
## 4375 0 1 0 0.2
## 4404 0 1 0 0.2
## 4427 0 1 0 0.2
## 4464 0 1 0 0.2
## 4490 0 1 0 0.2
## 4553 0 2 0 0.2
## 4818 0 1 0 0.2
## 4884 0 1 0 0.2
## 4914 0 1 0 0.2
## 5039 0 1 0 0.2
## 5044 0 1 0 0.2
## 5057 0 1 0 0.2
## 5119 0 1 0 0.2
## 5199 0 1 0 0.2
## 5200 0 2 0 0.2
## 5255 0 1 0 0.2
## 5277 0 1 0 0.2
## 5287 0 1 0 0.2
## 5356 0 1 0 0.2
## 5408 0 1 0 0.2
## 6930 0 1 0 0.2
## 7152 0 1 0 0.2
## 7636 0 1 0 0.2
## 8545 0 1 0 0.2
## 9307 0 1 0 0.2
## 9495 0 1 0 0.2
## 9552 0 1 0 0.2
## 9569 0 1 0 0.2
## 9582 0 1 0 0.2
## 9719 0 1 0 0.2
## 9770 0 1 0 0.2
## 9879 0 1 0 0.2
## 9908 0 1 0 0.2
## 10147 0 1 0 0.2
## 10223 0 2 0 0.2
## 10270 0 1 0 0.2
## 10573 0 1 0 0.2
## 10632 0 1 0 0.2
## 10752 0 1 0 0.2
## 10796 0 1 0 0.2
## 10842 0 1 0 0.2
## 10989 0 1 0 0.2
## 11044 0 1 0 0.2
## 11206 0 1 0 0.2
## 11405 0 1 0 0.2
## 11524 0 1 0 0.2
## 11582 0 1 0 0.2
## 11625 0 1 0 0.2
## 11659 0 1 0 0.2
## 11734 0 1 0 0.2
## 11748 0 1 0 0.2
## 11802 0 1 0 0.2
## 11814 0 1 0 0.2
## 11828 0 1 0 0.2
## 11935 0 1 0 0.2
## 11939 0 1 0 0.2
## 12160 0 1 0 0.2
## 12181 0 1 0 0.2
## 12186 0 1 0 0.2
## ExitRates PageValues SpecialDay Month OperatingSystems Browser Region
## 159 0.2 0 0.0 Feb 1 1 1
## 179 0.2 0 0.0 Feb 3 2 3
## 419 0.2 0 0.0 Mar 1 1 1
## 457 0.2 0 0.0 Mar 2 2 4
## 484 0.2 0 0.0 Mar 3 2 3
## 513 0.2 0 0.0 Mar 2 2 1
## 555 0.2 0 0.0 Mar 2 2 1
## 590 0.2 0 0.0 Mar 2 2 1
## 660 0.2 0 0.0 Mar 2 5 1
## 775 0.2 0 0.0 Mar 2 2 4
## 873 0.2 0 0.0 Mar 3 2 3
## 890 0.2 0 0.0 Mar 1 1 2
## 923 0.2 0 0.0 Mar 3 2 2
## 948 0.2 0 0.0 Mar 2 2 1
## 975 0.2 0 0.0 Mar 2 2 1
## 1035 0.2 0 0.0 Mar 2 2 1
## 1120 0.2 0 0.0 Mar 2 2 1
## 1171 0.2 0 0.0 Mar 3 2 1
## 1177 0.2 0 0.0 Mar 2 4 1
## 1214 0.2 0 0.0 Mar 3 2 3
## 1215 0.2 0 0.0 Mar 1 1 1
## 1292 0.2 0 0.0 Mar 2 2 1
## 1326 0.2 0 0.0 Mar 1 1 3
## 1357 0.2 0 0.0 Mar 1 1 1
## 1367 0.2 0 0.0 Mar 1 1 8
## 1382 0.2 0 0.0 Mar 1 1 4
## 1391 0.2 0 0.0 Mar 2 2 1
## 1395 0.2 0 0.0 Mar 2 2 1
## 1437 0.2 0 0.0 Mar 3 2 3
## 1454 0.2 0 0.0 Mar 2 2 1
## 1516 0.2 0 0.0 Mar 1 1 1
## 1574 0.2 0 0.0 Mar 2 2 1
## 1609 0.2 0 0.0 Mar 2 2 7
## 1698 0.2 0 0.0 Mar 2 2 2
## 1776 0.2 0 0.0 Mar 3 2 1
## 1805 0.2 0 0.0 Mar 1 1 8
## 1840 0.2 0 0.0 Mar 2 2 1
## 1867 0.2 0 0.0 Mar 1 1 1
## 1926 0.2 0 0.0 Mar 3 2 1
## 1934 0.2 0 0.0 Mar 2 2 1
## 1950 0.2 0 0.0 Mar 2 2 1
## 2057 0.2 0 0.0 Mar 3 2 3
## 2058 0.2 0 0.0 Mar 2 4 1
## 2236 0.2 0 0.0 May 1 1 4
## 2622 0.2 0 0.0 May 1 1 1
## 2740 0.2 0 0.0 May 2 2 1
## 3232 0.2 0 0.0 May 2 4 1
## 3273 0.2 0 0.0 May 1 1 3
## 3282 0.2 0 0.0 May 1 1 1
## 3578 0.2 0 0.0 May 2 2 1
## 3651 0.2 0 0.0 May 2 2 4
## 3664 0.2 0 0.0 May 1 1 1
## 3722 0.2 0 0.0 May 1 1 4
## 3892 0.2 0 0.0 May 2 2 7
## 4164 0.2 0 0.0 May 1 1 4
## 4183 0.2 0 0.0 May 1 1 1
## 4232 0.2 0 0.0 May 2 2 2
## 4344 0.2 0 0.0 May 3 2 1
## 4375 0.2 0 0.0 May 2 2 1
## 4404 0.2 0 0.0 May 2 2 1
## 4427 0.2 0 0.0 May 2 2 1
## 4464 0.2 0 0.0 May 1 1 1
## 4490 0.2 0 0.0 May 3 2 9
## 4553 0.2 0 0.0 May 2 2 2
## 4818 0.2 0 0.0 May 2 2 1
## 4884 0.2 0 0.0 May 2 2 1
## 4914 0.2 0 0.8 May 2 2 1
## 5039 0.2 0 0.0 May 3 2 3
## 5044 0.2 0 0.0 May 2 2 1
## 5057 0.2 0 0.0 May 2 2 6
## 5119 0.2 0 0.0 May 1 1 6
## 5199 0.2 0 0.0 May 2 2 1
## 5200 0.2 0 0.0 May 2 2 2
## 5255 0.2 0 0.6 May 2 2 1
## 5277 0.2 0 0.0 May 3 2 3
## 5287 0.2 0 0.0 May 1 1 3
## 5356 0.2 0 0.0 May 1 1 3
## 5408 0.2 0 0.0 May 2 4 1
## 6930 0.2 0 0.0 June 2 2 1
## 7152 0.2 0 0.0 June 2 2 1
## 7636 0.2 0 0.0 June 3 2 3
## 8545 0.2 0 0.0 Nov 3 2 3
## 9307 0.2 0 0.0 Dec 3 2 3
## 9495 0.2 0 0.0 Dec 2 2 1
## 9552 0.2 0 0.0 Nov 3 2 4
## 9569 0.2 0 0.0 Dec 2 2 8
## 9582 0.2 0 0.0 Nov 2 2 1
## 9719 0.2 0 0.0 Nov 3 2 7
## 9770 0.2 0 0.0 Dec 2 2 2
## 9879 0.2 0 0.0 Dec 2 2 6
## 9908 0.2 0 0.0 Dec 2 2 1
## 10147 0.2 0 0.0 Dec 8 13 9
## 10223 0.2 0 0.0 Nov 1 1 1
## 10270 0.2 0 0.0 Nov 1 1 3
## 10573 0.2 0 0.0 Nov 2 2 3
## 10632 0.2 0 0.0 Nov 2 2 1
## 10752 0.2 0 0.0 Dec 1 1 1
## 10796 0.2 0 0.0 Nov 1 1 4
## 10842 0.2 0 0.0 Nov 2 2 3
## 10989 0.2 0 0.0 Nov 2 4 3
## 11044 0.2 0 0.0 Dec 3 2 6
## 11206 0.2 0 0.0 Dec 8 13 9
## 11405 0.2 0 0.0 Nov 3 2 1
## 11524 0.2 0 0.0 Dec 2 2 1
## 11582 0.2 0 0.0 Dec 8 13 9
## 11625 0.2 0 0.0 Nov 3 2 1
## 11659 0.2 0 0.0 Dec 1 1 1
## 11734 0.2 0 0.0 Nov 2 2 1
## 11748 0.2 0 0.0 Nov 1 1 3
## 11802 0.2 0 0.0 Dec 1 1 4
## 11814 0.2 0 0.0 Dec 2 2 1
## 11828 0.2 0 0.0 Dec 2 2 1
## 11935 0.2 0 0.0 Dec 1 1 1
## 11939 0.2 0 0.0 Dec 1 1 4
## 12160 0.2 0 0.0 Dec 1 1 1
## 12181 0.2 0 0.0 Dec 1 13 9
## 12186 0.2 0 0.0 Dec 8 13 9
## TrafficType VisitorType Weekend Revenue
## 159 3 Returning_Visitor FALSE FALSE
## 179 3 Returning_Visitor FALSE FALSE
## 419 1 Returning_Visitor TRUE FALSE
## 457 1 Returning_Visitor FALSE FALSE
## 484 1 Returning_Visitor FALSE FALSE
## 513 1 Returning_Visitor FALSE FALSE
## 555 1 Returning_Visitor FALSE FALSE
## 590 1 Returning_Visitor FALSE FALSE
## 660 1 Returning_Visitor FALSE FALSE
## 775 1 Returning_Visitor FALSE FALSE
## 873 1 Returning_Visitor FALSE FALSE
## 890 1 Returning_Visitor FALSE FALSE
## 923 1 Returning_Visitor FALSE FALSE
## 948 1 Returning_Visitor FALSE FALSE
## 975 1 Returning_Visitor FALSE FALSE
## 1035 1 Returning_Visitor FALSE FALSE
## 1120 1 Returning_Visitor FALSE FALSE
## 1171 1 Returning_Visitor FALSE FALSE
## 1177 1 Returning_Visitor FALSE FALSE
## 1214 1 Returning_Visitor FALSE FALSE
## 1215 3 Returning_Visitor FALSE FALSE
## 1292 1 Returning_Visitor FALSE FALSE
## 1326 3 Returning_Visitor FALSE FALSE
## 1357 1 Returning_Visitor FALSE FALSE
## 1367 1 Returning_Visitor FALSE FALSE
## 1382 1 Returning_Visitor FALSE FALSE
## 1391 1 Returning_Visitor FALSE FALSE
## 1395 1 Returning_Visitor FALSE FALSE
## 1437 1 Returning_Visitor FALSE FALSE
## 1454 1 Returning_Visitor FALSE FALSE
## 1516 3 Returning_Visitor TRUE FALSE
## 1574 1 Returning_Visitor FALSE FALSE
## 1609 1 Returning_Visitor FALSE FALSE
## 1698 1 Returning_Visitor FALSE FALSE
## 1776 1 Returning_Visitor FALSE FALSE
## 1805 1 Returning_Visitor FALSE FALSE
## 1840 3 Returning_Visitor FALSE FALSE
## 1867 9 Returning_Visitor TRUE FALSE
## 1926 1 Returning_Visitor FALSE FALSE
## 1934 1 Returning_Visitor FALSE FALSE
## 1950 1 Returning_Visitor FALSE FALSE
## 2057 1 Returning_Visitor FALSE FALSE
## 2058 1 Returning_Visitor FALSE FALSE
## 2236 3 Returning_Visitor FALSE FALSE
## 2622 3 Returning_Visitor FALSE FALSE
## 2740 1 Returning_Visitor FALSE FALSE
## 3232 3 Returning_Visitor FALSE FALSE
## 3273 3 Returning_Visitor FALSE FALSE
## 3282 3 Returning_Visitor FALSE FALSE
## 3578 4 Returning_Visitor FALSE FALSE
## 3651 1 Returning_Visitor FALSE FALSE
## 3664 3 Returning_Visitor FALSE FALSE
## 3722 3 Returning_Visitor FALSE FALSE
## 3892 4 Returning_Visitor FALSE FALSE
## 4164 3 Returning_Visitor FALSE FALSE
## 4183 3 Returning_Visitor FALSE FALSE
## 4232 1 Returning_Visitor FALSE FALSE
## 4344 13 Returning_Visitor FALSE FALSE
## 4375 3 Returning_Visitor FALSE FALSE
## 4404 3 Returning_Visitor FALSE FALSE
## 4427 3 Returning_Visitor FALSE FALSE
## 4464 3 Returning_Visitor FALSE FALSE
## 4490 3 Returning_Visitor FALSE FALSE
## 4553 3 Returning_Visitor FALSE FALSE
## 4818 3 Returning_Visitor FALSE FALSE
## 4884 3 Returning_Visitor FALSE FALSE
## 4914 1 Returning_Visitor FALSE FALSE
## 5039 3 Returning_Visitor FALSE FALSE
## 5044 3 Returning_Visitor FALSE FALSE
## 5057 3 Returning_Visitor FALSE FALSE
## 5119 4 Returning_Visitor TRUE FALSE
## 5199 13 Returning_Visitor FALSE FALSE
## 5200 3 Returning_Visitor FALSE FALSE
## 5255 1 Returning_Visitor FALSE FALSE
## 5277 13 Returning_Visitor FALSE FALSE
## 5287 15 Returning_Visitor FALSE FALSE
## 5356 3 Returning_Visitor FALSE FALSE
## 5408 6 Returning_Visitor FALSE FALSE
## 6930 1 Returning_Visitor FALSE FALSE
## 7152 1 Returning_Visitor FALSE FALSE
## 7636 13 Returning_Visitor FALSE FALSE
## 8545 3 Returning_Visitor FALSE FALSE
## 9307 1 Returning_Visitor TRUE FALSE
## 9495 3 Returning_Visitor FALSE FALSE
## 9552 3 Returning_Visitor FALSE FALSE
## 9569 1 Returning_Visitor FALSE FALSE
## 9582 1 Returning_Visitor FALSE FALSE
## 9719 13 Returning_Visitor FALSE FALSE
## 9770 1 Returning_Visitor FALSE FALSE
## 9879 13 Returning_Visitor FALSE FALSE
## 9908 13 Returning_Visitor FALSE FALSE
## 10147 20 Other FALSE FALSE
## 10223 1 Returning_Visitor FALSE FALSE
## 10270 2 Returning_Visitor FALSE FALSE
## 10573 1 Returning_Visitor FALSE FALSE
## 10632 1 Returning_Visitor FALSE FALSE
## 10752 1 Returning_Visitor TRUE FALSE
## 10796 1 Returning_Visitor FALSE FALSE
## 10842 1 Returning_Visitor FALSE FALSE
## 10989 3 Returning_Visitor FALSE FALSE
## 11044 1 Returning_Visitor FALSE FALSE
## 11206 20 Other FALSE FALSE
## 11405 13 Returning_Visitor FALSE FALSE
## 11524 13 Returning_Visitor FALSE FALSE
## 11582 20 Other FALSE FALSE
## 11625 1 Returning_Visitor FALSE FALSE
## 11659 1 Returning_Visitor TRUE FALSE
## 11734 1 Returning_Visitor FALSE FALSE
## 11748 3 Returning_Visitor FALSE FALSE
## 11802 1 Returning_Visitor TRUE FALSE
## 11814 1 Returning_Visitor FALSE FALSE
## 11828 1 Returning_Visitor FALSE FALSE
## 11935 2 New_Visitor FALSE FALSE
## 11939 1 Returning_Visitor TRUE FALSE
## 12160 3 Returning_Visitor FALSE FALSE
## 12181 20 Returning_Visitor FALSE FALSE
## 12186 20 Other FALSE FALSE
I will drop the duplicates
# eliminating for duplicates
df <- df[!duplicated(df), ]
### Dataset structure
str(df)
## 'data.frame': 12199 obs. of 18 variables:
## $ Administrative : int 0 0 0 0 0 0 0 1 0 0 ...
## $ Administrative_Duration: num 0 0 -1 0 0 0 -1 -1 0 0 ...
## $ Informational : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Informational_Duration : num 0 0 -1 0 0 0 -1 -1 0 0 ...
## $ ProductRelated : int 1 2 1 2 10 19 1 1 2 3 ...
## $ ProductRelated_Duration: num 0 64 -1 2.67 627.5 ...
## $ BounceRates : num 0.2 0 0.2 0.05 0.02 ...
## $ ExitRates : num 0.2 0.1 0.2 0.14 0.05 ...
## $ PageValues : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SpecialDay : num 0 0 0 0 0 0 0.4 0 0.8 0.4 ...
## $ Month : chr "Feb" "Feb" "Feb" "Feb" ...
## $ OperatingSystems : int 1 2 4 3 3 2 2 1 2 2 ...
## $ Browser : int 1 2 1 2 3 2 4 2 2 4 ...
## $ Region : int 1 1 9 2 1 1 3 1 2 1 ...
## $ TrafficType : int 1 2 3 4 4 3 3 5 3 2 ...
## $ VisitorType : chr "Returning_Visitor" "Returning_Visitor" "Returning_Visitor" "Returning_Visitor" ...
## $ Weekend : logi FALSE FALSE FALSE FALSE TRUE FALSE ...
## $ Revenue : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## - attr(*, "na.action")= 'omit' Named int [1:14] 1066 1133 1134 1135 1136 1137 1474 1475 1476 1477 ...
## ..- attr(*, "names")= chr [1:14] "1066" "1133" "1134" "1135" ...
# changing columns to factors
df$Month <- factor(df$Month)
df$VisitorType <- factor(df$VisitorType)
df$Weekend <- factor(df$Weekend)
df$Revenue <- factor(df$Revenue)
### Dataset structure
str(df)
## 'data.frame': 12199 obs. of 18 variables:
## $ Administrative : int 0 0 0 0 0 0 0 1 0 0 ...
## $ Administrative_Duration: num 0 0 -1 0 0 0 -1 -1 0 0 ...
## $ Informational : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Informational_Duration : num 0 0 -1 0 0 0 -1 -1 0 0 ...
## $ ProductRelated : int 1 2 1 2 10 19 1 1 2 3 ...
## $ ProductRelated_Duration: num 0 64 -1 2.67 627.5 ...
## $ BounceRates : num 0.2 0 0.2 0.05 0.02 ...
## $ ExitRates : num 0.2 0.1 0.2 0.14 0.05 ...
## $ PageValues : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SpecialDay : num 0 0 0 0 0 0 0.4 0 0.8 0.4 ...
## $ Month : Factor w/ 10 levels "Aug","Dec","Feb",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ OperatingSystems : int 1 2 4 3 3 2 2 1 2 2 ...
## $ Browser : int 1 2 1 2 3 2 4 2 2 4 ...
## $ Region : int 1 1 9 2 1 1 3 1 2 1 ...
## $ TrafficType : int 1 2 3 4 4 3 3 5 3 2 ...
## $ VisitorType : Factor w/ 3 levels "New_Visitor",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ Weekend : Factor w/ 2 levels "FALSE","TRUE": 1 1 1 1 2 1 1 2 1 1 ...
## $ Revenue : Factor w/ 2 levels "FALSE","TRUE": 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, "na.action")= 'omit' Named int [1:14] 1066 1133 1134 1135 1136 1137 1474 1475 1476 1477 ...
## ..- attr(*, "names")= chr [1:14] "1066" "1133" "1134" "1135" ...
mode <- function(v){
uniq <- unique(v)
uniq[which.max(tabulate(match(v,uniq)))]
}
summary(df)
## Administrative Administrative_Duration Informational
## Min. : 0.00 Min. : -1.00 Min. : 0.0000
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.0000
## Median : 1.00 Median : 9.00 Median : 0.0000
## Mean : 2.34 Mean : 81.68 Mean : 0.5088
## 3rd Qu.: 4.00 3rd Qu.: 94.75 3rd Qu.: 0.0000
## Max. :27.00 Max. :3398.75 Max. :24.0000
##
## Informational_Duration ProductRelated ProductRelated_Duration
## Min. : -1.00 Min. : 0.00 Min. : -1.0
## 1st Qu.: 0.00 1st Qu.: 8.00 1st Qu.: 193.6
## Median : 0.00 Median : 18.00 Median : 609.5
## Mean : 34.84 Mean : 32.06 Mean : 1207.5
## 3rd Qu.: 0.00 3rd Qu.: 38.00 3rd Qu.: 1477.6
## Max. :2549.38 Max. :705.00 Max. :63973.5
##
## BounceRates ExitRates PageValues SpecialDay
## Min. :0.00000 Min. :0.00000 Min. : 0.000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.01422 1st Qu.: 0.000 1st Qu.:0.00000
## Median :0.00293 Median :0.02500 Median : 0.000 Median :0.00000
## Mean :0.02045 Mean :0.04150 Mean : 5.952 Mean :0.06197
## 3rd Qu.:0.01667 3rd Qu.:0.04848 3rd Qu.: 0.000 3rd Qu.:0.00000
## Max. :0.20000 Max. :0.20000 Max. :361.764 Max. :1.00000
##
## Month OperatingSystems Browser Region
## May :3328 Min. :1.000 Min. : 1.000 Min. :1.000
## Nov :2983 1st Qu.:2.000 1st Qu.: 2.000 1st Qu.:1.000
## Mar :1853 Median :2.000 Median : 2.000 Median :3.000
## Dec :1706 Mean :2.124 Mean : 2.358 Mean :3.153
## Oct : 549 3rd Qu.:3.000 3rd Qu.: 2.000 3rd Qu.:4.000
## Sep : 448 Max. :8.000 Max. :13.000 Max. :9.000
## (Other):1332
## TrafficType VisitorType Weekend Revenue
## Min. : 1.000 New_Visitor : 1693 FALSE:9343 FALSE:10291
## 1st Qu.: 2.000 Other : 81 TRUE :2856 TRUE : 1908
## Median : 2.000 Returning_Visitor:10425
## Mean : 4.075
## 3rd Qu.: 4.000
## Max. :20.000
##
describe(df)
## vars n mean sd median trimmed mad min
## Administrative 1 12199 2.34 3.33 1.00 1.66 1.48 0
## Administrative_Duration 2 12199 81.68 177.53 9.00 42.87 13.34 -1
## Informational 3 12199 0.51 1.28 0.00 0.18 0.00 0
## Informational_Duration 4 12199 34.84 141.46 0.00 3.73 0.00 -1
## ProductRelated 5 12199 32.06 44.60 18.00 23.06 19.27 0
## ProductRelated_Duration 6 12199 1207.51 1919.93 609.54 832.36 745.12 -1
## BounceRates 7 12199 0.02 0.05 0.00 0.01 0.00 0
## ExitRates 8 12199 0.04 0.05 0.03 0.03 0.02 0
## PageValues 9 12199 5.95 18.66 0.00 1.33 0.00 0
## SpecialDay 10 12199 0.06 0.20 0.00 0.00 0.00 0
## Month* 11 12199 6.17 2.37 7.00 6.36 1.48 1
## OperatingSystems 12 12199 2.12 0.91 2.00 2.06 0.00 1
## Browser 13 12199 2.36 1.71 2.00 2.00 0.00 1
## Region 14 12199 3.15 2.40 3.00 2.79 2.97 1
## TrafficType 15 12199 4.07 4.02 2.00 3.22 1.48 1
## VisitorType* 16 12199 2.72 0.69 3.00 2.89 0.00 1
## Weekend* 17 12199 1.23 0.42 1.00 1.17 0.00 1
## Revenue* 18 12199 1.16 0.36 1.00 1.07 0.00 1
## max range skew kurtosis se
## Administrative 27.00 27.00 1.95 4.63 0.03
## Administrative_Duration 3398.75 3399.75 5.59 50.09 1.61
## Informational 24.00 24.00 4.01 26.64 0.01
## Informational_Duration 2549.38 2550.38 7.54 75.45 1.28
## ProductRelated 705.00 705.00 4.33 31.04 0.40
## ProductRelated_Duration 63973.52 63974.52 7.25 136.57 17.38
## BounceRates 0.20 0.20 3.15 9.25 0.00
## ExitRates 0.20 0.20 2.23 4.62 0.00
## PageValues 361.76 361.76 6.35 64.93 0.17
## SpecialDay 1.00 1.00 3.28 9.78 0.00
## Month* 10.00 9.00 -0.83 -0.37 0.02
## OperatingSystems 8.00 7.00 2.03 10.27 0.01
## Browser 13.00 12.00 3.22 12.53 0.02
## Region 9.00 8.00 0.98 -0.16 0.02
## TrafficType 20.00 19.00 1.96 3.47 0.04
## VisitorType* 3.00 2.00 -2.05 2.23 0.01
## Weekend* 2.00 1.00 1.26 -0.42 0.00
## Revenue* 2.00 1.00 1.89 1.58 0.00
mode(df$Administrative)
## [1] 0
mode(df$Informational)
## [1] 0
mode(df$BounceRates)
## [1] 0
mode(df$ExitRates)
## [1] 0.2
mode(df$PageValues)
## [1] 0
mode(df$SpecialDay)
## [1] 0
mode(df$Month)
## [1] May
## Levels: Aug Dec Feb Jul June Mar May Nov Oct Sep
mode(df$OperatingSystems)
## [1] 2
mode(df$Browser)
## [1] 2
mode(df$Region)
## [1] 1
mode(df$TrafficType)
## [1] 2
plot_histogram(df)
plot_bar(df)
All the data profiling statistics will be organized into the report below
create_report(df)
##
##
## processing file: report.rmd
##
|
| | 0%
|
|.. | 2%
## inline R code fragments
##
##
|
|... | 5%
## label: global_options (with options)
## List of 1
## $ include: logi FALSE
##
##
|
|..... | 7%
## ordinary text without R code
##
##
|
|....... | 10%
## label: introduce
##
|
|........ | 12%
## ordinary text without R code
##
##
|
|.......... | 14%
## label: plot_intro
##
|
|............ | 17%
## ordinary text without R code
##
##
|
|............. | 19%
## label: data_structure
##
|
|............... | 21%
## ordinary text without R code
##
##
|
|................. | 24%
## label: missing_profile
##
|
|.................. | 26%
## ordinary text without R code
##
##
|
|.................... | 29%
## label: univariate_distribution_header
##
|
|...................... | 31%
## ordinary text without R code
##
##
|
|....................... | 33%
## label: plot_histogram
##
|
|......................... | 36%
## ordinary text without R code
##
##
|
|........................... | 38%
## label: plot_density
##
|
|............................ | 40%
## ordinary text without R code
##
##
|
|.............................. | 43%
## label: plot_frequency_bar
##
|
|................................ | 45%
## ordinary text without R code
##
##
|
|................................. | 48%
## label: plot_response_bar
##
|
|................................... | 50%
## ordinary text without R code
##
##
|
|..................................... | 52%
## label: plot_with_bar
##
|
|...................................... | 55%
## ordinary text without R code
##
##
|
|........................................ | 57%
## label: plot_normal_qq
##
|
|.......................................... | 60%
## ordinary text without R code
##
##
|
|........................................... | 62%
## label: plot_response_qq
##
|
|............................................. | 64%
## ordinary text without R code
##
##
|
|............................................... | 67%
## label: plot_by_qq
##
|
|................................................ | 69%
## ordinary text without R code
##
##
|
|.................................................. | 71%
## label: correlation_analysis
##
|
|.................................................... | 74%
## ordinary text without R code
##
##
|
|..................................................... | 76%
## label: principal_component_analysis
##
|
|....................................................... | 79%
## ordinary text without R code
##
##
|
|......................................................... | 81%
## label: bivariate_distribution_header
##
|
|.......................................................... | 83%
## ordinary text without R code
##
##
|
|............................................................ | 86%
## label: plot_response_boxplot
##
|
|.............................................................. | 88%
## ordinary text without R code
##
##
|
|............................................................... | 90%
## label: plot_by_boxplot
##
|
|................................................................. | 93%
## ordinary text without R code
##
##
|
|................................................................... | 95%
## label: plot_response_scatterplot
##
|
|.................................................................... | 98%
## ordinary text without R code
##
##
|
|......................................................................| 100%
## label: plot_by_scatterplot
## output file: C:/Users/user/Documents/IP_W13_Part 2/report.knit.md
## "C:/Program Files/RStudio/bin/quarto/bin/pandoc" +RTS -K512m -RTS "C:/Users/user/Documents/IP_W13_Part 2/report.knit.md" --to html4 --from markdown+autolink_bare_uris+tex_math_single_backslash --output pandoc315c753d57ef.html --lua-filter "C:\Users\user\Documents\R\win-library\4.1\rmarkdown\rmarkdown\lua\pagebreak.lua" --lua-filter "C:\Users\user\Documents\R\win-library\4.1\rmarkdown\rmarkdown\lua\latex-div.lua" --self-contained --variable bs3=TRUE --standalone --section-divs --table-of-contents --toc-depth 6 --template "C:\Users\user\Documents\R\win-library\4.1\rmarkdown\rmd\h\default.html" --no-highlight --variable highlightjs=1 --variable theme=yeti --mathjax --variable "mathjax-url=https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML" --include-in-header "C:\Users\user\AppData\Local\Temp\Rtmpol5I5k\rmarkdown-str315c6b4f50e7.html"
##
## Output created: report.html
The link for the report is here: “file:///C:/Users/user/Documents/IP_W13_Part%202/report.html#qq-plot”
Step 1: One hot encoding of the factor variables.
# # One hot encoding of the factor variables.
dmy = dummyVars(" ~ .", data = df)
df2 = data.frame(predict(dmy, newdata = df))
# Checking the data types of each attribute
sapply(df2, class)
## Administrative Administrative_Duration
## "numeric" "numeric"
## Informational Informational_Duration
## "numeric" "numeric"
## ProductRelated ProductRelated_Duration
## "numeric" "numeric"
## BounceRates ExitRates
## "numeric" "numeric"
## PageValues SpecialDay
## "numeric" "numeric"
## Month.Aug Month.Dec
## "numeric" "numeric"
## Month.Feb Month.Jul
## "numeric" "numeric"
## Month.June Month.Mar
## "numeric" "numeric"
## Month.May Month.Nov
## "numeric" "numeric"
## Month.Oct Month.Sep
## "numeric" "numeric"
## OperatingSystems Browser
## "numeric" "numeric"
## Region TrafficType
## "numeric" "numeric"
## VisitorType.New_Visitor VisitorType.Other
## "numeric" "numeric"
## VisitorType.Returning_Visitor Weekend.FALSE
## "numeric" "numeric"
## Weekend.TRUE Revenue.FALSE
## "numeric" "numeric"
## Revenue.TRUE
## "numeric"
Step 2: We are instructed to use Revenue as the class label, Hence we will remove it and store it in another variable
# Step 2
# We are instructed to use Revenue as the class label,
# Hence we will remove it and store it in another variable
df2_copy <- df2[, -c(30:31)]
df.class<- df[, "Revenue"]
df2_copy_copy <- df2[, -c(30,31)]
# Previewing the copy dataset with dummies
head(df2_copy)
## Administrative Administrative_Duration Informational Informational_Duration
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 -1 0 -1
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues
## 1 1 0.000000 0.20000000 0.2000000 0
## 2 2 64.000000 0.00000000 0.1000000 0
## 3 1 -1.000000 0.20000000 0.2000000 0
## 4 2 2.666667 0.05000000 0.1400000 0
## 5 10 627.500000 0.02000000 0.0500000 0
## 6 19 154.216667 0.01578947 0.0245614 0
## SpecialDay Month.Aug Month.Dec Month.Feb Month.Jul Month.June Month.Mar
## 1 0 0 0 1 0 0 0
## 2 0 0 0 1 0 0 0
## 3 0 0 0 1 0 0 0
## 4 0 0 0 1 0 0 0
## 5 0 0 0 1 0 0 0
## 6 0 0 0 1 0 0 0
## Month.May Month.Nov Month.Oct Month.Sep OperatingSystems Browser Region
## 1 0 0 0 0 1 1 1
## 2 0 0 0 0 2 2 1
## 3 0 0 0 0 4 1 9
## 4 0 0 0 0 3 2 2
## 5 0 0 0 0 3 3 1
## 6 0 0 0 0 2 2 1
## TrafficType VisitorType.New_Visitor VisitorType.Other
## 1 1 0 0
## 2 2 0 0
## 3 3 0 0
## 4 4 0 0
## 5 4 0 0
## 6 3 0 0
## VisitorType.Returning_Visitor Weekend.FALSE Weekend.TRUE
## 1 1 1 0
## 2 1 1 0
## 3 1 1 0
## 4 1 1 0
## 5 1 0 1
## 6 1 1 0
Step 3: Determining whether to Normalize or Scale the data
# This is important to ensure that no particular attribute, has more impact on clustering algorithm than others
df2_scaled <- scale(df2_copy)
# After scaling the data lets see what we find in the output
summary(df2_scaled)
## Administrative Administrative_Duration Informational
## Min. :-0.7025 Min. :-0.46574 Min. :-0.3988
## 1st Qu.:-0.7025 1st Qu.:-0.46011 1st Qu.:-0.3988
## Median :-0.4023 Median :-0.40941 Median :-0.3988
## Mean : 0.0000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.4984 3rd Qu.: 0.07361 3rd Qu.:-0.3988
## Max. : 7.4035 Max. :18.68474 Max. :18.4127
## Informational_Duration ProductRelated ProductRelated_Duration
## Min. :-0.2533 Min. :-0.7188 Min. :-0.6295
## 1st Qu.:-0.2463 1st Qu.:-0.5394 1st Qu.:-0.5281
## Median :-0.2463 Median :-0.3152 Median :-0.3115
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.:-0.2463 3rd Qu.: 0.1332 3rd Qu.: 0.1407
## Max. :17.7758 Max. :15.0881 Max. :32.6919
## BounceRates ExitRates PageValues SpecialDay
## Min. :-0.45034 Min. :-0.8973 Min. :-0.319 Min. :-0.3103
## 1st Qu.:-0.45034 1st Qu.:-0.5897 1st Qu.:-0.319 1st Qu.:-0.3103
## Median :-0.38580 Median :-0.3567 Median :-0.319 Median :-0.3103
## Mean : 0.00000 Mean : 0.0000 Mean : 0.000 Mean : 0.0000
## 3rd Qu.:-0.08326 3rd Qu.: 0.1511 3rd Qu.:-0.319 3rd Qu.:-0.3103
## Max. : 3.95470 Max. : 3.4273 Max. :19.070 Max. : 4.6969
## Month.Aug Month.Dec Month.Feb Month.Jul
## Min. :-0.1918 Min. :-0.4032 Min. :-0.1231 Min. :-0.1916
## 1st Qu.:-0.1918 1st Qu.:-0.4032 1st Qu.:-0.1231 1st Qu.:-0.1916
## Median :-0.1918 Median :-0.4032 Median :-0.1231 Median :-0.1916
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.:-0.1918 3rd Qu.:-0.4032 3rd Qu.:-0.1231 3rd Qu.:-0.1916
## Max. : 5.2126 Max. : 2.4799 Max. : 8.1254 Max. : 5.2188
## Month.June Month.Mar Month.May Month.Nov
## Min. :-0.1547 Min. :-0.4232 Min. :-0.6125 Min. :-0.5689
## 1st Qu.:-0.1547 1st Qu.:-0.4232 1st Qu.:-0.6125 1st Qu.:-0.5689
## Median :-0.1547 Median :-0.4232 Median :-0.6125 Median :-0.5689
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.:-0.1547 3rd Qu.:-0.4232 3rd Qu.: 1.6326 3rd Qu.:-0.5689
## Max. : 6.4653 Max. : 2.3628 Max. : 1.6326 Max. : 1.7576
## Month.Oct Month.Sep OperatingSystems Browser
## Min. :-0.2171 Min. :-0.1952 Min. :-1.2397 Min. :-0.7940
## 1st Qu.:-0.2171 1st Qu.:-0.1952 1st Qu.:-0.1371 1st Qu.:-0.2094
## Median :-0.2171 Median :-0.1952 Median :-0.1371 Median :-0.2094
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.:-0.2171 3rd Qu.:-0.1952 3rd Qu.: 0.9654 3rd Qu.:-0.2094
## Max. : 4.6064 Max. : 5.1213 Max. : 6.4782 Max. : 6.2212
## Region TrafficType VisitorType.New_Visitor
## Min. :-0.89629 Min. :-0.76562 Min. :-0.4014
## 1st Qu.:-0.89629 1st Qu.:-0.51661 1st Qu.:-0.4014
## Median :-0.06381 Median :-0.51661 Median :-0.4014
## Mean : 0.00000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.35244 3rd Qu.:-0.01858 3rd Qu.:-0.4014
## Max. : 2.43366 Max. : 3.96567 Max. : 2.4910
## VisitorType.Other VisitorType.Returning_Visitor Weekend.FALSE
## Min. :-0.08175 Min. :-2.4241 Min. :-1.8086
## 1st Qu.:-0.08175 1st Qu.: 0.4125 1st Qu.: 0.5529
## Median :-0.08175 Median : 0.4125 Median : 0.5529
## Mean : 0.00000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.:-0.08175 3rd Qu.: 0.4125 3rd Qu.: 0.5529
## Max. :12.23081 Max. : 0.4125 Max. : 0.5529
## Weekend.TRUE
## Min. :-0.5529
## 1st Qu.:-0.5529
## Median :-0.5529
## Mean : 0.0000
## 3rd Qu.:-0.5529
## Max. : 1.8086
It is evident that there are some attributes still with large values compared to others. Scaling makes the data changes the data to have a mean 0. We will normalize the data and see if we get different results.
# Normalizing the a copy of the original data
df2_norm <- as.data.frame(apply(df2_copy, 2, function(x) (x - min(x))/(max(x)-min(x))))
# summary of the normalized data.
summary(df2_norm)
## Administrative Administrative_Duration Informational
## Min. :0.00000 Min. :0.0000000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.0002941 1st Qu.:0.0000
## Median :0.03704 Median :0.0029414 Median :0.0000
## Mean :0.08667 Mean :0.0243201 Mean :0.0212
## 3rd Qu.:0.14815 3rd Qu.:0.0281638 3rd Qu.:0.0000
## Max. :1.00000 Max. :1.0000000 Max. :1.0000
## Informational_Duration ProductRelated ProductRelated_Duration
## Min. :0.0000000 Min. :0.00000 Min. :0.000000
## 1st Qu.:0.0003921 1st Qu.:0.01135 1st Qu.:0.003042
## Median :0.0003921 Median :0.02553 Median :0.009543
## Mean :0.0140518 Mean :0.04547 Mean :0.018891
## 3rd Qu.:0.0003921 3rd Qu.:0.05390 3rd Qu.:0.023112
## Max. :1.0000000 Max. :1.00000 Max. :1.000000
## BounceRates ExitRates PageValues SpecialDay
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.07111 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.01465 Median :0.12500 Median :0.00000 Median :0.00000
## Mean :0.10223 Mean :0.20748 Mean :0.01645 Mean :0.06197
## 3rd Qu.:0.08333 3rd Qu.:0.24242 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.00000 Max. :1.00000 Max. :1.00000
## Month.Aug Month.Dec Month.Feb Month.Jul
## Min. :0.00000 Min. :0.0000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.00000 Median :0.0000 Median :0.00000 Median :0.00000
## Mean :0.03549 Mean :0.1398 Mean :0.01492 Mean :0.03541
## 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.0000 Max. :1.00000 Max. :1.00000
## Month.June Month.Mar Month.May Month.Nov
## Min. :0.00000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.00000 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.02336 Mean :0.1519 Mean :0.2728 Mean :0.2445
## 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :1.00000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## Month.Oct Month.Sep OperatingSystems Browser
## Min. :0.000 Min. :0.00000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.000 1st Qu.:0.00000 1st Qu.:0.1429 1st Qu.:0.08333
## Median :0.000 Median :0.00000 Median :0.1429 Median :0.08333
## Mean :0.045 Mean :0.03672 Mean :0.1606 Mean :0.11318
## 3rd Qu.:0.000 3rd Qu.:0.00000 3rd Qu.:0.2857 3rd Qu.:0.08333
## Max. :1.000 Max. :1.00000 Max. :1.0000 Max. :1.00000
## Region TrafficType VisitorType.New_Visitor VisitorType.Other
## Min. :0.0000 Min. :0.00000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.05263 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.2500 Median :0.05263 Median :0.0000 Median :0.00000
## Mean :0.2692 Mean :0.16182 Mean :0.1388 Mean :0.00664
## 3rd Qu.:0.3750 3rd Qu.:0.15789 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.00000 Max. :1.0000 Max. :1.00000
## VisitorType.Returning_Visitor Weekend.FALSE Weekend.TRUE
## Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:1.0000 1st Qu.:1.0000 1st Qu.:0.0000
## Median :1.0000 Median :1.0000 Median :0.0000
## Mean :0.8546 Mean :0.7659 Mean :0.2341
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000
Here, we have a maximum value of 1 and minimum value of 0s and mean of close to zero in all attributes. We will use the NORMALIZED dataset for clustering.
# Searching for the optimal number of clusters
# # Elbow method
fviz_nbclust(df2_norm, kmeans, method = "wss") +
geom_vline(xintercept = 4, linetype = 2)+
labs(subtitle = "Elbow method")
# Applying K-Means Clustering algorithm
# Using 3 centroids as K=3
result <- kmeans(df2_norm, 10)
# Previewing the number of records in each cluster
result$size
## [1] 463 505 2591 1852 1595 475 1025 645 1793 1255
# Viewing the cluster center datapoints by each attribute
result$centers
## Administrative Administrative_Duration Informational Informational_Duration
## 1 0.070794336 0.020569578 0.0185385169 0.0112926541
## 2 0.000880088 0.000320175 0.0004125413 0.0003711355
## 3 0.091384708 0.025408114 0.0202142030 0.0140365410
## 4 0.098312135 0.027560732 0.0290901728 0.0199305407
## 5 0.094159991 0.027110401 0.0135579937 0.0075177819
## 6 0.077660819 0.021247941 0.0188596491 0.0137691894
## 7 0.091093044 0.025973660 0.0265853659 0.0203120192
## 8 0.048176859 0.011195465 0.0121447028 0.0075813206
## 9 0.105058768 0.029081948 0.0290481502 0.0183418266
## 10 0.083901431 0.024540485 0.0205843293 0.0124254526
## ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues
## 1 0.04176585 0.0175658519 0.06022419 0.1823545 0.011489607
## 2 0.00281160 0.0003643396 0.96790888 0.9798465 0.000000000
## 3 0.04502219 0.0185803318 0.05737498 0.1790229 0.012204454
## 4 0.05548228 0.0227181641 0.10334965 0.2002827 0.017327454
## 5 0.02580493 0.0100123697 0.02192726 0.1002636 0.030468410
## 6 0.03002613 0.0131179344 0.06331099 0.1669631 0.014282993
## 7 0.04812870 0.0215558689 0.04935376 0.1696726 0.019527581
## 8 0.04344164 0.0160424025 0.14172310 0.2805351 0.005358765
## 9 0.07378181 0.0314005950 0.05550721 0.1615382 0.018283271
## 10 0.03944054 0.0167075198 0.06680364 0.1859334 0.015981030
## SpecialDay Month.Aug Month.Dec Month.Feb Month.Jul Month.June
## 1 0.175809935 0.00000000 0.0000000 0.0000000000 0.00000000 0.00000000
## 2 0.080000000 0.02970297 0.1128713 0.0554455446 0.03960396 0.05346535
## 3 0.010806638 0.10575068 0.0000000 0.0482439213 0.10304902 0.07255886
## 4 0.005291577 0.03887689 0.1565875 0.0151187905 0.04913607 0.02213823
## 5 0.021065831 0.04514107 0.2094044 0.0006269592 0.03385580 0.01818182
## 6 0.000000000 0.00000000 0.0000000 0.0000000000 0.00000000 0.00000000
## 7 0.000000000 0.00000000 1.0000000 0.0000000000 0.00000000 0.00000000
## 8 0.761860465 0.00000000 0.0000000 0.0000000000 0.00000000 0.00000000
## 9 0.000000000 0.00000000 0.0000000 0.0000000000 0.00000000 0.00000000
## 10 0.056892430 0.00000000 0.0000000 0.0000000000 0.00000000 0.00000000
## Month.Mar Month.May Month.Nov Month.Oct Month.Sep OperatingSystems
## 1 0.00000000 1.0000000 0.0000000 0.00000000 0.00000000 0.1598272
## 2 0.14851485 0.3247525 0.2118812 0.00990099 0.01386139 0.1705799
## 3 0.44847549 0.0000000 0.0000000 0.12196063 0.09996140 0.1546011
## 4 0.00000000 0.2634989 0.3585313 0.05615551 0.03995680 0.1620642
## 5 0.08840125 0.1962382 0.2626959 0.07774295 0.06771160 0.1462606
## 6 1.00000000 0.0000000 0.0000000 0.00000000 0.00000000 0.1624060
## 7 0.00000000 0.0000000 0.0000000 0.00000000 0.00000000 0.1863415
## 8 0.00000000 1.0000000 0.0000000 0.00000000 0.00000000 0.1676633
## 9 0.00000000 0.0000000 1.0000000 0.00000000 0.00000000 0.1591108
## 10 0.00000000 1.0000000 0.0000000 0.00000000 0.00000000 0.1623221
## Browser Region TrafficType VisitorType.New_Visitor VisitorType.Other
## 1 0.11969042 0.7200324 0.1813118 0.00000000 0.0000000000
## 2 0.11881188 0.2910891 0.2143825 0.01386139 0.0198019802
## 3 0.11459539 0.2717580 0.1258608 0.00000000 0.0003859514
## 4 0.10421166 0.2688310 0.1691486 0.00000000 0.0037796976
## 5 0.10909091 0.2876959 0.1497773 1.00000000 0.0000000000
## 6 0.09666667 0.2323684 0.1329640 0.19157895 0.0000000000
## 7 0.14495935 0.3015854 0.1750963 0.00000000 0.0478048780
## 8 0.11925065 0.1856589 0.1986944 0.00000000 0.0000000000
## 9 0.10461982 0.2435165 0.1717791 0.00000000 0.0078081428
## 10 0.11341301 0.1325697 0.1791151 0.00000000 0.0000000000
## VisitorType.Returning_Visitor Weekend.FALSE Weekend.TRUE
## 1 1.0000000 1.0000000 0.0000000
## 2 0.9663366 1.0000000 0.0000000
## 3 0.9996140 1.0000000 0.0000000
## 4 0.9962203 0.0000000 1.0000000
## 5 0.0000000 0.7567398 0.2432602
## 6 0.8084211 0.0000000 1.0000000
## 7 0.9521951 1.0000000 0.0000000
## 8 1.0000000 0.7813953 0.2186047
## 9 0.9921919 1.0000000 0.0000000
## 10 1.0000000 1.0000000 0.0000000
# Plotting two variables to see how their data points
# have been distributed in the cluster
# Product Related, vs Product Related Duration
plot(df2_norm[, 5:6], col = result$cluster)
# Product Related, vs Product Related Duration
plot(df2_norm[, 7:8], col = result$cluster)
# We use R function hclust()
# For hierarchical clustering
# First we use the dist() to compute the Euclidean distance btwn obs
# d will be the first argument in the hclust() dissimilairty matrix
#
d <- dist(df2_norm, method = "euclidean")
# We then apply hierarchical clustering using the Ward's method
res.hc <- hclust(d, method = "ward.D2")
# Lastly we plot the obtained dendrogram
#--
plot(res.hc, cex = 0.6, hang = -1)