1. Problem Definition

Kira Plastinina is a Russian brand that is sold through a defunct chain of retail stores in Russia, Ukraine, Kazakhstan, Belarus, China, Philippines, and Armenia. The brand’s Sales and Marketing team would like to understand their customer’s behavior from data that they have collected over the past year. More specifically, they would like to learn the characteristics of customer groups.

2. Steps Taken

3. Data Sourcing

The dataset for this Independent project can be found here [http://bit.ly/EcommerceCustomersDataset

4. Installing and loading Necessary Packages

5. Check the Data

df <- read.csv("C:/Users/user/Downloads/online_shoppers_intention.csv")
head(df)
##   Administrative Administrative_Duration Informational Informational_Duration
## 1              0                       0             0                      0
## 2              0                       0             0                      0
## 3              0                      -1             0                     -1
## 4              0                       0             0                      0
## 5              0                       0             0                      0
## 6              0                       0             0                      0
##   ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues
## 1              1                0.000000  0.20000000 0.2000000          0
## 2              2               64.000000  0.00000000 0.1000000          0
## 3              1               -1.000000  0.20000000 0.2000000          0
## 4              2                2.666667  0.05000000 0.1400000          0
## 5             10              627.500000  0.02000000 0.0500000          0
## 6             19              154.216667  0.01578947 0.0245614          0
##   SpecialDay Month OperatingSystems Browser Region TrafficType
## 1          0   Feb                1       1      1           1
## 2          0   Feb                2       2      1           2
## 3          0   Feb                4       1      9           3
## 4          0   Feb                3       2      2           4
## 5          0   Feb                3       3      1           4
## 6          0   Feb                2       2      1           3
##         VisitorType Weekend Revenue
## 1 Returning_Visitor   FALSE   FALSE
## 2 Returning_Visitor   FALSE   FALSE
## 3 Returning_Visitor   FALSE   FALSE
## 4 Returning_Visitor   FALSE   FALSE
## 5 Returning_Visitor    TRUE   FALSE
## 6 Returning_Visitor   FALSE   FALSE

6. Data Cleaning

6.1 Missing Values

# checking for missing values

colSums(is.na(df))
##          Administrative Administrative_Duration           Informational 
##                      14                      14                      14 
##  Informational_Duration          ProductRelated ProductRelated_Duration 
##                      14                      14                      14 
##             BounceRates               ExitRates              PageValues 
##                      14                      14                       0 
##              SpecialDay                   Month        OperatingSystems 
##                       0                       0                       0 
##                 Browser                  Region             TrafficType 
##                       0                       0                       0 
##             VisitorType                 Weekend                 Revenue 
##                       0                       0                       0
# dropping null values

df <- na.omit(df)
# confirming there are no null values

colSums(is.na(df))
##          Administrative Administrative_Duration           Informational 
##                       0                       0                       0 
##  Informational_Duration          ProductRelated ProductRelated_Duration 
##                       0                       0                       0 
##             BounceRates               ExitRates              PageValues 
##                       0                       0                       0 
##              SpecialDay                   Month        OperatingSystems 
##                       0                       0                       0 
##                 Browser                  Region             TrafficType 
##                       0                       0                       0 
##             VisitorType                 Weekend                 Revenue 
##                       0                       0                       0

6.2 Checking for duplicates

duplicates <- df[duplicated(df),]
duplicates
##       Administrative Administrative_Duration Informational
## 159                0                       0             0
## 179                0                       0             0
## 419                0                       0             0
## 457                0                       0             0
## 484                0                       0             0
## 513                0                       0             0
## 555                0                       0             0
## 590                0                       0             0
## 660                0                       0             0
## 775                0                       0             0
## 873                0                       0             0
## 890                0                       0             0
## 923                0                       0             0
## 948                0                       0             0
## 975                0                       0             0
## 1035               0                       0             0
## 1120               0                       0             0
## 1171               0                       0             0
## 1177               0                       0             0
## 1214               0                       0             0
## 1215               0                       0             0
## 1292               0                       0             0
## 1326               0                       0             0
## 1357               0                       0             0
## 1367               0                       0             0
## 1382               0                       0             0
## 1391               0                       0             0
## 1395               0                       0             0
## 1437               0                       0             0
## 1454               0                       0             0
## 1516               0                       0             0
## 1574               0                       0             0
## 1609               0                       0             0
## 1698               0                       0             0
## 1776               0                       0             0
## 1805               0                       0             0
## 1840               0                       0             0
## 1867               0                       0             0
## 1926               0                       0             0
## 1934               0                       0             0
## 1950               0                       0             0
## 2057               0                       0             0
## 2058               0                       0             0
## 2236               0                       0             0
## 2622               0                       0             0
## 2740               0                       0             0
## 3232               0                       0             0
## 3273               0                       0             0
## 3282               0                       0             0
## 3578               0                       0             0
## 3651               0                       0             0
## 3664               0                       0             0
## 3722               0                       0             0
## 3892               0                       0             0
## 4164               0                       0             0
## 4183               0                       0             0
## 4232               0                       0             0
## 4344               0                       0             0
## 4375               0                       0             0
## 4404               0                       0             0
## 4427               0                       0             0
## 4464               0                       0             0
## 4490               0                       0             0
## 4553               0                       0             0
## 4818               0                       0             0
## 4884               0                       0             0
## 4914               0                       0             0
## 5039               0                       0             0
## 5044               0                       0             0
## 5057               0                       0             0
## 5119               0                       0             0
## 5199               0                       0             0
## 5200               0                       0             0
## 5255               0                       0             0
## 5277               0                       0             0
## 5287               0                       0             0
## 5356               0                       0             0
## 5408               0                       0             0
## 6930               0                       0             0
## 7152               0                       0             0
## 7636               0                       0             0
## 8545               0                       0             0
## 9307               0                       0             0
## 9495               0                       0             0
## 9552               0                       0             0
## 9569               0                       0             0
## 9582               0                       0             0
## 9719               0                       0             0
## 9770               0                       0             0
## 9879               0                       0             0
## 9908               0                       0             0
## 10147              0                       0             0
## 10223              0                       0             0
## 10270              0                       0             0
## 10573              0                       0             0
## 10632              0                       0             0
## 10752              0                       0             0
## 10796              0                       0             0
## 10842              0                       0             0
## 10989              0                       0             0
## 11044              0                       0             0
## 11206              0                       0             0
## 11405              0                       0             0
## 11524              0                       0             0
## 11582              0                       0             0
## 11625              0                       0             0
## 11659              0                       0             0
## 11734              0                       0             0
## 11748              0                       0             0
## 11802              0                       0             0
## 11814              0                       0             0
## 11828              0                       0             0
## 11935              0                       0             0
## 11939              0                       0             0
## 12160              0                       0             0
## 12181              0                       0             0
## 12186              0                       0             0
##       Informational_Duration ProductRelated ProductRelated_Duration BounceRates
## 159                        0              1                       0         0.2
## 179                        0              1                       0         0.2
## 419                        0              1                       0         0.2
## 457                        0              1                       0         0.2
## 484                        0              1                       0         0.2
## 513                        0              1                       0         0.2
## 555                        0              1                       0         0.2
## 590                        0              1                       0         0.2
## 660                        0              2                       0         0.2
## 775                        0              1                       0         0.2
## 873                        0              1                       0         0.2
## 890                        0              1                       0         0.2
## 923                        0              1                       0         0.2
## 948                        0              1                       0         0.2
## 975                        0              1                       0         0.2
## 1035                       0              1                       0         0.2
## 1120                       0              1                       0         0.2
## 1171                       0              1                       0         0.2
## 1177                       0              1                       0         0.2
## 1214                       0              1                       0         0.2
## 1215                       0              1                       0         0.2
## 1292                       0              2                       0         0.2
## 1326                       0              1                       0         0.2
## 1357                       0              2                       0         0.2
## 1367                       0              1                       0         0.2
## 1382                       0              1                       0         0.2
## 1391                       0              1                       0         0.2
## 1395                       0              1                       0         0.2
## 1437                       0              1                       0         0.2
## 1454                       0              1                       0         0.2
## 1516                       0              1                       0         0.2
## 1574                       0              1                       0         0.2
## 1609                       0              1                       0         0.2
## 1698                       0              1                       0         0.2
## 1776                       0              1                       0         0.2
## 1805                       0              1                       0         0.2
## 1840                       0              1                       0         0.2
## 1867                       0              1                       0         0.2
## 1926                       0              1                       0         0.2
## 1934                       0              1                       0         0.2
## 1950                       0              1                       0         0.2
## 2057                       0              1                       0         0.2
## 2058                       0              1                       0         0.2
## 2236                       0              1                       0         0.2
## 2622                       0              1                       0         0.2
## 2740                       0              1                       0         0.2
## 3232                       0              1                       0         0.2
## 3273                       0              1                       0         0.2
## 3282                       0              1                       0         0.2
## 3578                       0              1                       0         0.2
## 3651                       0              1                       0         0.2
## 3664                       0              1                       0         0.2
## 3722                       0              1                       0         0.2
## 3892                       0              1                       0         0.2
## 4164                       0              1                       0         0.2
## 4183                       0              1                       0         0.2
## 4232                       0              1                       0         0.2
## 4344                       0              1                       0         0.2
## 4375                       0              1                       0         0.2
## 4404                       0              1                       0         0.2
## 4427                       0              1                       0         0.2
## 4464                       0              1                       0         0.2
## 4490                       0              1                       0         0.2
## 4553                       0              2                       0         0.2
## 4818                       0              1                       0         0.2
## 4884                       0              1                       0         0.2
## 4914                       0              1                       0         0.2
## 5039                       0              1                       0         0.2
## 5044                       0              1                       0         0.2
## 5057                       0              1                       0         0.2
## 5119                       0              1                       0         0.2
## 5199                       0              1                       0         0.2
## 5200                       0              2                       0         0.2
## 5255                       0              1                       0         0.2
## 5277                       0              1                       0         0.2
## 5287                       0              1                       0         0.2
## 5356                       0              1                       0         0.2
## 5408                       0              1                       0         0.2
## 6930                       0              1                       0         0.2
## 7152                       0              1                       0         0.2
## 7636                       0              1                       0         0.2
## 8545                       0              1                       0         0.2
## 9307                       0              1                       0         0.2
## 9495                       0              1                       0         0.2
## 9552                       0              1                       0         0.2
## 9569                       0              1                       0         0.2
## 9582                       0              1                       0         0.2
## 9719                       0              1                       0         0.2
## 9770                       0              1                       0         0.2
## 9879                       0              1                       0         0.2
## 9908                       0              1                       0         0.2
## 10147                      0              1                       0         0.2
## 10223                      0              2                       0         0.2
## 10270                      0              1                       0         0.2
## 10573                      0              1                       0         0.2
## 10632                      0              1                       0         0.2
## 10752                      0              1                       0         0.2
## 10796                      0              1                       0         0.2
## 10842                      0              1                       0         0.2
## 10989                      0              1                       0         0.2
## 11044                      0              1                       0         0.2
## 11206                      0              1                       0         0.2
## 11405                      0              1                       0         0.2
## 11524                      0              1                       0         0.2
## 11582                      0              1                       0         0.2
## 11625                      0              1                       0         0.2
## 11659                      0              1                       0         0.2
## 11734                      0              1                       0         0.2
## 11748                      0              1                       0         0.2
## 11802                      0              1                       0         0.2
## 11814                      0              1                       0         0.2
## 11828                      0              1                       0         0.2
## 11935                      0              1                       0         0.2
## 11939                      0              1                       0         0.2
## 12160                      0              1                       0         0.2
## 12181                      0              1                       0         0.2
## 12186                      0              1                       0         0.2
##       ExitRates PageValues SpecialDay Month OperatingSystems Browser Region
## 159         0.2          0        0.0   Feb                1       1      1
## 179         0.2          0        0.0   Feb                3       2      3
## 419         0.2          0        0.0   Mar                1       1      1
## 457         0.2          0        0.0   Mar                2       2      4
## 484         0.2          0        0.0   Mar                3       2      3
## 513         0.2          0        0.0   Mar                2       2      1
## 555         0.2          0        0.0   Mar                2       2      1
## 590         0.2          0        0.0   Mar                2       2      1
## 660         0.2          0        0.0   Mar                2       5      1
## 775         0.2          0        0.0   Mar                2       2      4
## 873         0.2          0        0.0   Mar                3       2      3
## 890         0.2          0        0.0   Mar                1       1      2
## 923         0.2          0        0.0   Mar                3       2      2
## 948         0.2          0        0.0   Mar                2       2      1
## 975         0.2          0        0.0   Mar                2       2      1
## 1035        0.2          0        0.0   Mar                2       2      1
## 1120        0.2          0        0.0   Mar                2       2      1
## 1171        0.2          0        0.0   Mar                3       2      1
## 1177        0.2          0        0.0   Mar                2       4      1
## 1214        0.2          0        0.0   Mar                3       2      3
## 1215        0.2          0        0.0   Mar                1       1      1
## 1292        0.2          0        0.0   Mar                2       2      1
## 1326        0.2          0        0.0   Mar                1       1      3
## 1357        0.2          0        0.0   Mar                1       1      1
## 1367        0.2          0        0.0   Mar                1       1      8
## 1382        0.2          0        0.0   Mar                1       1      4
## 1391        0.2          0        0.0   Mar                2       2      1
## 1395        0.2          0        0.0   Mar                2       2      1
## 1437        0.2          0        0.0   Mar                3       2      3
## 1454        0.2          0        0.0   Mar                2       2      1
## 1516        0.2          0        0.0   Mar                1       1      1
## 1574        0.2          0        0.0   Mar                2       2      1
## 1609        0.2          0        0.0   Mar                2       2      7
## 1698        0.2          0        0.0   Mar                2       2      2
## 1776        0.2          0        0.0   Mar                3       2      1
## 1805        0.2          0        0.0   Mar                1       1      8
## 1840        0.2          0        0.0   Mar                2       2      1
## 1867        0.2          0        0.0   Mar                1       1      1
## 1926        0.2          0        0.0   Mar                3       2      1
## 1934        0.2          0        0.0   Mar                2       2      1
## 1950        0.2          0        0.0   Mar                2       2      1
## 2057        0.2          0        0.0   Mar                3       2      3
## 2058        0.2          0        0.0   Mar                2       4      1
## 2236        0.2          0        0.0   May                1       1      4
## 2622        0.2          0        0.0   May                1       1      1
## 2740        0.2          0        0.0   May                2       2      1
## 3232        0.2          0        0.0   May                2       4      1
## 3273        0.2          0        0.0   May                1       1      3
## 3282        0.2          0        0.0   May                1       1      1
## 3578        0.2          0        0.0   May                2       2      1
## 3651        0.2          0        0.0   May                2       2      4
## 3664        0.2          0        0.0   May                1       1      1
## 3722        0.2          0        0.0   May                1       1      4
## 3892        0.2          0        0.0   May                2       2      7
## 4164        0.2          0        0.0   May                1       1      4
## 4183        0.2          0        0.0   May                1       1      1
## 4232        0.2          0        0.0   May                2       2      2
## 4344        0.2          0        0.0   May                3       2      1
## 4375        0.2          0        0.0   May                2       2      1
## 4404        0.2          0        0.0   May                2       2      1
## 4427        0.2          0        0.0   May                2       2      1
## 4464        0.2          0        0.0   May                1       1      1
## 4490        0.2          0        0.0   May                3       2      9
## 4553        0.2          0        0.0   May                2       2      2
## 4818        0.2          0        0.0   May                2       2      1
## 4884        0.2          0        0.0   May                2       2      1
## 4914        0.2          0        0.8   May                2       2      1
## 5039        0.2          0        0.0   May                3       2      3
## 5044        0.2          0        0.0   May                2       2      1
## 5057        0.2          0        0.0   May                2       2      6
## 5119        0.2          0        0.0   May                1       1      6
## 5199        0.2          0        0.0   May                2       2      1
## 5200        0.2          0        0.0   May                2       2      2
## 5255        0.2          0        0.6   May                2       2      1
## 5277        0.2          0        0.0   May                3       2      3
## 5287        0.2          0        0.0   May                1       1      3
## 5356        0.2          0        0.0   May                1       1      3
## 5408        0.2          0        0.0   May                2       4      1
## 6930        0.2          0        0.0  June                2       2      1
## 7152        0.2          0        0.0  June                2       2      1
## 7636        0.2          0        0.0  June                3       2      3
## 8545        0.2          0        0.0   Nov                3       2      3
## 9307        0.2          0        0.0   Dec                3       2      3
## 9495        0.2          0        0.0   Dec                2       2      1
## 9552        0.2          0        0.0   Nov                3       2      4
## 9569        0.2          0        0.0   Dec                2       2      8
## 9582        0.2          0        0.0   Nov                2       2      1
## 9719        0.2          0        0.0   Nov                3       2      7
## 9770        0.2          0        0.0   Dec                2       2      2
## 9879        0.2          0        0.0   Dec                2       2      6
## 9908        0.2          0        0.0   Dec                2       2      1
## 10147       0.2          0        0.0   Dec                8      13      9
## 10223       0.2          0        0.0   Nov                1       1      1
## 10270       0.2          0        0.0   Nov                1       1      3
## 10573       0.2          0        0.0   Nov                2       2      3
## 10632       0.2          0        0.0   Nov                2       2      1
## 10752       0.2          0        0.0   Dec                1       1      1
## 10796       0.2          0        0.0   Nov                1       1      4
## 10842       0.2          0        0.0   Nov                2       2      3
## 10989       0.2          0        0.0   Nov                2       4      3
## 11044       0.2          0        0.0   Dec                3       2      6
## 11206       0.2          0        0.0   Dec                8      13      9
## 11405       0.2          0        0.0   Nov                3       2      1
## 11524       0.2          0        0.0   Dec                2       2      1
## 11582       0.2          0        0.0   Dec                8      13      9
## 11625       0.2          0        0.0   Nov                3       2      1
## 11659       0.2          0        0.0   Dec                1       1      1
## 11734       0.2          0        0.0   Nov                2       2      1
## 11748       0.2          0        0.0   Nov                1       1      3
## 11802       0.2          0        0.0   Dec                1       1      4
## 11814       0.2          0        0.0   Dec                2       2      1
## 11828       0.2          0        0.0   Dec                2       2      1
## 11935       0.2          0        0.0   Dec                1       1      1
## 11939       0.2          0        0.0   Dec                1       1      4
## 12160       0.2          0        0.0   Dec                1       1      1
## 12181       0.2          0        0.0   Dec                1      13      9
## 12186       0.2          0        0.0   Dec                8      13      9
##       TrafficType       VisitorType Weekend Revenue
## 159             3 Returning_Visitor   FALSE   FALSE
## 179             3 Returning_Visitor   FALSE   FALSE
## 419             1 Returning_Visitor    TRUE   FALSE
## 457             1 Returning_Visitor   FALSE   FALSE
## 484             1 Returning_Visitor   FALSE   FALSE
## 513             1 Returning_Visitor   FALSE   FALSE
## 555             1 Returning_Visitor   FALSE   FALSE
## 590             1 Returning_Visitor   FALSE   FALSE
## 660             1 Returning_Visitor   FALSE   FALSE
## 775             1 Returning_Visitor   FALSE   FALSE
## 873             1 Returning_Visitor   FALSE   FALSE
## 890             1 Returning_Visitor   FALSE   FALSE
## 923             1 Returning_Visitor   FALSE   FALSE
## 948             1 Returning_Visitor   FALSE   FALSE
## 975             1 Returning_Visitor   FALSE   FALSE
## 1035            1 Returning_Visitor   FALSE   FALSE
## 1120            1 Returning_Visitor   FALSE   FALSE
## 1171            1 Returning_Visitor   FALSE   FALSE
## 1177            1 Returning_Visitor   FALSE   FALSE
## 1214            1 Returning_Visitor   FALSE   FALSE
## 1215            3 Returning_Visitor   FALSE   FALSE
## 1292            1 Returning_Visitor   FALSE   FALSE
## 1326            3 Returning_Visitor   FALSE   FALSE
## 1357            1 Returning_Visitor   FALSE   FALSE
## 1367            1 Returning_Visitor   FALSE   FALSE
## 1382            1 Returning_Visitor   FALSE   FALSE
## 1391            1 Returning_Visitor   FALSE   FALSE
## 1395            1 Returning_Visitor   FALSE   FALSE
## 1437            1 Returning_Visitor   FALSE   FALSE
## 1454            1 Returning_Visitor   FALSE   FALSE
## 1516            3 Returning_Visitor    TRUE   FALSE
## 1574            1 Returning_Visitor   FALSE   FALSE
## 1609            1 Returning_Visitor   FALSE   FALSE
## 1698            1 Returning_Visitor   FALSE   FALSE
## 1776            1 Returning_Visitor   FALSE   FALSE
## 1805            1 Returning_Visitor   FALSE   FALSE
## 1840            3 Returning_Visitor   FALSE   FALSE
## 1867            9 Returning_Visitor    TRUE   FALSE
## 1926            1 Returning_Visitor   FALSE   FALSE
## 1934            1 Returning_Visitor   FALSE   FALSE
## 1950            1 Returning_Visitor   FALSE   FALSE
## 2057            1 Returning_Visitor   FALSE   FALSE
## 2058            1 Returning_Visitor   FALSE   FALSE
## 2236            3 Returning_Visitor   FALSE   FALSE
## 2622            3 Returning_Visitor   FALSE   FALSE
## 2740            1 Returning_Visitor   FALSE   FALSE
## 3232            3 Returning_Visitor   FALSE   FALSE
## 3273            3 Returning_Visitor   FALSE   FALSE
## 3282            3 Returning_Visitor   FALSE   FALSE
## 3578            4 Returning_Visitor   FALSE   FALSE
## 3651            1 Returning_Visitor   FALSE   FALSE
## 3664            3 Returning_Visitor   FALSE   FALSE
## 3722            3 Returning_Visitor   FALSE   FALSE
## 3892            4 Returning_Visitor   FALSE   FALSE
## 4164            3 Returning_Visitor   FALSE   FALSE
## 4183            3 Returning_Visitor   FALSE   FALSE
## 4232            1 Returning_Visitor   FALSE   FALSE
## 4344           13 Returning_Visitor   FALSE   FALSE
## 4375            3 Returning_Visitor   FALSE   FALSE
## 4404            3 Returning_Visitor   FALSE   FALSE
## 4427            3 Returning_Visitor   FALSE   FALSE
## 4464            3 Returning_Visitor   FALSE   FALSE
## 4490            3 Returning_Visitor   FALSE   FALSE
## 4553            3 Returning_Visitor   FALSE   FALSE
## 4818            3 Returning_Visitor   FALSE   FALSE
## 4884            3 Returning_Visitor   FALSE   FALSE
## 4914            1 Returning_Visitor   FALSE   FALSE
## 5039            3 Returning_Visitor   FALSE   FALSE
## 5044            3 Returning_Visitor   FALSE   FALSE
## 5057            3 Returning_Visitor   FALSE   FALSE
## 5119            4 Returning_Visitor    TRUE   FALSE
## 5199           13 Returning_Visitor   FALSE   FALSE
## 5200            3 Returning_Visitor   FALSE   FALSE
## 5255            1 Returning_Visitor   FALSE   FALSE
## 5277           13 Returning_Visitor   FALSE   FALSE
## 5287           15 Returning_Visitor   FALSE   FALSE
## 5356            3 Returning_Visitor   FALSE   FALSE
## 5408            6 Returning_Visitor   FALSE   FALSE
## 6930            1 Returning_Visitor   FALSE   FALSE
## 7152            1 Returning_Visitor   FALSE   FALSE
## 7636           13 Returning_Visitor   FALSE   FALSE
## 8545            3 Returning_Visitor   FALSE   FALSE
## 9307            1 Returning_Visitor    TRUE   FALSE
## 9495            3 Returning_Visitor   FALSE   FALSE
## 9552            3 Returning_Visitor   FALSE   FALSE
## 9569            1 Returning_Visitor   FALSE   FALSE
## 9582            1 Returning_Visitor   FALSE   FALSE
## 9719           13 Returning_Visitor   FALSE   FALSE
## 9770            1 Returning_Visitor   FALSE   FALSE
## 9879           13 Returning_Visitor   FALSE   FALSE
## 9908           13 Returning_Visitor   FALSE   FALSE
## 10147          20             Other   FALSE   FALSE
## 10223           1 Returning_Visitor   FALSE   FALSE
## 10270           2 Returning_Visitor   FALSE   FALSE
## 10573           1 Returning_Visitor   FALSE   FALSE
## 10632           1 Returning_Visitor   FALSE   FALSE
## 10752           1 Returning_Visitor    TRUE   FALSE
## 10796           1 Returning_Visitor   FALSE   FALSE
## 10842           1 Returning_Visitor   FALSE   FALSE
## 10989           3 Returning_Visitor   FALSE   FALSE
## 11044           1 Returning_Visitor   FALSE   FALSE
## 11206          20             Other   FALSE   FALSE
## 11405          13 Returning_Visitor   FALSE   FALSE
## 11524          13 Returning_Visitor   FALSE   FALSE
## 11582          20             Other   FALSE   FALSE
## 11625           1 Returning_Visitor   FALSE   FALSE
## 11659           1 Returning_Visitor    TRUE   FALSE
## 11734           1 Returning_Visitor   FALSE   FALSE
## 11748           3 Returning_Visitor   FALSE   FALSE
## 11802           1 Returning_Visitor    TRUE   FALSE
## 11814           1 Returning_Visitor   FALSE   FALSE
## 11828           1 Returning_Visitor   FALSE   FALSE
## 11935           2       New_Visitor   FALSE   FALSE
## 11939           1 Returning_Visitor    TRUE   FALSE
## 12160           3 Returning_Visitor   FALSE   FALSE
## 12181          20 Returning_Visitor   FALSE   FALSE
## 12186          20             Other   FALSE   FALSE

I will drop the duplicates

# eliminating for duplicates
df <- df[!duplicated(df), ]
### Dataset structure
str(df)
## 'data.frame':    12199 obs. of  18 variables:
##  $ Administrative         : int  0 0 0 0 0 0 0 1 0 0 ...
##  $ Administrative_Duration: num  0 0 -1 0 0 0 -1 -1 0 0 ...
##  $ Informational          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Informational_Duration : num  0 0 -1 0 0 0 -1 -1 0 0 ...
##  $ ProductRelated         : int  1 2 1 2 10 19 1 1 2 3 ...
##  $ ProductRelated_Duration: num  0 64 -1 2.67 627.5 ...
##  $ BounceRates            : num  0.2 0 0.2 0.05 0.02 ...
##  $ ExitRates              : num  0.2 0.1 0.2 0.14 0.05 ...
##  $ PageValues             : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SpecialDay             : num  0 0 0 0 0 0 0.4 0 0.8 0.4 ...
##  $ Month                  : chr  "Feb" "Feb" "Feb" "Feb" ...
##  $ OperatingSystems       : int  1 2 4 3 3 2 2 1 2 2 ...
##  $ Browser                : int  1 2 1 2 3 2 4 2 2 4 ...
##  $ Region                 : int  1 1 9 2 1 1 3 1 2 1 ...
##  $ TrafficType            : int  1 2 3 4 4 3 3 5 3 2 ...
##  $ VisitorType            : chr  "Returning_Visitor" "Returning_Visitor" "Returning_Visitor" "Returning_Visitor" ...
##  $ Weekend                : logi  FALSE FALSE FALSE FALSE TRUE FALSE ...
##  $ Revenue                : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  - attr(*, "na.action")= 'omit' Named int [1:14] 1066 1133 1134 1135 1136 1137 1474 1475 1476 1477 ...
##   ..- attr(*, "names")= chr [1:14] "1066" "1133" "1134" "1135" ...

6.3 Changing columns to factors

# changing columns to factors

df$Month <- factor(df$Month)
df$VisitorType <- factor(df$VisitorType)
df$Weekend <- factor(df$Weekend)
df$Revenue <- factor(df$Revenue)
### Dataset structure
str(df)
## 'data.frame':    12199 obs. of  18 variables:
##  $ Administrative         : int  0 0 0 0 0 0 0 1 0 0 ...
##  $ Administrative_Duration: num  0 0 -1 0 0 0 -1 -1 0 0 ...
##  $ Informational          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Informational_Duration : num  0 0 -1 0 0 0 -1 -1 0 0 ...
##  $ ProductRelated         : int  1 2 1 2 10 19 1 1 2 3 ...
##  $ ProductRelated_Duration: num  0 64 -1 2.67 627.5 ...
##  $ BounceRates            : num  0.2 0 0.2 0.05 0.02 ...
##  $ ExitRates              : num  0.2 0.1 0.2 0.14 0.05 ...
##  $ PageValues             : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SpecialDay             : num  0 0 0 0 0 0 0.4 0 0.8 0.4 ...
##  $ Month                  : Factor w/ 10 levels "Aug","Dec","Feb",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ OperatingSystems       : int  1 2 4 3 3 2 2 1 2 2 ...
##  $ Browser                : int  1 2 1 2 3 2 4 2 2 4 ...
##  $ Region                 : int  1 1 9 2 1 1 3 1 2 1 ...
##  $ TrafficType            : int  1 2 3 4 4 3 3 5 3 2 ...
##  $ VisitorType            : Factor w/ 3 levels "New_Visitor",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ Weekend                : Factor w/ 2 levels "FALSE","TRUE": 1 1 1 1 2 1 1 2 1 1 ...
##  $ Revenue                : Factor w/ 2 levels "FALSE","TRUE": 1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, "na.action")= 'omit' Named int [1:14] 1066 1133 1134 1135 1136 1137 1474 1475 1476 1477 ...
##   ..- attr(*, "names")= chr [1:14] "1066" "1133" "1134" "1135" ...

7. Exploratory Data Analysis

A function to determine the mode

mode <- function(v){
  uniq <- unique(v)
  uniq[which.max(tabulate(match(v,uniq)))]
}

Summary statistics of the columns

summary(df)
##  Administrative  Administrative_Duration Informational    
##  Min.   : 0.00   Min.   :  -1.00         Min.   : 0.0000  
##  1st Qu.: 0.00   1st Qu.:   0.00         1st Qu.: 0.0000  
##  Median : 1.00   Median :   9.00         Median : 0.0000  
##  Mean   : 2.34   Mean   :  81.68         Mean   : 0.5088  
##  3rd Qu.: 4.00   3rd Qu.:  94.75         3rd Qu.: 0.0000  
##  Max.   :27.00   Max.   :3398.75         Max.   :24.0000  
##                                                           
##  Informational_Duration ProductRelated   ProductRelated_Duration
##  Min.   :  -1.00        Min.   :  0.00   Min.   :   -1.0        
##  1st Qu.:   0.00        1st Qu.:  8.00   1st Qu.:  193.6        
##  Median :   0.00        Median : 18.00   Median :  609.5        
##  Mean   :  34.84        Mean   : 32.06   Mean   : 1207.5        
##  3rd Qu.:   0.00        3rd Qu.: 38.00   3rd Qu.: 1477.6        
##  Max.   :2549.38        Max.   :705.00   Max.   :63973.5        
##                                                                 
##   BounceRates        ExitRates         PageValues        SpecialDay     
##  Min.   :0.00000   Min.   :0.00000   Min.   :  0.000   Min.   :0.00000  
##  1st Qu.:0.00000   1st Qu.:0.01422   1st Qu.:  0.000   1st Qu.:0.00000  
##  Median :0.00293   Median :0.02500   Median :  0.000   Median :0.00000  
##  Mean   :0.02045   Mean   :0.04150   Mean   :  5.952   Mean   :0.06197  
##  3rd Qu.:0.01667   3rd Qu.:0.04848   3rd Qu.:  0.000   3rd Qu.:0.00000  
##  Max.   :0.20000   Max.   :0.20000   Max.   :361.764   Max.   :1.00000  
##                                                                         
##      Month      OperatingSystems    Browser           Region     
##  May    :3328   Min.   :1.000    Min.   : 1.000   Min.   :1.000  
##  Nov    :2983   1st Qu.:2.000    1st Qu.: 2.000   1st Qu.:1.000  
##  Mar    :1853   Median :2.000    Median : 2.000   Median :3.000  
##  Dec    :1706   Mean   :2.124    Mean   : 2.358   Mean   :3.153  
##  Oct    : 549   3rd Qu.:3.000    3rd Qu.: 2.000   3rd Qu.:4.000  
##  Sep    : 448   Max.   :8.000    Max.   :13.000   Max.   :9.000  
##  (Other):1332                                                    
##   TrafficType                VisitorType     Weekend      Revenue     
##  Min.   : 1.000   New_Visitor      : 1693   FALSE:9343   FALSE:10291  
##  1st Qu.: 2.000   Other            :   81   TRUE :2856   TRUE : 1908  
##  Median : 2.000   Returning_Visitor:10425                             
##  Mean   : 4.075                                                       
##  3rd Qu.: 4.000                                                       
##  Max.   :20.000                                                       
## 

Description of Columns

describe(df)
##                         vars     n    mean      sd median trimmed    mad min
## Administrative             1 12199    2.34    3.33   1.00    1.66   1.48   0
## Administrative_Duration    2 12199   81.68  177.53   9.00   42.87  13.34  -1
## Informational              3 12199    0.51    1.28   0.00    0.18   0.00   0
## Informational_Duration     4 12199   34.84  141.46   0.00    3.73   0.00  -1
## ProductRelated             5 12199   32.06   44.60  18.00   23.06  19.27   0
## ProductRelated_Duration    6 12199 1207.51 1919.93 609.54  832.36 745.12  -1
## BounceRates                7 12199    0.02    0.05   0.00    0.01   0.00   0
## ExitRates                  8 12199    0.04    0.05   0.03    0.03   0.02   0
## PageValues                 9 12199    5.95   18.66   0.00    1.33   0.00   0
## SpecialDay                10 12199    0.06    0.20   0.00    0.00   0.00   0
## Month*                    11 12199    6.17    2.37   7.00    6.36   1.48   1
## OperatingSystems          12 12199    2.12    0.91   2.00    2.06   0.00   1
## Browser                   13 12199    2.36    1.71   2.00    2.00   0.00   1
## Region                    14 12199    3.15    2.40   3.00    2.79   2.97   1
## TrafficType               15 12199    4.07    4.02   2.00    3.22   1.48   1
## VisitorType*              16 12199    2.72    0.69   3.00    2.89   0.00   1
## Weekend*                  17 12199    1.23    0.42   1.00    1.17   0.00   1
## Revenue*                  18 12199    1.16    0.36   1.00    1.07   0.00   1
##                              max    range  skew kurtosis    se
## Administrative             27.00    27.00  1.95     4.63  0.03
## Administrative_Duration  3398.75  3399.75  5.59    50.09  1.61
## Informational              24.00    24.00  4.01    26.64  0.01
## Informational_Duration   2549.38  2550.38  7.54    75.45  1.28
## ProductRelated            705.00   705.00  4.33    31.04  0.40
## ProductRelated_Duration 63973.52 63974.52  7.25   136.57 17.38
## BounceRates                 0.20     0.20  3.15     9.25  0.00
## ExitRates                   0.20     0.20  2.23     4.62  0.00
## PageValues                361.76   361.76  6.35    64.93  0.17
## SpecialDay                  1.00     1.00  3.28     9.78  0.00
## Month*                     10.00     9.00 -0.83    -0.37  0.02
## OperatingSystems            8.00     7.00  2.03    10.27  0.01
## Browser                    13.00    12.00  3.22    12.53  0.02
## Region                      9.00     8.00  0.98    -0.16  0.02
## TrafficType                20.00    19.00  1.96     3.47  0.04
## VisitorType*                3.00     2.00 -2.05     2.23  0.01
## Weekend*                    2.00     1.00  1.26    -0.42  0.00
## Revenue*                    2.00     1.00  1.89     1.58  0.00

Univariate Analysis

Administrative Column

  • From the summary and description, we can gather the following about the administrative column:
    • Mean: 2.34
    • Median: 1
    • Skewness: 1.95
    • Kurtosis: 4.63
  • The mode is:
mode(df$Administrative)
## [1] 0

Informational Column

  • From the summary and description, we can gather the following about the administrative column:
    • Mean: 0.51
    • Median: 0
    • Skewness: 4.01
    • Kurtosis: 26.64
  • The mode is:
mode(df$Informational)
## [1] 0

ProductRelated Column

  • From the summary and description, we can gather the following about the administrative column:
    • Mean: 32.06
    • Median: 18
    • Skewness: 4.33
    • Kurtosis: 31.04
  • The mode is:
mode(df$ProductRelated)
## [1] 1

BounceRates

  • From the summary and description, we can gather the following about the administrative column:
    • Mean: 0.02
    • Median: 0.00
    • Skewness: 3.15
    • Kurtosis: 9.25
  • The mode is:
mode(df$BounceRates)
## [1] 0

ExitRates

  • From the summary and description, we can gather the following about the administrative column:
    • Mean: 0.04
    • Median: 0.03
    • Skewness: 2.23
    • Kurtosis: 4.62
  • The mode is:
mode(df$ExitRates)
## [1] 0.2

PageValues

  • From the summary and description, we can gather the following about the administrative column:
    • Mean: 5.95
    • Median: 0
    • Skewness: 6.35
    • Kurtosis: 64.93
  • The mode is:
mode(df$PageValues)
## [1] 0

SpecialDay

  • From the summary and description, we can gather the following about the administrative column:
    • Mean: 0.06
    • Median: 0
    • Skewness: 3.28
    • Kurtosis: 9.78
  • The mode is:
mode(df$SpecialDay)
## [1] 0

Month

  • From the summary and description, we can gather the following about the administrative column:
    • Mean: 6.17
    • Median: 7
    • Skewness: -0.83
    • Kurtosis: -0.37
  • The mode is:
mode(df$Month)
## [1] May
## Levels: Aug Dec Feb Jul June Mar May Nov Oct Sep

OperatingSystems

  • From the summary and description, we can gather the following about the administrative column:
    • Mean: 2.12
    • Median: 2
    • Skewness: 2.03
    • Kurtosis: 10.27
  • The mode is:
mode(df$OperatingSystems)
## [1] 2

Browser

  • From the summary and description, we can gather the following about the administrative column:
    • Mean: 2.36
    • Median: 2
    • Skewness: 3.22
    • Kurtosis: 12.53
  • The mode is:
mode(df$Browser)
## [1] 2

Region

  • From the summary and description, we can gather the following about the administrative column:
    • Mean: 3.15
    • Median: 3
    • Skewness: 0.98
    • Kurtosis: -0.16
  • The mode is:
mode(df$Region)
## [1] 1

TrafficType

  • From the summary and description, we can gather the following about the administrative column:
    • Mean: 4.07
    • Median: 2
    • Skewness: 1.96
    • Kurtosis: 3.47
  • The mode is:
mode(df$TrafficType)
## [1] 2

Distributions

plot_histogram(df)

plot_bar(df)

Bivariate Analysis

All the data profiling statistics will be organized into the report below

create_report(df)
## 
## 
## processing file: report.rmd
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |..                                                                    |   2%
##    inline R code fragments
## 
## 
  |                                                                            
  |...                                                                   |   5%
## label: global_options (with options) 
## List of 1
##  $ include: logi FALSE
## 
## 
  |                                                                            
  |.....                                                                 |   7%
##   ordinary text without R code
## 
## 
  |                                                                            
  |.......                                                               |  10%
## label: introduce
## 
  |                                                                            
  |........                                                              |  12%
##   ordinary text without R code
## 
## 
  |                                                                            
  |..........                                                            |  14%
## label: plot_intro
## 
  |                                                                            
  |............                                                          |  17%
##   ordinary text without R code
## 
## 
  |                                                                            
  |.............                                                         |  19%
## label: data_structure
## 
  |                                                                            
  |...............                                                       |  21%
##   ordinary text without R code
## 
## 
  |                                                                            
  |.................                                                     |  24%
## label: missing_profile
## 
  |                                                                            
  |..................                                                    |  26%
##   ordinary text without R code
## 
## 
  |                                                                            
  |....................                                                  |  29%
## label: univariate_distribution_header
## 
  |                                                                            
  |......................                                                |  31%
##   ordinary text without R code
## 
## 
  |                                                                            
  |.......................                                               |  33%
## label: plot_histogram
## 
  |                                                                            
  |.........................                                             |  36%
##   ordinary text without R code
## 
## 
  |                                                                            
  |...........................                                           |  38%
## label: plot_density
## 
  |                                                                            
  |............................                                          |  40%
##   ordinary text without R code
## 
## 
  |                                                                            
  |..............................                                        |  43%
## label: plot_frequency_bar
## 
  |                                                                            
  |................................                                      |  45%
##   ordinary text without R code
## 
## 
  |                                                                            
  |.................................                                     |  48%
## label: plot_response_bar
## 
  |                                                                            
  |...................................                                   |  50%
##   ordinary text without R code
## 
## 
  |                                                                            
  |.....................................                                 |  52%
## label: plot_with_bar
## 
  |                                                                            
  |......................................                                |  55%
##   ordinary text without R code
## 
## 
  |                                                                            
  |........................................                              |  57%
## label: plot_normal_qq
## 
  |                                                                            
  |..........................................                            |  60%
##   ordinary text without R code
## 
## 
  |                                                                            
  |...........................................                           |  62%
## label: plot_response_qq
## 
  |                                                                            
  |.............................................                         |  64%
##   ordinary text without R code
## 
## 
  |                                                                            
  |...............................................                       |  67%
## label: plot_by_qq
## 
  |                                                                            
  |................................................                      |  69%
##   ordinary text without R code
## 
## 
  |                                                                            
  |..................................................                    |  71%
## label: correlation_analysis
## 
  |                                                                            
  |....................................................                  |  74%
##   ordinary text without R code
## 
## 
  |                                                                            
  |.....................................................                 |  76%
## label: principal_component_analysis
## 
  |                                                                            
  |.......................................................               |  79%
##   ordinary text without R code
## 
## 
  |                                                                            
  |.........................................................             |  81%
## label: bivariate_distribution_header
## 
  |                                                                            
  |..........................................................            |  83%
##   ordinary text without R code
## 
## 
  |                                                                            
  |............................................................          |  86%
## label: plot_response_boxplot
## 
  |                                                                            
  |..............................................................        |  88%
##   ordinary text without R code
## 
## 
  |                                                                            
  |...............................................................       |  90%
## label: plot_by_boxplot
## 
  |                                                                            
  |.................................................................     |  93%
##   ordinary text without R code
## 
## 
  |                                                                            
  |...................................................................   |  95%
## label: plot_response_scatterplot
## 
  |                                                                            
  |....................................................................  |  98%
##   ordinary text without R code
## 
## 
  |                                                                            
  |......................................................................| 100%
## label: plot_by_scatterplot
## output file: C:/Users/user/Documents/IP_W13_Part 2/report.knit.md
## "C:/Program Files/RStudio/bin/quarto/bin/pandoc" +RTS -K512m -RTS "C:/Users/user/Documents/IP_W13_Part 2/report.knit.md" --to html4 --from markdown+autolink_bare_uris+tex_math_single_backslash --output pandoc315c753d57ef.html --lua-filter "C:\Users\user\Documents\R\win-library\4.1\rmarkdown\rmarkdown\lua\pagebreak.lua" --lua-filter "C:\Users\user\Documents\R\win-library\4.1\rmarkdown\rmarkdown\lua\latex-div.lua" --self-contained --variable bs3=TRUE --standalone --section-divs --table-of-contents --toc-depth 6 --template "C:\Users\user\Documents\R\win-library\4.1\rmarkdown\rmd\h\default.html" --no-highlight --variable highlightjs=1 --variable theme=yeti --mathjax --variable "mathjax-url=https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML" --include-in-header "C:\Users\user\AppData\Local\Temp\Rtmpol5I5k\rmarkdown-str315c6b4f50e7.html"
## 
## Output created: report.html

The link for the report is here: “file:///C:/Users/user/Documents/IP_W13_Part%202/report.html#qq-plot

8. Implementing the Solution

K-Means Clustering

Step 1: One hot encoding of the factor variables.

# # One hot encoding of the factor variables.

dmy = dummyVars(" ~ .", data = df)

df2 = data.frame(predict(dmy, newdata = df))
# Checking the data types of each attribute
sapply(df2, class)
##                Administrative       Administrative_Duration 
##                     "numeric"                     "numeric" 
##                 Informational        Informational_Duration 
##                     "numeric"                     "numeric" 
##                ProductRelated       ProductRelated_Duration 
##                     "numeric"                     "numeric" 
##                   BounceRates                     ExitRates 
##                     "numeric"                     "numeric" 
##                    PageValues                    SpecialDay 
##                     "numeric"                     "numeric" 
##                     Month.Aug                     Month.Dec 
##                     "numeric"                     "numeric" 
##                     Month.Feb                     Month.Jul 
##                     "numeric"                     "numeric" 
##                    Month.June                     Month.Mar 
##                     "numeric"                     "numeric" 
##                     Month.May                     Month.Nov 
##                     "numeric"                     "numeric" 
##                     Month.Oct                     Month.Sep 
##                     "numeric"                     "numeric" 
##              OperatingSystems                       Browser 
##                     "numeric"                     "numeric" 
##                        Region                   TrafficType 
##                     "numeric"                     "numeric" 
##       VisitorType.New_Visitor             VisitorType.Other 
##                     "numeric"                     "numeric" 
## VisitorType.Returning_Visitor                 Weekend.FALSE 
##                     "numeric"                     "numeric" 
##                  Weekend.TRUE                 Revenue.FALSE 
##                     "numeric"                     "numeric" 
##                  Revenue.TRUE 
##                     "numeric"

Step 2: We are instructed to use Revenue as the class label, Hence we will remove it and store it in another variable

# Step 2
# We are instructed to use Revenue as the class label,
# Hence we will remove it and store it in another variable

df2_copy <- df2[, -c(30:31)]
df.class<- df[, "Revenue"]

df2_copy_copy <- df2[, -c(30,31)]
# Previewing the copy dataset with dummies
head(df2_copy)
##   Administrative Administrative_Duration Informational Informational_Duration
## 1              0                       0             0                      0
## 2              0                       0             0                      0
## 3              0                      -1             0                     -1
## 4              0                       0             0                      0
## 5              0                       0             0                      0
## 6              0                       0             0                      0
##   ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues
## 1              1                0.000000  0.20000000 0.2000000          0
## 2              2               64.000000  0.00000000 0.1000000          0
## 3              1               -1.000000  0.20000000 0.2000000          0
## 4              2                2.666667  0.05000000 0.1400000          0
## 5             10              627.500000  0.02000000 0.0500000          0
## 6             19              154.216667  0.01578947 0.0245614          0
##   SpecialDay Month.Aug Month.Dec Month.Feb Month.Jul Month.June Month.Mar
## 1          0         0         0         1         0          0         0
## 2          0         0         0         1         0          0         0
## 3          0         0         0         1         0          0         0
## 4          0         0         0         1         0          0         0
## 5          0         0         0         1         0          0         0
## 6          0         0         0         1         0          0         0
##   Month.May Month.Nov Month.Oct Month.Sep OperatingSystems Browser Region
## 1         0         0         0         0                1       1      1
## 2         0         0         0         0                2       2      1
## 3         0         0         0         0                4       1      9
## 4         0         0         0         0                3       2      2
## 5         0         0         0         0                3       3      1
## 6         0         0         0         0                2       2      1
##   TrafficType VisitorType.New_Visitor VisitorType.Other
## 1           1                       0                 0
## 2           2                       0                 0
## 3           3                       0                 0
## 4           4                       0                 0
## 5           4                       0                 0
## 6           3                       0                 0
##   VisitorType.Returning_Visitor Weekend.FALSE Weekend.TRUE
## 1                             1             1            0
## 2                             1             1            0
## 3                             1             1            0
## 4                             1             1            0
## 5                             1             0            1
## 6                             1             1            0

Step 3: Determining whether to Normalize or Scale the data

Scaling:

# This is important to ensure that no particular attribute, has more impact on clustering algorithm than others

df2_scaled <- scale(df2_copy)
# After scaling the data lets see what we find in the output
summary(df2_scaled)
##  Administrative    Administrative_Duration Informational    
##  Min.   :-0.7025   Min.   :-0.46574        Min.   :-0.3988  
##  1st Qu.:-0.7025   1st Qu.:-0.46011        1st Qu.:-0.3988  
##  Median :-0.4023   Median :-0.40941        Median :-0.3988  
##  Mean   : 0.0000   Mean   : 0.00000        Mean   : 0.0000  
##  3rd Qu.: 0.4984   3rd Qu.: 0.07361        3rd Qu.:-0.3988  
##  Max.   : 7.4035   Max.   :18.68474        Max.   :18.4127  
##  Informational_Duration ProductRelated    ProductRelated_Duration
##  Min.   :-0.2533        Min.   :-0.7188   Min.   :-0.6295        
##  1st Qu.:-0.2463        1st Qu.:-0.5394   1st Qu.:-0.5281        
##  Median :-0.2463        Median :-0.3152   Median :-0.3115        
##  Mean   : 0.0000        Mean   : 0.0000   Mean   : 0.0000        
##  3rd Qu.:-0.2463        3rd Qu.: 0.1332   3rd Qu.: 0.1407        
##  Max.   :17.7758        Max.   :15.0881   Max.   :32.6919        
##   BounceRates         ExitRates         PageValues       SpecialDay     
##  Min.   :-0.45034   Min.   :-0.8973   Min.   :-0.319   Min.   :-0.3103  
##  1st Qu.:-0.45034   1st Qu.:-0.5897   1st Qu.:-0.319   1st Qu.:-0.3103  
##  Median :-0.38580   Median :-0.3567   Median :-0.319   Median :-0.3103  
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.000   Mean   : 0.0000  
##  3rd Qu.:-0.08326   3rd Qu.: 0.1511   3rd Qu.:-0.319   3rd Qu.:-0.3103  
##  Max.   : 3.95470   Max.   : 3.4273   Max.   :19.070   Max.   : 4.6969  
##    Month.Aug         Month.Dec         Month.Feb         Month.Jul      
##  Min.   :-0.1918   Min.   :-0.4032   Min.   :-0.1231   Min.   :-0.1916  
##  1st Qu.:-0.1918   1st Qu.:-0.4032   1st Qu.:-0.1231   1st Qu.:-0.1916  
##  Median :-0.1918   Median :-0.4032   Median :-0.1231   Median :-0.1916  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.:-0.1918   3rd Qu.:-0.4032   3rd Qu.:-0.1231   3rd Qu.:-0.1916  
##  Max.   : 5.2126   Max.   : 2.4799   Max.   : 8.1254   Max.   : 5.2188  
##    Month.June        Month.Mar         Month.May         Month.Nov      
##  Min.   :-0.1547   Min.   :-0.4232   Min.   :-0.6125   Min.   :-0.5689  
##  1st Qu.:-0.1547   1st Qu.:-0.4232   1st Qu.:-0.6125   1st Qu.:-0.5689  
##  Median :-0.1547   Median :-0.4232   Median :-0.6125   Median :-0.5689  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.:-0.1547   3rd Qu.:-0.4232   3rd Qu.: 1.6326   3rd Qu.:-0.5689  
##  Max.   : 6.4653   Max.   : 2.3628   Max.   : 1.6326   Max.   : 1.7576  
##    Month.Oct         Month.Sep       OperatingSystems     Browser       
##  Min.   :-0.2171   Min.   :-0.1952   Min.   :-1.2397   Min.   :-0.7940  
##  1st Qu.:-0.2171   1st Qu.:-0.1952   1st Qu.:-0.1371   1st Qu.:-0.2094  
##  Median :-0.2171   Median :-0.1952   Median :-0.1371   Median :-0.2094  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.:-0.2171   3rd Qu.:-0.1952   3rd Qu.: 0.9654   3rd Qu.:-0.2094  
##  Max.   : 4.6064   Max.   : 5.1213   Max.   : 6.4782   Max.   : 6.2212  
##      Region          TrafficType       VisitorType.New_Visitor
##  Min.   :-0.89629   Min.   :-0.76562   Min.   :-0.4014        
##  1st Qu.:-0.89629   1st Qu.:-0.51661   1st Qu.:-0.4014        
##  Median :-0.06381   Median :-0.51661   Median :-0.4014        
##  Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.0000        
##  3rd Qu.: 0.35244   3rd Qu.:-0.01858   3rd Qu.:-0.4014        
##  Max.   : 2.43366   Max.   : 3.96567   Max.   : 2.4910        
##  VisitorType.Other  VisitorType.Returning_Visitor Weekend.FALSE    
##  Min.   :-0.08175   Min.   :-2.4241               Min.   :-1.8086  
##  1st Qu.:-0.08175   1st Qu.: 0.4125               1st Qu.: 0.5529  
##  Median :-0.08175   Median : 0.4125               Median : 0.5529  
##  Mean   : 0.00000   Mean   : 0.0000               Mean   : 0.0000  
##  3rd Qu.:-0.08175   3rd Qu.: 0.4125               3rd Qu.: 0.5529  
##  Max.   :12.23081   Max.   : 0.4125               Max.   : 0.5529  
##   Weekend.TRUE    
##  Min.   :-0.5529  
##  1st Qu.:-0.5529  
##  Median :-0.5529  
##  Mean   : 0.0000  
##  3rd Qu.:-0.5529  
##  Max.   : 1.8086

It is evident that there are some attributes still with large values compared to others. Scaling makes the data changes the data to have a mean 0. We will normalize the data and see if we get different results.

Normalizing:

# Normalizing the a copy of the original data

df2_norm <- as.data.frame(apply(df2_copy, 2, function(x) (x - min(x))/(max(x)-min(x))))
# summary of the normalized data.
summary(df2_norm)
##  Administrative    Administrative_Duration Informational   
##  Min.   :0.00000   Min.   :0.0000000       Min.   :0.0000  
##  1st Qu.:0.00000   1st Qu.:0.0002941       1st Qu.:0.0000  
##  Median :0.03704   Median :0.0029414       Median :0.0000  
##  Mean   :0.08667   Mean   :0.0243201       Mean   :0.0212  
##  3rd Qu.:0.14815   3rd Qu.:0.0281638       3rd Qu.:0.0000  
##  Max.   :1.00000   Max.   :1.0000000       Max.   :1.0000  
##  Informational_Duration ProductRelated    ProductRelated_Duration
##  Min.   :0.0000000      Min.   :0.00000   Min.   :0.000000       
##  1st Qu.:0.0003921      1st Qu.:0.01135   1st Qu.:0.003042       
##  Median :0.0003921      Median :0.02553   Median :0.009543       
##  Mean   :0.0140518      Mean   :0.04547   Mean   :0.018891       
##  3rd Qu.:0.0003921      3rd Qu.:0.05390   3rd Qu.:0.023112       
##  Max.   :1.0000000      Max.   :1.00000   Max.   :1.000000       
##   BounceRates        ExitRates         PageValues        SpecialDay     
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.00000   1st Qu.:0.07111   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.01465   Median :0.12500   Median :0.00000   Median :0.00000  
##  Mean   :0.10223   Mean   :0.20748   Mean   :0.01645   Mean   :0.06197  
##  3rd Qu.:0.08333   3rd Qu.:0.24242   3rd Qu.:0.00000   3rd Qu.:0.00000  
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.00000   Max.   :1.00000  
##    Month.Aug         Month.Dec        Month.Feb         Month.Jul      
##  Min.   :0.00000   Min.   :0.0000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.00000   Median :0.0000   Median :0.00000   Median :0.00000  
##  Mean   :0.03549   Mean   :0.1398   Mean   :0.01492   Mean   :0.03541  
##  3rd Qu.:0.00000   3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:0.00000  
##  Max.   :1.00000   Max.   :1.0000   Max.   :1.00000   Max.   :1.00000  
##    Month.June        Month.Mar        Month.May        Month.Nov     
##  Min.   :0.00000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.00000   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.02336   Mean   :0.1519   Mean   :0.2728   Mean   :0.2445  
##  3rd Qu.:0.00000   3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :1.00000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##    Month.Oct       Month.Sep       OperatingSystems    Browser       
##  Min.   :0.000   Min.   :0.00000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.000   1st Qu.:0.00000   1st Qu.:0.1429   1st Qu.:0.08333  
##  Median :0.000   Median :0.00000   Median :0.1429   Median :0.08333  
##  Mean   :0.045   Mean   :0.03672   Mean   :0.1606   Mean   :0.11318  
##  3rd Qu.:0.000   3rd Qu.:0.00000   3rd Qu.:0.2857   3rd Qu.:0.08333  
##  Max.   :1.000   Max.   :1.00000   Max.   :1.0000   Max.   :1.00000  
##      Region        TrafficType      VisitorType.New_Visitor VisitorType.Other
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.0000          Min.   :0.00000  
##  1st Qu.:0.0000   1st Qu.:0.05263   1st Qu.:0.0000          1st Qu.:0.00000  
##  Median :0.2500   Median :0.05263   Median :0.0000          Median :0.00000  
##  Mean   :0.2692   Mean   :0.16182   Mean   :0.1388          Mean   :0.00664  
##  3rd Qu.:0.3750   3rd Qu.:0.15789   3rd Qu.:0.0000          3rd Qu.:0.00000  
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.0000          Max.   :1.00000  
##  VisitorType.Returning_Visitor Weekend.FALSE     Weekend.TRUE   
##  Min.   :0.0000                Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:1.0000                1st Qu.:1.0000   1st Qu.:0.0000  
##  Median :1.0000                Median :1.0000   Median :0.0000  
##  Mean   :0.8546                Mean   :0.7659   Mean   :0.2341  
##  3rd Qu.:1.0000                3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :1.0000                Max.   :1.0000   Max.   :1.0000

Here, we have a maximum value of 1 and minimum value of 0s and mean of close to zero in all attributes. We will use the NORMALIZED dataset for clustering.

# Searching for the optimal number of clusters
# # Elbow method
fviz_nbclust(df2_norm, kmeans, method = "wss") +
    geom_vline(xintercept = 4, linetype = 2)+
  labs(subtitle = "Elbow method")

# Applying K-Means  Clustering algorithm 
# Using 3 centroids as K=3

result <- kmeans(df2_norm, 10)
# Previewing the number of records in each cluster

result$size
##  [1]  463  505 2591 1852 1595  475 1025  645 1793 1255
# Viewing the cluster center datapoints by each attribute

result$centers
##    Administrative Administrative_Duration Informational Informational_Duration
## 1     0.070794336             0.020569578  0.0185385169           0.0112926541
## 2     0.000880088             0.000320175  0.0004125413           0.0003711355
## 3     0.091384708             0.025408114  0.0202142030           0.0140365410
## 4     0.098312135             0.027560732  0.0290901728           0.0199305407
## 5     0.094159991             0.027110401  0.0135579937           0.0075177819
## 6     0.077660819             0.021247941  0.0188596491           0.0137691894
## 7     0.091093044             0.025973660  0.0265853659           0.0203120192
## 8     0.048176859             0.011195465  0.0121447028           0.0075813206
## 9     0.105058768             0.029081948  0.0290481502           0.0183418266
## 10    0.083901431             0.024540485  0.0205843293           0.0124254526
##    ProductRelated ProductRelated_Duration BounceRates ExitRates  PageValues
## 1      0.04176585            0.0175658519  0.06022419 0.1823545 0.011489607
## 2      0.00281160            0.0003643396  0.96790888 0.9798465 0.000000000
## 3      0.04502219            0.0185803318  0.05737498 0.1790229 0.012204454
## 4      0.05548228            0.0227181641  0.10334965 0.2002827 0.017327454
## 5      0.02580493            0.0100123697  0.02192726 0.1002636 0.030468410
## 6      0.03002613            0.0131179344  0.06331099 0.1669631 0.014282993
## 7      0.04812870            0.0215558689  0.04935376 0.1696726 0.019527581
## 8      0.04344164            0.0160424025  0.14172310 0.2805351 0.005358765
## 9      0.07378181            0.0314005950  0.05550721 0.1615382 0.018283271
## 10     0.03944054            0.0167075198  0.06680364 0.1859334 0.015981030
##     SpecialDay  Month.Aug Month.Dec    Month.Feb  Month.Jul Month.June
## 1  0.175809935 0.00000000 0.0000000 0.0000000000 0.00000000 0.00000000
## 2  0.080000000 0.02970297 0.1128713 0.0554455446 0.03960396 0.05346535
## 3  0.010806638 0.10575068 0.0000000 0.0482439213 0.10304902 0.07255886
## 4  0.005291577 0.03887689 0.1565875 0.0151187905 0.04913607 0.02213823
## 5  0.021065831 0.04514107 0.2094044 0.0006269592 0.03385580 0.01818182
## 6  0.000000000 0.00000000 0.0000000 0.0000000000 0.00000000 0.00000000
## 7  0.000000000 0.00000000 1.0000000 0.0000000000 0.00000000 0.00000000
## 8  0.761860465 0.00000000 0.0000000 0.0000000000 0.00000000 0.00000000
## 9  0.000000000 0.00000000 0.0000000 0.0000000000 0.00000000 0.00000000
## 10 0.056892430 0.00000000 0.0000000 0.0000000000 0.00000000 0.00000000
##     Month.Mar Month.May Month.Nov  Month.Oct  Month.Sep OperatingSystems
## 1  0.00000000 1.0000000 0.0000000 0.00000000 0.00000000        0.1598272
## 2  0.14851485 0.3247525 0.2118812 0.00990099 0.01386139        0.1705799
## 3  0.44847549 0.0000000 0.0000000 0.12196063 0.09996140        0.1546011
## 4  0.00000000 0.2634989 0.3585313 0.05615551 0.03995680        0.1620642
## 5  0.08840125 0.1962382 0.2626959 0.07774295 0.06771160        0.1462606
## 6  1.00000000 0.0000000 0.0000000 0.00000000 0.00000000        0.1624060
## 7  0.00000000 0.0000000 0.0000000 0.00000000 0.00000000        0.1863415
## 8  0.00000000 1.0000000 0.0000000 0.00000000 0.00000000        0.1676633
## 9  0.00000000 0.0000000 1.0000000 0.00000000 0.00000000        0.1591108
## 10 0.00000000 1.0000000 0.0000000 0.00000000 0.00000000        0.1623221
##       Browser    Region TrafficType VisitorType.New_Visitor VisitorType.Other
## 1  0.11969042 0.7200324   0.1813118              0.00000000      0.0000000000
## 2  0.11881188 0.2910891   0.2143825              0.01386139      0.0198019802
## 3  0.11459539 0.2717580   0.1258608              0.00000000      0.0003859514
## 4  0.10421166 0.2688310   0.1691486              0.00000000      0.0037796976
## 5  0.10909091 0.2876959   0.1497773              1.00000000      0.0000000000
## 6  0.09666667 0.2323684   0.1329640              0.19157895      0.0000000000
## 7  0.14495935 0.3015854   0.1750963              0.00000000      0.0478048780
## 8  0.11925065 0.1856589   0.1986944              0.00000000      0.0000000000
## 9  0.10461982 0.2435165   0.1717791              0.00000000      0.0078081428
## 10 0.11341301 0.1325697   0.1791151              0.00000000      0.0000000000
##    VisitorType.Returning_Visitor Weekend.FALSE Weekend.TRUE
## 1                      1.0000000     1.0000000    0.0000000
## 2                      0.9663366     1.0000000    0.0000000
## 3                      0.9996140     1.0000000    0.0000000
## 4                      0.9962203     0.0000000    1.0000000
## 5                      0.0000000     0.7567398    0.2432602
## 6                      0.8084211     0.0000000    1.0000000
## 7                      0.9521951     1.0000000    0.0000000
## 8                      1.0000000     0.7813953    0.2186047
## 9                      0.9921919     1.0000000    0.0000000
## 10                     1.0000000     1.0000000    0.0000000
# Plotting two variables to see how their data points 
# have been distributed in the cluster
# Product Related, vs Product Related Duration

plot(df2_norm[, 5:6], col = result$cluster)

# Product Related, vs Product Related Duration

plot(df2_norm[, 7:8], col = result$cluster)

9 Challenging the solution

Hierachical clustering

# We use R function hclust() 
# For hierarchical clustering
# First we use the dist() to compute the Euclidean distance btwn obs
# d will be the first argument in the hclust() dissimilairty matrix
# 

d <- dist(df2_norm, method = "euclidean")

# We then apply hierarchical clustering using the Ward's method

res.hc <- hclust(d, method = "ward.D2")

# Lastly we plot the obtained dendrogram
#--

plot(res.hc, cex = 0.6, hang = -1)