**PART 1 & 2**

1. Defining the Question

(a) Specifying the Question

2. Loading Libraries and Dataset

# Loading libraries

library(data.table)
library (plyr)
library(ggplot2)
library(ggcorrplot)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ tibble  3.1.7     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ✔ purrr   0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::arrange()   masks plyr::arrange()
## ✖ dplyr::between()   masks data.table::between()
## ✖ purrr::compact()   masks plyr::compact()
## ✖ dplyr::count()     masks plyr::count()
## ✖ dplyr::failwith()  masks plyr::failwith()
## ✖ dplyr::filter()    masks stats::filter()
## ✖ dplyr::first()     masks data.table::first()
## ✖ dplyr::id()        masks plyr::id()
## ✖ dplyr::lag()       masks stats::lag()
## ✖ dplyr::last()      masks data.table::last()
## ✖ dplyr::mutate()    masks plyr::mutate()
## ✖ dplyr::rename()    masks plyr::rename()
## ✖ dplyr::summarise() masks plyr::summarise()
## ✖ dplyr::summarize() masks plyr::summarize()
## ✖ purrr::transpose() masks data.table::transpose()
library(magrittr)
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
library(modelr)
library(broom)
## 
## Attaching package: 'broom'
## The following object is masked from 'package:modelr':
## 
##     bootstrap
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(rpart)
library(dplyr)
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(devtools)
## Loading required package: usethis
library(ggbiplot)
## Loading required package: scales
## 
## Attaching package: 'scales'
## The following objects are masked from 'package:psych':
## 
##     alpha, rescale
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
## Loading required package: grid
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(wskm)
## Loading required package: latticeExtra
## 
## Attaching package: 'latticeExtra'
## The following object is masked from 'package:ggplot2':
## 
##     layer
## Loading required package: fpc
library(clustvarsel)
## Loading required package: mclust
## Package 'mclust' version 5.4.10
## Type 'citation("mclust")' for citing this R package in publications.
## 
## Attaching package: 'mclust'
## The following object is masked from 'package:psych':
## 
##     sim
## The following object is masked from 'package:purrr':
## 
##     map
## Package 'clustvarsel' version 2.3.4
## Type 'citation("clustvarsel")' for citing this R package in publications.
library(mclust)
library(corrplot)
## corrplot 0.92 loaded
# Loading Dataset

url<-"http://bit.ly/CarreFourDataset"
carrefour<-read.csv(url)

3. Checking the Data

#view(carrefour)
# Previewing top of the data

head(carrefour)
##    Invoice.ID Branch Customer.type Gender           Product.line Unit.price
## 1 750-67-8428      A        Member Female      Health and beauty      74.69
## 2 226-31-3081      C        Normal Female Electronic accessories      15.28
## 3 631-41-3108      A        Normal   Male     Home and lifestyle      46.33
## 4 123-19-1176      A        Member   Male      Health and beauty      58.22
## 5 373-73-7910      A        Normal   Male      Sports and travel      86.31
## 6 699-14-3026      C        Normal   Male Electronic accessories      85.39
##   Quantity     Tax      Date  Time     Payment   cogs gross.margin.percentage
## 1        7 26.1415  1/5/2019 13:08     Ewallet 522.83                4.761905
## 2        5  3.8200  3/8/2019 10:29        Cash  76.40                4.761905
## 3        7 16.2155  3/3/2019 13:23 Credit card 324.31                4.761905
## 4        8 23.2880 1/27/2019 20:33     Ewallet 465.76                4.761905
## 5        7 30.2085  2/8/2019 10:37     Ewallet 604.17                4.761905
## 6        7 29.8865 3/25/2019 18:30     Ewallet 597.73                4.761905
##   gross.income Rating    Total
## 1      26.1415    9.1 548.9715
## 2       3.8200    9.6  80.2200
## 3      16.2155    7.4 340.5255
## 4      23.2880    8.4 489.0480
## 5      30.2085    5.3 634.3785
## 6      29.8865    4.1 627.6165
# Previewing top of the data

tail(carrefour)
##       Invoice.ID Branch Customer.type Gender           Product.line Unit.price
## 995  652-49-6720      C        Member Female Electronic accessories      60.95
## 996  233-67-5758      C        Normal   Male      Health and beauty      40.35
## 997  303-96-2227      B        Normal Female     Home and lifestyle      97.38
## 998  727-02-1313      A        Member   Male     Food and beverages      31.84
## 999  347-56-2442      A        Normal   Male     Home and lifestyle      65.82
## 1000 849-09-3807      A        Member Female    Fashion accessories      88.34
##      Quantity     Tax      Date  Time Payment   cogs gross.margin.percentage
## 995         1  3.0475 2/18/2019 11:40 Ewallet  60.95                4.761905
## 996         1  2.0175 1/29/2019 13:46 Ewallet  40.35                4.761905
## 997        10 48.6900  3/2/2019 17:16 Ewallet 973.80                4.761905
## 998         1  1.5920  2/9/2019 13:22    Cash  31.84                4.761905
## 999         1  3.2910 2/22/2019 15:33    Cash  65.82                4.761905
## 1000        7 30.9190 2/18/2019 13:28    Cash 618.38                4.761905
##      gross.income Rating     Total
## 995        3.0475    5.9   63.9975
## 996        2.0175    6.2   42.3675
## 997       48.6900    4.4 1022.4900
## 998        1.5920    7.7   33.4320
## 999        3.2910    4.1   69.1110
## 1000      30.9190    6.6  649.2990
# Previewing shape

cat("The dataset has", nrow(carrefour), "rows", "and", ncol(carrefour), "columns")
## The dataset has 1000 rows and 16 columns
# Checking Data types

str(carrefour)
## 'data.frame':    1000 obs. of  16 variables:
##  $ Invoice.ID             : chr  "750-67-8428" "226-31-3081" "631-41-3108" "123-19-1176" ...
##  $ Branch                 : chr  "A" "C" "A" "A" ...
##  $ Customer.type          : chr  "Member" "Normal" "Normal" "Member" ...
##  $ Gender                 : chr  "Female" "Female" "Male" "Male" ...
##  $ Product.line           : chr  "Health and beauty" "Electronic accessories" "Home and lifestyle" "Health and beauty" ...
##  $ Unit.price             : num  74.7 15.3 46.3 58.2 86.3 ...
##  $ Quantity               : int  7 5 7 8 7 7 6 10 2 3 ...
##  $ Tax                    : num  26.14 3.82 16.22 23.29 30.21 ...
##  $ Date                   : chr  "1/5/2019" "3/8/2019" "3/3/2019" "1/27/2019" ...
##  $ Time                   : chr  "13:08" "10:29" "13:23" "20:33" ...
##  $ Payment                : chr  "Ewallet" "Cash" "Credit card" "Ewallet" ...
##  $ cogs                   : num  522.8 76.4 324.3 465.8 604.2 ...
##  $ gross.margin.percentage: num  4.76 4.76 4.76 4.76 4.76 ...
##  $ gross.income           : num  26.14 3.82 16.22 23.29 30.21 ...
##  $ Rating                 : num  9.1 9.6 7.4 8.4 5.3 4.1 5.8 8 7.2 5.9 ...
##  $ Total                  : num  549 80.2 340.5 489 634.4 ...

4. Data Cleaning

# Checking for number of missing values

length(which(is.na(carrefour)))
## [1] 0
# Checking for duplicates

sum(duplicated(carrefour))
## [1] 0
# Tidying column names

colnames(carrefour)
##  [1] "Invoice.ID"              "Branch"                 
##  [3] "Customer.type"           "Gender"                 
##  [5] "Product.line"            "Unit.price"             
##  [7] "Quantity"                "Tax"                    
##  [9] "Date"                    "Time"                   
## [11] "Payment"                 "cogs"                   
## [13] "gross.margin.percentage" "gross.income"           
## [15] "Rating"                  "Total"
colnames(carrefour) = tolower(colnames(carrefour))
colnames(carrefour)
##  [1] "invoice.id"              "branch"                 
##  [3] "customer.type"           "gender"                 
##  [5] "product.line"            "unit.price"             
##  [7] "quantity"                "tax"                    
##  [9] "date"                    "time"                   
## [11] "payment"                 "cogs"                   
## [13] "gross.margin.percentage" "gross.income"           
## [15] "rating"                  "total"
# Checking for outliers
# Selecting numerical columns

cols_num  <- unlist(lapply(carrefour, is.numeric))
carr_num <- carrefour[ ,cols_num]
boxplot(carr_num)

# Finding total number of outliers

sum(carrefour$tax < quantile(carrefour$tax, p = 0.25)- 1.5 * IQR(carrefour$tax)) + sum(carrefour$tax > quantile(carrefour$tax, p = 0.75) + 1.5 * IQR(carrefour$tax))
## [1] 9
sum(carrefour$cogs < quantile(carrefour$cogs, p = 0.25)- 1.5 * IQR(carrefour$cogs)) + sum(carrefour$cogs > quantile(carrefour$cogs, p = 0.75) + 1.5 * IQR(carrefour$cogs))
## [1] 9
sum(carrefour$gross.income < quantile(carrefour$gross.income, p = 0.25)- 1.5 * IQR(carrefour$gross.income)) + sum(carrefour$gross.income > quantile(carrefour$gross.income, p = 0.75) + 1.5 * IQR(carrefour$gross.income))
## [1] 9
sum(carrefour$total < quantile(carrefour$total, p = 0.25)- 1.5 * IQR(carrefour$total)) + sum(carrefour$total > quantile(carrefour$total, p = 0.75) + 1.5 * IQR(carrefour$total))
## [1] 9
# Listing the outliers

boxplot.stats(carrefour$tax)$out
## [1] 47.790 49.490 49.650 47.720 48.605 49.260 48.750 48.685 48.690
boxplot.stats(carrefour$cogs)$out
## [1] 955.8 989.8 993.0 954.4 972.1 985.2 975.0 973.7 973.8
boxplot.stats(carrefour$gross.income)$out
## [1] 47.790 49.490 49.650 47.720 48.605 49.260 48.750 48.685 48.690
boxplot.stats(carrefour$total)$out
## [1] 1003.590 1039.290 1042.650 1002.120 1020.705 1034.460 1023.750 1022.385
## [9] 1022.490

5. EDA

# Statistical summary of the numerical columns

describe(carr_num)
##                         vars    n   mean     sd median trimmed    mad   min
## unit.price                 1 1000  55.67  26.49  55.23   55.62  33.37 10.08
## quantity                   2 1000   5.51   2.92   5.00    5.51   2.97  1.00
## tax                        3 1000  15.38  11.71  12.09   14.00  11.13  0.51
## cogs                       4 1000 307.59 234.18 241.76  279.91 222.65 10.17
## gross.margin.percentage    5 1000   4.76   0.00   4.76    4.76   0.00  4.76
## gross.income               6 1000  15.38  11.71  12.09   14.00  11.13  0.51
## rating                     7 1000   6.97   1.72   7.00    6.97   2.22  4.00
## total                      8 1000 322.97 245.89 253.85  293.91 233.78 10.68
##                             max   range skew kurtosis   se
## unit.price                99.96   89.88 0.01    -1.22 0.84
## quantity                  10.00    9.00 0.01    -1.22 0.09
## tax                       49.65   49.14 0.89    -0.09 0.37
## cogs                     993.00  982.83 0.89    -0.09 7.41
## gross.margin.percentage    4.76    0.00  NaN      NaN 0.00
## gross.income              49.65   49.14 0.89    -0.09 0.37
## rating                    10.00    6.00 0.01    -1.16 0.05
## total                   1042.65 1031.97 0.89    -0.09 7.78

(i) Univariate Analysis

# Visualization using barplots

par(mfrow = c(2,2), mar = c(4,3,2,2))
barplot(table(carrefour$invoice.id),main = "Invoice ID")
barplot(table(carrefour$branch),main = "Branch")
barplot(table(carrefour$customer.type),main = "Customer Type")
barplot(table(carrefour$gender),main = "Gender")

barplot(table(carrefour$product.line),main = "Product Line")
barplot(table(carrefour$date),main = "Date")
barplot(table(carrefour$time),main = "Time")
barplot(table(carrefour$payment),main = "Payment")

# Visualization using Histograms

par(mfrow = c(2,2), mar = c(5,4,2,2))
hist(carrefour$unit.price,xlab ='Unit Price', main ='Unit Price Histogram')
hist(carrefour$quantity,xlab ='quantity', main ='Quantity Histogram')
hist(carrefour$tax,xlab ='tax', main ='Tax Histogram')
hist(carrefour$cogs,xlab ='cogs', main ='Cogs Histogram')

hist(carrefour$gross.margin.percentage,xlab ='gross.margin.percentage', main ='Gross Margin % Histogram')
hist(carrefour$gross.income ,xlab ='gross.income', main ='Gross Income Histogram')
hist(carrefour$rating,xlab ='rating', main ='Rating Histogram')
hist(carrefour$total,xlab ='total', main ='Total Histogram')

(ii) Bivariate Analysis

# Visualization of relationship between product line and gender

ggplot(carrefour, aes(x=product.line)) + geom_bar(aes(fill = gender),position = "dodge") + theme(axis.text.x = element_text(angle =90, size = 10))

# Visualization of relationship between product line and customer type

ggplot(carrefour, aes(x=product.line)) + geom_bar(aes(fill = customer.type), position = 'dodge') + theme(axis.text.x = element_text(angle =90, size = 10))

# Visualization of relationship between payment and gender

ggplot(carrefour, aes(x=payment)) + geom_bar(aes(fill = gender), position = "dodge") 

# Visualization of relationship between unit price and rating

ggplot(carrefour, aes(x=unit.price, y=rating)) + geom_point()

# Visualization of relationship between quantity and rating

ggplot(carrefour, aes(x=quantity, y=rating)) + geom_point()

# Visualization of relationship between tax and rating

ggplot(carrefour, aes(x=tax, y=rating)) + geom_point()

# Plotting the correlations

ggcorrplot(cor(carr_num), type = "lower", outline.col = "black",
 lab=TRUE,
 ggtheme = ggplot2::theme_gray,
 colors = c("#6D9EC1", "white", "#E46726"))
## Warning in cor(carr_num): the standard deviation is zero

6. Implementing the Solution

Part 1: Dimensionality Reduction

# Performing PCA
# Gross margin percentage has no variance;can't be scaled

carr.pca<-prcomp(carrefour[,c(6,7,8,12,14,15,16)], center = T, scale. = T)
summary(carr.pca)
## Importance of components:
##                           PC1    PC2    PC3     PC4       PC5       PC6
## Standard deviation     2.2185 1.0002 0.9939 0.30001 2.981e-16 1.493e-16
## Proportion of Variance 0.7031 0.1429 0.1411 0.01286 0.000e+00 0.000e+00
## Cumulative Proportion  0.7031 0.8460 0.9871 1.00000 1.000e+00 1.000e+00
##                              PC7
## Standard deviation     9.831e-17
## Proportion of Variance 0.000e+00
## Cumulative Proportion  1.000e+00
# Looking into the PCA object 

str(carr.pca)
## List of 5
##  $ sdev    : num [1:7] 2.22 1.00 9.94e-01 3.00e-01 2.98e-16 ...
##  $ rotation: num [1:7, 1:7] -0.292 -0.325 -0.45 -0.45 -0.45 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:7] "unit.price" "quantity" "tax" "cogs" ...
##   .. ..$ : chr [1:7] "PC1" "PC2" "PC3" "PC4" ...
##  $ center  : Named num [1:7] 55.67 5.51 15.38 307.59 15.38 ...
##   ..- attr(*, "names")= chr [1:7] "unit.price" "quantity" "tax" "cogs" ...
##  $ scale   : Named num [1:7] 26.49 2.92 11.71 234.18 11.71 ...
##   ..- attr(*, "names")= chr [1:7] "unit.price" "quantity" "tax" "cogs" ...
##  $ x       : num [1:1000, 1:7] -2.005 2.306 -0.186 -1.504 -2.8 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : chr [1:7] "PC1" "PC2" "PC3" "PC4" ...
##  - attr(*, "class")= chr "prcomp"
# Plotting PCA

ggbiplot(carr.pca)

# Closer look at the variables

fviz_pca_var(carr.pca, col.var = "blue")

Part 2: Feature Selection

     ***(i) Filter Method***
# Numeric variables

carr<- carrefour[,c(6,7,8,12,14,15,16)]
head(carr)
##   unit.price quantity     tax   cogs gross.income rating    total
## 1      74.69        7 26.1415 522.83      26.1415    9.1 548.9715
## 2      15.28        5  3.8200  76.40       3.8200    9.6  80.2200
## 3      46.33        7 16.2155 324.31      16.2155    7.4 340.5255
## 4      58.22        8 23.2880 465.76      23.2880    8.4 489.0480
## 5      86.31        7 30.2085 604.17      30.2085    5.3 634.3785
## 6      85.39        7 29.8865 597.73      29.8865    4.1 627.6165
# Computing correlation matrix

correlationMatrix <- cor(carr)

# Finding attributes that are highly correlated

highlyCorrelated <- findCorrelation(correlationMatrix, cutoff=0.75)
head(highlyCorrelated)
## [1] 4 7 3
# Removing the highly correlated features

 carr_new<-carr[,-c(3,4,7)]
head(carr_new)
##   unit.price quantity gross.income rating
## 1      74.69        7      26.1415    9.1
## 2      15.28        5       3.8200    9.6
## 3      46.33        7      16.2155    7.4
## 4      58.22        8      23.2880    8.4
## 5      86.31        7      30.2085    5.3
## 6      85.39        7      29.8865    4.1
# Correlation of selected features

correlationmatrix <- cor(carr_new)
# Graphical comparison (before and after feature selection)

par(mfrow = c(1, 2))
corrplot(correlationMatrix, order = "hclust")
corrplot(correlationmatrix, order = "hclust")

# Sequential forward greedy search (default)

out = clustvarsel(carr)
out
## ------------------------------------------------------ 
## Variable selection for Gaussian model-based clustering
## Stepwise (forward/backward) greedy search
## ------------------------------------------------------ 
## 
##  Variable proposed Type of step   BICclust Model G     BICdiff Decision
##           quantity          Add  -4308.761     E 9    687.4466 Accepted
##              total          Add -16747.736   VEV 9    739.7086 Accepted
##         unit.price          Add -19457.808   VEV 9   5188.5285 Accepted
##           quantity       Remove -19856.444   VEV 9   3714.2317 Rejected
##             rating          Add -22474.724   VEV 8    916.7736 Accepted
##             rating       Remove -19457.808   VEV 9    916.7736 Rejected
##               cogs          Add -39878.535   VVI 9 -69715.0674 Rejected
##             rating       Remove -19457.808   VEV 9    916.7736 Rejected
## 
## Selected subset: quantity, total, unit.price, rating
# Computing cluster model

Subset1 = carr[,out$subset]
model = Mclust(Subset1)
summary(model)
## ---------------------------------------------------- 
## Gaussian finite mixture model fitted by EM algorithm 
## ---------------------------------------------------- 
## 
## Mclust EVV (ellipsoidal, equal volume) model with 7 components: 
## 
##  log-likelihood    n df       BIC       ICL
##       -12465.74 1000 98 -25608.44 -25697.64
## 
## Clustering table:
##   1   2   3   4   5   6   7 
## 120 187 127 144 135 154 133

7. Conclusion

8. Recommendations