Dimensionality Reduction & Feature Selection

Libraries Needed

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.1.3
library(Rtsne)
## Warning: package 'Rtsne' was built under R version 4.1.3
library(e1071)
## Warning: package 'e1071' was built under R version 4.1.3
library(CatEncoders)
## Warning: package 'CatEncoders' was built under R version 4.1.3
## 
## Attaching package: 'CatEncoders'
## The following object is masked from 'package:base':
## 
##     transform
library(lattice)
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.1.3
## corrplot 0.92 loaded
library(caret)
## Warning: package 'caret' was built under R version 4.1.3

Data Reading

df <- read.csv("http://bit.ly/CarreFourDataset")
head(df)
##    Invoice.ID Branch Customer.type Gender           Product.line Unit.price
## 1 750-67-8428      A        Member Female      Health and beauty      74.69
## 2 226-31-3081      C        Normal Female Electronic accessories      15.28
## 3 631-41-3108      A        Normal   Male     Home and lifestyle      46.33
## 4 123-19-1176      A        Member   Male      Health and beauty      58.22
## 5 373-73-7910      A        Normal   Male      Sports and travel      86.31
## 6 699-14-3026      C        Normal   Male Electronic accessories      85.39
##   Quantity     Tax      Date  Time     Payment   cogs gross.margin.percentage
## 1        7 26.1415  1/5/2019 13:08     Ewallet 522.83                4.761905
## 2        5  3.8200  3/8/2019 10:29        Cash  76.40                4.761905
## 3        7 16.2155  3/3/2019 13:23 Credit card 324.31                4.761905
## 4        8 23.2880 1/27/2019 20:33     Ewallet 465.76                4.761905
## 5        7 30.2085  2/8/2019 10:37     Ewallet 604.17                4.761905
## 6        7 29.8865 3/25/2019 18:30     Ewallet 597.73                4.761905
##   gross.income Rating    Total
## 1      26.1415    9.1 548.9715
## 2       3.8200    9.6  80.2200
## 3      16.2155    7.4 340.5255
## 4      23.2880    8.4 489.0480
## 5      30.2085    5.3 634.3785
## 6      29.8865    4.1 627.6165

Structure of the data

str(df)
## 'data.frame':    1000 obs. of  16 variables:
##  $ Invoice.ID             : chr  "750-67-8428" "226-31-3081" "631-41-3108" "123-19-1176" ...
##  $ Branch                 : chr  "A" "C" "A" "A" ...
##  $ Customer.type          : chr  "Member" "Normal" "Normal" "Member" ...
##  $ Gender                 : chr  "Female" "Female" "Male" "Male" ...
##  $ Product.line           : chr  "Health and beauty" "Electronic accessories" "Home and lifestyle" "Health and beauty" ...
##  $ Unit.price             : num  74.7 15.3 46.3 58.2 86.3 ...
##  $ Quantity               : int  7 5 7 8 7 7 6 10 2 3 ...
##  $ Tax                    : num  26.14 3.82 16.22 23.29 30.21 ...
##  $ Date                   : chr  "1/5/2019" "3/8/2019" "3/3/2019" "1/27/2019" ...
##  $ Time                   : chr  "13:08" "10:29" "13:23" "20:33" ...
##  $ Payment                : chr  "Ewallet" "Cash" "Credit card" "Ewallet" ...
##  $ cogs                   : num  522.8 76.4 324.3 465.8 604.2 ...
##  $ gross.margin.percentage: num  4.76 4.76 4.76 4.76 4.76 ...
##  $ gross.income           : num  26.14 3.82 16.22 23.29 30.21 ...
##  $ Rating                 : num  9.1 9.6 7.4 8.4 5.3 4.1 5.8 8 7.2 5.9 ...
##  $ Total                  : num  549 80.2 340.5 489 634.4 ...

Data Cleaning

# checking for duplicates
df[duplicated(df), ]
##  [1] Invoice.ID              Branch                  Customer.type          
##  [4] Gender                  Product.line            Unit.price             
##  [7] Quantity                Tax                     Date                   
## [10] Time                    Payment                 cogs                   
## [13] gross.margin.percentage gross.income            Rating                 
## [16] Total                  
## <0 rows> (or 0-length row.names)
# checking for missing values
colSums(is.na(df))
##              Invoice.ID                  Branch           Customer.type 
##                       0                       0                       0 
##                  Gender            Product.line              Unit.price 
##                       0                       0                       0 
##                Quantity                     Tax                    Date 
##                       0                       0                       0 
##                    Time                 Payment                    cogs 
##                       0                       0                       0 
## gross.margin.percentage            gross.income                  Rating 
##                       0                       0                       0 
##                   Total 
##                       0
# Accuracy
df$Invoice.ID <- NULL
df$Branch <- as.factor(df$Branch)
df$Customer.type <- as.factor(df$Customer.type)
df$Gender <- as.factor(df$Gender)
df$Product.line <- as.factor(df$Product.line)
df$Payment <- as.factor(df$Payment)
df$Date <- as.Date(df$Date, format = "%m/%d/%y")

Exploaratory Data Analysis

Univariate Analysis

# creating a mode function
mode <- function(x){
  uniqx <- unique(x)
  uniqx[which.max(tabulate(match(x, uniqx)))]
}

Branch

# Visualization
ggplot(df, aes(Branch)) +
  geom_bar(stat="count") +
  labs(title="Branches Distribution") +
  theme_classic()

Branch distribution is roughly equal

Gender

# Visualization
ggplot(df, aes(Gender)) +
  geom_bar(stat="count")

The gender distribution in the dataset is balanced.

Customer type

# visualization
ggplot(df, aes(Customer.type)) +
  geom_bar()

The customer types were balanced in terms of distribution.

Product Line

# visualization
ggplot(df, aes(Product.line)) +
  geom_bar()

Fashion Accessories and, Food and Beverage tie for the most bought categories but the distribution does not suggest an imbalance in general.

Unit Price

# Mean
uprice.mean <- mean(df$Unit.price)
uprice.mean
## [1] 55.67213
# Mode
uprice.mode <- mode(df$Unit.price)
uprice.mode
## [1] 83.77
# Median
uprice.median <- median(df$Unit.price)
uprice.median
## [1] 55.23
# Standard Deviation
uprice.sd <- sd(df$Unit.price)
uprice.sd
## [1] 26.49463
# Kurtosis
uprice.kurt <- kurtosis(df$Unit.price)
uprice.kurt
## [1] -1.222062
# SKewness
uprice.skew <- skewness(df$Unit.price)
uprice.skew
## [1] 0.00705623
# Range
uprice.range <- range(df$Unit.price)
uprice.range
## [1] 10.08 99.96
# Visualization
ggplot(df, aes(Unit.price)) +
  geom_boxplot(outlier.colour = "red")

Quantity

# mean
quantity.mean <- mean(df$Quantity)
quantity.mean
## [1] 5.51
# Mode
quantity.mode <- mode(df$Quantity)
quantity.mode
## [1] 10
# Median
quantity.median <- median(df$Quantity)
quantity.median
## [1] 5
# Standard Deviation
quantity.sd <- sd(df$Quantity)
quantity.sd
## [1] 2.923431
# Range
quantity.range <- range(df$Quantity)
quantity.range
## [1]  1 10
# Kurtosis
quantity.kurt <- kurtosis(df$Quantity)
quantity.kurt
## [1] -1.219039
# Skewness
quantity.skew <- skewness(df$Quantity)
quantity.skew
## [1] 0.01290225
# Quantiles
quantity.quants <- quantile(df$Quantity)
quantity.quants
##   0%  25%  50%  75% 100% 
##    1    3    5    8   10
# Visualization
ggplot(df, aes(Quantity)) +
  geom_boxplot(outlier.colour = "red")

Tax

# mean
tax.mean <- mean(df$Tax)
tax.mean
## [1] 15.37937
# mode
tax.mode <- mode(df$Tax)
tax.mode
## [1] 39.48
# Median
tax.median <- median(df$Tax)
tax.median
## [1] 12.088
# Standard Deviation
tax.sd <- sd(df$Tax)
tax.sd
## [1] 11.70883
# Kurtosis
tax.kurt <- kurtosis(df$Tax)
tax.kurt
## [1] -0.09329206
# Skewness
tax.skew <- skewness(df$Tax)
tax.skew
## [1] 0.8898939
# Range
tax.range <- range(df$Tax)
tax.range
## [1]  0.5085 49.6500
# Quantiles
tax.quantiles <- quantile(df$Tax)
tax.quantiles
##        0%       25%       50%       75%      100% 
##  0.508500  5.924875 12.088000 22.445250 49.650000
# Visual
ggplot(df, aes(Tax)) +
  geom_boxplot(outlier.colour = "red")

Date

# mode
date.mode <- mode(df$Date)
date.mode
## [1] "2020-02-07"
# median
date.median <- median(df$Date)
date.median
## [1] "2020-02-13"
# standard deviation
date.sd <- sd(df$Date)
date.sd
## [1] 25.51686

Payment

# Mode
payment.mode <- mode(df$Payment)
payment.mode
## [1] Ewallet
## Levels: Cash Credit card Ewallet
# visual
ggplot(df, aes(Payment)) +
  geom_bar(stat="count")

There is a fair distribution in the payment variable. However, fewer people tend to pay by Credit Card in these stores

COGS

# mean
cogs.mean <- mean(df$cogs)
cogs.mean
## [1] 307.5874
# mode
cogs.mode <- mode(df$cogs)
cogs.mode
## [1] 789.6
# median
cogs.median <- median(df$cogs)
cogs.median
## [1] 241.76
# standard deviation
cogs.sd <- sd(df$cogs)
cogs.sd
## [1] 234.1765
# range
cogs.range <- range(df$cogs)
cogs.range
## [1]  10.17 993.00
# kurtosis
cogs.kurt <- kurtosis(df$cogs)
cogs.kurt
## [1] -0.09329206
# skewness
cogs.skew <- skewness(df$cogs)
cogs.skew
## [1] 0.8898939
# quantiles
cogs.quantiles <- quantile(df$cogs)
cogs.quantiles
##       0%      25%      50%      75%     100% 
##  10.1700 118.4975 241.7600 448.9050 993.0000
# visual
ggplot(df, aes(cogs)) +
  geom_boxplot(outlier.colour = "red")

Gross Income

gi <- df$gross.income
# mean
gi.mean <- mean(gi)
gi.mean
## [1] 15.37937
# mode
gi.mode <- mode(gi)
gi.mode
## [1] 39.48
# median
gi.median <- median(gi)
gi.median
## [1] 12.088
# range 
gi.range <- range(gi)
gi.range
## [1]  0.5085 49.6500
# standard deviation
gi.sd <- sd(gi)
gi.sd
## [1] 11.70883
# kurtosis
gi.kurt <- kurtosis(gi)
gi.kurt
## [1] -0.09329206
# skewness
gi.skew <- skewness(gi)
gi.skew
## [1] 0.8898939
# visual
 ggplot(df, aes(gross.income)) +
   geom_boxplot(outlier.colour = "red")

Rating

# mean
rate.mean <- mean(df$Rating)
rate.mean
## [1] 6.9727
# mode
rate.mode <- mode(df$Rating)
rate.mode
## [1] 6
# median
rate.median <- median(df$Rating)
rate.median
## [1] 7
# standard deviation
rate.sd <- sd(df$Rating)
rate.sd
## [1] 1.71858
# range
rate.range <- range(df$Rating)
rate.range
## [1]  4 10
# quantiles
rate.quantiles <- quantile(df$Rating)
rate.quantiles
##   0%  25%  50%  75% 100% 
##  4.0  5.5  7.0  8.5 10.0
# kurtosis
rate.kurt <- kurtosis(df$Rating)
rate.kurt
## [1] -1.155525
# skewness
rate.skew <- skewness(df$Rating)
rate.skew
## [1] 0.008982638
# visual
ggplot(df, aes(Rating)) +
  geom_boxplot(outlier.colour = "red")

Total

# mean
total.mean <- mean(df$Total)
total.mean
## [1] 322.9667
# median
total.median <- median(df$Total)
total.median
## [1] 253.848
# mode
total.mode <- mode(df$Total)
total.mode
## [1] 829.08
# standard deviation
total.sd <- sd(df$Total)
total.sd
## [1] 245.8853
# range
total.range <- range(df$Total)
total.range
## [1]   10.6785 1042.6500
# kurtosis
total.kurt <- kurtosis(df$Total)
total.kurt
## [1] -0.09329206
# skewness
total.skew <- skewness(df$Total)
total.skew
## [1] 0.8898939
# quantiles
total.quantiles <- quantile(df$Total)
total.quantiles
##        0%       25%       50%       75%      100% 
##   10.6785  124.4224  253.8480  471.3502 1042.6500
# visual
ggplot(df, aes(Total)) +
  geom_boxplot(outlier.colour = "red" )

Bivariate Analysis

ggplot(df, aes(x=Product.line, y=Total)) +
  geom_point()

Fashion Accessories have the highest Total prices while health and beauty products have a relatively lower price.

ggplot(df ,aes(Gender, Total)) +
  geom_point()

Total Price is equally distributed in terms of gender

ggplot(df, aes(Payment, Total)) +
  geom_point()

The payment methods are nearly identical for the total prices of items at checkouts with some more expensive ones being attributed with Credit card payments.

ggplot(df, aes(gross.income, Total)) +
  geom_point()

As expected, there is a perfect positive linear relationship with how much the total is at checkout with the consumers gross income.

ggplot(df, aes(Customer.type , Total)) +
  geom_point()

Members and non members have a nearly equal distribution in expenditure with Members having no visible breaks in prices.

ggplot(df, aes(Tax, Total)) +
  geom_point()

There is a direct linear relationship between tax and total price. As expected, the higher the tax on items, the more they cost.

ggplot(df, aes(Unit.price, Total)) +
  geom_point()

There are several positive linear relationships with the Unit Price variable: the higher it is the higher the total price is. More data would be needed to explain the different lines considering they represent outside factors that influence the relationship. A good example would be the type of products being of different types.

cor(df[,unlist(lapply(df, is.numeric))])
## Warning in cor(df[, unlist(lapply(df, is.numeric))]): the standard deviation is
## zero
##                           Unit.price    Quantity        Tax       cogs
## Unit.price               1.000000000  0.01077756  0.6339621  0.6339621
## Quantity                 0.010777564  1.00000000  0.7055102  0.7055102
## Tax                      0.633962089  0.70551019  1.0000000  1.0000000
## cogs                     0.633962089  0.70551019  1.0000000  1.0000000
## gross.margin.percentage           NA          NA         NA         NA
## gross.income             0.633962089  0.70551019  1.0000000  1.0000000
## Rating                  -0.008777507 -0.01581490 -0.0364417 -0.0364417
## Total                    0.633962089  0.70551019  1.0000000  1.0000000
##                         gross.margin.percentage gross.income       Rating
## Unit.price                                   NA    0.6339621 -0.008777507
## Quantity                                     NA    0.7055102 -0.015814905
## Tax                                          NA    1.0000000 -0.036441705
## cogs                                         NA    1.0000000 -0.036441705
## gross.margin.percentage                       1           NA           NA
## gross.income                                 NA    1.0000000 -0.036441705
## Rating                                       NA   -0.0364417  1.000000000
## Total                                        NA    1.0000000 -0.036441705
##                              Total
## Unit.price               0.6339621
## Quantity                 0.7055102
## Tax                      1.0000000
## cogs                     1.0000000
## gross.margin.percentage         NA
## gross.income             1.0000000
## Rating                  -0.0364417
## Total                    1.0000000

The following variables show strong correlation:

  • Unit Price to Tax and COGS
  • Quantity to Tax and COGS
  • Gross Income to Unit Price, Quantity, Tax, COGS, Total
copy <- df[, -c(8, 9, 12, 15)]
label <- df[, 15]

APPLICATION OF t-SNE ALGORITHM

branch <- LabelEncoder.fit(copy$Branch)
copy$Branch <- transform(branch, factor(df$Branch))
gender <- LabelEncoder.fit(copy$Gender)
copy$Gender <- transform(gender, factor(df$Gender))
customer <- LabelEncoder.fit(copy$Customer.type)
copy$Customer.type <- transform(customer, factor(copy$Customer.type))
product <- LabelEncoder.fit(copy$Product.line)
copy$Product.line <- transform(product, factor(copy$Product.line))
pay <- LabelEncoder.fit(copy$Payment)
copy$Payment <- transform(pay, factor(copy$Payment))
model <- Rtsne(copy, dims=2, perplexity=30, verbose= TRUE, max_iter=1000)
## Performing PCA
## Read the 1000 x 11 data matrix successfully!
## OpenMP is working. 1 threads.
## Using no_dims = 2, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 0.24 seconds (sparsity = 0.102676)!
## Learning embedding...
## Iteration 50: error is 61.242416 (50 iterations in 0.33 seconds)
## Iteration 100: error is 53.551234 (50 iterations in 0.28 seconds)
## Iteration 150: error is 52.083077 (50 iterations in 0.26 seconds)
## Iteration 200: error is 51.412922 (50 iterations in 0.28 seconds)
## Iteration 250: error is 51.034899 (50 iterations in 0.34 seconds)
## Iteration 300: error is 0.583195 (50 iterations in 0.26 seconds)
## Iteration 350: error is 0.421205 (50 iterations in 0.29 seconds)
## Iteration 400: error is 0.380630 (50 iterations in 0.24 seconds)
## Iteration 450: error is 0.368461 (50 iterations in 0.31 seconds)
## Iteration 500: error is 0.360154 (50 iterations in 0.24 seconds)
## Iteration 550: error is 0.352034 (50 iterations in 0.28 seconds)
## Iteration 600: error is 0.347254 (50 iterations in 0.24 seconds)
## Iteration 650: error is 0.343180 (50 iterations in 0.29 seconds)
## Iteration 700: error is 0.341481 (50 iterations in 0.32 seconds)
## Iteration 750: error is 0.337247 (50 iterations in 0.21 seconds)
## Iteration 800: error is 0.334652 (50 iterations in 0.26 seconds)
## Iteration 850: error is 0.333340 (50 iterations in 0.23 seconds)
## Iteration 900: error is 0.332257 (50 iterations in 0.27 seconds)
## Iteration 950: error is 0.329499 (50 iterations in 0.21 seconds)
## Iteration 1000: error is 0.328039 (50 iterations in 0.22 seconds)
## Fitting performed in 5.38 seconds.
summary(model)
##                     Length Class  Mode   
## N                      1   -none- numeric
## Y                   2000   -none- numeric
## costs               1000   -none- numeric
## itercosts             20   -none- numeric
## origD                  1   -none- numeric
## perplexity             1   -none- numeric
## theta                  1   -none- numeric
## max_iter               1   -none- numeric
## stop_lying_iter        1   -none- numeric
## mom_switch_iter        1   -none- numeric
## momentum               1   -none- numeric
## final_momentum         1   -none- numeric
## eta                    1   -none- numeric
## exaggeration_factor    1   -none- numeric
head(model$Y)
##            [,1]       [,2]
## [1,] -16.101889   9.497742
## [2,]  -5.609068 -23.404850
## [3,]  12.610804  10.182126
## [4,]  -7.801824   6.380460
## [5,] -18.066283  18.270284
## [6,] -18.249653  17.671074
plot(model$Y, t='p', main="Output of TSNE")

Feature Selection

corrMat <- cor(copy)
# highly correlated features
high <- findCorrelation(corrMat, cutoff = .75)
# names of highly correlated features
names(copy[, high])
## [1] "Tax"  "cogs"
# data set without highly correlated variables
c2 <- copy[-high]
par(mfrow = c(1, 2))
# plotting
corrplot(corrMat, order = "hclust")
corrplot(cor(c2), order = "hclust")

# adding Ratings column to copy data set
copy$Rating <- df$Rating
# how different variable score in relation to rating variable
# scores <- information.gain(Rating~., copy)
# subset <- cutoff.k(scores, 5)
# Needed features (5)
as.data.frame(subset)
## function (...) 
## data.frame(value = x(...))
## <bytecode: 0x00000000422c5950>
## <environment: 0x00000000422c54b8>

Conclusions

Using a Feature Ranking method with Information Gain of all variables being used as a metric of comparison, these variables(Branch,Customer Type,Gender,Product Line,Unit Price) would be the best to use for modeling a regressor with respect to Rating.