Dimensionality Reduction & Feature Selection

Libraries Needed

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.1.3

library(Rtsne)

## Warning: package 'Rtsne' was built under R version 4.1.3

library(e1071)

## Warning: package 'e1071' was built under R version 4.1.3

library(CatEncoders)

## Warning: package 'CatEncoders' was built under R version 4.1.3

## 
## Attaching package: 'CatEncoders'

## The following object is masked from 'package:base':
## 
##     transform

library(lattice)
library(corrplot)

## Warning: package 'corrplot' was built under R version 4.1.3

## corrplot 0.92 loaded

library(caret)

## Warning: package 'caret' was built under R version 4.1.3

Data Reading

df <- read.csv("http://bit.ly/CarreFourDataset")
head(df)

##    Invoice.ID Branch Customer.type Gender           Product.line Unit.price
## 1 750-67-8428      A        Member Female      Health and beauty      74.69
## 2 226-31-3081      C        Normal Female Electronic accessories      15.28
## 3 631-41-3108      A        Normal   Male     Home and lifestyle      46.33
## 4 123-19-1176      A        Member   Male      Health and beauty      58.22
## 5 373-73-7910      A        Normal   Male      Sports and travel      86.31
## 6 699-14-3026      C        Normal   Male Electronic accessories      85.39
##   Quantity     Tax      Date  Time     Payment   cogs gross.margin.percentage
## 1        7 26.1415  1/5/2019 13:08     Ewallet 522.83                4.761905
## 2        5  3.8200  3/8/2019 10:29        Cash  76.40                4.761905
## 3        7 16.2155  3/3/2019 13:23 Credit card 324.31                4.761905
## 4        8 23.2880 1/27/2019 20:33     Ewallet 465.76                4.761905
## 5        7 30.2085  2/8/2019 10:37     Ewallet 604.17                4.761905
## 6        7 29.8865 3/25/2019 18:30     Ewallet 597.73                4.761905
##   gross.income Rating    Total
## 1      26.1415    9.1 548.9715
## 2       3.8200    9.6  80.2200
## 3      16.2155    7.4 340.5255
## 4      23.2880    8.4 489.0480
## 5      30.2085    5.3 634.3785
## 6      29.8865    4.1 627.6165

Structure of the data

str(df)

## 'data.frame':    1000 obs. of  16 variables:
##  $ Invoice.ID             : chr  "750-67-8428" "226-31-3081" "631-41-3108" "123-19-1176" ...
##  $ Branch                 : chr  "A" "C" "A" "A" ...
##  $ Customer.type          : chr  "Member" "Normal" "Normal" "Member" ...
##  $ Gender                 : chr  "Female" "Female" "Male" "Male" ...
##  $ Product.line           : chr  "Health and beauty" "Electronic accessories" "Home and lifestyle" "Health and beauty" ...
##  $ Unit.price             : num  74.7 15.3 46.3 58.2 86.3 ...
##  $ Quantity               : int  7 5 7 8 7 7 6 10 2 3 ...
##  $ Tax                    : num  26.14 3.82 16.22 23.29 30.21 ...
##  $ Date                   : chr  "1/5/2019" "3/8/2019" "3/3/2019" "1/27/2019" ...
##  $ Time                   : chr  "13:08" "10:29" "13:23" "20:33" ...
##  $ Payment                : chr  "Ewallet" "Cash" "Credit card" "Ewallet" ...
##  $ cogs                   : num  522.8 76.4 324.3 465.8 604.2 ...
##  $ gross.margin.percentage: num  4.76 4.76 4.76 4.76 4.76 ...
##  $ gross.income           : num  26.14 3.82 16.22 23.29 30.21 ...
##  $ Rating                 : num  9.1 9.6 7.4 8.4 5.3 4.1 5.8 8 7.2 5.9 ...
##  $ Total                  : num  549 80.2 340.5 489 634.4 ...

Data Cleaning

# checking for duplicates
df[duplicated(df), ]

##  [1] Invoice.ID              Branch                  Customer.type          
##  [4] Gender                  Product.line            Unit.price             
##  [7] Quantity                Tax                     Date                   
## [10] Time                    Payment                 cogs                   
## [13] gross.margin.percentage gross.income            Rating                 
## [16] Total                  
## <0 rows> (or 0-length row.names)

# checking for missing values
colSums(is.na(df))

##              Invoice.ID                  Branch           Customer.type 
##                       0                       0                       0 
##                  Gender            Product.line              Unit.price 
##                       0                       0                       0 
##                Quantity                     Tax                    Date 
##                       0                       0                       0 
##                    Time                 Payment                    cogs 
##                       0                       0                       0 
## gross.margin.percentage            gross.income                  Rating 
##                       0                       0                       0 
##                   Total 
##                       0

# Accuracy
df$Invoice.ID <- NULL
df$Branch <- as.factor(df$Branch)
df$Customer.type <- as.factor(df$Customer.type)
df$Gender <- as.factor(df$Gender)
df$Product.line <- as.factor(df$Product.line)
df$Payment <- as.factor(df$Payment)
df$Date <- as.Date(df$Date, format = "%m/%d/%y")

Exploaratory Data Analysis

Univariate Analysis

# creating a mode function
mode <- function(x){
  uniqx <- unique(x)
  uniqx[which.max(tabulate(match(x, uniqx)))]
}

Branch

# Visualization
ggplot(df, aes(Branch)) +
  geom_bar(stat="count") +
  labs(title="Branches Distribution") +
  theme_classic()

Branch distribution is roughly equal

Gender

# Visualization
ggplot(df, aes(Gender)) +
  geom_bar(stat="count")

The gender distribution in the dataset is balanced.

Customer type

# visualization
ggplot(df, aes(Customer.type)) +
  geom_bar()

The customer types were balanced in terms of distribution.

Product Line

# visualization
ggplot(df, aes(Product.line)) +
  geom_bar()

Fashion Accessories and, Food and Beverage tie for the most bought categories but the distribution does not suggest an imbalance in general.

Unit Price

# Mean
uprice.mean <- mean(df$Unit.price)
uprice.mean

## [1] 55.67213

# Mode
uprice.mode <- mode(df$Unit.price)
uprice.mode

## [1] 83.77

# Median
uprice.median <- median(df$Unit.price)
uprice.median

## [1] 55.23

# Standard Deviation
uprice.sd <- sd(df$Unit.price)
uprice.sd

## [1] 26.49463

# Kurtosis
uprice.kurt <- kurtosis(df$Unit.price)
uprice.kurt

## [1] -1.222062

# SKewness
uprice.skew <- skewness(df$Unit.price)
uprice.skew

## [1] 0.00705623

# Range
uprice.range <- range(df$Unit.price)
uprice.range

## [1] 10.08 99.96

# Visualization
ggplot(df, aes(Unit.price)) +
  geom_boxplot(outlier.colour = "red")

Quantity

# mean
quantity.mean <- mean(df$Quantity)
quantity.mean

## [1] 5.51

# Mode
quantity.mode <- mode(df$Quantity)
quantity.mode

## [1] 10

# Median
quantity.median <- median(df$Quantity)
quantity.median

## [1] 5

# Standard Deviation
quantity.sd <- sd(df$Quantity)
quantity.sd

## [1] 2.923431

# Range
quantity.range <- range(df$Quantity)
quantity.range

## [1]  1 10

# Kurtosis
quantity.kurt <- kurtosis(df$Quantity)
quantity.kurt

## [1] -1.219039

# Skewness
quantity.skew <- skewness(df$Quantity)
quantity.skew

## [1] 0.01290225

# Quantiles
quantity.quants <- quantile(df$Quantity)
quantity.quants

##   0%  25%  50%  75% 100% 
##    1    3    5    8   10

# Visualization
ggplot(df, aes(Quantity)) +
  geom_boxplot(outlier.colour = "red")

Tax

# mean
tax.mean <- mean(df$Tax)
tax.mean

## [1] 15.37937

# mode
tax.mode <- mode(df$Tax)
tax.mode

## [1] 39.48

# Median
tax.median <- median(df$Tax)
tax.median

## [1] 12.088

# Standard Deviation
tax.sd <- sd(df$Tax)
tax.sd

## [1] 11.70883

# Kurtosis
tax.kurt <- kurtosis(df$Tax)
tax.kurt

## [1] -0.09329206

# Skewness
tax.skew <- skewness(df$Tax)
tax.skew

## [1] 0.8898939

# Range
tax.range <- range(df$Tax)
tax.range

## [1]  0.5085 49.6500

# Quantiles
tax.quantiles <- quantile(df$Tax)
tax.quantiles

##        0%       25%       50%       75%      100% 
##  0.508500  5.924875 12.088000 22.445250 49.650000

# Visual
ggplot(df, aes(Tax)) +
  geom_boxplot(outlier.colour = "red")

Date

# mode
date.mode <- mode(df$Date)
date.mode

## [1] "2020-02-07"

# median
date.median <- median(df$Date)
date.median

## [1] "2020-02-13"

# standard deviation
date.sd <- sd(df$Date)
date.sd

## [1] 25.51686

Payment

# Mode
payment.mode <- mode(df$Payment)
payment.mode

## [1] Ewallet
## Levels: Cash Credit card Ewallet

# visual
ggplot(df, aes(Payment)) +
  geom_bar(stat="count")

There is a fair distribution in the payment variable. However, fewer people tend to pay by Credit Card in these stores

COGS

# mean
cogs.mean <- mean(df$cogs)
cogs.mean

## [1] 307.5874

# mode
cogs.mode <- mode(df$cogs)
cogs.mode

## [1] 789.6

# median
cogs.median <- median(df$cogs)
cogs.median

## [1] 241.76

# standard deviation
cogs.sd <- sd(df$cogs)
cogs.sd

## [1] 234.1765

# range
cogs.range <- range(df$cogs)
cogs.range

## [1]  10.17 993.00

# kurtosis
cogs.kurt <- kurtosis(df$cogs)
cogs.kurt

## [1] -0.09329206

# skewness
cogs.skew <- skewness(df$cogs)
cogs.skew

## [1] 0.8898939

# quantiles
cogs.quantiles <- quantile(df$cogs)
cogs.quantiles

##       0%      25%      50%      75%     100% 
##  10.1700 118.4975 241.7600 448.9050 993.0000

# visual
ggplot(df, aes(cogs)) +
  geom_boxplot(outlier.colour = "red")

Gross Income

gi <- df$gross.income
# mean
gi.mean <- mean(gi)
gi.mean

## [1] 15.37937

# mode
gi.mode <- mode(gi)
gi.mode

## [1] 39.48

# median
gi.median <- median(gi)
gi.median

## [1] 12.088

# range 
gi.range <- range(gi)
gi.range

## [1]  0.5085 49.6500

# standard deviation
gi.sd <- sd(gi)
gi.sd

## [1] 11.70883

# kurtosis
gi.kurt <- kurtosis(gi)
gi.kurt

## [1] -0.09329206

# skewness
gi.skew <- skewness(gi)
gi.skew

## [1] 0.8898939

# visual
 ggplot(df, aes(gross.income)) +
   geom_boxplot(outlier.colour = "red")

Rating

# mean
rate.mean <- mean(df$Rating)
rate.mean

## [1] 6.9727

# mode
rate.mode <- mode(df$Rating)
rate.mode

## [1] 6

# median
rate.median <- median(df$Rating)
rate.median

## [1] 7

# standard deviation
rate.sd <- sd(df$Rating)
rate.sd

## [1] 1.71858

# range
rate.range <- range(df$Rating)
rate.range

## [1]  4 10

# quantiles
rate.quantiles <- quantile(df$Rating)
rate.quantiles

##   0%  25%  50%  75% 100% 
##  4.0  5.5  7.0  8.5 10.0

# kurtosis
rate.kurt <- kurtosis(df$Rating)
rate.kurt

## [1] -1.155525

# skewness
rate.skew <- skewness(df$Rating)
rate.skew

## [1] 0.008982638

# visual
ggplot(df, aes(Rating)) +
  geom_boxplot(outlier.colour = "red")

Total

# mean
total.mean <- mean(df$Total)
total.mean

## [1] 322.9667

# median
total.median <- median(df$Total)
total.median

## [1] 253.848

# mode
total.mode <- mode(df$Total)
total.mode

## [1] 829.08

# standard deviation
total.sd <- sd(df$Total)
total.sd

## [1] 245.8853

# range
total.range <- range(df$Total)
total.range

## [1]   10.6785 1042.6500

# kurtosis
total.kurt <- kurtosis(df$Total)
total.kurt

## [1] -0.09329206

# skewness
total.skew <- skewness(df$Total)
total.skew

## [1] 0.8898939

# quantiles
total.quantiles <- quantile(df$Total)
total.quantiles

##        0%       25%       50%       75%      100% 
##   10.6785  124.4224  253.8480  471.3502 1042.6500

# visual
ggplot(df, aes(Total)) +
  geom_boxplot(outlier.colour = "red" )

Bivariate Analysis

ggplot(df, aes(x=Product.line, y=Total)) +
  geom_point()

Fashion Accessories have the highest Total prices while health and beauty products have a relatively lower price.

ggplot(df ,aes(Gender, Total)) +
  geom_point()

Total Price is equally distributed in terms of gender

ggplot(df, aes(Payment, Total)) +
  geom_point()

The payment methods are nearly identical for the total prices of items at checkouts with some more expensive ones being attributed with Credit card payments.

ggplot(df, aes(gross.income, Total)) +
  geom_point()

As expected, there is a perfect positive linear relationship with how much the total is at checkout with the consumers gross income.

ggplot(df, aes(Customer.type , Total)) +
  geom_point()

Members and non members have a nearly equal distribution in expenditure with Members having no visible breaks in prices.

ggplot(df, aes(Tax, Total)) +
  geom_point()

There is a direct linear relationship between tax and total price. As expected, the higher the tax on items, the more they cost.

ggplot(df, aes(Unit.price, Total)) +
  geom_point()

There are several positive linear relationships with the Unit Price variable: the higher it is the higher the total price is. More data would be needed to explain the different lines considering they represent outside factors that influence the relationship. A good example would be the type of products being of different types.

cor(df[,unlist(lapply(df, is.numeric))])

## Warning in cor(df[, unlist(lapply(df, is.numeric))]): the standard deviation is
## zero

##                           Unit.price    Quantity        Tax       cogs
## Unit.price               1.000000000  0.01077756  0.6339621  0.6339621
## Quantity                 0.010777564  1.00000000  0.7055102  0.7055102
## Tax                      0.633962089  0.70551019  1.0000000  1.0000000
## cogs                     0.633962089  0.70551019  1.0000000  1.0000000
## gross.margin.percentage           NA          NA         NA         NA
## gross.income             0.633962089  0.70551019  1.0000000  1.0000000
## Rating                  -0.008777507 -0.01581490 -0.0364417 -0.0364417
## Total                    0.633962089  0.70551019  1.0000000  1.0000000
##                         gross.margin.percentage gross.income       Rating
## Unit.price                                   NA    0.6339621 -0.008777507
## Quantity                                     NA    0.7055102 -0.015814905
## Tax                                          NA    1.0000000 -0.036441705
## cogs                                         NA    1.0000000 -0.036441705
## gross.margin.percentage                       1           NA           NA
## gross.income                                 NA    1.0000000 -0.036441705
## Rating                                       NA   -0.0364417  1.000000000
## Total                                        NA    1.0000000 -0.036441705
##                              Total
## Unit.price               0.6339621
## Quantity                 0.7055102
## Tax                      1.0000000
## cogs                     1.0000000
## gross.margin.percentage         NA
## gross.income             1.0000000
## Rating                  -0.0364417
## Total                    1.0000000

The following variables show strong correlation:

Unit Price to Tax and COGS
Quantity to Tax and COGS
Gross Income to Unit Price, Quantity, Tax, COGS, Total

copy <- df[, -c(8, 9, 12, 15)]
label <- df[, 15]

APPLICATION OF t-SNE ALGORITHM

branch <- LabelEncoder.fit(copy$Branch)
copy$Branch <- transform(branch, factor(df$Branch))
gender <- LabelEncoder.fit(copy$Gender)
copy$Gender <- transform(gender, factor(df$Gender))
customer <- LabelEncoder.fit(copy$Customer.type)
copy$Customer.type <- transform(customer, factor(copy$Customer.type))
product <- LabelEncoder.fit(copy$Product.line)
copy$Product.line <- transform(product, factor(copy$Product.line))
pay <- LabelEncoder.fit(copy$Payment)
copy$Payment <- transform(pay, factor(copy$Payment))

model <- Rtsne(copy, dims=2, perplexity=30, verbose= TRUE, max_iter=1000)

## Performing PCA
## Read the 1000 x 11 data matrix successfully!
## OpenMP is working. 1 threads.
## Using no_dims = 2, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 0.24 seconds (sparsity = 0.102676)!
## Learning embedding...
## Iteration 50: error is 61.242416 (50 iterations in 0.33 seconds)
## Iteration 100: error is 53.551234 (50 iterations in 0.28 seconds)
## Iteration 150: error is 52.083077 (50 iterations in 0.26 seconds)
## Iteration 200: error is 51.412922 (50 iterations in 0.28 seconds)
## Iteration 250: error is 51.034899 (50 iterations in 0.34 seconds)
## Iteration 300: error is 0.583195 (50 iterations in 0.26 seconds)
## Iteration 350: error is 0.421205 (50 iterations in 0.29 seconds)
## Iteration 400: error is 0.380630 (50 iterations in 0.24 seconds)
## Iteration 450: error is 0.368461 (50 iterations in 0.31 seconds)
## Iteration 500: error is 0.360154 (50 iterations in 0.24 seconds)
## Iteration 550: error is 0.352034 (50 iterations in 0.28 seconds)
## Iteration 600: error is 0.347254 (50 iterations in 0.24 seconds)
## Iteration 650: error is 0.343180 (50 iterations in 0.29 seconds)
## Iteration 700: error is 0.341481 (50 iterations in 0.32 seconds)
## Iteration 750: error is 0.337247 (50 iterations in 0.21 seconds)
## Iteration 800: error is 0.334652 (50 iterations in 0.26 seconds)
## Iteration 850: error is 0.333340 (50 iterations in 0.23 seconds)
## Iteration 900: error is 0.332257 (50 iterations in 0.27 seconds)
## Iteration 950: error is 0.329499 (50 iterations in 0.21 seconds)
## Iteration 1000: error is 0.328039 (50 iterations in 0.22 seconds)
## Fitting performed in 5.38 seconds.

summary(model)

##                     Length Class  Mode   
## N                      1   -none- numeric
## Y                   2000   -none- numeric
## costs               1000   -none- numeric
## itercosts             20   -none- numeric
## origD                  1   -none- numeric
## perplexity             1   -none- numeric
## theta                  1   -none- numeric
## max_iter               1   -none- numeric
## stop_lying_iter        1   -none- numeric
## mom_switch_iter        1   -none- numeric
## momentum               1   -none- numeric
## final_momentum         1   -none- numeric
## eta                    1   -none- numeric
## exaggeration_factor    1   -none- numeric

head(model$Y)

##            [,1]       [,2]
## [1,] -16.101889   9.497742
## [2,]  -5.609068 -23.404850
## [3,]  12.610804  10.182126
## [4,]  -7.801824   6.380460
## [5,] -18.066283  18.270284
## [6,] -18.249653  17.671074

plot(model$Y, t='p', main="Output of TSNE")

Feature Selection

corrMat <- cor(copy)
# highly correlated features
high <- findCorrelation(corrMat, cutoff = .75)
# names of highly correlated features
names(copy[, high])

## [1] "Tax"  "cogs"

# data set without highly correlated variables
c2 <- copy[-high]
par(mfrow = c(1, 2))
# plotting
corrplot(corrMat, order = "hclust")
corrplot(cor(c2), order = "hclust")

# adding Ratings column to copy data set
copy$Rating <- df$Rating
# how different variable score in relation to rating variable
# scores <- information.gain(Rating~., copy)
# subset <- cutoff.k(scores, 5)
# Needed features (5)
as.data.frame(subset)

## function (...) 
## data.frame(value = x(...))
## <bytecode: 0x00000000422c5950>
## <environment: 0x00000000422c54b8>

Conclusions

Using a Feature Ranking method with Information Gain of all variables being used as a metric of comparison, these variables(Branch,Customer Type,Gender,Product Line,Unit Price) would be the best to use for modeling a regressor with respect to Rating.

Dimensionality Reduction & Feature Selection

Bradley Agwa

2022-04-04

Dimensionality Reduction & Feature Selection

Libraries Needed

Data Reading

Structure of the data

Data Cleaning

Exploaratory Data Analysis

Univariate Analysis

Branch

Gender

Customer type

Product Line

Unit Price

Quantity

Tax

Date

Payment

COGS

Gross Income

Rating

Total

Bivariate Analysis

APPLICATION OF t-SNE ALGORITHM

Feature Selection

Conclusions