library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.1.3
library(Rtsne)
## Warning: package 'Rtsne' was built under R version 4.1.3
library(e1071)
## Warning: package 'e1071' was built under R version 4.1.3
library(CatEncoders)
## Warning: package 'CatEncoders' was built under R version 4.1.3
##
## Attaching package: 'CatEncoders'
## The following object is masked from 'package:base':
##
## transform
library(lattice)
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.1.3
## corrplot 0.92 loaded
library(caret)
## Warning: package 'caret' was built under R version 4.1.3
df <- read.csv("http://bit.ly/CarreFourDataset")
head(df)
## Invoice.ID Branch Customer.type Gender Product.line Unit.price
## 1 750-67-8428 A Member Female Health and beauty 74.69
## 2 226-31-3081 C Normal Female Electronic accessories 15.28
## 3 631-41-3108 A Normal Male Home and lifestyle 46.33
## 4 123-19-1176 A Member Male Health and beauty 58.22
## 5 373-73-7910 A Normal Male Sports and travel 86.31
## 6 699-14-3026 C Normal Male Electronic accessories 85.39
## Quantity Tax Date Time Payment cogs gross.margin.percentage
## 1 7 26.1415 1/5/2019 13:08 Ewallet 522.83 4.761905
## 2 5 3.8200 3/8/2019 10:29 Cash 76.40 4.761905
## 3 7 16.2155 3/3/2019 13:23 Credit card 324.31 4.761905
## 4 8 23.2880 1/27/2019 20:33 Ewallet 465.76 4.761905
## 5 7 30.2085 2/8/2019 10:37 Ewallet 604.17 4.761905
## 6 7 29.8865 3/25/2019 18:30 Ewallet 597.73 4.761905
## gross.income Rating Total
## 1 26.1415 9.1 548.9715
## 2 3.8200 9.6 80.2200
## 3 16.2155 7.4 340.5255
## 4 23.2880 8.4 489.0480
## 5 30.2085 5.3 634.3785
## 6 29.8865 4.1 627.6165
str(df)
## 'data.frame': 1000 obs. of 16 variables:
## $ Invoice.ID : chr "750-67-8428" "226-31-3081" "631-41-3108" "123-19-1176" ...
## $ Branch : chr "A" "C" "A" "A" ...
## $ Customer.type : chr "Member" "Normal" "Normal" "Member" ...
## $ Gender : chr "Female" "Female" "Male" "Male" ...
## $ Product.line : chr "Health and beauty" "Electronic accessories" "Home and lifestyle" "Health and beauty" ...
## $ Unit.price : num 74.7 15.3 46.3 58.2 86.3 ...
## $ Quantity : int 7 5 7 8 7 7 6 10 2 3 ...
## $ Tax : num 26.14 3.82 16.22 23.29 30.21 ...
## $ Date : chr "1/5/2019" "3/8/2019" "3/3/2019" "1/27/2019" ...
## $ Time : chr "13:08" "10:29" "13:23" "20:33" ...
## $ Payment : chr "Ewallet" "Cash" "Credit card" "Ewallet" ...
## $ cogs : num 522.8 76.4 324.3 465.8 604.2 ...
## $ gross.margin.percentage: num 4.76 4.76 4.76 4.76 4.76 ...
## $ gross.income : num 26.14 3.82 16.22 23.29 30.21 ...
## $ Rating : num 9.1 9.6 7.4 8.4 5.3 4.1 5.8 8 7.2 5.9 ...
## $ Total : num 549 80.2 340.5 489 634.4 ...
# checking for duplicates
df[duplicated(df), ]
## [1] Invoice.ID Branch Customer.type
## [4] Gender Product.line Unit.price
## [7] Quantity Tax Date
## [10] Time Payment cogs
## [13] gross.margin.percentage gross.income Rating
## [16] Total
## <0 rows> (or 0-length row.names)
# checking for missing values
colSums(is.na(df))
## Invoice.ID Branch Customer.type
## 0 0 0
## Gender Product.line Unit.price
## 0 0 0
## Quantity Tax Date
## 0 0 0
## Time Payment cogs
## 0 0 0
## gross.margin.percentage gross.income Rating
## 0 0 0
## Total
## 0
# Accuracy
df$Invoice.ID <- NULL
df$Branch <- as.factor(df$Branch)
df$Customer.type <- as.factor(df$Customer.type)
df$Gender <- as.factor(df$Gender)
df$Product.line <- as.factor(df$Product.line)
df$Payment <- as.factor(df$Payment)
df$Date <- as.Date(df$Date, format = "%m/%d/%y")
# creating a mode function
mode <- function(x){
uniqx <- unique(x)
uniqx[which.max(tabulate(match(x, uniqx)))]
}
# Visualization
ggplot(df, aes(Branch)) +
geom_bar(stat="count") +
labs(title="Branches Distribution") +
theme_classic()
Branch distribution is roughly equal
# Visualization
ggplot(df, aes(Gender)) +
geom_bar(stat="count")
The gender distribution in the dataset is balanced.
# visualization
ggplot(df, aes(Customer.type)) +
geom_bar()
The customer types were balanced in terms of distribution.
# visualization
ggplot(df, aes(Product.line)) +
geom_bar()
Fashion Accessories and, Food and Beverage tie for the most bought
categories but the distribution does not suggest an imbalance in
general.
# Mean
uprice.mean <- mean(df$Unit.price)
uprice.mean
## [1] 55.67213
# Mode
uprice.mode <- mode(df$Unit.price)
uprice.mode
## [1] 83.77
# Median
uprice.median <- median(df$Unit.price)
uprice.median
## [1] 55.23
# Standard Deviation
uprice.sd <- sd(df$Unit.price)
uprice.sd
## [1] 26.49463
# Kurtosis
uprice.kurt <- kurtosis(df$Unit.price)
uprice.kurt
## [1] -1.222062
# SKewness
uprice.skew <- skewness(df$Unit.price)
uprice.skew
## [1] 0.00705623
# Range
uprice.range <- range(df$Unit.price)
uprice.range
## [1] 10.08 99.96
# Visualization
ggplot(df, aes(Unit.price)) +
geom_boxplot(outlier.colour = "red")
# mean
quantity.mean <- mean(df$Quantity)
quantity.mean
## [1] 5.51
# Mode
quantity.mode <- mode(df$Quantity)
quantity.mode
## [1] 10
# Median
quantity.median <- median(df$Quantity)
quantity.median
## [1] 5
# Standard Deviation
quantity.sd <- sd(df$Quantity)
quantity.sd
## [1] 2.923431
# Range
quantity.range <- range(df$Quantity)
quantity.range
## [1] 1 10
# Kurtosis
quantity.kurt <- kurtosis(df$Quantity)
quantity.kurt
## [1] -1.219039
# Skewness
quantity.skew <- skewness(df$Quantity)
quantity.skew
## [1] 0.01290225
# Quantiles
quantity.quants <- quantile(df$Quantity)
quantity.quants
## 0% 25% 50% 75% 100%
## 1 3 5 8 10
# Visualization
ggplot(df, aes(Quantity)) +
geom_boxplot(outlier.colour = "red")
# mean
tax.mean <- mean(df$Tax)
tax.mean
## [1] 15.37937
# mode
tax.mode <- mode(df$Tax)
tax.mode
## [1] 39.48
# Median
tax.median <- median(df$Tax)
tax.median
## [1] 12.088
# Standard Deviation
tax.sd <- sd(df$Tax)
tax.sd
## [1] 11.70883
# Kurtosis
tax.kurt <- kurtosis(df$Tax)
tax.kurt
## [1] -0.09329206
# Skewness
tax.skew <- skewness(df$Tax)
tax.skew
## [1] 0.8898939
# Range
tax.range <- range(df$Tax)
tax.range
## [1] 0.5085 49.6500
# Quantiles
tax.quantiles <- quantile(df$Tax)
tax.quantiles
## 0% 25% 50% 75% 100%
## 0.508500 5.924875 12.088000 22.445250 49.650000
# Visual
ggplot(df, aes(Tax)) +
geom_boxplot(outlier.colour = "red")
# mode
date.mode <- mode(df$Date)
date.mode
## [1] "2020-02-07"
# median
date.median <- median(df$Date)
date.median
## [1] "2020-02-13"
# standard deviation
date.sd <- sd(df$Date)
date.sd
## [1] 25.51686
# Mode
payment.mode <- mode(df$Payment)
payment.mode
## [1] Ewallet
## Levels: Cash Credit card Ewallet
# visual
ggplot(df, aes(Payment)) +
geom_bar(stat="count")
There is a fair distribution in the payment variable. However, fewer people tend to pay by Credit Card in these stores
# mean
cogs.mean <- mean(df$cogs)
cogs.mean
## [1] 307.5874
# mode
cogs.mode <- mode(df$cogs)
cogs.mode
## [1] 789.6
# median
cogs.median <- median(df$cogs)
cogs.median
## [1] 241.76
# standard deviation
cogs.sd <- sd(df$cogs)
cogs.sd
## [1] 234.1765
# range
cogs.range <- range(df$cogs)
cogs.range
## [1] 10.17 993.00
# kurtosis
cogs.kurt <- kurtosis(df$cogs)
cogs.kurt
## [1] -0.09329206
# skewness
cogs.skew <- skewness(df$cogs)
cogs.skew
## [1] 0.8898939
# quantiles
cogs.quantiles <- quantile(df$cogs)
cogs.quantiles
## 0% 25% 50% 75% 100%
## 10.1700 118.4975 241.7600 448.9050 993.0000
# visual
ggplot(df, aes(cogs)) +
geom_boxplot(outlier.colour = "red")
gi <- df$gross.income
# mean
gi.mean <- mean(gi)
gi.mean
## [1] 15.37937
# mode
gi.mode <- mode(gi)
gi.mode
## [1] 39.48
# median
gi.median <- median(gi)
gi.median
## [1] 12.088
# range
gi.range <- range(gi)
gi.range
## [1] 0.5085 49.6500
# standard deviation
gi.sd <- sd(gi)
gi.sd
## [1] 11.70883
# kurtosis
gi.kurt <- kurtosis(gi)
gi.kurt
## [1] -0.09329206
# skewness
gi.skew <- skewness(gi)
gi.skew
## [1] 0.8898939
# visual
ggplot(df, aes(gross.income)) +
geom_boxplot(outlier.colour = "red")
# mean
rate.mean <- mean(df$Rating)
rate.mean
## [1] 6.9727
# mode
rate.mode <- mode(df$Rating)
rate.mode
## [1] 6
# median
rate.median <- median(df$Rating)
rate.median
## [1] 7
# standard deviation
rate.sd <- sd(df$Rating)
rate.sd
## [1] 1.71858
# range
rate.range <- range(df$Rating)
rate.range
## [1] 4 10
# quantiles
rate.quantiles <- quantile(df$Rating)
rate.quantiles
## 0% 25% 50% 75% 100%
## 4.0 5.5 7.0 8.5 10.0
# kurtosis
rate.kurt <- kurtosis(df$Rating)
rate.kurt
## [1] -1.155525
# skewness
rate.skew <- skewness(df$Rating)
rate.skew
## [1] 0.008982638
# visual
ggplot(df, aes(Rating)) +
geom_boxplot(outlier.colour = "red")
# mean
total.mean <- mean(df$Total)
total.mean
## [1] 322.9667
# median
total.median <- median(df$Total)
total.median
## [1] 253.848
# mode
total.mode <- mode(df$Total)
total.mode
## [1] 829.08
# standard deviation
total.sd <- sd(df$Total)
total.sd
## [1] 245.8853
# range
total.range <- range(df$Total)
total.range
## [1] 10.6785 1042.6500
# kurtosis
total.kurt <- kurtosis(df$Total)
total.kurt
## [1] -0.09329206
# skewness
total.skew <- skewness(df$Total)
total.skew
## [1] 0.8898939
# quantiles
total.quantiles <- quantile(df$Total)
total.quantiles
## 0% 25% 50% 75% 100%
## 10.6785 124.4224 253.8480 471.3502 1042.6500
# visual
ggplot(df, aes(Total)) +
geom_boxplot(outlier.colour = "red" )
ggplot(df, aes(x=Product.line, y=Total)) +
geom_point()
Fashion Accessories have the highest Total prices while health and beauty products have a relatively lower price.
ggplot(df ,aes(Gender, Total)) +
geom_point()
Total Price is equally distributed in terms of gender
ggplot(df, aes(Payment, Total)) +
geom_point()
The payment methods are nearly identical for the total prices of items at checkouts with some more expensive ones being attributed with Credit card payments.
ggplot(df, aes(gross.income, Total)) +
geom_point()
As expected, there is a perfect positive linear relationship with how much the total is at checkout with the consumers gross income.
ggplot(df, aes(Customer.type , Total)) +
geom_point()
Members and non members have a nearly equal distribution in expenditure with Members having no visible breaks in prices.
ggplot(df, aes(Tax, Total)) +
geom_point()
There is a direct linear relationship between tax and total price. As expected, the higher the tax on items, the more they cost.
ggplot(df, aes(Unit.price, Total)) +
geom_point()
There are several positive linear relationships with the Unit Price variable: the higher it is the higher the total price is. More data would be needed to explain the different lines considering they represent outside factors that influence the relationship. A good example would be the type of products being of different types.
cor(df[,unlist(lapply(df, is.numeric))])
## Warning in cor(df[, unlist(lapply(df, is.numeric))]): the standard deviation is
## zero
## Unit.price Quantity Tax cogs
## Unit.price 1.000000000 0.01077756 0.6339621 0.6339621
## Quantity 0.010777564 1.00000000 0.7055102 0.7055102
## Tax 0.633962089 0.70551019 1.0000000 1.0000000
## cogs 0.633962089 0.70551019 1.0000000 1.0000000
## gross.margin.percentage NA NA NA NA
## gross.income 0.633962089 0.70551019 1.0000000 1.0000000
## Rating -0.008777507 -0.01581490 -0.0364417 -0.0364417
## Total 0.633962089 0.70551019 1.0000000 1.0000000
## gross.margin.percentage gross.income Rating
## Unit.price NA 0.6339621 -0.008777507
## Quantity NA 0.7055102 -0.015814905
## Tax NA 1.0000000 -0.036441705
## cogs NA 1.0000000 -0.036441705
## gross.margin.percentage 1 NA NA
## gross.income NA 1.0000000 -0.036441705
## Rating NA -0.0364417 1.000000000
## Total NA 1.0000000 -0.036441705
## Total
## Unit.price 0.6339621
## Quantity 0.7055102
## Tax 1.0000000
## cogs 1.0000000
## gross.margin.percentage NA
## gross.income 1.0000000
## Rating -0.0364417
## Total 1.0000000
The following variables show strong correlation:
copy <- df[, -c(8, 9, 12, 15)]
label <- df[, 15]
branch <- LabelEncoder.fit(copy$Branch)
copy$Branch <- transform(branch, factor(df$Branch))
gender <- LabelEncoder.fit(copy$Gender)
copy$Gender <- transform(gender, factor(df$Gender))
customer <- LabelEncoder.fit(copy$Customer.type)
copy$Customer.type <- transform(customer, factor(copy$Customer.type))
product <- LabelEncoder.fit(copy$Product.line)
copy$Product.line <- transform(product, factor(copy$Product.line))
pay <- LabelEncoder.fit(copy$Payment)
copy$Payment <- transform(pay, factor(copy$Payment))
model <- Rtsne(copy, dims=2, perplexity=30, verbose= TRUE, max_iter=1000)
## Performing PCA
## Read the 1000 x 11 data matrix successfully!
## OpenMP is working. 1 threads.
## Using no_dims = 2, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 0.24 seconds (sparsity = 0.102676)!
## Learning embedding...
## Iteration 50: error is 61.242416 (50 iterations in 0.33 seconds)
## Iteration 100: error is 53.551234 (50 iterations in 0.28 seconds)
## Iteration 150: error is 52.083077 (50 iterations in 0.26 seconds)
## Iteration 200: error is 51.412922 (50 iterations in 0.28 seconds)
## Iteration 250: error is 51.034899 (50 iterations in 0.34 seconds)
## Iteration 300: error is 0.583195 (50 iterations in 0.26 seconds)
## Iteration 350: error is 0.421205 (50 iterations in 0.29 seconds)
## Iteration 400: error is 0.380630 (50 iterations in 0.24 seconds)
## Iteration 450: error is 0.368461 (50 iterations in 0.31 seconds)
## Iteration 500: error is 0.360154 (50 iterations in 0.24 seconds)
## Iteration 550: error is 0.352034 (50 iterations in 0.28 seconds)
## Iteration 600: error is 0.347254 (50 iterations in 0.24 seconds)
## Iteration 650: error is 0.343180 (50 iterations in 0.29 seconds)
## Iteration 700: error is 0.341481 (50 iterations in 0.32 seconds)
## Iteration 750: error is 0.337247 (50 iterations in 0.21 seconds)
## Iteration 800: error is 0.334652 (50 iterations in 0.26 seconds)
## Iteration 850: error is 0.333340 (50 iterations in 0.23 seconds)
## Iteration 900: error is 0.332257 (50 iterations in 0.27 seconds)
## Iteration 950: error is 0.329499 (50 iterations in 0.21 seconds)
## Iteration 1000: error is 0.328039 (50 iterations in 0.22 seconds)
## Fitting performed in 5.38 seconds.
summary(model)
## Length Class Mode
## N 1 -none- numeric
## Y 2000 -none- numeric
## costs 1000 -none- numeric
## itercosts 20 -none- numeric
## origD 1 -none- numeric
## perplexity 1 -none- numeric
## theta 1 -none- numeric
## max_iter 1 -none- numeric
## stop_lying_iter 1 -none- numeric
## mom_switch_iter 1 -none- numeric
## momentum 1 -none- numeric
## final_momentum 1 -none- numeric
## eta 1 -none- numeric
## exaggeration_factor 1 -none- numeric
head(model$Y)
## [,1] [,2]
## [1,] -16.101889 9.497742
## [2,] -5.609068 -23.404850
## [3,] 12.610804 10.182126
## [4,] -7.801824 6.380460
## [5,] -18.066283 18.270284
## [6,] -18.249653 17.671074
plot(model$Y, t='p', main="Output of TSNE")
corrMat <- cor(copy)
# highly correlated features
high <- findCorrelation(corrMat, cutoff = .75)
# names of highly correlated features
names(copy[, high])
## [1] "Tax" "cogs"
# data set without highly correlated variables
c2 <- copy[-high]
par(mfrow = c(1, 2))
# plotting
corrplot(corrMat, order = "hclust")
corrplot(cor(c2), order = "hclust")
# adding Ratings column to copy data set
copy$Rating <- df$Rating
# how different variable score in relation to rating variable
# scores <- information.gain(Rating~., copy)
# subset <- cutoff.k(scores, 5)
# Needed features (5)
as.data.frame(subset)
## function (...)
## data.frame(value = x(...))
## <bytecode: 0x00000000422c5950>
## <environment: 0x00000000422c54b8>
Using a Feature Ranking method with Information Gain of all variables being used as a metric of comparison, these variables(Branch,Customer Type,Gender,Product Line,Unit Price) would be the best to use for modeling a regressor with respect to Rating.