PART 1a: DIMENSIONALITY REDUCTION USING PRINCIPAL COMPONENT ANALYSIS(PCA)
# loading the advertising dataset using the fread function
library(data.table)
#import data
df <- fread("C:\\Users\\Gakungi\\OneDrive\\Desktop\\R\\Datasets\\Supermarket_Dataset_1 - Sales Data.csv")
# preview the first 6 rows of the dataset
head(df)
## Invoice ID Branch Customer type Gender Product line Unit price
## 1: 750-67-8428 A Member Female Health and beauty 74.69
## 2: 226-31-3081 C Normal Female Electronic accessories 15.28
## 3: 631-41-3108 A Normal Male Home and lifestyle 46.33
## 4: 123-19-1176 A Member Male Health and beauty 58.22
## 5: 373-73-7910 A Normal Male Sports and travel 86.31
## 6: 699-14-3026 C Normal Male Electronic accessories 85.39
## Quantity Tax Date Time Payment cogs gross margin percentage
## 1: 7 26.1415 1/5/2019 13:08 Ewallet 522.83 4.761905
## 2: 5 3.8200 3/8/2019 10:29 Cash 76.40 4.761905
## 3: 7 16.2155 3/3/2019 13:23 Credit card 324.31 4.761905
## 4: 8 23.2880 1/27/2019 20:33 Ewallet 465.76 4.761905
## 5: 7 30.2085 2/8/2019 10:37 Ewallet 604.17 4.761905
## 6: 7 29.8865 3/25/2019 18:30 Ewallet 597.73 4.761905
## gross income Rating Total
## 1: 26.1415 9.1 548.9715
## 2: 3.8200 9.6 80.2200
## 3: 16.2155 7.4 340.5255
## 4: 23.2880 8.4 489.0480
## 5: 30.2085 5.3 634.3785
## 6: 29.8865 4.1 627.6165
# preview the last 6 rows of the dataset
tail(df)
## Invoice ID Branch Customer type Gender Product line Unit price
## 1: 652-49-6720 C Member Female Electronic accessories 60.95
## 2: 233-67-5758 C Normal Male Health and beauty 40.35
## 3: 303-96-2227 B Normal Female Home and lifestyle 97.38
## 4: 727-02-1313 A Member Male Food and beverages 31.84
## 5: 347-56-2442 A Normal Male Home and lifestyle 65.82
## 6: 849-09-3807 A Member Female Fashion accessories 88.34
## Quantity Tax Date Time Payment cogs gross margin percentage
## 1: 1 3.0475 2/18/2019 11:40 Ewallet 60.95 4.761905
## 2: 1 2.0175 1/29/2019 13:46 Ewallet 40.35 4.761905
## 3: 10 48.6900 3/2/2019 17:16 Ewallet 973.80 4.761905
## 4: 1 1.5920 2/9/2019 13:22 Cash 31.84 4.761905
## 5: 1 3.2910 2/22/2019 15:33 Cash 65.82 4.761905
## 6: 7 30.9190 2/18/2019 13:28 Cash 618.38 4.761905
## gross income Rating Total
## 1: 3.0475 5.9 63.9975
## 2: 2.0175 6.2 42.3675
## 3: 48.6900 4.4 1022.4900
## 4: 1.5920 7.7 33.4320
## 5: 3.2910 4.1 69.1110
## 6: 30.9190 6.6 649.2990
# checking the shape of our dataset
dim(df)
## [1] 1000 16
# we have 1000 rows and 16 columns
# checking for duplicates in the data
dup <- df[duplicated(df),]
dup
## Empty data.table (0 rows and 16 cols): Invoice ID,Branch,Customer type,Gender,Product line,Unit price...
# there are no duplicates in our data
# checking for missing values per column
colSums(is.na(df))
## Invoice ID Branch Customer type
## 0 0 0
## Gender Product line Unit price
## 0 0 0
## Quantity Tax Date
## 0 0 0
## Time Payment cogs
## 0 0 0
## gross margin percentage gross income Rating
## 0 0 0
## Total
## 0
# we have no missing values in our columns
# checking the data types of our 10 columns
str(df)
## Classes 'data.table' and 'data.frame': 1000 obs. of 16 variables:
## $ Invoice ID : chr "750-67-8428" "226-31-3081" "631-41-3108" "123-19-1176" ...
## $ Branch : chr "A" "C" "A" "A" ...
## $ Customer type : chr "Member" "Normal" "Normal" "Member" ...
## $ Gender : chr "Female" "Female" "Male" "Male" ...
## $ Product line : chr "Health and beauty" "Electronic accessories" "Home and lifestyle" "Health and beauty" ...
## $ Unit price : num 74.7 15.3 46.3 58.2 86.3 ...
## $ Quantity : int 7 5 7 8 7 7 6 10 2 3 ...
## $ Tax : num 26.14 3.82 16.22 23.29 30.21 ...
## $ Date : chr "1/5/2019" "3/8/2019" "3/3/2019" "1/27/2019" ...
## $ Time : chr "13:08" "10:29" "13:23" "20:33" ...
## $ Payment : chr "Ewallet" "Cash" "Credit card" "Ewallet" ...
## $ cogs : num 522.8 76.4 324.3 465.8 604.2 ...
## $ gross margin percentage: num 4.76 4.76 4.76 4.76 4.76 ...
## $ gross income : num 26.14 3.82 16.22 23.29 30.21 ...
## $ Rating : num 9.1 9.6 7.4 8.4 5.3 4.1 5.8 8 7.2 5.9 ...
## $ Total : num 549 80.2 340.5 489 634.4 ...
## - attr(*, ".internal.selfref")=<externalptr>
# our columns have the appropriate data types attached to them
# checking the unique values in certain columns
print(unique(df$Branch))
## [1] "A" "C" "B"
print(unique(df$`Customer type`))
## [1] "Member" "Normal"
print(unique(df$Gender))
## [1] "Female" "Male"
print(unique(df$`Product line`))
## [1] "Health and beauty" "Electronic accessories" "Home and lifestyle"
## [4] "Sports and travel" "Food and beverages" "Fashion accessories"
print(unique(df$Payment))
## [1] "Ewallet" "Cash" "Credit card"
# changing column names of our dataset to enable ease of use and previewing the dataset
colnames(df)[1] = "invoice_id"
colnames(df)[3] = "customer_type"
colnames(df)[5] = "product_line"
colnames(df)[6] = "unit_price"
colnames(df)[13] = "gross_margin%"
colnames(df)[14] = "gross_income"
head(df)
## invoice_id Branch customer_type Gender product_line unit_price
## 1: 750-67-8428 A Member Female Health and beauty 74.69
## 2: 226-31-3081 C Normal Female Electronic accessories 15.28
## 3: 631-41-3108 A Normal Male Home and lifestyle 46.33
## 4: 123-19-1176 A Member Male Health and beauty 58.22
## 5: 373-73-7910 A Normal Male Sports and travel 86.31
## 6: 699-14-3026 C Normal Male Electronic accessories 85.39
## Quantity Tax Date Time Payment cogs gross_margin%
## 1: 7 26.1415 1/5/2019 13:08 Ewallet 522.83 4.761905
## 2: 5 3.8200 3/8/2019 10:29 Cash 76.40 4.761905
## 3: 7 16.2155 3/3/2019 13:23 Credit card 324.31 4.761905
## 4: 8 23.2880 1/27/2019 20:33 Ewallet 465.76 4.761905
## 5: 7 30.2085 2/8/2019 10:37 Ewallet 604.17 4.761905
## 6: 7 29.8865 3/25/2019 18:30 Ewallet 597.73 4.761905
## gross_income Rating Total
## 1: 26.1415 9.1 548.9715
## 2: 3.8200 9.6 80.2200
## 3: 16.2155 7.4 340.5255
## 4: 23.2880 8.4 489.0480
## 5: 30.2085 5.3 634.3785
## 6: 29.8865 4.1 627.6165
# dropping irrelevant columns
dfb <- within(df, rm("invoice_id"))
# converting the logical and factor columns to numeric and encoding them for PCA
dfb$Branch <- as.factor(dfb$Branch)
dfb$Branch <- unclass(dfb$Branch)
dfb$Branch <- as.numeric(dfb$Branch)
dfb$customer_type <- as.factor(dfb$customer_type)
dfb$customer_type <- unclass(dfb$customer_type)
dfb$customer_type <- as.numeric(dfb$customer_type)
dfb$Gender <- as.factor(dfb$Gender)
dfb$Gender <- unclass(dfb$Gender)
dfb$Gender <- as.numeric(dfb$Gender)
dfb$product_line <- as.factor(dfb$product_line)
dfb$product_line <- unclass(dfb$product_line)
dfb$product_line <- as.numeric(dfb$product_line)
dfb$Payment <- as.factor(dfb$Payment)
dfb$Payment <- unclass(dfb$Payment)
dfb$Payment <- as.numeric(dfb$Payment)
head(dfb)
## Branch customer_type Gender product_line unit_price Quantity Tax
## 1: 1 1 1 4 74.69 7 26.1415
## 2: 3 2 1 1 15.28 5 3.8200
## 3: 1 2 2 5 46.33 7 16.2155
## 4: 1 1 2 4 58.22 8 23.2880
## 5: 1 2 2 6 86.31 7 30.2085
## 6: 3 2 2 1 85.39 7 29.8865
## Date Time Payment cogs gross_margin% gross_income Rating Total
## 1: 1/5/2019 13:08 3 522.83 4.761905 26.1415 9.1 548.9715
## 2: 3/8/2019 10:29 1 76.40 4.761905 3.8200 9.6 80.2200
## 3: 3/3/2019 13:23 2 324.31 4.761905 16.2155 7.4 340.5255
## 4: 1/27/2019 20:33 3 465.76 4.761905 23.2880 8.4 489.0480
## 5: 2/8/2019 10:37 3 604.17 4.761905 30.2085 5.3 634.3785
## 6: 3/25/2019 18:30 3 597.73 4.761905 29.8865 4.1 627.6165
# checking the unique values per column after encoding
print(unique(dfb$Branch))
## [1] 1 3 2
print(unique(dfb$customer_type))
## [1] 1 2
print(unique(dfb$Gender))
## [1] 1 2
print(unique(dfb$product_line))
## [1] 4 1 5 6 3 2
print(unique(dfb$Payment))
## [1] 3 1 2
# branches: A=1, C=3, B=2
# customer type: member=1, normal=2
# gender: female=1, male=2
# productline: health and beauty=4, electronic accessories=1, home and lifestyle=5, sports and travel=6, food and beverages=3, fashion accessories=2
# payment: ewallet=3, cash=1, credit card=2
# checking the summary statistics of our dataset
summary(dfb)
## Branch customer_type Gender product_line
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:2.000
## Median :2.000 Median :1.000 Median :1.000 Median :3.000
## Mean :1.988 Mean :1.499 Mean :1.499 Mean :3.452
## 3rd Qu.:3.000 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:5.000
## Max. :3.000 Max. :2.000 Max. :2.000 Max. :6.000
## unit_price Quantity Tax Date
## Min. :10.08 Min. : 1.00 Min. : 0.5085 Length:1000
## 1st Qu.:32.88 1st Qu.: 3.00 1st Qu.: 5.9249 Class :character
## Median :55.23 Median : 5.00 Median :12.0880 Mode :character
## Mean :55.67 Mean : 5.51 Mean :15.3794
## 3rd Qu.:77.94 3rd Qu.: 8.00 3rd Qu.:22.4453
## Max. :99.96 Max. :10.00 Max. :49.6500
## Time Payment cogs gross_margin%
## Length:1000 Min. :1.000 Min. : 10.17 Min. :4.762
## Class :character 1st Qu.:1.000 1st Qu.:118.50 1st Qu.:4.762
## Mode :character Median :2.000 Median :241.76 Median :4.762
## Mean :2.001 Mean :307.59 Mean :4.762
## 3rd Qu.:3.000 3rd Qu.:448.90 3rd Qu.:4.762
## Max. :3.000 Max. :993.00 Max. :4.762
## gross_income Rating Total
## Min. : 0.5085 Min. : 4.000 Min. : 10.68
## 1st Qu.: 5.9249 1st Qu.: 5.500 1st Qu.: 124.42
## Median :12.0880 Median : 7.000 Median : 253.85
## Mean :15.3794 Mean : 6.973 Mean : 322.97
## 3rd Qu.:22.4453 3rd Qu.: 8.500 3rd Qu.: 471.35
## Max. :49.6500 Max. :10.000 Max. :1042.65
# we are able to check for the mean, median, upper quantile, lower quantile, minimum value and maximum value of each column using summary statistics
DIMENSIONALITY REDUCTION USING PRINCIPAL COMPONENT ANALYSIS
# loadig relevant package
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.7 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::between() masks data.table::between()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::first() masks data.table::first()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::last() masks data.table::last()
## ✖ purrr::transpose() masks data.table::transpose()
# dropping irrelevant columns
dfc <- within(dfb, rm("Date", "Time", "gross_margin%"))
dim(dfc)
## [1] 1000 12
# plotting boxplot to check for outliers
library(lattice)
boxplot(dfc$Branch, xlab=c("Branch"))
boxplot(dfc$Payment, xlab=c("payment"))
boxplot(dfc$product_line, xlab=c("product line"))
boxplot(dfc$Quantity, xlab=c("quantity"))
boxplot(dfc$Rating, xlab=c("rating"))
boxplot(dfc$Tax, xlab=c("tax"))
boxplot(dfc$Total, xlab=c("total"))
boxplot(dfc$cogs, xlab=c("cogs"))
boxplot(dfc$customer_type, xlab=c("customer type"))
boxplot(dfc$gross_income, xlab=c("gross income"))
boxplot(dfc$Gender, xlab=c("gender"))
# outliers exist in the tax, total, cogs and gross income column which will be retained for the analysis since we assume they are correct entries.
# checking the distribution of the data columns
dfc %>%
gather(attributes, value, 1:12) %>%
ggplot(aes(x = value)) +
geom_histogram(fill = 'lightblue2', color = 'black') +
facet_wrap(~attributes, scales = 'free_x') +
labs(x="Values", y="Frequency") +
theme_bw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# our columns lack a normal distribution except for the gross margin % column according to the histograms displayed
# hence we should use normalize in scaling our data
CONFIRMING THE UNNORMAL DISTRIBUTION USING QQ PLOTS
qqnorm(dfc$Branch)
qqline(dfc$Branch)
# the column's data does not follow a normal distribution.
qqnorm(dfc$gross_income)
qqline(dfc$gross_income)
# the column's data follows a normal distribution.
qqnorm(dfc$cogs)
qqline(dfc$cogs)
# the column's data follows a normal distribution.
qqnorm(dfc$customer_type)
qqline(dfc$customer_type)
# the column's data follows a normal distribution.
qqnorm(dfc$Gender)
qqline(dfc$Gender)
# the column's data follows a normal distribution.
qqnorm(dfc$Payment)
qqline(dfc$Payment)
# the column's data does not follow a normal distribution.
qqnorm(dfc$product_line)
qqline(dfc$product_line)
# the column's data does not follow a normal distribution.
qqnorm(dfc$Quantity)
qqline(dfc$Quantity)
# the column's data does not follow a normal distribution.
qqnorm(dfc$Rating)
qqline(dfc$Rating)
# the column's data does not follow a normal distribution where rating is below 5 and above 9
qqnorm(dfc$Tax)
qqline(dfc$Tax)
# the column's data does not follow a normal distribution.
qqnorm(dfc$Total)
qqline(dfc$Total)
# the column's data does not follow a normal distribution.
qqnorm(dfc$unit_price)
qqline(dfc$unit_price)
# the column's data does not follow a normal distribution below unit price of 20 and above 90.
SCALING OUR DATA
#define Min-Max normalization function
min_max_norm <- function(x) {
(x - min(x)) / (max(x) - min(x))
}
#apply Min-Max normalization to the columns in our dataset
dfd <- as.data.frame(lapply(dfc, min_max_norm))
#view last six rows of normalized dataset
tail(dfd)
## Branch customer_type Gender product_line unit_price Quantity Tax
## 995 1.0 0 0 0.0 0.5659769 0.0000000 0.05166712
## 996 1.0 1 1 0.6 0.3367824 0.0000000 0.03070724
## 997 0.5 1 0 0.8 0.9712951 1.0000000 0.98046458
## 998 0.0 0 1 0.4 0.2421006 0.0000000 0.02204857
## 999 0.0 1 1 0.8 0.6201602 0.0000000 0.05662220
## 1000 0.0 0 0 0.2 0.8707165 0.6666667 0.61883540
## Payment cogs gross_income Rating Total
## 995 1 0.05166712 0.05166712 0.31666667 0.05166712
## 996 1 0.03070724 0.03070724 0.36666667 0.03070724
## 997 1 0.98046458 0.98046458 0.06666667 0.98046458
## 998 0 0.02204857 0.02204857 0.61666667 0.02204857
## 999 0 0.05662220 0.05662220 0.01666667 0.05662220
## 1000 0 0.61883540 0.61883540 0.43333333 0.61883540
DIMENSIONALITY REDUCTION USING PCA
#remove the dependent and identifier variables from the dataset
dfe <- subset(dfd, select = -c(Branch, customer_type, Gender, product_line, Payment))
# checking the dimensions of our data to confirm the removal of the variables
dim(dfe)
## [1] 1000 7
# performing principal component analysis
dfpca <- prcomp(dfe)
dfpca$rotation
## PC1 PC2 PC3 PC4 PC5
## unit_price -0.3256243 0.73300331 -0.009994873 -0.59713910 -8.831671e-16
## Quantity -0.4333457 -0.67699031 -0.027481272 -0.59425621 -8.335815e-16
## Tax -0.4200361 0.03268678 -0.003722882 0.26923486 8.553317e-01
## cogs -0.4200361 0.03268678 -0.003722882 0.26923486 -2.489485e-01
## gross_income -0.4200361 0.03268678 -0.003722882 0.26923486 -1.969322e-01
## Rating 0.0214282 0.01079644 -0.999544618 0.01829826 8.650521e-17
## Total -0.4200361 0.03268678 -0.003722882 0.26923486 -4.094509e-01
## PC6 PC7
## unit_price 1.340372e-16 -4.272721e-17
## Quantity 7.461853e-17 2.148701e-17
## Tax -1.285587e-01 4.336345e-02
## cogs 5.091873e-01 6.547922e-01
## gross_income 3.805455e-01 -7.525974e-01
## Rating 8.268177e-18 -6.242761e-17
## Total -7.611741e-01 5.444170e-02
# pc2 has high factor loading for unit_price, pc5 has high factor loading for tax, pc6 and pc7 have high factors for cogs
# visually representing the principal components
biplot(dfpca, scale = 0)
#compute standard deviation of each principal component
std_dev <- dfpca$sdev
#compute variance
pr_var <- std_dev^2
#check variance of our 7 components components
pr_var
## [1] 3.180646e-01 9.387228e-02 8.195725e-02 7.639353e-03 1.632528e-32
## [6] 2.430723e-33 8.213067e-34
# calculating the proportion of variance explained
prop_varex <- pr_var/sum(pr_var)
prop_varex
## [1] 6.341842e-01 1.871705e-01 1.634133e-01 1.523199e-02 3.255072e-32
## [6] 4.846581e-33 1.637591e-33
# this shows that pc1 explains 63.4%, pc2 explains 18.7%, pc3 explains 16.3%, pc4 explains 1.5% variance
# SINCE PC2 IS THE ONLY VARIABLE THAT HAD HIGH FACTOR LOADINGS ON THE UNIT PRICE VARIABLE WITH PC1,PC3,PC4 LACKING HIGH FACTOR LOADINGS ON ANY OF THE VARIABLES WE CAN ADVISE THE MARKETING TEAM THAT UNIT PRICE OF THE VARIOUS PRODUCTS SHOULD BE PRIORITISED IN TRYING TO RAISE THE TOTAL SALES VALUE
# plotting a scree plot
plot(prop_varex, xlab = "Principal Component",
ylab = "Proportion of Variance Explained",
type = "b")
# the scree-plot shows that we only need 3 principal components from our initial 7 components which explain variance of about 98%
PART 1bi: FEATURE SELECTION USING CORRELATION COEFFICIENT
# previewing the first 6 rows of our dataset
head(dfc)
## Branch customer_type Gender product_line unit_price Quantity Tax Payment
## 1: 1 1 1 4 74.69 7 26.1415 3
## 2: 3 2 1 1 15.28 5 3.8200 1
## 3: 1 2 2 5 46.33 7 16.2155 2
## 4: 1 1 2 4 58.22 8 23.2880 3
## 5: 1 2 2 6 86.31 7 30.2085 3
## 6: 3 2 2 1 85.39 7 29.8865 3
## cogs gross_income Rating Total
## 1: 522.83 26.1415 9.1 548.9715
## 2: 76.40 3.8200 9.6 80.2200
## 3: 324.31 16.2155 7.4 340.5255
## 4: 465.76 23.2880 8.4 489.0480
## 5: 604.17 30.2085 5.3 634.3785
## 6: 597.73 29.8865 4.1 627.6165
# previewing the last 6 rows of our dataset
tail(dfc)
## Branch customer_type Gender product_line unit_price Quantity Tax Payment
## 1: 3 1 1 1 60.95 1 3.0475 3
## 2: 3 2 2 4 40.35 1 2.0175 3
## 3: 2 2 1 5 97.38 10 48.6900 3
## 4: 1 1 2 3 31.84 1 1.5920 1
## 5: 1 2 2 5 65.82 1 3.2910 1
## 6: 1 1 1 2 88.34 7 30.9190 1
## cogs gross_income Rating Total
## 1: 60.95 3.0475 5.9 63.9975
## 2: 40.35 2.0175 6.2 42.3675
## 3: 973.80 48.6900 4.4 1022.4900
## 4: 31.84 1.5920 7.7 33.4320
## 5: 65.82 3.2910 4.1 69.1110
## 6: 618.38 30.9190 6.6 649.2990
library(data.table)
# Installing and loading our caret package
suppressWarnings(
suppressMessages(if
(!require(caret, quietly=TRUE))
install.packages("caret")))
library(caret)
# Installing and loading the corrplot package for plotting
# ---
#
suppressWarnings(
suppressMessages(if
(!require(corrplot, quietly=TRUE))
install.packages("corrplot")))
library(corrplot)
# Calculating the correlation matrix
# ---
#
correlationMatrix <- cor(dfc)
# Find attributes that are highly correlated
# ---
#
highcor <- findCorrelation(correlationMatrix, cutoff=0.75)
# Highly correlated attributes
# ---
#
highcor
## [1] 9 12 7
names(dfc[,..highcor])
## [1] "cogs" "Total" "Tax"
# the highly correlated columns are cogs, Total and Tax
# We can remove the variables with a higher correlation
# and comparing the results graphically as shown below
# ---
#
# Removing Redundant Features
# ---
#
dfj <- within(dfc, rm("Total", "cogs", "Tax"))
# Performing our graphical comparison
# ---
#
par(mfrow = c(1, 2))
corrplot(correlationMatrix, order = "hclust")
corrplot(cor(dfj), order = "hclust")
# the 2nd plot shows that there is no high correlation between the remaining columns hence we have 9 optimal columns after feature selection
1Bii.) FEATURE SELECTION USING FEATURE RANKING
# loading the relevant libraries
library(devtools)
## Loading required package: usethis
library(FSelector)
head(dfc)
## Branch customer_type Gender product_line unit_price Quantity Tax Payment
## 1: 1 1 1 4 74.69 7 26.1415 3
## 2: 3 2 1 1 15.28 5 3.8200 1
## 3: 1 2 2 5 46.33 7 16.2155 2
## 4: 1 1 2 4 58.22 8 23.2880 3
## 5: 1 2 2 6 86.31 7 30.2085 3
## 6: 3 2 2 1 85.39 7 29.8865 3
## cogs gross_income Rating Total
## 1: 522.83 26.1415 9.1 548.9715
## 2: 76.40 3.8200 9.6 80.2200
## 3: 324.31 16.2155 7.4 340.5255
## 4: 465.76 23.2880 8.4 489.0480
## 5: 604.17 30.2085 5.3 634.3785
## 6: 597.73 29.8865 4.1 627.6165
# calculating information gain to get our most important variables
dfp <- information.gain(Total~., dfc)
# viewing our 5 best variables
dfq <- cutoff.k(dfp, 5)
as.data.frame(dfq)
## dfq
## 1 Tax
## 2 cogs
## 3 gross_income
## 4 Quantity
## 5 unit_price
WE CAN TELL FROM OUR ANALYSIS THAT IF THE MARKETING TEAM IS TO IMPLEMENT A STRATEGY THAT LEADS TO THE OVERALL INCREASE IN TOTAL SALES INCLUSIVE OF TAX THEY HAVE TO FOCUS ON INCREASING THE UNIT PRICE OF ITEMS, INCREASING THEIR QUANTITY AND DECREASING THEIR cogs(COST OF GOODS SOLD) ALL THESE WILL LEAD TO A SPIKE IN THEIR GROSS INCOME. THEY SHOULD ALSO HAVE AN EXPERIENCED QUALIFIED ACCOUNTANT WHO CAN HELP THEM FIND WAYS OF DECREASING THEIR ANNUAL TAX RETURNS TO ALSO FACILITATE THIS.