Sales management has gained importance to meet increasing competition and the need for improved methods of distribution to reduce cost and to increase profits. Sales management today is the most important function in a commercial and business enterprise.
Do ETL: Extract-Transform-Load some Amazon dataset and find for me Sales-trend -> month-wise, year-wise, yearly_month-wise
Find key metrics and factors and show the meaningful relationships between attributes. Do your own research and come up with your findings.
options(repos = c(CRAN = "https://cloud.r-project.org/"))
install.packages("remotes")
## Installing package into 'C:/Users/Saptorshi Mondal/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'remotes' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Saptorshi Mondal\AppData\Local\Temp\Rtmpauk5xT\downloaded_packages
remotes::install_cran(c("fansi", "utf8", "cli", "glue", "lifecycle", "magrittr", "pillar", "rlang", "tibble", "tidyselect", "vctrs"))
## Skipping install of 'fansi' from a cran remote, the SHA1 (1.0.6) has not changed since last install.
## Use `force = TRUE` to force installation
## Skipping install of 'utf8' from a cran remote, the SHA1 (1.2.4) has not changed since last install.
## Use `force = TRUE` to force installation
## Skipping install of 'cli' from a cran remote, the SHA1 (3.6.2) has not changed since last install.
## Use `force = TRUE` to force installation
## Skipping install of 'glue' from a cran remote, the SHA1 (1.7.0) has not changed since last install.
## Use `force = TRUE` to force installation
## Skipping install of 'lifecycle' from a cran remote, the SHA1 (1.0.4) has not changed since last install.
## Use `force = TRUE` to force installation
## Skipping install of 'magrittr' from a cran remote, the SHA1 (2.0.3) has not changed since last install.
## Use `force = TRUE` to force installation
## Skipping install of 'pillar' from a cran remote, the SHA1 (1.9.0) has not changed since last install.
## Use `force = TRUE` to force installation
## Skipping install of 'rlang' from a cran remote, the SHA1 (1.1.4) has not changed since last install.
## Use `force = TRUE` to force installation
## Skipping install of 'tibble' from a cran remote, the SHA1 (3.2.1) has not changed since last install.
## Use `force = TRUE` to force installation
## Skipping install of 'tidyselect' from a cran remote, the SHA1 (1.2.1) has not changed since last install.
## Use `force = TRUE` to force installation
## Skipping install of 'vctrs' from a cran remote, the SHA1 (0.6.5) has not changed since last install.
## Use `force = TRUE` to force installation
library(fansi)
library(utf8)
library(cli)
library(glue)
library(lifecycle)
library(magrittr)
library(pillar)
##
## Attaching package: 'pillar'
## The following object is masked from 'package:cli':
##
## style_bold
library(rlang)
##
## Attaching package: 'rlang'
## The following object is masked from 'package:magrittr':
##
## set_names
library(tibble)
library(tidyselect)
library(vctrs)
##
## Attaching package: 'vctrs'
## The following object is masked from 'package:tibble':
##
## data_frame
# Load necessary libraries for EDA
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:vctrs':
##
## data_frame
## The following object is masked from 'package:pillar':
##
## dim_desc
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:magrittr':
##
## extract
library(readr)
library(skimr)
library(DataExplorer)
# Load necessary libraries for Data Visualization
library(ggplot2)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(cowplot)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
# Load necessary libraries for Machine Learning
install.packages("future.apply")
## Installing package into 'C:/Users/Saptorshi Mondal/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'future.apply' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Saptorshi Mondal\AppData\Local\Temp\Rtmpauk5xT\downloaded_packages
install.packages("caret")
## Installing package into 'C:/Users/Saptorshi Mondal/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'caret' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Saptorshi Mondal\AppData\Local\Temp\Rtmpauk5xT\downloaded_packages
library(caret)
## Loading required package: lattice
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
library(e1071)
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1-8
library(xgboost)
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:plotly':
##
## slice
## The following object is masked from 'package:dplyr':
##
## slice
setwd("C:/Users/Saptorshi Mondal/Downloads")
data <- read_csv("Amazon Sales data.csv")
## Rows: 100 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): Region, Country, Item Type, Sales Channel, Order Priority, Order Da...
## dbl (7): Order ID, Units Sold, Unit Price, Unit Cost, Total Revenue, Total C...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(data)
## # A tibble: 6 × 14
## Region Country `Item Type` `Sales Channel` `Order Priority` `Order Date`
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 Australia a… Tuvalu Baby Food Offline H 5/28/2010
## 2 Central Ame… Grenada Cereal Online C 8/22/2012
## 3 Europe Russia Office Sup… Offline L 5/2/2014
## 4 Sub-Saharan… Sao To… Fruits Online C 6/20/2014
## 5 Sub-Saharan… Rwanda Office Sup… Offline L 2/1/2013
## 6 Australia a… Solomo… Baby Food Online C 2/4/2015
## # ℹ 8 more variables: `Order ID` <dbl>, `Ship Date` <chr>, `Units Sold` <dbl>,
## # `Unit Price` <dbl>, `Unit Cost` <dbl>, `Total Revenue` <dbl>,
## # `Total Cost` <dbl>, `Total Profit` <dbl>
shape <- dim(data)
num_rows <- shape[1]
num_columns <- shape[2]
print(paste("Number of rows:", num_rows))
## [1] "Number of rows: 100"
print(paste("Number of columns:", num_columns))
## [1] "Number of columns: 14"
There are 100 rows and 14 columns.
#null values in data
colSums(is.na(data))
## Region Country Item Type Sales Channel Order Priority
## 0 0 0 0 0
## Order Date Order ID Ship Date Units Sold Unit Price
## 0 0 0 0 0
## Unit Cost Total Revenue Total Cost Total Profit
## 0 0 0 0
There are no null values in the data.
#data set info
str(data)
## spc_tbl_ [100 × 14] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Region : chr [1:100] "Australia and Oceania" "Central America and the Caribbean" "Europe" "Sub-Saharan Africa" ...
## $ Country : chr [1:100] "Tuvalu" "Grenada" "Russia" "Sao Tome and Principe" ...
## $ Item Type : chr [1:100] "Baby Food" "Cereal" "Office Supplies" "Fruits" ...
## $ Sales Channel : chr [1:100] "Offline" "Online" "Offline" "Online" ...
## $ Order Priority: chr [1:100] "H" "C" "L" "C" ...
## $ Order Date : chr [1:100] "5/28/2010" "8/22/2012" "5/2/2014" "6/20/2014" ...
## $ Order ID : num [1:100] 6.69e+08 9.64e+08 3.41e+08 5.14e+08 1.15e+08 ...
## $ Ship Date : chr [1:100] "6/27/2010" "9/15/2012" "5/8/2014" "7/5/2014" ...
## $ Units Sold : num [1:100] 9925 2804 1779 8102 5062 ...
## $ Unit Price : num [1:100] 255.28 205.7 651.21 9.33 651.21 ...
## $ Unit Cost : num [1:100] 159.42 117.11 524.96 6.92 524.96 ...
## $ Total Revenue : num [1:100] 2533654 576783 1158503 75592 3296425 ...
## $ Total Cost : num [1:100] 1582244 328376 933904 56066 2657348 ...
## $ Total Profit : num [1:100] 951411 248406 224599 19526 639078 ...
## - attr(*, "spec")=
## .. cols(
## .. Region = col_character(),
## .. Country = col_character(),
## .. `Item Type` = col_character(),
## .. `Sales Channel` = col_character(),
## .. `Order Priority` = col_character(),
## .. `Order Date` = col_character(),
## .. `Order ID` = col_double(),
## .. `Ship Date` = col_character(),
## .. `Units Sold` = col_double(),
## .. `Unit Price` = col_double(),
## .. `Unit Cost` = col_double(),
## .. `Total Revenue` = col_double(),
## .. `Total Cost` = col_double(),
## .. `Total Profit` = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
There are 7 numerical and 7 categorical columns.
#value counts
install.packages("purrr")
## Installing package into 'C:/Users/Saptorshi Mondal/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'purrr' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'purrr'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\Saptorshi
## Mondal\AppData\Local\R\win-library\4.4\00LOCK\purrr\libs\x64\purrr.dll to
## C:\Users\Saptorshi
## Mondal\AppData\Local\R\win-library\4.4\purrr\libs\x64\purrr.dll: Permission
## denied
## Warning: restored 'purrr'
##
## The downloaded binary packages are in
## C:\Users\Saptorshi Mondal\AppData\Local\Temp\Rtmpauk5xT\downloaded_packages
library(purrr)
##
## Attaching package: 'purrr'
## The following object is masked from 'package:caret':
##
## lift
## The following objects are masked from 'package:rlang':
##
## %@%, flatten, flatten_chr, flatten_dbl, flatten_int, flatten_lgl,
## flatten_raw, invoke, splice
## The following object is masked from 'package:magrittr':
##
## set_names
categorical_columns <- names(data)[sapply(data, is.factor) | sapply(data, is.character)]
# Calculate unique values and percentages for each categorical column
results <- lapply(categorical_columns, function(column) {
value_counts <- table(data[[column]])
value_percentages <- prop.table(value_counts) * 100
unique_values_data <- data.frame(Value = names(value_counts),
Count = as.numeric(value_counts),
Percentage =round(value_percentages,2))
return(unique_values_data)
})
# Print the results for each categorical column
for (i in seq_along(results)) {
cat("Column:", categorical_columns[i], "\n")
print(results[[i]])
cat("\n")
}
## Column: Region
## Value Count Percentage.Var1
## 1 Asia 11 Asia
## 2 Australia and Oceania 11 Australia and Oceania
## 3 Central America and the Caribbean 7 Central America and the Caribbean
## 4 Europe 22 Europe
## 5 Middle East and North Africa 10 Middle East and North Africa
## 6 North America 3 North America
## 7 Sub-Saharan Africa 36 Sub-Saharan Africa
## Percentage.Freq
## 1 11
## 2 11
## 3 7
## 4 22
## 5 10
## 6 3
## 7 36
##
## Column: Country
## Value Count Percentage.Var1
## 1 Albania 1 Albania
## 2 Angola 1 Angola
## 3 Australia 3 Australia
## 4 Austria 1 Austria
## 5 Azerbaijan 2 Azerbaijan
## 6 Bangladesh 1 Bangladesh
## 7 Belize 1 Belize
## 8 Brunei 1 Brunei
## 9 Bulgaria 2 Bulgaria
## 10 Burkina Faso 1 Burkina Faso
## 11 Cameroon 2 Cameroon
## 12 Cape Verde 1 Cape Verde
## 13 Comoros 1 Comoros
## 14 Costa Rica 1 Costa Rica
## 15 Cote d'Ivoire 1 Cote d'Ivoire
## 16 Democratic Republic of the Congo 1 Democratic Republic of the Congo
## 17 Djibouti 3 Djibouti
## 18 East Timor 1 East Timor
## 19 Federated States of Micronesia 1 Federated States of Micronesia
## 20 Fiji 1 Fiji
## 21 France 1 France
## 22 Gabon 1 Gabon
## 23 Grenada 1 Grenada
## 24 Haiti 1 Haiti
## 25 Honduras 2 Honduras
## 26 Iceland 1 Iceland
## 27 Iran 1 Iran
## 28 Kenya 1 Kenya
## 29 Kiribati 1 Kiribati
## 30 Kuwait 1 Kuwait
## 31 Kyrgyzstan 1 Kyrgyzstan
## 32 Laos 1 Laos
## 33 Lebanon 1 Lebanon
## 34 Lesotho 1 Lesotho
## 35 Libya 2 Libya
## 36 Lithuania 1 Lithuania
## 37 Macedonia 1 Macedonia
## 38 Madagascar 1 Madagascar
## 39 Malaysia 1 Malaysia
## 40 Mali 2 Mali
## 41 Mauritania 1 Mauritania
## 42 Mexico 3 Mexico
## 43 Moldova 1 Moldova
## 44 Monaco 1 Monaco
## 45 Mongolia 1 Mongolia
## 46 Mozambique 1 Mozambique
## 47 Myanmar 2 Myanmar
## 48 New Zealand 1 New Zealand
## 49 Nicaragua 1 Nicaragua
## 50 Niger 1 Niger
## 51 Norway 2 Norway
## 52 Pakistan 1 Pakistan
## 53 Portugal 1 Portugal
## 54 Republic of the Congo 1 Republic of the Congo
## 55 Romania 1 Romania
## 56 Russia 1 Russia
## 57 Rwanda 2 Rwanda
## 58 Samoa 1 Samoa
## 59 San Marino 1 San Marino
## 60 Sao Tome and Principe 3 Sao Tome and Principe
## 61 Saudi Arabia 1 Saudi Arabia
## 62 Senegal 1 Senegal
## 63 Sierra Leone 3 Sierra Leone
## 64 Slovakia 1 Slovakia
## 65 Slovenia 1 Slovenia
## 66 Solomon Islands 1 Solomon Islands
## 67 South Sudan 1 South Sudan
## 68 Spain 1 Spain
## 69 Sri Lanka 1 Sri Lanka
## 70 Switzerland 2 Switzerland
## 71 Syria 1 Syria
## 72 The Gambia 4 The Gambia
## 73 Turkmenistan 2 Turkmenistan
## 74 Tuvalu 1 Tuvalu
## 75 United Kingdom 1 United Kingdom
## 76 Zambia 1 Zambia
## Percentage.Freq
## 1 1
## 2 1
## 3 3
## 4 1
## 5 2
## 6 1
## 7 1
## 8 1
## 9 2
## 10 1
## 11 2
## 12 1
## 13 1
## 14 1
## 15 1
## 16 1
## 17 3
## 18 1
## 19 1
## 20 1
## 21 1
## 22 1
## 23 1
## 24 1
## 25 2
## 26 1
## 27 1
## 28 1
## 29 1
## 30 1
## 31 1
## 32 1
## 33 1
## 34 1
## 35 2
## 36 1
## 37 1
## 38 1
## 39 1
## 40 2
## 41 1
## 42 3
## 43 1
## 44 1
## 45 1
## 46 1
## 47 2
## 48 1
## 49 1
## 50 1
## 51 2
## 52 1
## 53 1
## 54 1
## 55 1
## 56 1
## 57 2
## 58 1
## 59 1
## 60 3
## 61 1
## 62 1
## 63 3
## 64 1
## 65 1
## 66 1
## 67 1
## 68 1
## 69 1
## 70 2
## 71 1
## 72 4
## 73 2
## 74 1
## 75 1
## 76 1
##
## Column: Item Type
## Value Count Percentage.Var1 Percentage.Freq
## 1 Baby Food 7 Baby Food 7
## 2 Beverages 8 Beverages 8
## 3 Cereal 7 Cereal 7
## 4 Clothes 13 Clothes 13
## 5 Cosmetics 13 Cosmetics 13
## 6 Fruits 10 Fruits 10
## 7 Household 9 Household 9
## 8 Meat 2 Meat 2
## 9 Office Supplies 12 Office Supplies 12
## 10 Personal Care 10 Personal Care 10
## 11 Snacks 3 Snacks 3
## 12 Vegetables 6 Vegetables 6
##
## Column: Sales Channel
## Value Count Percentage.Var1 Percentage.Freq
## 1 Offline 50 Offline 50
## 2 Online 50 Online 50
##
## Column: Order Priority
## Value Count Percentage.Var1 Percentage.Freq
## 1 C 22 C 22
## 2 H 30 H 30
## 3 L 27 L 27
## 4 M 21 M 21
##
## Column: Order Date
## Value Count Percentage.Var1 Percentage.Freq
## 1 1/11/2012 1 1/11/2012 1
## 2 1/13/2017 1 1/13/2017 1
## 3 1/14/2017 1 1/14/2017 1
## 4 1/16/2011 1 1/16/2011 1
## 5 1/16/2015 1 1/16/2015 1
## 6 1/4/2011 1 1/4/2011 1
## 7 1/5/2012 1 1/5/2012 1
## 8 10/11/2013 1 10/11/2013 1
## 9 10/13/2013 1 10/13/2013 1
## 10 10/13/2014 1 10/13/2014 1
## 11 10/14/2014 1 10/14/2014 1
## 12 10/21/2012 1 10/21/2012 1
## 13 10/23/2016 1 10/23/2016 1
## 14 10/24/2010 1 10/24/2010 1
## 15 10/27/2015 1 10/27/2015 1
## 16 10/28/2014 1 10/28/2014 1
## 17 10/30/2010 1 10/30/2010 1
## 18 10/6/2012 1 10/6/2012 1
## 19 11/11/2011 1 11/11/2011 1
## 20 11/14/2015 1 11/14/2015 1
## 21 11/15/2016 1 11/15/2016 1
## 22 11/19/2016 1 11/19/2016 1
## 23 11/22/2011 1 11/22/2011 1
## 24 11/26/2010 1 11/26/2010 1
## 25 11/26/2011 1 11/26/2011 1
## 26 11/6/2014 1 11/6/2014 1
## 27 11/7/2011 1 11/7/2011 1
## 28 12/23/2010 1 12/23/2010 1
## 29 12/29/2013 1 12/29/2013 1
## 30 12/30/2010 1 12/30/2010 1
## 31 12/31/2016 1 12/31/2016 1
## 32 12/6/2016 1 12/6/2016 1
## 33 2/1/2013 1 2/1/2013 1
## 34 2/10/2012 1 2/10/2012 1
## 35 2/16/2012 1 2/16/2012 1
## 36 2/17/2012 1 2/17/2012 1
## 37 2/19/2014 1 2/19/2014 1
## 38 2/2/2010 1 2/2/2010 1
## 39 2/23/2015 1 2/23/2015 1
## 40 2/25/2017 1 2/25/2017 1
## 41 2/3/2014 1 2/3/2014 1
## 42 2/4/2015 1 2/4/2015 1
## 43 2/6/2010 1 2/6/2010 1
## 44 2/8/2011 1 2/8/2011 1
## 45 2/8/2017 1 2/8/2017 1
## 46 3/11/2017 1 3/11/2017 1
## 47 3/18/2012 1 3/18/2012 1
## 48 3/25/2013 1 3/25/2013 1
## 49 3/29/2016 1 3/29/2016 1
## 50 4/1/2012 1 4/1/2012 1
## 51 4/1/2015 1 4/1/2015 1
## 52 4/18/2014 1 4/18/2014 1
## 53 4/23/2011 1 4/23/2011 1
## 54 4/23/2012 1 4/23/2012 1
## 55 4/23/2013 1 4/23/2013 1
## 56 4/25/2015 1 4/25/2015 1
## 57 4/30/2012 1 4/30/2012 1
## 58 4/7/2014 1 4/7/2014 1
## 59 5/14/2014 1 5/14/2014 1
## 60 5/2/2014 1 5/2/2014 1
## 61 5/20/2017 1 5/20/2017 1
## 62 5/22/2017 1 5/22/2017 1
## 63 5/26/2011 1 5/26/2011 1
## 64 5/26/2012 1 5/26/2012 1
## 65 5/28/2010 1 5/28/2010 1
## 66 5/29/2012 1 5/29/2012 1
## 67 5/7/2010 1 5/7/2010 1
## 68 5/7/2016 1 5/7/2016 1
## 69 5/8/2017 1 5/8/2017 1
## 70 6/1/2016 1 6/1/2016 1
## 71 6/13/2012 1 6/13/2012 1
## 72 6/20/2014 1 6/20/2014 1
## 73 6/24/2011 1 6/24/2011 1
## 74 6/26/2013 1 6/26/2013 1
## 75 6/30/2010 1 6/30/2010 1
## 76 6/30/2016 1 6/30/2016 1
## 77 6/7/2012 1 6/7/2012 1
## 78 6/8/2012 1 6/8/2012 1
## 79 6/9/2013 1 6/9/2013 1
## 80 7/14/2015 1 7/14/2015 1
## 81 7/17/2012 1 7/17/2012 1
## 82 7/18/2014 1 7/18/2014 1
## 83 7/20/2013 1 7/20/2013 1
## 84 7/25/2016 1 7/25/2016 1
## 85 7/26/2011 1 7/26/2011 1
## 86 7/30/2015 1 7/30/2015 1
## 87 7/31/2012 1 7/31/2012 1
## 88 7/31/2015 1 7/31/2015 1
## 89 7/5/2013 1 7/5/2013 1
## 90 7/7/2014 1 7/7/2014 1
## 91 7/8/2012 1 7/8/2012 1
## 92 8/14/2015 1 8/14/2015 1
## 93 8/18/2013 1 8/18/2013 1
## 94 8/2/2014 1 8/2/2014 1
## 95 8/22/2012 1 8/22/2012 1
## 96 9/15/2011 1 9/15/2011 1
## 97 9/17/2012 1 9/17/2012 1
## 98 9/17/2013 1 9/17/2013 1
## 99 9/18/2012 1 9/18/2012 1
## 100 9/8/2014 1 9/8/2014 1
##
## Column: Ship Date
## Value Count Percentage.Var1 Percentage.Freq
## 1 1/13/2012 1 1/13/2012 1
## 2 1/20/2011 1 1/20/2011 1
## 3 1/21/2011 1 1/21/2011 1
## 4 1/23/2017 1 1/23/2017 1
## 5 1/28/2014 1 1/28/2014 1
## 6 1/31/2011 1 1/31/2011 1
## 7 1/5/2011 1 1/5/2011 1
## 8 1/7/2012 1 1/7/2012 1
## 9 10/20/2012 1 10/20/2012 1
## 10 10/23/2011 1 10/23/2011 1
## 11 10/24/2013 1 10/24/2013 1
## 12 10/4/2014 1 10/4/2014 1
## 13 10/8/2012 1 10/8/2012 1
## 14 11/10/2012 1 11/10/2012 1
## 15 11/10/2014 1 11/10/2014 1
## 16 11/14/2014 1 11/14/2014 1
## 17 11/15/2011 1 11/15/2011 1
## 18 11/15/2014 1 11/15/2014 1
## 19 11/16/2013 1 11/16/2013 1
## 20 11/17/2010 2 11/17/2010 2
## 21 11/18/2015 1 11/18/2015 1
## 22 11/25/2013 1 11/25/2013 1
## 23 11/25/2015 1 11/25/2015 1
## 24 11/25/2016 1 11/25/2016 1
## 25 11/30/2012 1 11/30/2012 1
## 26 12/12/2014 1 12/12/2014 1
## 27 12/14/2016 1 12/14/2016 1
## 28 12/18/2016 1 12/18/2016 1
## 29 12/25/2010 1 12/25/2010 1
## 30 12/28/2011 1 12/28/2011 1
## 31 12/3/2011 1 12/3/2011 1
## 32 12/31/2016 1 12/31/2016 1
## 33 12/8/2016 1 12/8/2016 1
## 34 2/13/2017 1 2/13/2017 1
## 35 2/14/2012 1 2/14/2012 1
## 36 2/15/2012 1 2/15/2012 1
## 37 2/21/2015 1 2/21/2015 1
## 38 2/23/2014 1 2/23/2014 1
## 39 2/25/2010 1 2/25/2010 1
## 40 2/25/2017 1 2/25/2017 1
## 41 2/28/2012 1 2/28/2012 1
## 42 2/6/2013 1 2/6/2013 1
## 43 3/1/2015 1 3/1/2015 1
## 44 3/1/2017 1 3/1/2017 1
## 45 3/18/2010 1 3/18/2010 1
## 46 3/2/2015 1 3/2/2015 1
## 47 3/20/2012 1 3/20/2012 1
## 48 3/20/2014 1 3/20/2014 1
## 49 3/21/2011 1 3/21/2011 1
## 50 3/28/2013 1 3/28/2013 1
## 51 3/28/2017 1 3/28/2017 1
## 52 4/18/2015 1 4/18/2015 1
## 53 4/19/2014 1 4/19/2014 1
## 54 4/27/2011 1 4/27/2011 1
## 55 4/29/2016 1 4/29/2016 1
## 56 4/7/2012 1 4/7/2012 1
## 57 5/10/2010 1 5/10/2010 1
## 58 5/10/2016 1 5/10/2016 1
## 59 5/18/2012 1 5/18/2012 1
## 60 5/20/2013 1 5/20/2013 1
## 61 5/21/2017 1 5/21/2017 1
## 62 5/28/2015 1 5/28/2015 1
## 63 5/30/2014 1 5/30/2014 1
## 64 5/8/2012 1 5/8/2012 1
## 65 5/8/2014 1 5/8/2014 1
## 66 6/17/2017 1 6/17/2017 1
## 67 6/2/2012 1 6/2/2012 1
## 68 6/27/2010 1 6/27/2010 1
## 69 6/27/2012 1 6/27/2012 1
## 70 6/28/2014 1 6/28/2014 1
## 71 6/29/2016 1 6/29/2016 1
## 72 6/3/2012 1 6/3/2012 1
## 73 6/5/2017 1 6/5/2017 1
## 74 6/8/2012 1 6/8/2012 1
## 75 6/9/2012 1 6/9/2012 1
## 76 7/1/2013 1 7/1/2013 1
## 77 7/11/2014 1 7/11/2014 1
## 78 7/12/2011 1 7/12/2011 1
## 79 7/15/2011 1 7/15/2011 1
## 80 7/2/2013 1 7/2/2013 1
## 81 7/24/2012 1 7/24/2012 1
## 82 7/26/2016 1 7/26/2016 1
## 83 7/27/2012 1 7/27/2012 1
## 84 7/30/2014 1 7/30/2014 1
## 85 7/5/2014 1 7/5/2014 1
## 86 7/9/2012 1 7/9/2012 1
## 87 8/1/2010 1 8/1/2010 1
## 88 8/16/2013 1 8/16/2013 1
## 89 8/19/2014 1 8/19/2014 1
## 90 8/25/2015 1 8/25/2015 1
## 91 8/7/2013 1 8/7/2013 1
## 92 8/8/2015 1 8/8/2015 1
## 93 9/11/2012 1 9/11/2012 1
## 94 9/15/2012 1 9/15/2012 1
## 95 9/18/2013 1 9/18/2013 1
## 96 9/3/2011 1 9/3/2011 1
## 97 9/3/2015 1 9/3/2015 1
## 98 9/30/2015 1 9/30/2015 1
## 99 9/7/2016 1 9/7/2016 1
#summary
summary(data)
## Region Country Item Type Sales Channel
## Length:100 Length:100 Length:100 Length:100
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Order Priority Order Date Order ID Ship Date
## Length:100 Length:100 Min. :114606559 Length:100
## Class :character Class :character 1st Qu.:338922488 Class :character
## Mode :character Mode :character Median :557708561 Mode :character
## Mean :555020412
## 3rd Qu.:790755081
## Max. :994022214
## Units Sold Unit Price Unit Cost Total Revenue
## Min. : 124 Min. : 9.33 Min. : 6.92 Min. : 4870
## 1st Qu.:2836 1st Qu.: 81.73 1st Qu.: 35.84 1st Qu.: 268721
## Median :5382 Median :179.88 Median :107.28 Median : 752314
## Mean :5129 Mean :276.76 Mean :191.05 Mean :1373488
## 3rd Qu.:7369 3rd Qu.:437.20 3rd Qu.:263.33 3rd Qu.:2212045
## Max. :9925 Max. :668.27 Max. :524.96 Max. :5997055
## Total Cost Total Profit
## Min. : 3612 Min. : 1258
## 1st Qu.: 168868 1st Qu.: 121444
## Median : 363566 Median : 290768
## Mean : 931806 Mean : 441682
## 3rd Qu.:1613870 3rd Qu.: 635829
## Max. :4509794 Max. :1719922
#bar graphs analysis
ggplot(data, aes_string(x = "`Item Type`")) +
geom_bar() +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(title = "Bar Graph of Item Type", x = "Item Type", y = "Count") +
scale_y_continuous(expand = expansion(mult = c(0, 0.1)))
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
ggplot(data, aes_string(x = "`Sales Channel`")) +
geom_bar() +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(title = "Bar Graph of Sales Channel", x = "Sales Channel", y = "Count") +
scale_y_continuous(expand = expansion(mult = c(0, 0.1)))
ggplot(data, aes_string(x = "`Order Priority`")) +
geom_bar() +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(title = "Bar Graph of Order Priority", x = "Order Priority", y = "Count") +
scale_y_continuous(expand = expansion(mult = c(0, 0.1)))
Most of the orders, belong to the ‘Clothing’ and ‘Cosmetics’ catgeory. Many of the orders are of ‘high’ priority. The amount of ‘offline’ and ‘online’ orders are equal.
ggplot(data, aes(x = `Item Type`, y = `Total Profit` )) +
geom_boxplot() +
labs(x = "`Item Type`", y = "`Total Profit`", title = "Box Plot of Sales by Item Type") +
theme_minimal()
ggplot(data, aes(x = `Order Priority`, y = `Total Profit` )) +
geom_boxplot() +
labs(x = "`Order Priority`", y = "`Total Profit`", title = "Box Plot of Sales by Order Priroity") +
theme_minimal()
ggplot(data, aes(x = `Sales Channel`, y = `Total Profit` )) +
geom_boxplot() +
labs(x = "`Sales Channel`", y = "`Total Profit`", title = "Box Plot of Sales by Sales channel") +
theme_minimal()
install.packages("hexbin")
## Installing package into 'C:/Users/Saptorshi Mondal/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'hexbin' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Saptorshi Mondal\AppData\Local\Temp\Rtmpauk5xT\downloaded_packages
library(hexbin)
ggplot(data, aes(x = `Units Sold`, y = `Total Profit` )) +
geom_hex() +
labs(x = "`Units Sold`", y = "`Total Profit`", title = "Hexbin Plot of Sales by Units Sold") +
theme_minimal()
Cosmetics are the most profitable products. Most of the items have ‘high’ order priority. The ‘offline’ orders were highly profitable compared to ‘online’ based orders. Over 10000 single units, received the best profit.
cat_cols <- c("Region", "Country","`Item Type`","`Sales Channel`","`Order Priority`","`Order Date`","`Ship Date`")
# Function to perform label encoding
label_encode <- function(df, cat_cols) {
for (col in cat_cols) {
if (col %in% colnames(df)) {
df[[col]] <- as.numeric(factor(df[[col]]))
} else {
stop(paste("Column", col, "not found in the dataframe"))
}
}
return(df)
}
# Apply label encoding
new_df <- label_encode(data, categorical_columns)
print(head(new_df))
## # A tibble: 6 × 14
## Region Country `Item Type` `Sales Channel` `Order Priority` `Order Date`
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2 74 1 1 2 65
## 2 3 23 3 2 1 95
## 3 4 56 9 1 3 60
## 4 7 60 6 2 1 72
## 5 7 57 9 1 3 33
## 6 2 66 1 2 1 42
## # ℹ 8 more variables: `Order ID` <dbl>, `Ship Date` <dbl>, `Units Sold` <dbl>,
## # `Unit Price` <dbl>, `Unit Cost` <dbl>, `Total Revenue` <dbl>,
## # `Total Cost` <dbl>, `Total Profit` <dbl>
colnames(new_df) <- make.names(colnames(new_df))
# Function to create a boxplot for each numerical column in the dataframe
create_boxplot <- function(column_name) {
ggplot(new_df, aes_string(x = "1", y = column_name)) +
geom_boxplot(outlier.colour = "red", outlier.shape = 16,
outlier.size = 2, notch = FALSE) +
labs(title = paste("Boxplot for", column_name), x = column_name, y = "Values")
}
# Identify numerical columns
numerical_cols <- sapply(new_df, is.numeric)
# Create boxplots for each numerical column
boxplots <- lapply(names(new_df)[numerical_cols], create_boxplot)
# Print each boxplot
for (plot in boxplots) {
print(plot)
}
# Function to floor and cap outliers in a dataframe
floor_cap_outliers <- function(df) {
for (col in names(df)) {
if (is.numeric(df[[col]])) {
Q1 <- quantile(df[[col]], 0.25, na.rm = TRUE)
Q3 <- quantile(df[[col]], 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
df[[col]] <- ifelse(df[[col]] < lower_bound, lower_bound, df[[col]])
df[[col]] <- ifelse(df[[col]] > upper_bound, upper_bound, df[[col]])
}
}
return(df)
}
# Floor and cap outliers in the sales dataframe
new_df <- floor_cap_outliers(new_df)
# Print the modified dataframe
print(head(new_df))
## # A tibble: 6 × 14
## Region Country Item.Type Sales.Channel Order.Priority Order.Date Order.ID
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2 74 1 1 2 65 669165933
## 2 3 23 3 2 1 95 963881480
## 3 4 56 9 1 3 60 341417157
## 4 7 60 6 2 1 72 514321792
## 5 7 57 9 1 3 33 115456712
## 6 2 66 1 2 1 42 547995746
## # ℹ 7 more variables: Ship.Date <dbl>, Units.Sold <dbl>, Unit.Price <dbl>,
## # Unit.Cost <dbl>, Total.Revenue <dbl>, Total.Cost <dbl>, Total.Profit <dbl>
cor_matrix <- cor(new_df[sapply(new_df, is.numeric)])
print(cor_matrix)
## Region Country Item.Type Sales.Channel
## Region 1.00000000 0.0903812850 0.08690327 -0.084148907
## Country 0.09038129 1.0000000000 0.01651612 -0.143683342
## Item.Type 0.08690327 0.0165161236 1.00000000 0.047308727
## Sales.Channel -0.08414891 -0.1436833421 0.04730873 1.000000000
## Order.Priority 0.08815965 0.0585824524 0.23625804 0.161422314
## Order.Date 0.05814104 -0.1823098254 -0.08396653 0.041571298
## Order.ID 0.11698280 0.0833316091 -0.02238890 0.004589715
## Ship.Date 0.05929032 -0.2850124080 -0.16051768 0.116822499
## Units.Sold -0.05838976 -0.0766098886 -0.23797564 -0.146352585
## Unit.Price -0.05433658 0.0694213235 0.20658133 -0.144870564
## Unit.Cost -0.04161618 0.0812824523 0.26925347 -0.137639339
## Total.Revenue -0.13013230 0.0230168493 0.05625127 -0.137065488
## Total.Cost -0.12068836 0.0275433120 0.12993992 -0.131283116
## Total.Profit -0.13483535 -0.0007261285 -0.13055185 -0.135148715
## Order.Priority Order.Date Order.ID Ship.Date Units.Sold
## Region 0.08815965 0.05814104 0.116982801 0.05929032 -0.05838976
## Country 0.05858245 -0.18230983 0.083331609 -0.28501241 -0.07660989
## Item.Type 0.23625804 -0.08396653 -0.022388900 -0.16051768 -0.23797564
## Sales.Channel 0.16142231 0.04157130 0.004589715 0.11682250 -0.14635258
## Order.Priority 1.00000000 0.05542770 -0.081379601 0.07642075 -0.07328794
## Order.Date 0.05542770 1.00000000 0.211468012 0.70259981 0.04349417
## Order.ID -0.08137960 0.21146801 1.000000000 0.27977777 -0.22290682
## Ship.Date 0.07642075 0.70259981 0.279777770 1.00000000 0.02094666
## Units.Sold -0.07328794 0.04349417 -0.222906818 0.02094666 1.00000000
## Unit.Price 0.17922767 -0.31834658 -0.190941212 -0.21516034 -0.07048559
## Unit.Cost 0.19492283 -0.32472142 -0.213200577 -0.22634324 -0.09223245
## Total.Revenue 0.13589890 -0.19854240 -0.316786994 -0.15209563 0.44782719
## Total.Cost 0.16191450 -0.22124275 -0.330534062 -0.16583751 0.37282111
## Total.Profit 0.06284622 -0.13726709 -0.235266808 -0.11337649 0.56210781
## Unit.Price Unit.Cost Total.Revenue Total.Cost Total.Profit
## Region -0.05433658 -0.04161618 -0.13013230 -0.12068836 -0.1348353541
## Country 0.06942132 0.08128245 0.02301685 0.02754331 -0.0007261285
## Item.Type 0.20658133 0.26925347 0.05625127 0.12993992 -0.1305518463
## Sales.Channel -0.14487056 -0.13763934 -0.13706549 -0.13128312 -0.1351487154
## Order.Priority 0.17922767 0.19492283 0.13589890 0.16191450 0.0628462153
## Order.Date -0.31834658 -0.32472142 -0.19854240 -0.22124275 -0.1372670930
## Order.ID -0.19094121 -0.21320058 -0.31678699 -0.33053406 -0.2352668083
## Ship.Date -0.21516034 -0.22634324 -0.15209563 -0.16583751 -0.1133764856
## Units.Sold -0.07048559 -0.09223245 0.44782719 0.37282111 0.5621078132
## Unit.Price 1.00000000 0.98726981 0.75765257 0.79944011 0.5714697976
## Unit.Cost 0.98726981 1.00000000 0.71948636 0.78484573 0.4822356853
## Total.Revenue 0.75765257 0.71948636 1.00000000 0.98378547 0.9073819089
## Total.Cost 0.79944011 0.78484573 0.98378547 1.00000000 0.8199024016
## Total.Profit 0.57146980 0.48223569 0.90738191 0.81990240 1.0000000000
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
melted_cor_matrix <- melt(cor_matrix)
# Plot the heatmap
ggplot(data = melted_cor_matrix, aes(x = Var1, y = Var2, fill = value)) +
geom_tile() +
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1, 1), space = "Lab",
name="Pearson\nCorrelation") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 12, hjust = 1)) +
coord_fixed() +
ggtitle("Heatmap of Correlation Matrix")
The heatmap represents the correlation matrix visually.The ‘red’ represents positive correlation while ‘blue’ represents negative correlation. It is seen that the ‘Total Revenue’,‘Total cost’, ‘Units Sold’,‘Unit cost’ are all positively correlated with ‘Total Profit’ while the ‘Item type’, ‘Sales Channel’ and ‘Region’ are negatively correlated.
# Set seed for reproducibility
set.seed(123)
train_index <- createDataPartition(new_df$Total.Profit, p = 0.7, list = FALSE)
train_data <- new_df[train_index, ]
test_data <- new_df[-train_index, ]
x_train <- as.matrix(train_data[ , !colnames(train_data) %in% "Total.Profit"])
y_train <- train_data$Total.Profit
x_test <- as.matrix(test_data[ , !colnames(test_data) %in% "Total.Profit"])
y_test <- test_data$Total.Profit
# Fit the linear model(base model)
linear_model <- lm(Total.Profit ~ ., data = train_data)
# View the model summary
summary(linear_model)
##
## Call:
## lm(formula = Total.Profit ~ ., data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -199424 -11943 4223 13508 108036
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.147e+04 4.160e+04 -0.756 0.45242
## Region 8.758e+02 2.525e+03 0.347 0.72992
## Country -1.331e+02 2.595e+02 -0.513 0.60985
## Item.Type -1.102e+03 2.074e+03 -0.531 0.59721
## Sales.Channel -7.736e+03 1.131e+04 -0.684 0.49660
## Order.Priority 8.602e+03 5.774e+03 1.490 0.14174
## Order.Date -1.563e+02 2.955e+02 -0.529 0.59879
## Order.ID 1.027e-05 2.607e-05 0.394 0.69518
## Ship.Date -8.004e+01 3.025e+02 -0.265 0.79227
## Units.Sold 7.572e+00 3.681e+00 2.057 0.04416 *
## Unit.Price 1.090e+03 3.319e+02 3.285 0.00173 **
## Unit.Cost -1.313e+03 4.124e+02 -3.183 0.00234 **
## Total.Revenue 7.639e-01 5.405e-02 14.132 < 2e-16 ***
## Total.Cost -7.190e-01 7.105e-02 -10.120 1.99e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 43560 on 58 degrees of freedom
## Multiple R-squared: 0.991, Adjusted R-squared: 0.9889
## F-statistic: 489.1 on 13 and 58 DF, p-value: < 2.2e-16
linear_predictions <- predict(linear_model, test_data)
test_mse <- mean((test_data$Total.Profit - linear_predictions)^2)
train_mse <- mean((train_data$Total.Profit - linear_predictions)^2)
## Warning in train_data$Total.Profit - linear_predictions: longer object length
## is not a multiple of shorter object length
print(paste("Test MSE:", test_mse))
## [1] "Test MSE: 2155336761.48605"
print(paste("Train MSE:", train_mse))
## [1] "Train MSE: 346528612916.511"
if (train_mse > test_mse) {
print("The model is potentially overfitting.")
} else if (train_mse < test_mse) {
print("The model is potentially underfitting.")
} else {
print("The model is performing similarly on both training and validation sets.")
}
## [1] "The model is potentially overfitting."
#Ridge regression
install.packages("glmnet")
## Warning: package 'glmnet' is in use and will not be installed
library(glmnet)
install.packages("Metrics")
## Installing package into 'C:/Users/Saptorshi Mondal/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'Metrics' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Saptorshi Mondal\AppData\Local\Temp\Rtmpauk5xT\downloaded_packages
library(Metrics)
##
## Attaching package: 'Metrics'
## The following objects are masked from 'package:caret':
##
## precision, recall
## The following object is masked from 'package:rlang':
##
## ll
ridge_model <- glmnet(x_train, y_train, alpha = 0)
train_predictions <- predict(ridge_model, newx = x_train)
test_predictions <- predict(ridge_model, newx = x_test)
train_mse <- mse(y_train, train_predictions)
test_mse <- mse(y_test, test_predictions)
print(paste("Train MSE:", train_mse))
## [1] "Train MSE: 102097581433.859"
print(paste("Test MSE:", test_mse))
## [1] "Test MSE: 108512464344.428"
if (train_mse > test_mse) {
print("The model is potentially overfitting.")
} else if (train_mse < test_mse) {
print("The model is potentially underfitting.")
} else {
print("The model is performing similarly on both training and validation sets.")
}
## [1] "The model is potentially underfitting."
#Lasso Model
lasso_model <- glmnet(x_train, y_train, alpha = 1)
train_predictions <- predict(lasso_model, newx = x_train)
test_predictions <- predict(lasso_model, newx = x_test)
train_mse <- mse(y_train, train_predictions)
test_mse <- mse(y_test, test_predictions)
print(paste("Train MSE:", train_mse))
## [1] "Train MSE: 24035100686.7128"
print(paste("Test MSE:", test_mse))
## [1] "Test MSE: 25535951816.1696"
if (train_mse > test_mse) {
print("The model is potentially overfitting.")
} else if (train_mse < test_mse) {
print("The model is potentially underfitting.")
} else {
print("The model is performing similarly on both training and validation sets.")
}
## [1] "The model is potentially underfitting."
# Random Forest Regression
forest_model <- randomForest(Total.Profit ~ ., data = train_data)
train_predictions <- predict(forest_model,x_train)
test_predictions <- predict(forest_model,x_test)
train_mse <- mse(y_train, train_predictions)
test_mse <- mse(y_test, test_predictions)
print(paste("Train MSE:", train_mse))
## [1] "Train MSE: 3403475849.00583"
print(paste("Test MSE:", test_mse))
## [1] "Test MSE: 10984416925.4822"
if (train_mse > test_mse) {
print("The model is potentially overfitting.")
} else if (train_mse < test_mse) {
print("The model is potentially underfitting.")
} else {
print("The model is performing similarly on both training and validation sets.")
}
## [1] "The model is potentially underfitting."
A good fit model is achieved with ‘Ridge Regression’ algorithm where, the train and test Mean Square error values are almost similar as compared to other models.
install.packages("e1071")
## Warning: package 'e1071' is in use and will not be installed
library(e1071)
predictors <- new_df[, !colnames(new_df) %in% "Total.Profit"]
target <- new_df$Total.Profit
set.seed(123)
control <- rfeControl(functions = rfFuncs, method = "cv", number = 10)
results <- rfe(predictors, target, sizes = c(1:5, 10, 15, 20), rfeControl = control)
print(results)
##
## Recursive feature selection
##
## Outer resampling method: Cross-Validated (10 fold)
##
## Resampling performance over subset size:
##
## Variables RMSE Rsquared MAE RMSESD RsquaredSD MAESD Selected
## 1 197763 0.8186 137436 46498 0.09748 35580
## 2 122459 0.9270 92817 35254 0.05355 30199
## 3 123633 0.9232 94676 31674 0.05035 25102
## 4 104786 0.9424 78959 32783 0.04581 25070
## 5 96373 0.9497 73756 32284 0.03960 22424 *
## 10 104684 0.9461 81357 34658 0.04626 25873
## 13 113343 0.9380 88230 30381 0.04909 22861
##
## The top 5 variables (out of 5):
## Total.Revenue, Units.Sold, Total.Cost, Unit.Price, Item.Type
The optimum features are found in Recursive Factor elimination method. They are: Total.Revenue, Units.Sold, Total.Cost, Unit.Price, Item.Type respectively.
library(cluster)
install.packages("factoextra")
## Installing package into 'C:/Users/Saptorshi Mondal/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'factoextra' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Saptorshi Mondal\AppData\Local\Temp\Rtmpauk5xT\downloaded_packages
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
selected_features <- new_df [, c("Total.Revenue", "Units.Sold", "Total.Cost", "Unit.Price", "Item.Type")]
scaled_features <- scale(selected_features)
elbow_plot <- fviz_nbclust(scaled_features, kmeans, method = "wss")
print(elbow_plot)
Using the Elbow Plot, it is determined that, the no. of clusters should be 2.
k <- 2 # Number of clusters (adjust as needed)
kmeans_model <- kmeans(scaled_features, centers = k, nstart = 20) # Run k-means clustering
cluster_labels <- kmeans_model$cluster
silhouette_score <- silhouette(cluster_labels, dist(scaled_features))
print(silhouette_score)
## cluster neighbor sil_width
## [1,] 2 1 0.103925232
## [2,] 1 2 0.483373125
## [3,] 2 1 0.069683338
## [4,] 1 2 0.502509631
## [5,] 2 1 0.571266618
## [6,] 1 2 0.389183429
## [7,] 2 1 0.529256390
## [8,] 1 2 0.203194168
## [9,] 1 2 0.449395967
## [10,] 1 2 0.321971591
## [11,] 1 2 0.369532726
## [12,] 1 2 0.557198973
## [13,] 1 2 0.434119488
## [14,] 2 1 0.523466463
## [15,] 1 2 0.473168091
## [16,] 1 2 0.527069922
## [17,] 2 1 0.462838514
## [18,] 1 2 0.523942586
## [19,] 2 1 0.485061761
## [20,] 2 1 0.440068915
## [21,] 1 2 0.073572234
## [22,] 1 2 0.406225198
## [23,] 1 2 0.415756417
## [24,] 1 2 0.550133818
## [25,] 1 2 0.470886200
## [26,] 1 2 0.285923603
## [27,] 1 2 0.572820784
## [28,] 1 2 0.567595894
## [29,] 1 2 0.527166231
## [30,] 2 1 0.201291461
## [31,] 2 1 0.512690805
## [32,] 1 2 0.470497342
## [33,] 2 1 0.323956692
## [34,] 2 1 0.533475534
## [35,] 1 2 0.267744706
## [36,] 1 2 0.438855110
## [37,] 1 2 0.573844295
## [38,] 2 1 0.291975214
## [39,] 2 1 0.556712459
## [40,] 2 1 0.491881642
## [41,] 1 2 0.478808963
## [42,] 2 1 0.481701671
## [43,] 1 2 0.483334760
## [44,] 1 2 0.370174847
## [45,] 1 2 0.533291660
## [46,] 1 2 0.468370232
## [47,] 2 1 0.511740749
## [48,] 1 2 0.435186453
## [49,] 1 2 0.457567801
## [50,] 1 2 0.007535791
## [51,] 1 2 0.540002259
## [52,] 1 2 0.436417184
## [53,] 1 2 0.461516016
## [54,] 2 1 0.419582804
## [55,] 1 2 0.519912490
## [56,] 1 2 0.555617230
## [57,] 1 2 0.332879646
## [58,] 1 2 0.175304381
## [59,] 1 2 0.165126821
## [60,] 2 1 0.480697705
## [61,] 1 2 0.462350990
## [62,] 1 2 0.312339132
## [63,] 2 1 0.577555547
## [64,] 1 2 0.527922343
## [65,] 1 2 0.293653275
## [66,] 2 1 0.101754677
## [67,] 1 2 0.339348057
## [68,] 1 2 0.543120594
## [69,] 2 1 0.520222311
## [70,] 1 2 0.481245506
## [71,] 2 1 0.569435799
## [72,] 1 2 0.509925063
## [73,] 1 2 0.519045839
## [74,] 1 2 0.135626810
## [75,] 2 1 0.491814086
## [76,] 2 1 0.566387002
## [77,] 1 2 0.397653214
## [78,] 1 2 0.369883546
## [79,] 2 1 0.005165212
## [80,] 2 1 0.498378649
## [81,] 2 1 0.558051417
## [82,] 1 2 0.455383923
## [83,] 2 1 0.420265461
## [84,] 1 2 0.399025970
## [85,] 1 2 0.285815223
## [86,] 1 2 0.438397334
## [87,] 1 2 0.422320166
## [88,] 1 2 0.254500564
## [89,] 1 2 0.505067991
## [90,] 1 2 0.529360246
## [91,] 1 2 0.068152545
## [92,] 1 2 0.397174018
## [93,] 2 1 0.127262790
## [94,] 2 1 0.504625118
## [95,] 1 2 0.449547485
## [96,] 1 2 0.505169835
## [97,] 1 2 0.559227962
## [98,] 1 2 0.381593236
## [99,] 1 2 0.457266891
## [100,] 2 1 0.593483399
## attr(,"Ordered")
## [1] FALSE
## attr(,"call")
## silhouette.default(x = cluster_labels, dist = dist(scaled_features))
## attr(,"class")
## [1] "silhouette"
clustered_data <- cbind(new_df, Cluster = cluster_labels) # Add cluster labels to the original data
ggplot(clustered_data, aes(x = Total.Profit, y = Units.Sold, color = factor(Cluster))) +
geom_point() +
labs(title = "K-Means Clustering of Sales Data", x = "Total Profit", y = "Units Sold") +
theme_minimal()
We are plotting ‘Units sold’ vs ‘Total Profit’, where the approximate no. of clusters equals to 2, Cluster 1 represents products with high units sold but low profit. Cluster 2 represents products with low units sold but high profit.