Objective

Sales management has gained importance to meet increasing competition and the need for improved methods of distribution to reduce cost and to increase profits. Sales management today is the most important function in a commercial and business enterprise.

Do ETL: Extract-Transform-Load some Amazon dataset and find for me Sales-trend -> month-wise, year-wise, yearly_month-wise

Find key metrics and factors and show the meaningful relationships between attributes. Do your own research and come up with your findings.

Importing the Libraries

options(repos = c(CRAN = "https://cloud.r-project.org/"))
install.packages("remotes")

## Installing package into 'C:/Users/Saptorshi Mondal/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)

## package 'remotes' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Saptorshi Mondal\AppData\Local\Temp\Rtmpauk5xT\downloaded_packages

remotes::install_cran(c("fansi", "utf8", "cli", "glue", "lifecycle", "magrittr", "pillar", "rlang", "tibble", "tidyselect", "vctrs"))

## Skipping install of 'fansi' from a cran remote, the SHA1 (1.0.6) has not changed since last install.
##   Use `force = TRUE` to force installation

## Skipping install of 'utf8' from a cran remote, the SHA1 (1.2.4) has not changed since last install.
##   Use `force = TRUE` to force installation

## Skipping install of 'cli' from a cran remote, the SHA1 (3.6.2) has not changed since last install.
##   Use `force = TRUE` to force installation

## Skipping install of 'glue' from a cran remote, the SHA1 (1.7.0) has not changed since last install.
##   Use `force = TRUE` to force installation

## Skipping install of 'lifecycle' from a cran remote, the SHA1 (1.0.4) has not changed since last install.
##   Use `force = TRUE` to force installation

## Skipping install of 'magrittr' from a cran remote, the SHA1 (2.0.3) has not changed since last install.
##   Use `force = TRUE` to force installation

## Skipping install of 'pillar' from a cran remote, the SHA1 (1.9.0) has not changed since last install.
##   Use `force = TRUE` to force installation

## Skipping install of 'rlang' from a cran remote, the SHA1 (1.1.4) has not changed since last install.
##   Use `force = TRUE` to force installation

## Skipping install of 'tibble' from a cran remote, the SHA1 (3.2.1) has not changed since last install.
##   Use `force = TRUE` to force installation

## Skipping install of 'tidyselect' from a cran remote, the SHA1 (1.2.1) has not changed since last install.
##   Use `force = TRUE` to force installation

## Skipping install of 'vctrs' from a cran remote, the SHA1 (0.6.5) has not changed since last install.
##   Use `force = TRUE` to force installation

library(fansi)
library(utf8)
library(cli)
library(glue)
library(lifecycle)
library(magrittr)
library(pillar)

## 
## Attaching package: 'pillar'

## The following object is masked from 'package:cli':
## 
##     style_bold

library(rlang)

## 
## Attaching package: 'rlang'

## The following object is masked from 'package:magrittr':
## 
##     set_names

library(tibble)
library(tidyselect)
library(vctrs)

## 
## Attaching package: 'vctrs'

## The following object is masked from 'package:tibble':
## 
##     data_frame

# Load necessary libraries for EDA
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:vctrs':
## 
##     data_frame

## The following object is masked from 'package:pillar':
## 
##     dim_desc

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)

## 
## Attaching package: 'tidyr'

## The following object is masked from 'package:magrittr':
## 
##     extract

library(readr)
library(skimr)
library(DataExplorer)

# Load necessary libraries for Data Visualization
library(ggplot2)
library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(cowplot)
library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

library(GGally)

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

# Load necessary libraries for Machine Learning
install.packages("future.apply")

## Installing package into 'C:/Users/Saptorshi Mondal/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)

## package 'future.apply' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Saptorshi Mondal\AppData\Local\Temp\Rtmpauk5xT\downloaded_packages

install.packages("caret")

## Installing package into 'C:/Users/Saptorshi Mondal/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)

## package 'caret' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Saptorshi Mondal\AppData\Local\Temp\Rtmpauk5xT\downloaded_packages

library(caret)

## Loading required package: lattice

library(randomForest)

## randomForest 4.7-1.1

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:gridExtra':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

## The following object is masked from 'package:dplyr':
## 
##     combine

library(e1071)
library(glmnet)

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

## Loaded glmnet 4.1-8

library(xgboost)

## 
## Attaching package: 'xgboost'

## The following object is masked from 'package:plotly':
## 
##     slice

## The following object is masked from 'package:dplyr':
## 
##     slice

Importing the Dataset

setwd("C:/Users/Saptorshi Mondal/Downloads")
data <- read_csv("Amazon Sales data.csv")

## Rows: 100 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): Region, Country, Item Type, Sales Channel, Order Priority, Order Da...
## dbl (7): Order ID, Units Sold, Unit Price, Unit Cost, Total Revenue, Total C...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

head(data)

## # A tibble: 6 × 14
##   Region       Country `Item Type` `Sales Channel` `Order Priority` `Order Date`
##   <chr>        <chr>   <chr>       <chr>           <chr>            <chr>       
## 1 Australia a… Tuvalu  Baby Food   Offline         H                5/28/2010   
## 2 Central Ame… Grenada Cereal      Online          C                8/22/2012   
## 3 Europe       Russia  Office Sup… Offline         L                5/2/2014    
## 4 Sub-Saharan… Sao To… Fruits      Online          C                6/20/2014   
## 5 Sub-Saharan… Rwanda  Office Sup… Offline         L                2/1/2013    
## 6 Australia a… Solomo… Baby Food   Online          C                2/4/2015    
## # ℹ 8 more variables: `Order ID` <dbl>, `Ship Date` <chr>, `Units Sold` <dbl>,
## #   `Unit Price` <dbl>, `Unit Cost` <dbl>, `Total Revenue` <dbl>,
## #   `Total Cost` <dbl>, `Total Profit` <dbl>

shape <- dim(data)
num_rows <- shape[1]
num_columns <- shape[2]
print(paste("Number of rows:", num_rows))

## [1] "Number of rows: 100"

print(paste("Number of columns:", num_columns))

## [1] "Number of columns: 14"

There are 100 rows and 14 columns.

Data Preprocessing

#null values in data

colSums(is.na(data))

##         Region        Country      Item Type  Sales Channel Order Priority 
##              0              0              0              0              0 
##     Order Date       Order ID      Ship Date     Units Sold     Unit Price 
##              0              0              0              0              0 
##      Unit Cost  Total Revenue     Total Cost   Total Profit 
##              0              0              0              0

There are no null values in the data.

#data set info
str(data)

## spc_tbl_ [100 × 14] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Region        : chr [1:100] "Australia and Oceania" "Central America and the Caribbean" "Europe" "Sub-Saharan Africa" ...
##  $ Country       : chr [1:100] "Tuvalu" "Grenada" "Russia" "Sao Tome and Principe" ...
##  $ Item Type     : chr [1:100] "Baby Food" "Cereal" "Office Supplies" "Fruits" ...
##  $ Sales Channel : chr [1:100] "Offline" "Online" "Offline" "Online" ...
##  $ Order Priority: chr [1:100] "H" "C" "L" "C" ...
##  $ Order Date    : chr [1:100] "5/28/2010" "8/22/2012" "5/2/2014" "6/20/2014" ...
##  $ Order ID      : num [1:100] 6.69e+08 9.64e+08 3.41e+08 5.14e+08 1.15e+08 ...
##  $ Ship Date     : chr [1:100] "6/27/2010" "9/15/2012" "5/8/2014" "7/5/2014" ...
##  $ Units Sold    : num [1:100] 9925 2804 1779 8102 5062 ...
##  $ Unit Price    : num [1:100] 255.28 205.7 651.21 9.33 651.21 ...
##  $ Unit Cost     : num [1:100] 159.42 117.11 524.96 6.92 524.96 ...
##  $ Total Revenue : num [1:100] 2533654 576783 1158503 75592 3296425 ...
##  $ Total Cost    : num [1:100] 1582244 328376 933904 56066 2657348 ...
##  $ Total Profit  : num [1:100] 951411 248406 224599 19526 639078 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Region = col_character(),
##   ..   Country = col_character(),
##   ..   `Item Type` = col_character(),
##   ..   `Sales Channel` = col_character(),
##   ..   `Order Priority` = col_character(),
##   ..   `Order Date` = col_character(),
##   ..   `Order ID` = col_double(),
##   ..   `Ship Date` = col_character(),
##   ..   `Units Sold` = col_double(),
##   ..   `Unit Price` = col_double(),
##   ..   `Unit Cost` = col_double(),
##   ..   `Total Revenue` = col_double(),
##   ..   `Total Cost` = col_double(),
##   ..   `Total Profit` = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

There are 7 numerical and 7 categorical columns.

Exploratory Data Analysis

#value counts 
install.packages("purrr")

## Installing package into 'C:/Users/Saptorshi Mondal/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)

## package 'purrr' successfully unpacked and MD5 sums checked

## Warning: cannot remove prior installation of package 'purrr'

## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\Saptorshi
## Mondal\AppData\Local\R\win-library\4.4\00LOCK\purrr\libs\x64\purrr.dll to
## C:\Users\Saptorshi
## Mondal\AppData\Local\R\win-library\4.4\purrr\libs\x64\purrr.dll: Permission
## denied

## Warning: restored 'purrr'

## 
## The downloaded binary packages are in
##  C:\Users\Saptorshi Mondal\AppData\Local\Temp\Rtmpauk5xT\downloaded_packages

library(purrr)

## 
## Attaching package: 'purrr'

## The following object is masked from 'package:caret':
## 
##     lift

## The following objects are masked from 'package:rlang':
## 
##     %@%, flatten, flatten_chr, flatten_dbl, flatten_int, flatten_lgl,
##     flatten_raw, invoke, splice

## The following object is masked from 'package:magrittr':
## 
##     set_names

categorical_columns <- names(data)[sapply(data, is.factor) | sapply(data, is.character)]

# Calculate unique values and percentages for each categorical column
results <- lapply(categorical_columns, function(column) {
  value_counts <- table(data[[column]])
  value_percentages <- prop.table(value_counts) * 100

  unique_values_data <- data.frame(Value = names(value_counts),
                                   Count = as.numeric(value_counts),
                                   Percentage =round(value_percentages,2))

  return(unique_values_data)
})

# Print the results for each categorical column
for (i in seq_along(results)) {
  cat("Column:", categorical_columns[i], "\n")
  print(results[[i]])
  cat("\n")
}

## Column: Region 
##                               Value Count                   Percentage.Var1
## 1                              Asia    11                              Asia
## 2             Australia and Oceania    11             Australia and Oceania
## 3 Central America and the Caribbean     7 Central America and the Caribbean
## 4                            Europe    22                            Europe
## 5      Middle East and North Africa    10      Middle East and North Africa
## 6                     North America     3                     North America
## 7                Sub-Saharan Africa    36                Sub-Saharan Africa
##   Percentage.Freq
## 1              11
## 2              11
## 3               7
## 4              22
## 5              10
## 6               3
## 7              36
## 
## Column: Country 
##                               Value Count                  Percentage.Var1
## 1                           Albania     1                          Albania
## 2                            Angola     1                           Angola
## 3                         Australia     3                        Australia
## 4                           Austria     1                          Austria
## 5                        Azerbaijan     2                       Azerbaijan
## 6                        Bangladesh     1                       Bangladesh
## 7                            Belize     1                           Belize
## 8                            Brunei     1                           Brunei
## 9                          Bulgaria     2                         Bulgaria
## 10                     Burkina Faso     1                     Burkina Faso
## 11                         Cameroon     2                         Cameroon
## 12                       Cape Verde     1                       Cape Verde
## 13                          Comoros     1                          Comoros
## 14                       Costa Rica     1                       Costa Rica
## 15                    Cote d'Ivoire     1                    Cote d'Ivoire
## 16 Democratic Republic of the Congo     1 Democratic Republic of the Congo
## 17                         Djibouti     3                         Djibouti
## 18                       East Timor     1                       East Timor
## 19   Federated States of Micronesia     1   Federated States of Micronesia
## 20                             Fiji     1                             Fiji
## 21                           France     1                           France
## 22                            Gabon     1                            Gabon
## 23                          Grenada     1                          Grenada
## 24                            Haiti     1                            Haiti
## 25                         Honduras     2                         Honduras
## 26                          Iceland     1                          Iceland
## 27                             Iran     1                             Iran
## 28                            Kenya     1                            Kenya
## 29                         Kiribati     1                         Kiribati
## 30                           Kuwait     1                           Kuwait
## 31                       Kyrgyzstan     1                       Kyrgyzstan
## 32                             Laos     1                             Laos
## 33                          Lebanon     1                          Lebanon
## 34                          Lesotho     1                          Lesotho
## 35                            Libya     2                            Libya
## 36                        Lithuania     1                        Lithuania
## 37                        Macedonia     1                        Macedonia
## 38                       Madagascar     1                       Madagascar
## 39                         Malaysia     1                         Malaysia
## 40                             Mali     2                             Mali
## 41                       Mauritania     1                       Mauritania
## 42                           Mexico     3                           Mexico
## 43                          Moldova     1                          Moldova
## 44                           Monaco     1                           Monaco
## 45                         Mongolia     1                         Mongolia
## 46                       Mozambique     1                       Mozambique
## 47                          Myanmar     2                          Myanmar
## 48                      New Zealand     1                      New Zealand
## 49                        Nicaragua     1                        Nicaragua
## 50                            Niger     1                            Niger
## 51                           Norway     2                           Norway
## 52                         Pakistan     1                         Pakistan
## 53                         Portugal     1                         Portugal
## 54            Republic of the Congo     1            Republic of the Congo
## 55                          Romania     1                          Romania
## 56                           Russia     1                           Russia
## 57                           Rwanda     2                           Rwanda
## 58                            Samoa     1                            Samoa
## 59                       San Marino     1                       San Marino
## 60            Sao Tome and Principe     3            Sao Tome and Principe
## 61                     Saudi Arabia     1                     Saudi Arabia
## 62                          Senegal     1                          Senegal
## 63                     Sierra Leone     3                     Sierra Leone
## 64                         Slovakia     1                         Slovakia
## 65                         Slovenia     1                         Slovenia
## 66                  Solomon Islands     1                  Solomon Islands
## 67                      South Sudan     1                      South Sudan
## 68                            Spain     1                            Spain
## 69                        Sri Lanka     1                        Sri Lanka
## 70                      Switzerland     2                      Switzerland
## 71                            Syria     1                            Syria
## 72                       The Gambia     4                       The Gambia
## 73                     Turkmenistan     2                     Turkmenistan
## 74                           Tuvalu     1                           Tuvalu
## 75                   United Kingdom     1                   United Kingdom
## 76                           Zambia     1                           Zambia
##    Percentage.Freq
## 1                1
## 2                1
## 3                3
## 4                1
## 5                2
## 6                1
## 7                1
## 8                1
## 9                2
## 10               1
## 11               2
## 12               1
## 13               1
## 14               1
## 15               1
## 16               1
## 17               3
## 18               1
## 19               1
## 20               1
## 21               1
## 22               1
## 23               1
## 24               1
## 25               2
## 26               1
## 27               1
## 28               1
## 29               1
## 30               1
## 31               1
## 32               1
## 33               1
## 34               1
## 35               2
## 36               1
## 37               1
## 38               1
## 39               1
## 40               2
## 41               1
## 42               3
## 43               1
## 44               1
## 45               1
## 46               1
## 47               2
## 48               1
## 49               1
## 50               1
## 51               2
## 52               1
## 53               1
## 54               1
## 55               1
## 56               1
## 57               2
## 58               1
## 59               1
## 60               3
## 61               1
## 62               1
## 63               3
## 64               1
## 65               1
## 66               1
## 67               1
## 68               1
## 69               1
## 70               2
## 71               1
## 72               4
## 73               2
## 74               1
## 75               1
## 76               1
## 
## Column: Item Type 
##              Value Count Percentage.Var1 Percentage.Freq
## 1        Baby Food     7       Baby Food               7
## 2        Beverages     8       Beverages               8
## 3           Cereal     7          Cereal               7
## 4          Clothes    13         Clothes              13
## 5        Cosmetics    13       Cosmetics              13
## 6           Fruits    10          Fruits              10
## 7        Household     9       Household               9
## 8             Meat     2            Meat               2
## 9  Office Supplies    12 Office Supplies              12
## 10   Personal Care    10   Personal Care              10
## 11          Snacks     3          Snacks               3
## 12      Vegetables     6      Vegetables               6
## 
## Column: Sales Channel 
##     Value Count Percentage.Var1 Percentage.Freq
## 1 Offline    50         Offline              50
## 2  Online    50          Online              50
## 
## Column: Order Priority 
##   Value Count Percentage.Var1 Percentage.Freq
## 1     C    22               C              22
## 2     H    30               H              30
## 3     L    27               L              27
## 4     M    21               M              21
## 
## Column: Order Date 
##          Value Count Percentage.Var1 Percentage.Freq
## 1    1/11/2012     1       1/11/2012               1
## 2    1/13/2017     1       1/13/2017               1
## 3    1/14/2017     1       1/14/2017               1
## 4    1/16/2011     1       1/16/2011               1
## 5    1/16/2015     1       1/16/2015               1
## 6     1/4/2011     1        1/4/2011               1
## 7     1/5/2012     1        1/5/2012               1
## 8   10/11/2013     1      10/11/2013               1
## 9   10/13/2013     1      10/13/2013               1
## 10  10/13/2014     1      10/13/2014               1
## 11  10/14/2014     1      10/14/2014               1
## 12  10/21/2012     1      10/21/2012               1
## 13  10/23/2016     1      10/23/2016               1
## 14  10/24/2010     1      10/24/2010               1
## 15  10/27/2015     1      10/27/2015               1
## 16  10/28/2014     1      10/28/2014               1
## 17  10/30/2010     1      10/30/2010               1
## 18   10/6/2012     1       10/6/2012               1
## 19  11/11/2011     1      11/11/2011               1
## 20  11/14/2015     1      11/14/2015               1
## 21  11/15/2016     1      11/15/2016               1
## 22  11/19/2016     1      11/19/2016               1
## 23  11/22/2011     1      11/22/2011               1
## 24  11/26/2010     1      11/26/2010               1
## 25  11/26/2011     1      11/26/2011               1
## 26   11/6/2014     1       11/6/2014               1
## 27   11/7/2011     1       11/7/2011               1
## 28  12/23/2010     1      12/23/2010               1
## 29  12/29/2013     1      12/29/2013               1
## 30  12/30/2010     1      12/30/2010               1
## 31  12/31/2016     1      12/31/2016               1
## 32   12/6/2016     1       12/6/2016               1
## 33    2/1/2013     1        2/1/2013               1
## 34   2/10/2012     1       2/10/2012               1
## 35   2/16/2012     1       2/16/2012               1
## 36   2/17/2012     1       2/17/2012               1
## 37   2/19/2014     1       2/19/2014               1
## 38    2/2/2010     1        2/2/2010               1
## 39   2/23/2015     1       2/23/2015               1
## 40   2/25/2017     1       2/25/2017               1
## 41    2/3/2014     1        2/3/2014               1
## 42    2/4/2015     1        2/4/2015               1
## 43    2/6/2010     1        2/6/2010               1
## 44    2/8/2011     1        2/8/2011               1
## 45    2/8/2017     1        2/8/2017               1
## 46   3/11/2017     1       3/11/2017               1
## 47   3/18/2012     1       3/18/2012               1
## 48   3/25/2013     1       3/25/2013               1
## 49   3/29/2016     1       3/29/2016               1
## 50    4/1/2012     1        4/1/2012               1
## 51    4/1/2015     1        4/1/2015               1
## 52   4/18/2014     1       4/18/2014               1
## 53   4/23/2011     1       4/23/2011               1
## 54   4/23/2012     1       4/23/2012               1
## 55   4/23/2013     1       4/23/2013               1
## 56   4/25/2015     1       4/25/2015               1
## 57   4/30/2012     1       4/30/2012               1
## 58    4/7/2014     1        4/7/2014               1
## 59   5/14/2014     1       5/14/2014               1
## 60    5/2/2014     1        5/2/2014               1
## 61   5/20/2017     1       5/20/2017               1
## 62   5/22/2017     1       5/22/2017               1
## 63   5/26/2011     1       5/26/2011               1
## 64   5/26/2012     1       5/26/2012               1
## 65   5/28/2010     1       5/28/2010               1
## 66   5/29/2012     1       5/29/2012               1
## 67    5/7/2010     1        5/7/2010               1
## 68    5/7/2016     1        5/7/2016               1
## 69    5/8/2017     1        5/8/2017               1
## 70    6/1/2016     1        6/1/2016               1
## 71   6/13/2012     1       6/13/2012               1
## 72   6/20/2014     1       6/20/2014               1
## 73   6/24/2011     1       6/24/2011               1
## 74   6/26/2013     1       6/26/2013               1
## 75   6/30/2010     1       6/30/2010               1
## 76   6/30/2016     1       6/30/2016               1
## 77    6/7/2012     1        6/7/2012               1
## 78    6/8/2012     1        6/8/2012               1
## 79    6/9/2013     1        6/9/2013               1
## 80   7/14/2015     1       7/14/2015               1
## 81   7/17/2012     1       7/17/2012               1
## 82   7/18/2014     1       7/18/2014               1
## 83   7/20/2013     1       7/20/2013               1
## 84   7/25/2016     1       7/25/2016               1
## 85   7/26/2011     1       7/26/2011               1
## 86   7/30/2015     1       7/30/2015               1
## 87   7/31/2012     1       7/31/2012               1
## 88   7/31/2015     1       7/31/2015               1
## 89    7/5/2013     1        7/5/2013               1
## 90    7/7/2014     1        7/7/2014               1
## 91    7/8/2012     1        7/8/2012               1
## 92   8/14/2015     1       8/14/2015               1
## 93   8/18/2013     1       8/18/2013               1
## 94    8/2/2014     1        8/2/2014               1
## 95   8/22/2012     1       8/22/2012               1
## 96   9/15/2011     1       9/15/2011               1
## 97   9/17/2012     1       9/17/2012               1
## 98   9/17/2013     1       9/17/2013               1
## 99   9/18/2012     1       9/18/2012               1
## 100   9/8/2014     1        9/8/2014               1
## 
## Column: Ship Date 
##         Value Count Percentage.Var1 Percentage.Freq
## 1   1/13/2012     1       1/13/2012               1
## 2   1/20/2011     1       1/20/2011               1
## 3   1/21/2011     1       1/21/2011               1
## 4   1/23/2017     1       1/23/2017               1
## 5   1/28/2014     1       1/28/2014               1
## 6   1/31/2011     1       1/31/2011               1
## 7    1/5/2011     1        1/5/2011               1
## 8    1/7/2012     1        1/7/2012               1
## 9  10/20/2012     1      10/20/2012               1
## 10 10/23/2011     1      10/23/2011               1
## 11 10/24/2013     1      10/24/2013               1
## 12  10/4/2014     1       10/4/2014               1
## 13  10/8/2012     1       10/8/2012               1
## 14 11/10/2012     1      11/10/2012               1
## 15 11/10/2014     1      11/10/2014               1
## 16 11/14/2014     1      11/14/2014               1
## 17 11/15/2011     1      11/15/2011               1
## 18 11/15/2014     1      11/15/2014               1
## 19 11/16/2013     1      11/16/2013               1
## 20 11/17/2010     2      11/17/2010               2
## 21 11/18/2015     1      11/18/2015               1
## 22 11/25/2013     1      11/25/2013               1
## 23 11/25/2015     1      11/25/2015               1
## 24 11/25/2016     1      11/25/2016               1
## 25 11/30/2012     1      11/30/2012               1
## 26 12/12/2014     1      12/12/2014               1
## 27 12/14/2016     1      12/14/2016               1
## 28 12/18/2016     1      12/18/2016               1
## 29 12/25/2010     1      12/25/2010               1
## 30 12/28/2011     1      12/28/2011               1
## 31  12/3/2011     1       12/3/2011               1
## 32 12/31/2016     1      12/31/2016               1
## 33  12/8/2016     1       12/8/2016               1
## 34  2/13/2017     1       2/13/2017               1
## 35  2/14/2012     1       2/14/2012               1
## 36  2/15/2012     1       2/15/2012               1
## 37  2/21/2015     1       2/21/2015               1
## 38  2/23/2014     1       2/23/2014               1
## 39  2/25/2010     1       2/25/2010               1
## 40  2/25/2017     1       2/25/2017               1
## 41  2/28/2012     1       2/28/2012               1
## 42   2/6/2013     1        2/6/2013               1
## 43   3/1/2015     1        3/1/2015               1
## 44   3/1/2017     1        3/1/2017               1
## 45  3/18/2010     1       3/18/2010               1
## 46   3/2/2015     1        3/2/2015               1
## 47  3/20/2012     1       3/20/2012               1
## 48  3/20/2014     1       3/20/2014               1
## 49  3/21/2011     1       3/21/2011               1
## 50  3/28/2013     1       3/28/2013               1
## 51  3/28/2017     1       3/28/2017               1
## 52  4/18/2015     1       4/18/2015               1
## 53  4/19/2014     1       4/19/2014               1
## 54  4/27/2011     1       4/27/2011               1
## 55  4/29/2016     1       4/29/2016               1
## 56   4/7/2012     1        4/7/2012               1
## 57  5/10/2010     1       5/10/2010               1
## 58  5/10/2016     1       5/10/2016               1
## 59  5/18/2012     1       5/18/2012               1
## 60  5/20/2013     1       5/20/2013               1
## 61  5/21/2017     1       5/21/2017               1
## 62  5/28/2015     1       5/28/2015               1
## 63  5/30/2014     1       5/30/2014               1
## 64   5/8/2012     1        5/8/2012               1
## 65   5/8/2014     1        5/8/2014               1
## 66  6/17/2017     1       6/17/2017               1
## 67   6/2/2012     1        6/2/2012               1
## 68  6/27/2010     1       6/27/2010               1
## 69  6/27/2012     1       6/27/2012               1
## 70  6/28/2014     1       6/28/2014               1
## 71  6/29/2016     1       6/29/2016               1
## 72   6/3/2012     1        6/3/2012               1
## 73   6/5/2017     1        6/5/2017               1
## 74   6/8/2012     1        6/8/2012               1
## 75   6/9/2012     1        6/9/2012               1
## 76   7/1/2013     1        7/1/2013               1
## 77  7/11/2014     1       7/11/2014               1
## 78  7/12/2011     1       7/12/2011               1
## 79  7/15/2011     1       7/15/2011               1
## 80   7/2/2013     1        7/2/2013               1
## 81  7/24/2012     1       7/24/2012               1
## 82  7/26/2016     1       7/26/2016               1
## 83  7/27/2012     1       7/27/2012               1
## 84  7/30/2014     1       7/30/2014               1
## 85   7/5/2014     1        7/5/2014               1
## 86   7/9/2012     1        7/9/2012               1
## 87   8/1/2010     1        8/1/2010               1
## 88  8/16/2013     1       8/16/2013               1
## 89  8/19/2014     1       8/19/2014               1
## 90  8/25/2015     1       8/25/2015               1
## 91   8/7/2013     1        8/7/2013               1
## 92   8/8/2015     1        8/8/2015               1
## 93  9/11/2012     1       9/11/2012               1
## 94  9/15/2012     1       9/15/2012               1
## 95  9/18/2013     1       9/18/2013               1
## 96   9/3/2011     1        9/3/2011               1
## 97   9/3/2015     1        9/3/2015               1
## 98  9/30/2015     1       9/30/2015               1
## 99   9/7/2016     1        9/7/2016               1

#summary

summary(data)

##     Region            Country           Item Type         Sales Channel     
##  Length:100         Length:100         Length:100         Length:100        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  Order Priority      Order Date           Order ID          Ship Date        
##  Length:100         Length:100         Min.   :114606559   Length:100        
##  Class :character   Class :character   1st Qu.:338922488   Class :character  
##  Mode  :character   Mode  :character   Median :557708561   Mode  :character  
##                                        Mean   :555020412                     
##                                        3rd Qu.:790755081                     
##                                        Max.   :994022214                     
##    Units Sold     Unit Price       Unit Cost      Total Revenue    
##  Min.   : 124   Min.   :  9.33   Min.   :  6.92   Min.   :   4870  
##  1st Qu.:2836   1st Qu.: 81.73   1st Qu.: 35.84   1st Qu.: 268721  
##  Median :5382   Median :179.88   Median :107.28   Median : 752314  
##  Mean   :5129   Mean   :276.76   Mean   :191.05   Mean   :1373488  
##  3rd Qu.:7369   3rd Qu.:437.20   3rd Qu.:263.33   3rd Qu.:2212045  
##  Max.   :9925   Max.   :668.27   Max.   :524.96   Max.   :5997055  
##    Total Cost       Total Profit    
##  Min.   :   3612   Min.   :   1258  
##  1st Qu.: 168868   1st Qu.: 121444  
##  Median : 363566   Median : 290768  
##  Mean   : 931806   Mean   : 441682  
##  3rd Qu.:1613870   3rd Qu.: 635829  
##  Max.   :4509794   Max.   :1719922

Univariate Analysis

#bar graphs analysis
ggplot(data, aes_string(x = "`Item Type`")) +
  geom_bar() +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +  
  labs(title = "Bar Graph of Item Type", x = "Item Type", y = "Count") +
  scale_y_continuous(expand = expansion(mult = c(0, 0.1)))

## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

ggplot(data, aes_string(x = "`Sales Channel`")) +
  geom_bar() +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +  
  labs(title = "Bar Graph of Sales Channel", x = "Sales Channel", y = "Count") +
  scale_y_continuous(expand = expansion(mult = c(0, 0.1)))

ggplot(data, aes_string(x = "`Order Priority`")) +
  geom_bar() +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +  
  labs(title = "Bar Graph of Order Priority", x = "Order Priority", y = "Count") +
  scale_y_continuous(expand = expansion(mult = c(0, 0.1)))

Most of the orders, belong to the ‘Clothing’ and ‘Cosmetics’ catgeory. Many of the orders are of ‘high’ priority. The amount of ‘offline’ and ‘online’ orders are equal.

Bivariate Analysis

ggplot(data, aes(x = `Item Type`, y = `Total Profit` )) +
  geom_boxplot() +
  labs(x = "`Item Type`", y = "`Total Profit`", title = "Box Plot of Sales by Item Type") +
  theme_minimal()

ggplot(data, aes(x = `Order Priority`, y = `Total Profit` )) +
  geom_boxplot() +
  labs(x = "`Order Priority`", y = "`Total Profit`", title = "Box Plot of Sales by Order Priroity") +
  theme_minimal()

ggplot(data, aes(x = `Sales Channel`, y = `Total Profit` )) +
  geom_boxplot() +
  labs(x = "`Sales Channel`", y = "`Total Profit`", title = "Box Plot of Sales by Sales channel") +
  theme_minimal()

install.packages("hexbin")

## Installing package into 'C:/Users/Saptorshi Mondal/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)

## package 'hexbin' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Saptorshi Mondal\AppData\Local\Temp\Rtmpauk5xT\downloaded_packages

library(hexbin)
ggplot(data, aes(x = `Units Sold`, y = `Total Profit` )) +
  geom_hex() +
  labs(x = "`Units Sold`", y = "`Total Profit`", title = "Hexbin Plot of Sales by Units Sold") +
  theme_minimal()

Cosmetics are the most profitable products. Most of the items have ‘high’ order priority. The ‘offline’ orders were highly profitable compared to ‘online’ based orders. Over 10000 single units, received the best profit.

cat_cols <- c("Region", "Country","`Item Type`","`Sales Channel`","`Order Priority`","`Order Date`","`Ship Date`")
# Function to perform label encoding
label_encode <- function(df, cat_cols) {
  for (col in cat_cols) {
    if (col %in% colnames(df)) {
      df[[col]] <- as.numeric(factor(df[[col]]))
    } else {
      stop(paste("Column", col, "not found in the dataframe"))
    }
  }
  return(df)
}
# Apply label encoding
new_df <- label_encode(data, categorical_columns)
print(head(new_df))

## # A tibble: 6 × 14
##   Region Country `Item Type` `Sales Channel` `Order Priority` `Order Date`
##    <dbl>   <dbl>       <dbl>           <dbl>            <dbl>        <dbl>
## 1      2      74           1               1                2           65
## 2      3      23           3               2                1           95
## 3      4      56           9               1                3           60
## 4      7      60           6               2                1           72
## 5      7      57           9               1                3           33
## 6      2      66           1               2                1           42
## # ℹ 8 more variables: `Order ID` <dbl>, `Ship Date` <dbl>, `Units Sold` <dbl>,
## #   `Unit Price` <dbl>, `Unit Cost` <dbl>, `Total Revenue` <dbl>,
## #   `Total Cost` <dbl>, `Total Profit` <dbl>

colnames(new_df) <- make.names(colnames(new_df))

# Function to create a boxplot for each numerical column in the dataframe
create_boxplot <- function(column_name) {
  ggplot(new_df, aes_string(x = "1", y = column_name)) +
    geom_boxplot(outlier.colour = "red", outlier.shape = 16,
                 outlier.size = 2, notch = FALSE) +
    labs(title = paste("Boxplot for", column_name), x = column_name, y = "Values")
}

# Identify numerical columns
numerical_cols <- sapply(new_df, is.numeric)

# Create boxplots for each numerical column
boxplots <- lapply(names(new_df)[numerical_cols], create_boxplot)

# Print each boxplot
for (plot in boxplots) {
  print(plot)
}

# Function to floor and cap outliers in a dataframe
floor_cap_outliers <- function(df) {
  for (col in names(df)) {
    if (is.numeric(df[[col]])) {
      Q1 <- quantile(df[[col]], 0.25, na.rm = TRUE)
      Q3 <- quantile(df[[col]], 0.75, na.rm = TRUE)
      IQR <- Q3 - Q1
      lower_bound <- Q1 - 1.5 * IQR
      upper_bound <- Q3 + 1.5 * IQR
      
      df[[col]] <- ifelse(df[[col]] < lower_bound, lower_bound, df[[col]])
      df[[col]] <- ifelse(df[[col]] > upper_bound, upper_bound, df[[col]])
    }
  }
  return(df)
}

# Floor and cap outliers in the sales dataframe
new_df <- floor_cap_outliers(new_df)

# Print the modified dataframe
print(head(new_df))

## # A tibble: 6 × 14
##   Region Country Item.Type Sales.Channel Order.Priority Order.Date  Order.ID
##    <dbl>   <dbl>     <dbl>         <dbl>          <dbl>      <dbl>     <dbl>
## 1      2      74         1             1              2         65 669165933
## 2      3      23         3             2              1         95 963881480
## 3      4      56         9             1              3         60 341417157
## 4      7      60         6             2              1         72 514321792
## 5      7      57         9             1              3         33 115456712
## 6      2      66         1             2              1         42 547995746
## # ℹ 7 more variables: Ship.Date <dbl>, Units.Sold <dbl>, Unit.Price <dbl>,
## #   Unit.Cost <dbl>, Total.Revenue <dbl>, Total.Cost <dbl>, Total.Profit <dbl>

cor_matrix <- cor(new_df[sapply(new_df, is.numeric)])
print(cor_matrix)

##                     Region       Country   Item.Type Sales.Channel
## Region          1.00000000  0.0903812850  0.08690327  -0.084148907
## Country         0.09038129  1.0000000000  0.01651612  -0.143683342
## Item.Type       0.08690327  0.0165161236  1.00000000   0.047308727
## Sales.Channel  -0.08414891 -0.1436833421  0.04730873   1.000000000
## Order.Priority  0.08815965  0.0585824524  0.23625804   0.161422314
## Order.Date      0.05814104 -0.1823098254 -0.08396653   0.041571298
## Order.ID        0.11698280  0.0833316091 -0.02238890   0.004589715
## Ship.Date       0.05929032 -0.2850124080 -0.16051768   0.116822499
## Units.Sold     -0.05838976 -0.0766098886 -0.23797564  -0.146352585
## Unit.Price     -0.05433658  0.0694213235  0.20658133  -0.144870564
## Unit.Cost      -0.04161618  0.0812824523  0.26925347  -0.137639339
## Total.Revenue  -0.13013230  0.0230168493  0.05625127  -0.137065488
## Total.Cost     -0.12068836  0.0275433120  0.12993992  -0.131283116
## Total.Profit   -0.13483535 -0.0007261285 -0.13055185  -0.135148715
##                Order.Priority  Order.Date     Order.ID   Ship.Date  Units.Sold
## Region             0.08815965  0.05814104  0.116982801  0.05929032 -0.05838976
## Country            0.05858245 -0.18230983  0.083331609 -0.28501241 -0.07660989
## Item.Type          0.23625804 -0.08396653 -0.022388900 -0.16051768 -0.23797564
## Sales.Channel      0.16142231  0.04157130  0.004589715  0.11682250 -0.14635258
## Order.Priority     1.00000000  0.05542770 -0.081379601  0.07642075 -0.07328794
## Order.Date         0.05542770  1.00000000  0.211468012  0.70259981  0.04349417
## Order.ID          -0.08137960  0.21146801  1.000000000  0.27977777 -0.22290682
## Ship.Date          0.07642075  0.70259981  0.279777770  1.00000000  0.02094666
## Units.Sold        -0.07328794  0.04349417 -0.222906818  0.02094666  1.00000000
## Unit.Price         0.17922767 -0.31834658 -0.190941212 -0.21516034 -0.07048559
## Unit.Cost          0.19492283 -0.32472142 -0.213200577 -0.22634324 -0.09223245
## Total.Revenue      0.13589890 -0.19854240 -0.316786994 -0.15209563  0.44782719
## Total.Cost         0.16191450 -0.22124275 -0.330534062 -0.16583751  0.37282111
## Total.Profit       0.06284622 -0.13726709 -0.235266808 -0.11337649  0.56210781
##                 Unit.Price   Unit.Cost Total.Revenue  Total.Cost  Total.Profit
## Region         -0.05433658 -0.04161618   -0.13013230 -0.12068836 -0.1348353541
## Country         0.06942132  0.08128245    0.02301685  0.02754331 -0.0007261285
## Item.Type       0.20658133  0.26925347    0.05625127  0.12993992 -0.1305518463
## Sales.Channel  -0.14487056 -0.13763934   -0.13706549 -0.13128312 -0.1351487154
## Order.Priority  0.17922767  0.19492283    0.13589890  0.16191450  0.0628462153
## Order.Date     -0.31834658 -0.32472142   -0.19854240 -0.22124275 -0.1372670930
## Order.ID       -0.19094121 -0.21320058   -0.31678699 -0.33053406 -0.2352668083
## Ship.Date      -0.21516034 -0.22634324   -0.15209563 -0.16583751 -0.1133764856
## Units.Sold     -0.07048559 -0.09223245    0.44782719  0.37282111  0.5621078132
## Unit.Price      1.00000000  0.98726981    0.75765257  0.79944011  0.5714697976
## Unit.Cost       0.98726981  1.00000000    0.71948636  0.78484573  0.4822356853
## Total.Revenue   0.75765257  0.71948636    1.00000000  0.98378547  0.9073819089
## Total.Cost      0.79944011  0.78484573    0.98378547  1.00000000  0.8199024016
## Total.Profit    0.57146980  0.48223569    0.90738191  0.81990240  1.0000000000

library(reshape2)

## 
## Attaching package: 'reshape2'

## The following object is masked from 'package:tidyr':
## 
##     smiths

melted_cor_matrix <- melt(cor_matrix)

# Plot the heatmap
ggplot(data = melted_cor_matrix, aes(x = Var1, y = Var2, fill = value)) +
  geom_tile() +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
                       midpoint = 0, limit = c(-1, 1), space = "Lab", 
                       name="Pearson\nCorrelation") +
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 45, vjust = 1, 
                                   size = 12, hjust = 1)) +
  coord_fixed() +
  ggtitle("Heatmap of Correlation Matrix")

The heatmap represents the correlation matrix visually.The ‘red’ represents positive correlation while ‘blue’ represents negative correlation. It is seen that the ‘Total Revenue’,‘Total cost’, ‘Units Sold’,‘Unit cost’ are all positively correlated with ‘Total Profit’ while the ‘Item type’, ‘Sales Channel’ and ‘Region’ are negatively correlated.

Model Building

Supervised Learning

# Set seed for reproducibility
set.seed(123)
train_index <- createDataPartition(new_df$Total.Profit, p = 0.7, list = FALSE)
train_data <- new_df[train_index, ]
test_data <- new_df[-train_index, ]
x_train <- as.matrix(train_data[ , !colnames(train_data) %in% "Total.Profit"])
y_train <- train_data$Total.Profit
x_test <- as.matrix(test_data[ , !colnames(test_data) %in% "Total.Profit"])
y_test <- test_data$Total.Profit

# Fit the linear model(base model)
linear_model <- lm(Total.Profit ~ ., data = train_data)

# View the model summary
summary(linear_model)

## 
## Call:
## lm(formula = Total.Profit ~ ., data = train_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -199424  -11943    4223   13508  108036 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -3.147e+04  4.160e+04  -0.756  0.45242    
## Region          8.758e+02  2.525e+03   0.347  0.72992    
## Country        -1.331e+02  2.595e+02  -0.513  0.60985    
## Item.Type      -1.102e+03  2.074e+03  -0.531  0.59721    
## Sales.Channel  -7.736e+03  1.131e+04  -0.684  0.49660    
## Order.Priority  8.602e+03  5.774e+03   1.490  0.14174    
## Order.Date     -1.563e+02  2.955e+02  -0.529  0.59879    
## Order.ID        1.027e-05  2.607e-05   0.394  0.69518    
## Ship.Date      -8.004e+01  3.025e+02  -0.265  0.79227    
## Units.Sold      7.572e+00  3.681e+00   2.057  0.04416 *  
## Unit.Price      1.090e+03  3.319e+02   3.285  0.00173 ** 
## Unit.Cost      -1.313e+03  4.124e+02  -3.183  0.00234 ** 
## Total.Revenue   7.639e-01  5.405e-02  14.132  < 2e-16 ***
## Total.Cost     -7.190e-01  7.105e-02 -10.120 1.99e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 43560 on 58 degrees of freedom
## Multiple R-squared:  0.991,  Adjusted R-squared:  0.9889 
## F-statistic: 489.1 on 13 and 58 DF,  p-value: < 2.2e-16

linear_predictions <- predict(linear_model, test_data)
test_mse <- mean((test_data$Total.Profit - linear_predictions)^2)
train_mse <- mean((train_data$Total.Profit - linear_predictions)^2)

## Warning in train_data$Total.Profit - linear_predictions: longer object length
## is not a multiple of shorter object length

print(paste("Test MSE:", test_mse))

## [1] "Test MSE: 2155336761.48605"

print(paste("Train MSE:", train_mse))

## [1] "Train MSE: 346528612916.511"

if (train_mse > test_mse) {
  print("The model is potentially overfitting.")
} else if (train_mse < test_mse) {
  print("The model is potentially underfitting.")
} else {
  print("The model is performing similarly on both training and validation sets.")
}

## [1] "The model is potentially overfitting."

#Ridge regression
install.packages("glmnet")

## Warning: package 'glmnet' is in use and will not be installed

library(glmnet)
install.packages("Metrics")

## Installing package into 'C:/Users/Saptorshi Mondal/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)

## package 'Metrics' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Saptorshi Mondal\AppData\Local\Temp\Rtmpauk5xT\downloaded_packages

library(Metrics)

## 
## Attaching package: 'Metrics'

## The following objects are masked from 'package:caret':
## 
##     precision, recall

## The following object is masked from 'package:rlang':
## 
##     ll

ridge_model <- glmnet(x_train, y_train, alpha = 0)
train_predictions <- predict(ridge_model, newx = x_train)
test_predictions <- predict(ridge_model, newx = x_test)
train_mse <- mse(y_train, train_predictions)
test_mse <- mse(y_test, test_predictions)
print(paste("Train MSE:", train_mse))

## [1] "Train MSE: 102097581433.859"

print(paste("Test MSE:", test_mse))

## [1] "Test MSE: 108512464344.428"

if (train_mse > test_mse) {
  print("The model is potentially overfitting.")
} else if (train_mse < test_mse) {
  print("The model is potentially underfitting.")
} else {
  print("The model is performing similarly on both training and validation sets.")
}

## [1] "The model is potentially underfitting."

#Lasso Model
lasso_model <- glmnet(x_train, y_train, alpha = 1)
train_predictions <- predict(lasso_model, newx = x_train)
test_predictions <- predict(lasso_model, newx = x_test)
train_mse <- mse(y_train, train_predictions)
test_mse <- mse(y_test, test_predictions)
print(paste("Train MSE:", train_mse))

## [1] "Train MSE: 24035100686.7128"

print(paste("Test MSE:", test_mse))

## [1] "Test MSE: 25535951816.1696"

if (train_mse > test_mse) {
  print("The model is potentially overfitting.")
} else if (train_mse < test_mse) {
  print("The model is potentially underfitting.")
} else {
  print("The model is performing similarly on both training and validation sets.")
}

## [1] "The model is potentially underfitting."

# Random Forest Regression
forest_model <- randomForest(Total.Profit ~ ., data = train_data)
train_predictions <- predict(forest_model,x_train)
test_predictions <- predict(forest_model,x_test)
train_mse <- mse(y_train, train_predictions)
test_mse <- mse(y_test, test_predictions)
print(paste("Train MSE:", train_mse))

## [1] "Train MSE: 3403475849.00583"

print(paste("Test MSE:", test_mse))

## [1] "Test MSE: 10984416925.4822"

if (train_mse > test_mse) {
  print("The model is potentially overfitting.")
} else if (train_mse < test_mse) {
  print("The model is potentially underfitting.")
} else {
  print("The model is performing similarly on both training and validation sets.")
}

## [1] "The model is potentially underfitting."

A good fit model is achieved with ‘Ridge Regression’ algorithm where, the train and test Mean Square error values are almost similar as compared to other models.

Feature Selection

install.packages("e1071")

## Warning: package 'e1071' is in use and will not be installed

library(e1071)
predictors <- new_df[, !colnames(new_df) %in% "Total.Profit"]
target <- new_df$Total.Profit
set.seed(123)
control <- rfeControl(functions = rfFuncs, method = "cv", number = 10)
results <- rfe(predictors, target, sizes = c(1:5, 10, 15, 20), rfeControl = control)
print(results)

## 
## Recursive feature selection
## 
## Outer resampling method: Cross-Validated (10 fold) 
## 
## Resampling performance over subset size:
## 
##  Variables   RMSE Rsquared    MAE RMSESD RsquaredSD MAESD Selected
##          1 197763   0.8186 137436  46498    0.09748 35580         
##          2 122459   0.9270  92817  35254    0.05355 30199         
##          3 123633   0.9232  94676  31674    0.05035 25102         
##          4 104786   0.9424  78959  32783    0.04581 25070         
##          5  96373   0.9497  73756  32284    0.03960 22424        *
##         10 104684   0.9461  81357  34658    0.04626 25873         
##         13 113343   0.9380  88230  30381    0.04909 22861         
## 
## The top 5 variables (out of 5):
##    Total.Revenue, Units.Sold, Total.Cost, Unit.Price, Item.Type

The optimum features are found in Recursive Factor elimination method. They are: Total.Revenue, Units.Sold, Total.Cost, Unit.Price, Item.Type respectively.

Unsupervised Learning

library(cluster)
install.packages("factoextra")

## Installing package into 'C:/Users/Saptorshi Mondal/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)

## package 'factoextra' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Saptorshi Mondal\AppData\Local\Temp\Rtmpauk5xT\downloaded_packages

library(factoextra)

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

selected_features <- new_df [, c("Total.Revenue", "Units.Sold", "Total.Cost", "Unit.Price", "Item.Type")]
scaled_features <- scale(selected_features)
elbow_plot <- fviz_nbclust(scaled_features, kmeans, method = "wss")
print(elbow_plot)

Using the Elbow Plot, it is determined that, the no. of clusters should be 2.

k <- 2  # Number of clusters (adjust as needed)
kmeans_model <- kmeans(scaled_features, centers = k, nstart = 20)  # Run k-means clustering
cluster_labels <- kmeans_model$cluster 
silhouette_score <- silhouette(cluster_labels, dist(scaled_features))
print(silhouette_score)

##        cluster neighbor   sil_width
##   [1,]       2        1 0.103925232
##   [2,]       1        2 0.483373125
##   [3,]       2        1 0.069683338
##   [4,]       1        2 0.502509631
##   [5,]       2        1 0.571266618
##   [6,]       1        2 0.389183429
##   [7,]       2        1 0.529256390
##   [8,]       1        2 0.203194168
##   [9,]       1        2 0.449395967
##  [10,]       1        2 0.321971591
##  [11,]       1        2 0.369532726
##  [12,]       1        2 0.557198973
##  [13,]       1        2 0.434119488
##  [14,]       2        1 0.523466463
##  [15,]       1        2 0.473168091
##  [16,]       1        2 0.527069922
##  [17,]       2        1 0.462838514
##  [18,]       1        2 0.523942586
##  [19,]       2        1 0.485061761
##  [20,]       2        1 0.440068915
##  [21,]       1        2 0.073572234
##  [22,]       1        2 0.406225198
##  [23,]       1        2 0.415756417
##  [24,]       1        2 0.550133818
##  [25,]       1        2 0.470886200
##  [26,]       1        2 0.285923603
##  [27,]       1        2 0.572820784
##  [28,]       1        2 0.567595894
##  [29,]       1        2 0.527166231
##  [30,]       2        1 0.201291461
##  [31,]       2        1 0.512690805
##  [32,]       1        2 0.470497342
##  [33,]       2        1 0.323956692
##  [34,]       2        1 0.533475534
##  [35,]       1        2 0.267744706
##  [36,]       1        2 0.438855110
##  [37,]       1        2 0.573844295
##  [38,]       2        1 0.291975214
##  [39,]       2        1 0.556712459
##  [40,]       2        1 0.491881642
##  [41,]       1        2 0.478808963
##  [42,]       2        1 0.481701671
##  [43,]       1        2 0.483334760
##  [44,]       1        2 0.370174847
##  [45,]       1        2 0.533291660
##  [46,]       1        2 0.468370232
##  [47,]       2        1 0.511740749
##  [48,]       1        2 0.435186453
##  [49,]       1        2 0.457567801
##  [50,]       1        2 0.007535791
##  [51,]       1        2 0.540002259
##  [52,]       1        2 0.436417184
##  [53,]       1        2 0.461516016
##  [54,]       2        1 0.419582804
##  [55,]       1        2 0.519912490
##  [56,]       1        2 0.555617230
##  [57,]       1        2 0.332879646
##  [58,]       1        2 0.175304381
##  [59,]       1        2 0.165126821
##  [60,]       2        1 0.480697705
##  [61,]       1        2 0.462350990
##  [62,]       1        2 0.312339132
##  [63,]       2        1 0.577555547
##  [64,]       1        2 0.527922343
##  [65,]       1        2 0.293653275
##  [66,]       2        1 0.101754677
##  [67,]       1        2 0.339348057
##  [68,]       1        2 0.543120594
##  [69,]       2        1 0.520222311
##  [70,]       1        2 0.481245506
##  [71,]       2        1 0.569435799
##  [72,]       1        2 0.509925063
##  [73,]       1        2 0.519045839
##  [74,]       1        2 0.135626810
##  [75,]       2        1 0.491814086
##  [76,]       2        1 0.566387002
##  [77,]       1        2 0.397653214
##  [78,]       1        2 0.369883546
##  [79,]       2        1 0.005165212
##  [80,]       2        1 0.498378649
##  [81,]       2        1 0.558051417
##  [82,]       1        2 0.455383923
##  [83,]       2        1 0.420265461
##  [84,]       1        2 0.399025970
##  [85,]       1        2 0.285815223
##  [86,]       1        2 0.438397334
##  [87,]       1        2 0.422320166
##  [88,]       1        2 0.254500564
##  [89,]       1        2 0.505067991
##  [90,]       1        2 0.529360246
##  [91,]       1        2 0.068152545
##  [92,]       1        2 0.397174018
##  [93,]       2        1 0.127262790
##  [94,]       2        1 0.504625118
##  [95,]       1        2 0.449547485
##  [96,]       1        2 0.505169835
##  [97,]       1        2 0.559227962
##  [98,]       1        2 0.381593236
##  [99,]       1        2 0.457266891
## [100,]       2        1 0.593483399
## attr(,"Ordered")
## [1] FALSE
## attr(,"call")
## silhouette.default(x = cluster_labels, dist = dist(scaled_features))
## attr(,"class")
## [1] "silhouette"

clustered_data <- cbind(new_df, Cluster = cluster_labels)  # Add cluster labels to the original data
ggplot(clustered_data, aes(x = Total.Profit, y = Units.Sold, color = factor(Cluster))) +
  geom_point() +
  labs(title = "K-Means Clustering of Sales Data", x = "Total Profit", y = "Units Sold") +
  theme_minimal()

We are plotting ‘Units sold’ vs ‘Total Profit’, where the approximate no. of clusters equals to 2, Cluster 1 represents products with high units sold but low profit. Cluster 2 represents products with low units sold but high profit.

Amazon_Sales_Data_analysis

Saptorshi Mondal

2024-06-21