library(dplyr)
library(ggplot2) # visualisasi
library(ggpubr)
library(scales) # untuk tampilan digit (memberikan koma dll)
library(glue)
library(plotly) 
library(lubridate) # working with datetime
options(scipen = 100) # supaya output tidak menampilkan notasi ilmiah (10-e10)

Load Data

Data yang akan digunakan adalah data Superstore Sales Dataset source : kaggle.com

sales <- read.csv("data_input/Superstore_Sales.csv", stringsAsFactors = TRUE)

Deskripsi kolom:

  • Order ID : Nomor unik yang mengidentifikasi setiap pesanan
  • Order Date : Tanggal ketika pesanan dibuat
  • Ship Date : Tanggal ketika pesanan dikirim
  • Ship Mode : Metode pengiriman yang digunakan untuk mengirimkan pesanan
  • Customer ID : Nomor unik yang mengidentifikasi setiap pelanggan
  • Customer Name : Nama pelanggan yang melakukan pesanan
  • Segment : Kategori atau segmen pelanggan (misalnya, konsumen, korporat)
  • Country : Negara tempat pelanggan berada
  • City : Kota tempat pelanggan berada
  • State : Provinsi atau negara bagian tempat pelanggan berada
  • Postal Code : Kode pos untuk alamat pelanggan
  • Region : Wilayah geografis yang mencakup negara, kota, atau negara bagian
  • Product ID : Nomor unik yang mengidentifikasi setiap produk
  • Category : Kategori produk
  • Sub Category : Subkategori dari produk yang lebih spesifik
  • Product Name : Nama produk yang dijual
  • Sales : Total nilai penjualan dari produk yang dipesan

Data Inspection

head(sales)
tail(sales)
dim(sales)
#> [1] 9800   18
names(sales)
#>  [1] "Row.ID"        "Order.ID"      "Order.Date"    "Ship.Date"    
#>  [5] "Ship.Mode"     "Customer.ID"   "Customer.Name" "Segment"      
#>  [9] "Country"       "City"          "State"         "Postal.Code"  
#> [13] "Region"        "Product.ID"    "Category"      "Sub.Category" 
#> [17] "Product.Name"  "Sales"

Data Cleansing & Coertions

str(sales)
#> 'data.frame':    9800 obs. of  18 variables:
#>  $ Row.ID       : int  1 2 3 4 5 6 7 8 9 10 ...
#>  $ Order.ID     : Factor w/ 4922 levels "CA-2015-100006",..: 2457 2457 2255 4296 4296 195 195 195 195 195 ...
#>  $ Order.Date   : Factor w/ 1230 levels "01/01/2018","01/02/2015",..: 315 315 457 433 433 338 338 338 338 338 ...
#>  $ Ship.Date    : Factor w/ 1326 levels "01/01/2016","01/01/2017",..: 477 477 674 775 775 585 585 585 585 585 ...
#>  $ Ship.Mode    : Factor w/ 4 levels "First Class",..: 3 3 3 4 4 4 4 4 4 4 ...
#>  $ Customer.ID  : Factor w/ 793 levels "AA-10315","AA-10375",..: 144 144 240 706 706 89 89 89 89 89 ...
#>  $ Customer.Name: Factor w/ 793 levels "Aaron Bergman",..: 167 167 202 688 688 114 114 114 114 114 ...
#>  $ Segment      : Factor w/ 3 levels "Consumer","Corporate",..: 1 1 2 1 1 1 1 1 1 1 ...
#>  $ Country      : Factor w/ 1 level "United States": 1 1 1 1 1 1 1 1 1 1 ...
#>  $ City         : Factor w/ 529 levels "Aberdeen","Abilene",..: 195 195 266 154 154 266 266 266 266 266 ...
#>  $ State        : Factor w/ 49 levels "Alabama","Arizona",..: 16 16 4 9 9 4 4 4 4 4 ...
#>  $ Postal.Code  : int  42420 42420 90036 33311 33311 90032 90032 90032 90032 90032 ...
#>  $ Region       : Factor w/ 4 levels "Central","East",..: 3 3 4 3 3 4 4 4 4 4 ...
#>  $ Product.ID   : Factor w/ 1861 levels "FUR-BO-10000112",..: 13 56 947 320 1317 186 563 1761 795 438 ...
#>  $ Category     : Factor w/ 3 levels "Furniture","Office Supplies",..: 1 1 2 1 2 1 2 3 2 2 ...
#>  $ Sub.Category : Factor w/ 17 levels "Accessories",..: 5 6 11 17 15 10 3 14 4 2 ...
#>  $ Product.Name : Factor w/ 1849 levels "\"While you Were Out\" Message Book, One Form per Page",..: 387 832 1439 368 574 570 1136 1098 535 295 ...
#>  $ Sales        : num  262 731.9 14.6 957.6 22.4 ...
sales_clean <- sales %>%
  
  # Menghapus kolom Row.ID
  select(-c(Row.ID)) %>% 
  
  # Data Coertion
  mutate(
    
    Order.Date = as.Date(as.character(Order.Date),format = "%d/%m/%Y"),
    Ship.Date = as.Date(as.character(Ship.Date),format = "%d/%m/%Y"),
    Ship.Day = as.numeric(difftime(Ship.Date, Order.Date, units = "days"))
  )

str(sales_clean)
#> 'data.frame':    9800 obs. of  18 variables:
#>  $ Order.ID     : Factor w/ 4922 levels "CA-2015-100006",..: 2457 2457 2255 4296 4296 195 195 195 195 195 ...
#>  $ Order.Date   : Date, format: "2017-11-08" "2017-11-08" ...
#>  $ Ship.Date    : Date, format: "2017-11-11" "2017-11-11" ...
#>  $ Ship.Mode    : Factor w/ 4 levels "First Class",..: 3 3 3 4 4 4 4 4 4 4 ...
#>  $ Customer.ID  : Factor w/ 793 levels "AA-10315","AA-10375",..: 144 144 240 706 706 89 89 89 89 89 ...
#>  $ Customer.Name: Factor w/ 793 levels "Aaron Bergman",..: 167 167 202 688 688 114 114 114 114 114 ...
#>  $ Segment      : Factor w/ 3 levels "Consumer","Corporate",..: 1 1 2 1 1 1 1 1 1 1 ...
#>  $ Country      : Factor w/ 1 level "United States": 1 1 1 1 1 1 1 1 1 1 ...
#>  $ City         : Factor w/ 529 levels "Aberdeen","Abilene",..: 195 195 266 154 154 266 266 266 266 266 ...
#>  $ State        : Factor w/ 49 levels "Alabama","Arizona",..: 16 16 4 9 9 4 4 4 4 4 ...
#>  $ Postal.Code  : int  42420 42420 90036 33311 33311 90032 90032 90032 90032 90032 ...
#>  $ Region       : Factor w/ 4 levels "Central","East",..: 3 3 4 3 3 4 4 4 4 4 ...
#>  $ Product.ID   : Factor w/ 1861 levels "FUR-BO-10000112",..: 13 56 947 320 1317 186 563 1761 795 438 ...
#>  $ Category     : Factor w/ 3 levels "Furniture","Office Supplies",..: 1 1 2 1 2 1 2 3 2 2 ...
#>  $ Sub.Category : Factor w/ 17 levels "Accessories",..: 5 6 11 17 15 10 3 14 4 2 ...
#>  $ Product.Name : Factor w/ 1849 levels "\"While you Were Out\" Message Book, One Form per Page",..: 387 832 1439 368 574 570 1136 1098 535 295 ...
#>  $ Sales        : num  262 731.9 14.6 957.6 22.4 ...
#>  $ Ship.Day     : num  3 3 4 7 7 5 5 5 5 5 ...

Check missing value:

sales_clean %>% is.na() %>% colSums()
#>      Order.ID    Order.Date     Ship.Date     Ship.Mode   Customer.ID 
#>             0             0             0             0             0 
#> Customer.Name       Segment       Country          City         State 
#>             0             0             0             0             0 
#>   Postal.Code        Region    Product.ID      Category  Sub.Category 
#>            11             0             0             0             0 
#>  Product.Name         Sales      Ship.Day 
#>             0             0             0
anyNA(sales_clean)
#> [1] TRUE

column Postal.Code has 11 missing values but we don’t use the column for this analysis so we can just ignore it

Data Explanation

summary(sales_clean)
#>            Order.ID      Order.Date           Ship.Date         
#>  CA-2018-100111:  14   Min.   :2015-01-03   Min.   :2015-01-07  
#>  CA-2018-157987:  12   1st Qu.:2016-05-24   1st Qu.:2016-05-27  
#>  CA-2017-165330:  11   Median :2017-06-26   Median :2017-06-29  
#>  US-2017-108504:  11   Mean   :2017-05-01   Mean   :2017-05-05  
#>  CA-2016-131338:  10   3rd Qu.:2018-05-15   3rd Qu.:2018-05-19  
#>  CA-2017-105732:  10   Max.   :2018-12-30   Max.   :2019-01-05  
#>  (Other)       :9732                                            
#>           Ship.Mode      Customer.ID               Customer.Name 
#>  First Class   :1501   WB-21850:  35   William Brown      :  35  
#>  Same Day      : 538   MA-17560:  34   Matt Abelman       :  34  
#>  Second Class  :1902   PP-18955:  34   Paul Prost         :  34  
#>  Standard Class:5859   JL-15835:  33   John Lee           :  33  
#>                        CK-12205:  32   Chloris Kastensmidt:  32  
#>                        JD-15895:  32   Jonathan Doherty   :  32  
#>                        (Other) :9600   (Other)            :9600  
#>         Segment              Country                City     
#>  Consumer   :5101   United States:9800   New York City: 891  
#>  Corporate  :2953                        Los Angeles  : 728  
#>  Home Office:1746                        Philadelphia : 532  
#>                                          San Francisco: 500  
#>                                          Seattle      : 426  
#>                                          Houston      : 374  
#>                                          (Other)      :6349  
#>           State       Postal.Code        Region               Product.ID  
#>  California  :1946   Min.   : 1040   Central:2277   OFF-PA-10001970:  19  
#>  New York    :1097   1st Qu.:23223   East   :2785   TEC-AC-10003832:  18  
#>  Texas       : 973   Median :58103   South  :1598   FUR-FU-10004270:  16  
#>  Pennsylvania: 582   Mean   :55273   West   :3140   FUR-CH-10002647:  15  
#>  Washington  : 504   3rd Qu.:90008                  TEC-AC-10002049:  15  
#>  Illinois    : 483   Max.   :99301                  TEC-AC-10003628:  15  
#>  (Other)     :4215   NA's   :11                     (Other)        :9702  
#>             Category         Sub.Category                   Product.Name 
#>  Furniture      :2078   Binders    :1492   Staple envelope        :  47  
#>  Office Supplies:5909   Paper      :1338   Staples                :  46  
#>  Technology     :1813   Furnishings: 931   Easy-staple paper      :  44  
#>                         Phones     : 876   Avery Non-Stick Binders:  20  
#>                         Storage    : 832   Staple remover         :  18  
#>                         Art        : 785   Staples in misc. colors:  18  
#>                         (Other)    :3546   (Other)                :9607  
#>      Sales              Ship.Day    
#>  Min.   :    0.444   Min.   :0.000  
#>  1st Qu.:   17.248   1st Qu.:3.000  
#>  Median :   54.490   Median :4.000  
#>  Mean   :  230.769   Mean   :3.961  
#>  3rd Qu.:  210.605   3rd Qu.:5.000  
#>  Max.   :22638.480   Max.   :7.000  
#> 

Summary : 1. First order occured in Jan 2015 2. Standart Class was the most popular one for shipping option and Same day delivery was the lowest one 3. The most buyers coming from cunsumer segment and home office segment was the lowest 4. Office Supply category was the best seller compare to others 5. In Sub category, Other stuff was the most favourite one then followed by Binders and paper. Art is the lowest 6. Company gained the average sales at 230.769; with max sales value at 22638.480 and minumum sales value at 0.444

Check the Outlier within profit

aggregate(Sales~Segment,sales_clean,mean)
aggregate(Sales~Segment,sales_clean,var)
aggregate(Sales~Segment,sales_clean,sd)
boxplot(sales_clean$Sales)

Data Manipulation & Transformation

  1. Which category gives the lowest sales number? how much profit?
sales_clean[sales_clean$Sales == 0.444,]

Answer : Sales 0.444 comes from segment ‘consumer’, office supplies in Houston, United States

  1. Which segment buying the most office supplies?
os <- sales_clean[sales_clean$Category=="Office Supplies",]
round(prop.table(table(os$Segment))*100,2)
#> 
#>    Consumer   Corporate Home Office 
#>       51.99       30.17       17.84

Answer : consumer segment –> 51.99%

  1. Which Category buyers buying the highest Sales and when it happened?
sales_clean[sales_clean$Sales == 22638.480,]

Answer : Highest Sales was on March 2015, from Technology Category

  1. How much total of sales from each segment and category, and which is the highest?
xtabs(Sales~Segment+Category,sales_clean)
#>              Category
#> Segment       Furniture Office Supplies Technology
#>   Consumer     387696.3        359352.6   401011.7
#>   Corporate    220321.7        224130.5   244041.8
#>   Home Office  120640.6        121939.2   182402.4
plot(xtabs(Sales~Segment+Category,sales_clean))

Answer : Based on result above: segmen consumer with category Technology make the highest Sales with 401011.7

  1. What is the correlation between Ship Mode and Ship Day?
# Plot using the columns from the data frame
plot(sales_clean$Ship.Mode, sales_clean$Ship.Day,
     main = "Scatter Plot of Ship Mode vs Ship Day",
     xlab = "Ship Mode",
     ylab = "Ship Day",
     pch = 19, col = "brown")

Answer : Based on result above: Ship mode aligns with the time required for the item to be delivered