For more information just find me out in instagram @gaostipkialpandi :D
This is a dataset that contains sales records and other informations from a retail company. This company sells some kind of stuff which is categorized into Technology, Furniture, and Office Supplies stuff. This data is gathered since 2014 to 2018 with almost 10.000 transactions.
Make sure you have placed the data in the same folder with your Rmarkdown file.
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.1.3
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
retail <- read.csv("retail.csv")
#Checking the rows and columns
dim(retail)
## [1] 9994 15
->Based on the result of dim(retail), we can see that Retail consists of 9994 rows and 15 columns.
-Next we have to check out the type of data in every columns.
#Checking the type of data
str(retail)
## 'data.frame': 9994 obs. of 15 variables:
## $ Row.ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Order.ID : chr "CA-2016-152156" "CA-2016-152156" "CA-2016-138688" "US-2015-108966" ...
## $ Order.Date : chr "11/8/16" "11/8/16" "6/12/16" "10/11/15" ...
## $ Ship.Date : chr "11/11/16" "11/11/16" "6/16/16" "10/18/15" ...
## $ Ship.Mode : chr "Second Class" "Second Class" "Second Class" "Standard Class" ...
## $ Customer.ID : chr "CG-12520" "CG-12520" "DV-13045" "SO-20335" ...
## $ Segment : chr "Consumer" "Consumer" "Corporate" "Consumer" ...
## $ Product.ID : chr "FUR-BO-10001798" "FUR-CH-10000454" "OFF-LA-10000240" "FUR-TA-10000577" ...
## $ Category : chr "Furniture" "Furniture" "Office Supplies" "Furniture" ...
## $ Sub.Category: chr "Bookcases" "Chairs" "Labels" "Tables" ...
## $ Product.Name: chr "Bush Somerset Collection Bookcase" "Hon Deluxe Fabric Upholstered Stacking Chairs, Rounded Back" "Self-Adhesive Address Labels for Typewriters by Universal" "Bretford CR4500 Series Slim Rectangular Table" ...
## $ Sales : num 262 731.9 14.6 957.6 22.4 ...
## $ Quantity : int 2 3 2 5 2 7 4 6 3 5 ...
## $ Discount : num 0 0 0 0.45 0.2 0 0 0.2 0.2 0 ...
## $ Profit : num 41.91 219.58 6.87 -383.03 2.52 ...
-> We can see that there are several columns which are categorized in inappropriate type of data such as Row.ID, Order.ID, Order.Date, and so on.
#Reformatting the type of data
retail$Row.ID <- as.character(retail$Row.ID)
retail$Segment <- as.factor(retail$Segment)
retail$Ship.Mode <- as.factor(retail$Ship.Mode)
retail$Category <- as.factor(retail$Category)
retail$Sub.Category <- as.factor(retail$Sub.Category)
str(retail)
## 'data.frame': 9994 obs. of 15 variables:
## $ Row.ID : chr "1" "2" "3" "4" ...
## $ Order.ID : chr "CA-2016-152156" "CA-2016-152156" "CA-2016-138688" "US-2015-108966" ...
## $ Order.Date : chr "11/8/16" "11/8/16" "6/12/16" "10/11/15" ...
## $ Ship.Date : chr "11/11/16" "11/11/16" "6/16/16" "10/18/15" ...
## $ Ship.Mode : Factor w/ 4 levels "First Class",..: 3 3 3 4 4 4 4 4 4 4 ...
## $ Customer.ID : chr "CG-12520" "CG-12520" "DV-13045" "SO-20335" ...
## $ Segment : Factor w/ 3 levels "Consumer","Corporate",..: 1 1 2 1 1 1 1 1 1 1 ...
## $ Product.ID : chr "FUR-BO-10001798" "FUR-CH-10000454" "OFF-LA-10000240" "FUR-TA-10000577" ...
## $ Category : Factor w/ 3 levels "Furniture","Office Supplies",..: 1 1 2 1 2 1 2 3 2 2 ...
## $ Sub.Category: Factor w/ 17 levels "Accessories",..: 5 6 11 17 15 10 3 14 4 2 ...
## $ Product.Name: chr "Bush Somerset Collection Bookcase" "Hon Deluxe Fabric Upholstered Stacking Chairs, Rounded Back" "Self-Adhesive Address Labels for Typewriters by Universal" "Bretford CR4500 Series Slim Rectangular Table" ...
## $ Sales : num 262 731.9 14.6 957.6 22.4 ...
## $ Quantity : int 2 3 2 5 2 7 4 6 3 5 ...
## $ Discount : num 0 0 0 0.45 0.2 0 0 0.2 0.2 0 ...
## $ Profit : num 41.91 219.58 6.87 -383.03 2.52 ...
#Using library Lubridate to change the type of data to be date.time
retail$Ship.Date <- mdy(retail$Ship.Date)
retail$Order.Date <- mdy(retail$Order.Date)
str(retail)
## 'data.frame': 9994 obs. of 15 variables:
## $ Row.ID : chr "1" "2" "3" "4" ...
## $ Order.ID : chr "CA-2016-152156" "CA-2016-152156" "CA-2016-138688" "US-2015-108966" ...
## $ Order.Date : Date, format: "2016-11-08" "2016-11-08" ...
## $ Ship.Date : Date, format: "2016-11-11" "2016-11-11" ...
## $ Ship.Mode : Factor w/ 4 levels "First Class",..: 3 3 3 4 4 4 4 4 4 4 ...
## $ Customer.ID : chr "CG-12520" "CG-12520" "DV-13045" "SO-20335" ...
## $ Segment : Factor w/ 3 levels "Consumer","Corporate",..: 1 1 2 1 1 1 1 1 1 1 ...
## $ Product.ID : chr "FUR-BO-10001798" "FUR-CH-10000454" "OFF-LA-10000240" "FUR-TA-10000577" ...
## $ Category : Factor w/ 3 levels "Furniture","Office Supplies",..: 1 1 2 1 2 1 2 3 2 2 ...
## $ Sub.Category: Factor w/ 17 levels "Accessories",..: 5 6 11 17 15 10 3 14 4 2 ...
## $ Product.Name: chr "Bush Somerset Collection Bookcase" "Hon Deluxe Fabric Upholstered Stacking Chairs, Rounded Back" "Self-Adhesive Address Labels for Typewriters by Universal" "Bretford CR4500 Series Slim Rectangular Table" ...
## $ Sales : num 262 731.9 14.6 957.6 22.4 ...
## $ Quantity : int 2 3 2 5 2 7 4 6 3 5 ...
## $ Discount : num 0 0 0 0.45 0.2 0 0 0.2 0.2 0 ...
## $ Profit : num 41.91 219.58 6.87 -383.03 2.52 ...
->Now we can see that every column has the right type of data.
#Looking the NaN data in every columns
colSums(is.na(retail))
## Row.ID Order.ID Order.Date Ship.Date Ship.Mode Customer.ID
## 0 0 0 0 0 0
## Segment Product.ID Category Sub.Category Product.Name Sales
## 0 0 0 0 0 0
## Quantity Discount Profit
## 0 0 0
#First, take a look on a summary of every columns
summary(retail)
## Row.ID Order.ID Order.Date
## Length:9994 Length:9994 Min. :2014-01-03
## Class :character Class :character 1st Qu.:2015-05-23
## Mode :character Mode :character Median :2016-06-26
## Mean :2016-04-30
## 3rd Qu.:2017-05-14
## Max. :2017-12-30
##
## Ship.Date Ship.Mode Customer.ID
## Min. :2014-01-07 First Class :1538 Length:9994
## 1st Qu.:2015-05-27 Same Day : 543 Class :character
## Median :2016-06-29 Second Class :1945 Mode :character
## Mean :2016-05-03 Standard Class:5968
## 3rd Qu.:2017-05-18
## Max. :2018-01-05
##
## Segment Product.ID Category
## Consumer :5191 Length:9994 Furniture :2121
## Corporate :3020 Class :character Office Supplies:6026
## Home Office:1783 Mode :character Technology :1847
##
##
##
##
## Sub.Category Product.Name Sales Quantity
## Binders :1523 Length:9994 Min. : 0.444 Min. : 1.00
## Paper :1370 Class :character 1st Qu.: 17.280 1st Qu.: 2.00
## Furnishings: 957 Mode :character Median : 54.490 Median : 3.00
## Phones : 889 Mean : 229.858 Mean : 3.79
## Storage : 846 3rd Qu.: 209.940 3rd Qu.: 5.00
## Art : 796 Max. :22638.480 Max. :14.00
## (Other) :3613
## Discount Profit
## Min. :0.0000 Min. :-6599.978
## 1st Qu.:0.0000 1st Qu.: 1.729
## Median :0.2000 Median : 8.666
## Mean :0.1562 Mean : 28.657
## 3rd Qu.:0.2000 3rd Qu.: 29.364
## Max. :0.8000 Max. : 8399.976
##
From the summary above, we get: -> mean of sales is 229.858 -> mean of profit is 28.657 -> mean of quantity is 3.79
Business Questions
highsales <- aggregate(formula=Sales~Segment+Category, data=retail, FUN=sum)
head(highsales[order(highsales$Sales, decreasing = T),],1)
## Segment Category Sales
## 7 Consumer Technology 406399.9
highprof <- aggregate(formula=Profit~Segment+Category, data=retail, FUN=sum)
head(highprof[order(highprof$Profit, decreasing = T),],1)
## Segment Category Profit
## 7 Consumer Technology 70797.81
->Consumer segment with Technology category is the highest sales and the highest profit.
lowsales <- aggregate(formula=Sales~Segment+Category, data=retail, FUN=sum)
head(lowsales[order(lowsales$Sales, decreasing = F),],1)
## Segment Category Sales
## 3 Home Office Furniture 121930.7
lowprof <- aggregate(formula=Profit~Segment+Category, data=retail, FUN=sum)
head(lowprof[order(lowprof$Profit, decreasing = F),],1)
## Segment Category Profit
## 3 Home Office Furniture 3875.378
-> Home Office segment with Furniture category is the lowest sales and the highest profit.
tseg1 <- retail %>%
filter(Segment=="Consumer")
nrow(tseg1)
## [1] 5191
tseg2 <- retail %>%
filter(Segment=="Corporate")
nrow(tseg2)
## [1] 3020
tseg3 <- retail %>%
filter(Segment=="Home Office")
nrow(tseg3)
## [1] 1783
-> Consumer is a segment with the most transaction number that reach out 5191 of 9994 transactions
seg1 <- aggregate(Profit~Segment, retail, sum)
seg1
## Segment Profit
## 1 Consumer 134119.21
## 2 Corporate 91979.13
## 3 Home Office 60298.68
seg2 <- aggregate(Sales~Segment, retail, sum)
seg2
## Segment Sales
## 1 Consumer 1161401.3
## 2 Corporate 706146.4
## 3 Home Office 429653.1
seg3 <- aggregate(Quantity~Segment, retail, sum)
seg3
## Segment Quantity
## 1 Consumer 19521
## 2 Corporate 11608
## 3 Home Office 6744
cons <-134119.21/1161401.3
cons
## [1] 0.1154805
corp <-91979.13/706146.4
corp
## [1] 0.130255
homof <-60298.68/429653.1
homof
## [1] 0.1403427
-> Consumer is the segment with overall get the highest in sales, profit, and quantity.
cat1 <- aggregate(Profit~Category, retail, sum)
cat1
## Category Profit
## 1 Furniture 18451.27
## 2 Office Supplies 122490.80
## 3 Technology 145454.95
cat2 <- aggregate(Sales~Category, retail, sum)
cat2
## Category Sales
## 1 Furniture 741999.8
## 2 Office Supplies 719047.0
## 3 Technology 836154.0
cat3 <- aggregate(Quantity~Category, retail, sum)
cat3
## Category Quantity
## 1 Furniture 8028
## 2 Office Supplies 22906
## 3 Technology 6939
Technology is the most profitable and the highest sales number of category. Meanwhile, Office Supplies is the highest of quantity number
Tech <- 145454.95/836154.0
Tech
## [1] 0.1739571
Fur <-18451.27/741999.8
Fur
## [1] 0.02486695
Ofs <- 122490.80/719047.0
Ofs
## [1] 0.1703516
-> Overall, we see that Technology get the highest profit rates of sales with 17,4% of rates and Furniture is the lowest with 2,5% of rates
subcsales <-aggregate(Sales~Sub.Category, retail, sum)
subcsales[order(subcsales$Sales, decreasing = T),]
## Sub.Category Sales
## 14 Phones 330007.05
## 6 Chairs 328449.10
## 15 Storage 223843.61
## 17 Tables 206965.53
## 4 Binders 203412.73
## 12 Machines 189238.63
## 1 Accessories 167380.32
## 7 Copiers 149528.03
## 5 Bookcases 114880.00
## 2 Appliances 107532.16
## 10 Furnishings 91705.16
## 13 Paper 78479.21
## 16 Supplies 46673.54
## 3 Art 27118.79
## 8 Envelopes 16476.40
## 11 Labels 12486.31
## 9 Fasteners 3024.28
subcprofit <-aggregate(Profit~Sub.Category, retail, sum)
subcprofit[order(subcprofit$Profit, decreasing = T),]
## Sub.Category Profit
## 7 Copiers 55617.8249
## 14 Phones 44515.7306
## 1 Accessories 41936.6357
## 13 Paper 34053.5693
## 4 Binders 30221.7633
## 6 Chairs 26590.1663
## 15 Storage 21278.8264
## 2 Appliances 18138.0054
## 10 Furnishings 13059.1436
## 8 Envelopes 6964.1767
## 3 Art 6527.7870
## 11 Labels 5546.2540
## 12 Machines 3384.7569
## 9 Fasteners 949.5182
## 16 Supplies -1189.0995
## 5 Bookcases -3472.5560
## 17 Tables -17725.4811
subcdisc <-aggregate(Discount~Sub.Category, retail, mean)
subcdisc[order(subcdisc$Discount, decreasing = T),]
## Sub.Category Discount
## 4 Binders 0.37229153
## 12 Machines 0.30608696
## 17 Tables 0.26128527
## 5 Bookcases 0.21114035
## 6 Chairs 0.17017828
## 2 Appliances 0.16652361
## 7 Copiers 0.16176471
## 14 Phones 0.15455568
## 10 Furnishings 0.13834901
## 9 Fasteners 0.08202765
## 8 Envelopes 0.08031496
## 1 Accessories 0.07845161
## 16 Supplies 0.07684211
## 13 Paper 0.07489051
## 3 Art 0.07487437
## 15 Storage 0.07470449
## 11 Labels 0.06868132
subcqu <-aggregate(Quantity~Sub.Category, retail, sum)
subcqu[order(subcqu$Quantity, decreasing = T),]
## Sub.Category Quantity
## 4 Binders 5974
## 13 Paper 5178
## 10 Furnishings 3563
## 14 Phones 3289
## 15 Storage 3158
## 3 Art 3000
## 1 Accessories 2976
## 6 Chairs 2356
## 2 Appliances 1729
## 11 Labels 1400
## 17 Tables 1241
## 9 Fasteners 914
## 8 Envelopes 906
## 5 Bookcases 868
## 16 Supplies 647
## 12 Machines 440
## 7 Copiers 234
plot(retail$Order.Date, retail$Profit)
plot(retail$Order.Date, retail$Sales)