The data I will use is “Retail Dataset”, that contains of sales record included the profit or loss from a retail company for 4 years. Please take a look at the following glossary for your reference:
Order.ID: Id of order.Order.Date: Date of order.Ship.Date: Date of shipment.Ship.Mode: Type of shipment.Customer.ID: Id of customer.Segment : Customer’s segment.Product.ID: Id of product.Category : Product category with 3 levels (“Furniture”, “Office Supplies”, “Technology”)Sub.Category: more specific product categoryProduct.Name: Name of product that was sold.Sales: How much earning from each sale.Quantity: Quantity of item sold.Discount: How much discount was given for each sale.Profit: How much can a company earn from each sale.library(dplyr)## Warning: package 'dplyr' was built under R version 4.0.5
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lubridate)## Warning: package 'lubridate' was built under R version 4.0.5
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(ggplot2)## Warning: package 'ggplot2' was built under R version 4.0.5
library(tidyverse)## Warning: package 'tidyverse' was built under R version 4.0.5
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble 3.1.5 v purrr 0.3.4
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.0.2 v forcats 0.5.1
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'readr' was built under R version 4.0.5
## Warning: package 'purrr' was built under R version 4.0.5
## Warning: package 'stringr' was built under R version 4.0.5
## Warning: package 'forcats' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x lubridate::as.difftime() masks base::as.difftime()
## x lubridate::date() masks base::date()
## x dplyr::filter() masks stats::filter()
## x lubridate::intersect() masks base::intersect()
## x dplyr::lag() masks stats::lag()
## x lubridate::setdiff() masks base::setdiff()
## x lubridate::union() masks base::union()
retail <- read.csv("retail.csv")
head(retail)explicit coercion
str(retail)## 'data.frame': 9994 obs. of 15 variables:
## $ Row.ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Order.ID : chr "CA-2016-152156" "CA-2016-152156" "CA-2016-138688" "US-2015-108966" ...
## $ Order.Date : chr "11/8/16" "11/8/16" "6/12/16" "10/11/15" ...
## $ Ship.Date : chr "11/11/16" "11/11/16" "6/16/16" "10/18/15" ...
## $ Ship.Mode : chr "Second Class" "Second Class" "Second Class" "Standard Class" ...
## $ Customer.ID : chr "CG-12520" "CG-12520" "DV-13045" "SO-20335" ...
## $ Segment : chr "Consumer" "Consumer" "Corporate" "Consumer" ...
## $ Product.ID : chr "FUR-BO-10001798" "FUR-CH-10000454" "OFF-LA-10000240" "FUR-TA-10000577" ...
## $ Category : chr "Furniture" "Furniture" "Office Supplies" "Furniture" ...
## $ Sub.Category: chr "Bookcases" "Chairs" "Labels" "Tables" ...
## $ Product.Name: chr "Bush Somerset Collection Bookcase" "Hon Deluxe Fabric Upholstered Stacking Chairs, Rounded Back" "Self-Adhesive Address Labels for Typewriters by Universal" "Bretford CR4500 Series Slim Rectangular Table" ...
## $ Sales : num 262 731.9 14.6 957.6 22.4 ...
## $ Quantity : int 2 3 2 5 2 7 4 6 3 5 ...
## $ Discount : num 0 0 0 0.45 0.2 0 0 0.2 0.2 0 ...
## $ Profit : num 41.91 219.58 6.87 -383.03 2.52 ...
Change the types of data into the appropriate data types
retail$Ship.Mode <- as.factor(retail$Ship.Mode)
retail$Segment <- as.factor(retail$Segment)
retail$Category <- as.factor(retail$Category)
retail$Sub.Category <- as.factor(retail$Sub.Category)
retail$Order.Date <- mdy(retail$Order.Date)
retail$Ship.Date <- mdy(retail$Ship.Date)
str(retail)## 'data.frame': 9994 obs. of 15 variables:
## $ Row.ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Order.ID : chr "CA-2016-152156" "CA-2016-152156" "CA-2016-138688" "US-2015-108966" ...
## $ Order.Date : Date, format: "2016-11-08" "2016-11-08" ...
## $ Ship.Date : Date, format: "2016-11-11" "2016-11-11" ...
## $ Ship.Mode : Factor w/ 4 levels "First Class",..: 3 3 3 4 4 4 4 4 4 4 ...
## $ Customer.ID : chr "CG-12520" "CG-12520" "DV-13045" "SO-20335" ...
## $ Segment : Factor w/ 3 levels "Consumer","Corporate",..: 1 1 2 1 1 1 1 1 1 1 ...
## $ Product.ID : chr "FUR-BO-10001798" "FUR-CH-10000454" "OFF-LA-10000240" "FUR-TA-10000577" ...
## $ Category : Factor w/ 3 levels "Furniture","Office Supplies",..: 1 1 2 1 2 1 2 3 2 2 ...
## $ Sub.Category: Factor w/ 17 levels "Accessories",..: 5 6 11 17 15 10 3 14 4 2 ...
## $ Product.Name: chr "Bush Somerset Collection Bookcase" "Hon Deluxe Fabric Upholstered Stacking Chairs, Rounded Back" "Self-Adhesive Address Labels for Typewriters by Universal" "Bretford CR4500 Series Slim Rectangular Table" ...
## $ Sales : num 262 731.9 14.6 957.6 22.4 ...
## $ Quantity : int 2 3 2 5 2 7 4 6 3 5 ...
## $ Discount : num 0 0 0 0.45 0.2 0 0 0.2 0.2 0 ...
## $ Profit : num 41.91 219.58 6.87 -383.03 2.52 ...
Check Missing Value
anyNA(retail)## [1] FALSE
No missing values found
Drop the first column because we don’t need that information
retail <- retail[,-1]
head(retail)Make a new column named year that contains the year from Order Date
retail <- retail %>% mutate(year=year(Order.Date))summary(retail)## Order.ID Order.Date Ship.Date
## Length:9994 Min. :2014-01-03 Min. :2014-01-07
## Class :character 1st Qu.:2015-05-23 1st Qu.:2015-05-27
## Mode :character Median :2016-06-26 Median :2016-06-29
## Mean :2016-04-30 Mean :2016-05-03
## 3rd Qu.:2017-05-14 3rd Qu.:2017-05-18
## Max. :2017-12-30 Max. :2018-01-05
##
## Ship.Mode Customer.ID Segment Product.ID
## First Class :1538 Length:9994 Consumer :5191 Length:9994
## Same Day : 543 Class :character Corporate :3020 Class :character
## Second Class :1945 Mode :character Home Office:1783 Mode :character
## Standard Class:5968
##
##
##
## Category Sub.Category Product.Name
## Furniture :2121 Binders :1523 Length:9994
## Office Supplies:6026 Paper :1370 Class :character
## Technology :1847 Furnishings: 957 Mode :character
## Phones : 889
## Storage : 846
## Art : 796
## (Other) :3613
## Sales Quantity Discount Profit
## Min. : 0.444 Min. : 1.00 Min. :0.0000 Min. :-6599.978
## 1st Qu.: 17.280 1st Qu.: 2.00 1st Qu.:0.0000 1st Qu.: 1.729
## Median : 54.490 Median : 3.00 Median :0.2000 Median : 8.666
## Mean : 229.858 Mean : 3.79 Mean :0.1562 Mean : 28.657
## 3rd Qu.: 209.940 3rd Qu.: 5.00 3rd Qu.:0.2000 3rd Qu.: 29.364
## Max. :22638.480 Max. :14.00 Max. :0.8000 Max. : 8399.976
##
## year
## Min. :2014
## 1st Qu.:2015
## Median :2016
## Mean :2016
## 3rd Qu.:2017
## Max. :2017
##
Data Summary
1. Now we will check the relationship between Sales and Profit for each year
retail %>% ggplot(aes(x=Sales,y=Profit,col=as.factor(year)))+
geom_jitter()+
geom_smooth(method='lm',col='red',linetype='dashed',size=1)+
labs(title = "Scatter Plot Sales Vs Profit (2014-2017)")+
scale_colour_manual(name="Year",values = c("2014"="red","2015"="blue","2016"="green","2017"="gold"))+
theme_dark()## `geom_smooth()` using formula 'y ~ x'
Interpretations
2. we want to know in what year the biggest loss occurred
retail %>% filter(Profit <0) %>%
group_by(year) %>%
summarise(total_profit = abs(sum(Profit))) %>%
ggplot(aes(year,total_profit))+
geom_col(fill='darkred')+
labs(title="Total Lost (2014-2017)",
y="Total Lost",
x='Year')+
theme_dark()Interpretations
3. We will check the relationship between Discount and Profit for each year
retail %>% ggplot(aes(x=Discount,y=Profit,col=as.factor(year)))+
geom_jitter()+
# geom_smooth(method='lm',col='red',linetype='dashed',size=1)+
labs(title = "Scatter Plot Discount Vs Profit (2014-2017)")+
scale_colour_manual(name="Year",values = c("2014"="red","2015"="blue","2016"="green","2017"="gold"))+
theme_dark()Interpretations
4. Total Discount For each year
retail %>%
group_by(year) %>%
summarise(total_discount = sum(Discount)) %>%
ggplot(aes(year,total_discount))+
geom_col(fill="darkblue")+
labs(title="Total Discount (2014-2017)",
y="Total Discount",
x='Year')+
theme_dark()5. Total Order For each year
retail %>%
group_by(year) %>%
summarise(total_orders = n()) %>%
ggplot(aes(year,total_orders))+
geom_col(fill="darkgreen")+
labs(title="Total Order (2014-2017)",
y="Total Order",
x='Year')+
theme_dark()6. Total Profit For each year
retail %>%
group_by(year) %>%
summarise(total_profit = sum(Profit))%>%
ggplot(aes(year,total_profit))+
geom_col(fill="gold")+
labs(title="Total Profit (2014-2017)",
y="Total Profit",
x='Year')+
theme_dark()7. We want to know which categories and shipments were ordered the most
retail %>% group_by(Category,Ship.Mode) %>%
summarise(total = n()) %>%
ggplot(aes(x=total,y=Category))+
geom_col(aes(fill=Ship.Mode), position = "stack")## `summarise()` has grouped output by 'Category'. You can override using the `.groups` argument.
Interpretations