retail <- read.csv("data_input/retail.csv")library(dplyr)##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
retail <- retail %>%
mutate(Ship.Mode = as.factor(Ship.Mode), Category = as.factor(Category), Sub.Category = as.factor(Sub.Category), Segment = as.factor(Segment))
retail$Order.Date <- as.Date(retail$Order.Date,
"%m/%d/%y")
retail$Ship.Date <- as.Date(retail$Ship.Date,
"%m/%d/%y")
anyNA(retail)## [1] FALSE
head(retail)retail_clean <- retail %>%
select(Order.Date, Ship.Date, Ship.Mode, Segment, Category, Sub.Category, Product.Name, Sales, Quantity, Discount, Profit)
head(retail_clean)In this section, I clean my data using DPLYR Package to make it easier and faster.
While cleaning the data, I did some changing in data type and also selecting the needed columns and removing columns that I don’t need.
In this section I wanna know: 1. Products that give big profit 2. Products that give big loss
retail_clean %>%
filter(Profit > 4500 | Profit < -4500)From the result above, I wanna make a new column about the profit status
retail_clean$Profit.Loss <- as.factor(ifelse(retail_clean$Profit < 0, "Loss", "Profit"))
head(retail_clean)From the final data. I want to analyze the data.
retail_clean %>%
filter(Segment == "Consumer",
Profit >= 2000)retail_clean %>%
filter(Category == "Furniture",
Profit >= 500)retail_clean %>%
filter(Segment == "Consumer",
Profit >= 500)There are a few ways to pull a summary from a data and one of them is the “summary” function
summary(retail_clean)## Order.Date Ship.Date Ship.Mode
## Min. :2014-01-03 Min. :2014-01-07 First Class :1538
## 1st Qu.:2015-05-23 1st Qu.:2015-05-27 Same Day : 543
## Median :2016-06-26 Median :2016-06-29 Second Class :1945
## Mean :2016-04-30 Mean :2016-05-03 Standard Class:5968
## 3rd Qu.:2017-05-14 3rd Qu.:2017-05-18
## Max. :2017-12-30 Max. :2018-01-05
##
## Segment Category Sub.Category
## Consumer :5191 Furniture :2121 Binders :1523
## Corporate :3020 Office Supplies:6026 Paper :1370
## Home Office:1783 Technology :1847 Furnishings: 957
## Phones : 889
## Storage : 846
## Art : 796
## (Other) :3613
## Product.Name Sales Quantity Discount
## Length:9994 Min. : 0.444 Min. : 1.00 Min. :0.0000
## Class :character 1st Qu.: 17.280 1st Qu.: 2.00 1st Qu.:0.0000
## Mode :character Median : 54.490 Median : 3.00 Median :0.2000
## Mean : 229.858 Mean : 3.79 Mean :0.1562
## 3rd Qu.: 209.940 3rd Qu.: 5.00 3rd Qu.:0.2000
## Max. :22638.480 Max. :14.00 Max. :0.8000
##
## Profit Profit.Loss
## Min. :-6599.978 Loss :1871
## 1st Qu.: 1.729 Profit:8123
## Median : 8.666
## Mean : 28.657
## 3rd Qu.: 29.364
## Max. : 8399.976
##
Here you can see the min, median, max, mean, and etc.
Here are some examples for aggregating
table(retail_clean$Category, retail_clean$Segment == "Consumer")##
## FALSE TRUE
## Furniture 1008 1113
## Office Supplies 2899 3127
## Technology 896 951
round(prop.table(table(retail_clean$Category, retail_clean$Segment))*100, digits = 2)##
## Consumer Corporate Home Office
## Furniture 11.14 6.46 3.62
## Office Supplies 31.29 18.21 10.80
## Technology 9.52 5.54 3.42
# Profits each category and segment
xtabs(formula = Profit ~ Category + Segment, data = retail_clean)## Segment
## Category Consumer Corporate Home Office
## Furniture 6991.079 7584.816 3875.378
## Office Supplies 56330.321 40227.320 25933.160
## Technology 70797.810 44166.998 30490.141
# Profit's mean each category and segment
aggregate(formula = Profit ~ Category + Segment, data = retail_clean, FUN = mean)# Top profit's mean each segment and category
aggregate(formula = Profit ~ Category + Segment, data = retail_clean, FUN = max)# Top loss' mean each segment and category
aggregate(formula = Profit ~ Category + Segment, data = retail_clean, FUN = min)Maybe that is all for my little explanation of my retail data for my LBB. Thank you.