# load library ------------------------------------------------------------
::p_load("tidyverse", "janitor", "stringr", "lubridate", "patchwork")
pacman
# import dataset --------------------------------------------------------
"https://raw.githubusercontent.com/Ricky-s-a/business_analysis/main/data/ST2187_coursework_dataset_2018_19.csv"
path_to_github <- read_csv(path_to_github) df_raw <-
# inspect the dataset -----------------------------------------------------
glimpse(df_raw)
## Rows: 51,290
## Columns: 24
## $ `Row ID` <dbl> 32298, 26341, 25330, 13524, 47221, 22732, 30570, 3119…
## $ `Order ID` <chr> "CA-2012-124891", "IN-2013-77878", "IN-2013-71249", "…
## $ `Order Date` <chr> "7/31/2012", "2/5/2013", "10/17/2013", "1/28/2013", "…
## $ `Ship Date` <chr> "7/31/2012", "2/7/2013", "10/18/2013", "1/30/2013", "…
## $ `Ship Mode` <chr> "Same Day", "Second Class", "First Class", "First Cla…
## $ `Customer ID` <chr> "RH-19495", "JR-16210", "CR-12730", "KM-16375", "RH-9…
## $ `Customer Name` <chr> "Rick Hansen", "Justin Ritter", "Craig Reiter", "Kath…
## $ Segment <chr> "Consumer", "Corporate", "Consumer", "Home Office", "…
## $ City <chr> "New York City", "Wollongong", "Brisbane", "Berlin", …
## $ State <chr> "New York", "New South Wales", "Queensland", "Berlin"…
## $ Country <chr> "United States", "Australia", "Australia", "Germany",…
## $ `Postal Code` <dbl> 10024, NA, NA, NA, NA, NA, NA, NA, 95823, 28027, 2230…
## $ Market <chr> "US", "APAC", "APAC", "EU", "Africa", "APAC", "APAC",…
## $ Region <chr> "East", "Oceania", "Oceania", "Central", "Africa", "O…
## $ `Product ID` <chr> "TEC-AC-10003033", "FUR-CH-10003950", "TEC-PH-1000466…
## $ Category <chr> "Technology", "Furniture", "Technology", "Technology"…
## $ `Sub-Category` <chr> "Accessories", "Chairs", "Phones", "Phones", "Copiers…
## $ `Product Name` <chr> "Plantronics CS510 - Over-the-Head monaural Wireless …
## $ Sales <dbl> 2309.650, 3709.395, 5175.171, 2892.510, 2832.960, 286…
## $ Quantity <dbl> 7, 9, 9, 5, 8, 5, 4, 6, 5, 13, 5, 5, 4, 7, 12, 4, 9, …
## $ Discount <dbl> 0.0, 0.1, 0.1, 0.1, 0.0, 0.1, 0.0, 0.0, 0.2, 0.4, 0.0…
## $ Profit <dbl> 762.1845, -288.7650, 919.9710, -96.5400, 311.5200, 76…
## $ `Shipping Cost` <dbl> 933.57, 923.63, 915.49, 910.16, 903.04, 897.35, 894.7…
## $ `Order Priority` <chr> "Critical", "Critical", "Medium", "Medium", "Critical…
# tidy dataset ------------------------------------------------------------
df_raw %>%
df_tidy <- clean_names()
# tidy
df_tidy %>%
df <- mutate(
order_date = as.Date(order_date, format = "%m/%d/%Y"),
ship_date = as.Date(ship_date, format = "%m/%d/%Y"),
order_year = year(order_date),
order_month = month(order_date),
ship_year = year(ship_date),
ship_month = month(ship_date),
split_tf = round(runif(nrow(df_tidy), min = 1, max = 5)),
gap_date = as.numeric(difftime(ship_date, order_date, units = "days"))
%>%
) arrange(desc(order_date))
# show
glimpse(df)
## Rows: 51,290
## Columns: 30
## $ row_id <dbl> 1783, 26535, 44025, 26333, 12929, 26335, 15693, 1787, 1…
## $ order_id <chr> "MX-2014-116267", "IN-2014-43550", "RS-2014-1460", "IN-…
## $ order_date <date> 2014-12-31, 2014-12-31, 2014-12-31, 2014-12-31, 2014-1…
## $ ship_date <date> 2015-01-03, 2015-01-01, 2015-01-02, 2015-01-03, 2015-0…
## $ ship_mode <chr> "Second Class", "First Class", "Second Class", "First C…
## $ customer_id <chr> "EB-13975", "ML-17395", "PB-9105", "JD-16150", "JG-1580…
## $ customer_name <chr> "Erica Bern", "Marina Lichtenstein", "Peter Bühler", "J…
## $ segment <chr> "Corporate", "Corporate", "Consumer", "Corporate", "Cor…
## $ city <chr> "São Paulo", "Jakarta", "Ufa", "Bangkok", "Maidenhead",…
## $ state <chr> "São Paulo", "Jakarta", "Bashkortostan", "Bangkok", "En…
## $ country <chr> "Brazil", "Indonesia", "Russia", "Thailand", "United Ki…
## $ postal_code <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 10009, …
## $ market <chr> "LATAM", "APAC", "EMEA", "APAC", "EU", "APAC", "EU", "L…
## $ region <chr> "South", "Southeast Asia", "EMEA", "Southeast Asia", "N…
## $ product_id <chr> "TEC-CO-10000137", "FUR-BO-10004679", "TEC-HEW-10004652…
## $ category <chr> "Technology", "Furniture", "Technology", "Furniture", "…
## $ sub_category <chr> "Copiers", "Bookcases", "Copiers", "Tables", "Phones", …
## $ product_name <chr> "Canon Wireless Fax, Color", "Safco Library with Doors,…
## $ sales <dbl> 1264.4660, 1091.2806, 865.6200, 1048.7313, 867.3000, 29…
## $ quantity <dbl> 5, 3, 6, 9, 5, 3, 3, 3, 2, 9, 4, 4, 4, 5, 2, 3, 3, 4, 2…
## $ discount <dbl> 0.002, 0.070, 0.000, 0.570, 0.000, 0.270, 0.100, 0.000,…
## $ profit <dbl> 301.4660, 46.9206, 51.8400, -1195.2387, 251.4000, 68.11…
## $ shipping_cost <dbl> 253.25, 243.11, 138.18, 86.86, 53.16, 52.11, 51.79, 51.…
## $ order_priority <chr> "High", "High", "High", "High", "Medium", "High", "Medi…
## $ order_year <dbl> 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2…
## $ order_month <dbl> 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,…
## $ ship_year <dbl> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2…
## $ ship_month <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 12, 1, 1, 1, 12, 1, 1, 1,…
## $ split_tf <dbl> 5, 3, 2, 3, 3, 2, 4, 4, 5, 1, 1, 4, 2, 3, 3, 4, 2, 5, 4…
## $ gap_date <dbl> 3, 1, 2, 3, 5, 3, 2, 3, 4, 5, 0, 4, 4, 1, 0, 4, 5, 5, 5…
# summary of numeric data
%>%
df select(where(is.numeric)) %>%
summary()
## row_id postal_code sales quantity
## Min. : 1 Min. : 1040 Min. : 0.444 Min. : 1.000
## 1st Qu.:12823 1st Qu.:23223 1st Qu.: 30.759 1st Qu.: 2.000
## Median :25646 Median :56431 Median : 85.053 Median : 3.000
## Mean :25646 Mean :55190 Mean : 246.491 Mean : 3.477
## 3rd Qu.:38468 3rd Qu.:90008 3rd Qu.: 251.053 3rd Qu.: 5.000
## Max. :51290 Max. :99301 Max. :22638.480 Max. :14.000
## NA's :41296
## discount profit shipping_cost order_year
## Min. :0.0000 Min. :-6599.98 Min. : 0.00 Min. :2011
## 1st Qu.:0.0000 1st Qu.: 0.00 1st Qu.: 2.61 1st Qu.:2012
## Median :0.0000 Median : 9.24 Median : 7.79 Median :2013
## Mean :0.1429 Mean : 28.61 Mean : 26.38 Mean :2013
## 3rd Qu.:0.2000 3rd Qu.: 36.81 3rd Qu.: 24.45 3rd Qu.:2014
## Max. :0.8500 Max. : 8399.98 Max. :933.57 Max. :2014
##
## order_month ship_year ship_month split_tf
## Min. : 1.000 Min. :2011 Min. : 1.000 Min. :1.000
## 1st Qu.: 5.000 1st Qu.:2012 1st Qu.: 5.000 1st Qu.:2.000
## Median : 8.000 Median :2013 Median : 8.000 Median :3.000
## Mean : 7.523 Mean :2013 Mean : 7.464 Mean :3.009
## 3rd Qu.:10.000 3rd Qu.:2014 3rd Qu.:10.000 3rd Qu.:4.000
## Max. :12.000 Max. :2015 Max. :12.000 Max. :5.000
##
## gap_date
## Min. :0.000
## 1st Qu.:3.000
## Median :4.000
## Mean :3.969
## 3rd Qu.:5.000
## Max. :7.000
##
# date range
%>%
df select(ends_with("date")) %>%
lapply(range)
## $order_date
## [1] "2011-01-01" "2014-12-31"
##
## $ship_date
## [1] "2011-01-03" "2015-01-07"
##
## $gap_date
## [1] 0 7
# check the number of unique categorical variables
%>%
df_tidy select(where(is.character)) %>%
lapply(unique) %>%
lapply(length)
## $order_id
## [1] 25035
##
## $order_date
## [1] 1430
##
## $ship_date
## [1] 1464
##
## $ship_mode
## [1] 4
##
## $customer_id
## [1] 1590
##
## $customer_name
## [1] 795
##
## $segment
## [1] 3
##
## $city
## [1] 3636
##
## $state
## [1] 1094
##
## $country
## [1] 147
##
## $market
## [1] 7
##
## $region
## [1] 13
##
## $product_id
## [1] 10292
##
## $category
## [1] 3
##
## $sub_category
## [1] 17
##
## $product_name
## [1] 3788
##
## $order_priority
## [1] 4
毎年11月、12月にかけてオーダー数が増えている。
g1 <- df %>%
group_by(order_year, order_month) %>%
ggplot() +
aes(order_date) +
geom_bar()
g2 <- g1 <-
df %>%
group_by(order_year, order_month) %>%
ggplot() +
aes(order_date) +
geom_bar()
/g2 g1
%>%
df group_by(order_year, order_date) %>%
summarise(total = n()) %>% filter(order_year == 2011, total > 50)
## # A tibble: 31 × 3
## # Groups: order_year [1]
## order_year order_date total
## <dbl> <date> <int>
## 1 2011 2011-03-01 53
## 2 2011 2011-06-07 60
## 3 2011 2011-06-22 55
## 4 2011 2011-08-25 53
## 5 2011 2011-09-02 60
## 6 2011 2011-09-08 76
## 7 2011 2011-09-14 63
## 8 2011 2011-09-23 54
## 9 2011 2011-09-26 56
## 10 2011 2011-09-27 57
## # … with 21 more rows
There is no significant difference across countires.
# there must be some variations in the gap between the order date and the ship date.
# Q. how is the gap between the order date and the ship date?
%>%
df ggplot(aes(gap_date, region)) +
geom_boxplot()
# there must be some variations in the shipping cost across countries.
# Q. how much is that?
df %>%
g_corporate <- filter(segment == "Corporate") %>%
ggplot(aes(shipping_cost, market)) +
geom_boxplot() +
labs(title = "Corporate")
df %>%
g_consumer <- filter(segment == "Consumer") %>%
ggplot(aes(shipping_cost, market)) +
geom_boxplot() +
labs(title = "Consumer")
df %>%
g_home_office <- filter(segment == "Home Office") %>%
ggplot(aes(shipping_cost, market)) +
geom_boxplot() +
labs(title = "Home Office")
/ g_consumer / g_home_office g_corporate
# basic info --------------------------------------------------------------
# the ration of regions in profit on the annual basis
%>%
df group_by(order_year, order_month, region) %>%
summarise(sum_profit = sum(profit)) %>%
ggplot(aes(order_year, sum_profit, fill = region)) +
geom_col()
# total profit by year
profit_by_year <- df %>% group_by(order_year) %>%
summarise(annual_profit = sum(profit))
# profit ratio by year
%>%
df group_by(order_year, order_month, market) %>%
summarise(sum_profit = sum(profit)) %>%
mutate(profit_ratio_by_year = sum_profit/filter(profit_by_year, order_year == order_year)[[2]]) %>%
arrange(order_year, desc(sum_profit))
## # A tibble: 335 × 5
## # Groups: order_year, order_month [48]
## order_year order_month market sum_profit profit_ratio_by_year
## <dbl> <dbl> <chr> <dbl> <dbl>
## 1 2011 9 EU 13805. 0.0555
## 2 2011 12 APAC 13516. 0.0440
## 3 2011 10 APAC 13496. 0.0439
## 4 2011 12 EU 11540. 0.0464
## 5 2011 11 US 9292. 0.0228
## 6 2011 6 APAC 9216. 0.0300
## 7 2011 12 US 8984. 0.0221
## 8 2011 11 APAC 8951. 0.0291
## 9 2011 9 US 8328. 0.0205
## 10 2011 6 EU 7799. 0.0313
## # … with 325 more rows
Especially the markets in APAc and EU are expanding.
# Q. the most profitable market, product, category, sub_category,
%>%
df group_by(order_year, market) %>%
summarise(profit = sum(profit)) %>%
arrange(order_year,desc(profit)) %>%
top_n(5, profit)
## # A tibble: 20 × 3
## # Groups: order_year [4]
## order_year market profit
## <dbl> <chr> <dbl>
## 1 2011 APAC 83032.
## 2 2011 EU 61626.
## 3 2011 US 49544.
## 4 2011 LATAM 36708.
## 5 2011 Africa 10944.
## 6 2012 APAC 89411.
## 7 2012 EU 83985.
## 8 2012 US 61619.
## 9 2012 LATAM 50185.
## 10 2012 Africa 11909.
## 11 2013 APAC 123103.
## 12 2013 EU 98275.
## 13 2013 US 81727.
## 14 2013 LATAM 61415.
## 15 2013 Africa 26687.
## 16 2014 APAC 140454.
## 17 2014 EU 128944.
## 18 2014 US 93508.
## 19 2014 LATAM 73335.
## 20 2014 Africa 39331.
The presence of China and India is growing.
# Q. which country?
%>%
df filter(market %in% c("APAC", "EU")) %>%
group_by(order_year, country) %>%
summarise(market_profit = sum(profit)) %>%
arrange(order_year, desc(market_profit)) %>%
top_n(3, market_profit)
## # A tibble: 12 × 3
## # Groups: order_year [4]
## order_year country market_profit
## <dbl> <chr> <dbl>
## 1 2011 China 33181.
## 2 2011 United Kingdom 20080.
## 3 2011 India 19929.
## 4 2012 India 27439.
## 5 2012 United Kingdom 27366.
## 6 2012 China 26234.
## 7 2013 China 44474.
## 8 2013 India 32897.
## 9 2013 France 32416.
## 10 2014 India 48808.
## 11 2014 China 46794.
## 12 2014 United Kingdom 36756.
technology, furniture, office suplliesで変らない。
# which category is sold in those region?
%>%
df filter(country %in% c("India", "China")) %>%
group_by(order_year, sub_category) %>%
summarise(profit_by_subcategory = sum(profit)) %>%
arrange(order_year, desc(profit_by_subcategory)) %>%
top_n(10, desc(profit_by_subcategory))
## # A tibble: 40 × 3
## # Groups: order_year [4]
## order_year sub_category profit_by_subcategory
## <dbl> <chr> <dbl>
## 1 2011 Machines 2599.
## 2 2011 Furnishings 2008.
## 3 2011 Envelopes 1187.
## 4 2011 Supplies 1073.
## 5 2011 Binders 1022.
## 6 2011 Art 819.
## 7 2011 Paper 763.
## 8 2011 Fasteners 478.
## 9 2011 Labels 224.
## 10 2011 Tables -1465.
## # … with 30 more rows
いるが、それぞれがバラバラ。もっとフォローアップを増やし、one-timeではなく、継続的な大手取引先を作るべき。
%>%
df group_by(order_year, customer_id) %>%
summarise(profit_by_customer = sum(profit)) %>%
arrange(order_year, desc(profit_by_customer)) %>%
top_n(10, profit_by_customer)
## # A tibble: 40 × 3
## # Groups: order_year [4]
## order_year customer_id profit_by_customer
## <dbl> <chr> <dbl>
## 1 2011 SC-20095 5716.
## 2 2011 CA-11965 3121.
## 3 2011 NM-18445 2950.
## 4 2011 GT-14710 2909.
## 5 2011 ON-18715 2689.
## 6 2011 ER-13855 2618.
## 7 2011 TB-21400 2549.
## 8 2011 KN-16390 2453.
## 9 2011 HL-15040 2405.
## 10 2011 RB-19330 2238.
## # … with 30 more rows
# top10 customers customer's id for every year.
major_customers <- df %>%
group_by(order_year, customer_id) %>%
summarise(profit_by_customer = sum(profit)) %>%
arrange(order_year, desc(profit_by_customer)) %>%
top_n(10, profit_by_customer) %>%
ungroup() %>%
select(customer_id) %>%
unlist() %>%
unname()
%>%
df filter(customer_id %in% major_customers) %>%
group_by(order_year, customer_id, sub_category) %>%
summarise(profit_by_subcategory = sum(profit)) %>%
arrange(order_year, desc(profit_by_subcategory)) %>%
top_n(1, profit_by_subcategory)
## # A tibble: 152 × 4
## # Groups: order_year, customer_id [152]
## order_year customer_id sub_category profit_by_subcategory
## <dbl> <chr> <chr> <dbl>
## 1 2011 SC-20095 Binders 5480.
## 2 2011 CA-11965 Phones 2939.
## 3 2011 ER-13855 Appliances 2476.
## 4 2011 TB-21400 Machines 2240.
## 5 2011 ON-18715 Chairs 2125.
## 6 2011 NM-18445 Machines 1996.
## 7 2011 HL-15040 Phones 1930.
## 8 2011 DR-12940 Appliances 1644.
## 9 2011 KN-16390 Tables 1528.
## 10 2011 GT-14710 Chairs 1474.
## # … with 142 more rows