• 準備
  • データセットの基本情報
  • データクレンジング
  • 探索的データ分析
    • 数値データの基本統計量
    • 日付データのレンジ
    • カテゴリカルデータの数
    • 月々のオーダーの数と出荷数
      • 2011年の例
    • オーダーを受けてから出荷するまでの日付に国ごとのばらつきはあるのか?
    • 出荷費用に関しては、どのような国ごとのばらつきはあるのか?
    • 年毎の利益と地域
    • 年ごとの利益
    • どのマーケットが一番儲かっているのか?
    • どの国が特に利益を上げているのか?
    • インドと中国では特にどのsubcategoryが売れているのか?
    • 大手取引先はいるのか?
    • 大手取引先の属性情報は?

準備

# load library ------------------------------------------------------------

pacman::p_load("tidyverse", "janitor", "stringr", "lubridate", "patchwork")


# import dataset --------------------------------------------------------

path_to_github <- "https://raw.githubusercontent.com/Ricky-s-a/business_analysis/main/data/ST2187_coursework_dataset_2018_19.csv"
df_raw <- read_csv(path_to_github)

データセットの基本情報

# inspect the dataset -----------------------------------------------------

glimpse(df_raw)
## Rows: 51,290
## Columns: 24
## $ `Row ID`         <dbl> 32298, 26341, 25330, 13524, 47221, 22732, 30570, 3119…
## $ `Order ID`       <chr> "CA-2012-124891", "IN-2013-77878", "IN-2013-71249", "…
## $ `Order Date`     <chr> "7/31/2012", "2/5/2013", "10/17/2013", "1/28/2013", "…
## $ `Ship Date`      <chr> "7/31/2012", "2/7/2013", "10/18/2013", "1/30/2013", "…
## $ `Ship Mode`      <chr> "Same Day", "Second Class", "First Class", "First Cla…
## $ `Customer ID`    <chr> "RH-19495", "JR-16210", "CR-12730", "KM-16375", "RH-9…
## $ `Customer Name`  <chr> "Rick Hansen", "Justin Ritter", "Craig Reiter", "Kath…
## $ Segment          <chr> "Consumer", "Corporate", "Consumer", "Home Office", "…
## $ City             <chr> "New York City", "Wollongong", "Brisbane", "Berlin", …
## $ State            <chr> "New York", "New South Wales", "Queensland", "Berlin"…
## $ Country          <chr> "United States", "Australia", "Australia", "Germany",…
## $ `Postal Code`    <dbl> 10024, NA, NA, NA, NA, NA, NA, NA, 95823, 28027, 2230…
## $ Market           <chr> "US", "APAC", "APAC", "EU", "Africa", "APAC", "APAC",…
## $ Region           <chr> "East", "Oceania", "Oceania", "Central", "Africa", "O…
## $ `Product ID`     <chr> "TEC-AC-10003033", "FUR-CH-10003950", "TEC-PH-1000466…
## $ Category         <chr> "Technology", "Furniture", "Technology", "Technology"…
## $ `Sub-Category`   <chr> "Accessories", "Chairs", "Phones", "Phones", "Copiers…
## $ `Product Name`   <chr> "Plantronics CS510 - Over-the-Head monaural Wireless …
## $ Sales            <dbl> 2309.650, 3709.395, 5175.171, 2892.510, 2832.960, 286…
## $ Quantity         <dbl> 7, 9, 9, 5, 8, 5, 4, 6, 5, 13, 5, 5, 4, 7, 12, 4, 9, …
## $ Discount         <dbl> 0.0, 0.1, 0.1, 0.1, 0.0, 0.1, 0.0, 0.0, 0.2, 0.4, 0.0…
## $ Profit           <dbl> 762.1845, -288.7650, 919.9710, -96.5400, 311.5200, 76…
## $ `Shipping Cost`  <dbl> 933.57, 923.63, 915.49, 910.16, 903.04, 897.35, 894.7…
## $ `Order Priority` <chr> "Critical", "Critical", "Medium", "Medium", "Critical…

データクレンジング

# tidy dataset ------------------------------------------------------------

df_tidy <- df_raw %>% 
  clean_names()

# tidy
df <- df_tidy %>% 
  mutate(
    order_date = as.Date(order_date, format = "%m/%d/%Y"),
    ship_date = as.Date(ship_date, format = "%m/%d/%Y"),
    order_year = year(order_date), 
    order_month = month(order_date),
    ship_year = year(ship_date),
    ship_month = month(ship_date),
    split_tf = round(runif(nrow(df_tidy), min = 1, max = 5)),
    gap_date = as.numeric(difftime(ship_date, order_date, units = "days"))
  ) %>% 
  arrange(desc(order_date))

# show 
glimpse(df)
## Rows: 51,290
## Columns: 30
## $ row_id         <dbl> 1783, 26535, 44025, 26333, 12929, 26335, 15693, 1787, 1…
## $ order_id       <chr> "MX-2014-116267", "IN-2014-43550", "RS-2014-1460", "IN-…
## $ order_date     <date> 2014-12-31, 2014-12-31, 2014-12-31, 2014-12-31, 2014-1…
## $ ship_date      <date> 2015-01-03, 2015-01-01, 2015-01-02, 2015-01-03, 2015-0…
## $ ship_mode      <chr> "Second Class", "First Class", "Second Class", "First C…
## $ customer_id    <chr> "EB-13975", "ML-17395", "PB-9105", "JD-16150", "JG-1580…
## $ customer_name  <chr> "Erica Bern", "Marina Lichtenstein", "Peter Bühler", "J…
## $ segment        <chr> "Corporate", "Corporate", "Consumer", "Corporate", "Cor…
## $ city           <chr> "São Paulo", "Jakarta", "Ufa", "Bangkok", "Maidenhead",…
## $ state          <chr> "São Paulo", "Jakarta", "Bashkortostan", "Bangkok", "En…
## $ country        <chr> "Brazil", "Indonesia", "Russia", "Thailand", "United Ki…
## $ postal_code    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 10009, …
## $ market         <chr> "LATAM", "APAC", "EMEA", "APAC", "EU", "APAC", "EU", "L…
## $ region         <chr> "South", "Southeast Asia", "EMEA", "Southeast Asia", "N…
## $ product_id     <chr> "TEC-CO-10000137", "FUR-BO-10004679", "TEC-HEW-10004652…
## $ category       <chr> "Technology", "Furniture", "Technology", "Furniture", "…
## $ sub_category   <chr> "Copiers", "Bookcases", "Copiers", "Tables", "Phones", …
## $ product_name   <chr> "Canon Wireless Fax, Color", "Safco Library with Doors,…
## $ sales          <dbl> 1264.4660, 1091.2806, 865.6200, 1048.7313, 867.3000, 29…
## $ quantity       <dbl> 5, 3, 6, 9, 5, 3, 3, 3, 2, 9, 4, 4, 4, 5, 2, 3, 3, 4, 2…
## $ discount       <dbl> 0.002, 0.070, 0.000, 0.570, 0.000, 0.270, 0.100, 0.000,…
## $ profit         <dbl> 301.4660, 46.9206, 51.8400, -1195.2387, 251.4000, 68.11…
## $ shipping_cost  <dbl> 253.25, 243.11, 138.18, 86.86, 53.16, 52.11, 51.79, 51.…
## $ order_priority <chr> "High", "High", "High", "High", "Medium", "High", "Medi…
## $ order_year     <dbl> 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2…
## $ order_month    <dbl> 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,…
## $ ship_year      <dbl> 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2…
## $ ship_month     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 12, 1, 1, 1, 12, 1, 1, 1,…
## $ split_tf       <dbl> 5, 3, 2, 3, 3, 2, 4, 4, 5, 1, 1, 4, 2, 3, 3, 4, 2, 5, 4…
## $ gap_date       <dbl> 3, 1, 2, 3, 5, 3, 2, 3, 4, 5, 0, 4, 4, 1, 0, 4, 5, 5, 5…

探索的データ分析

数値データの基本統計量

# summary of numeric data
df %>% 
  select(where(is.numeric)) %>% 
  summary()
##      row_id       postal_code        sales              quantity     
##  Min.   :    1   Min.   : 1040   Min.   :    0.444   Min.   : 1.000  
##  1st Qu.:12823   1st Qu.:23223   1st Qu.:   30.759   1st Qu.: 2.000  
##  Median :25646   Median :56431   Median :   85.053   Median : 3.000  
##  Mean   :25646   Mean   :55190   Mean   :  246.491   Mean   : 3.477  
##  3rd Qu.:38468   3rd Qu.:90008   3rd Qu.:  251.053   3rd Qu.: 5.000  
##  Max.   :51290   Max.   :99301   Max.   :22638.480   Max.   :14.000  
##                  NA's   :41296                                       
##     discount          profit         shipping_cost      order_year  
##  Min.   :0.0000   Min.   :-6599.98   Min.   :  0.00   Min.   :2011  
##  1st Qu.:0.0000   1st Qu.:    0.00   1st Qu.:  2.61   1st Qu.:2012  
##  Median :0.0000   Median :    9.24   Median :  7.79   Median :2013  
##  Mean   :0.1429   Mean   :   28.61   Mean   : 26.38   Mean   :2013  
##  3rd Qu.:0.2000   3rd Qu.:   36.81   3rd Qu.: 24.45   3rd Qu.:2014  
##  Max.   :0.8500   Max.   : 8399.98   Max.   :933.57   Max.   :2014  
##                                                                     
##   order_month       ship_year      ship_month        split_tf    
##  Min.   : 1.000   Min.   :2011   Min.   : 1.000   Min.   :1.000  
##  1st Qu.: 5.000   1st Qu.:2012   1st Qu.: 5.000   1st Qu.:2.000  
##  Median : 8.000   Median :2013   Median : 8.000   Median :3.000  
##  Mean   : 7.523   Mean   :2013   Mean   : 7.464   Mean   :3.009  
##  3rd Qu.:10.000   3rd Qu.:2014   3rd Qu.:10.000   3rd Qu.:4.000  
##  Max.   :12.000   Max.   :2015   Max.   :12.000   Max.   :5.000  
##                                                                  
##     gap_date    
##  Min.   :0.000  
##  1st Qu.:3.000  
##  Median :4.000  
##  Mean   :3.969  
##  3rd Qu.:5.000  
##  Max.   :7.000  
## 

日付データのレンジ

# date range
df %>% 
  select(ends_with("date")) %>% 
  lapply(range)
## $order_date
## [1] "2011-01-01" "2014-12-31"
## 
## $ship_date
## [1] "2011-01-03" "2015-01-07"
## 
## $gap_date
## [1] 0 7

カテゴリカルデータの数

# check the number of unique categorical variables
df_tidy %>% 
  select(where(is.character)) %>% 
  lapply(unique) %>%
  lapply(length)
## $order_id
## [1] 25035
## 
## $order_date
## [1] 1430
## 
## $ship_date
## [1] 1464
## 
## $ship_mode
## [1] 4
## 
## $customer_id
## [1] 1590
## 
## $customer_name
## [1] 795
## 
## $segment
## [1] 3
## 
## $city
## [1] 3636
## 
## $state
## [1] 1094
## 
## $country
## [1] 147
## 
## $market
## [1] 7
## 
## $region
## [1] 13
## 
## $product_id
## [1] 10292
## 
## $category
## [1] 3
## 
## $sub_category
## [1] 17
## 
## $product_name
## [1] 3788
## 
## $order_priority
## [1] 4

月々のオーダーの数と出荷数

毎年11月、12月にかけてオーダー数が増えている。

g1 <- 
  df %>%
  group_by(order_year, order_month) %>% 
  ggplot() + 
  aes(order_date) +
  geom_bar() 

g2 <- 
  g1 <- 
  df %>%
  group_by(order_year, order_month) %>% 
  ggplot() + 
  aes(order_date) +
  geom_bar()

g1/g2

2011年の例

df %>% 
  group_by(order_year, order_date) %>% 
  summarise(total = n()) %>% filter(order_year == 2011, total > 50) 
## # A tibble: 31 × 3
## # Groups:   order_year [1]
##    order_year order_date total
##         <dbl> <date>     <int>
##  1       2011 2011-03-01    53
##  2       2011 2011-06-07    60
##  3       2011 2011-06-22    55
##  4       2011 2011-08-25    53
##  5       2011 2011-09-02    60
##  6       2011 2011-09-08    76
##  7       2011 2011-09-14    63
##  8       2011 2011-09-23    54
##  9       2011 2011-09-26    56
## 10       2011 2011-09-27    57
## # … with 21 more rows

オーダーを受けてから出荷するまでの日付に国ごとのばらつきはあるのか?

There is no significant difference across countires.

# there must be some variations in the gap between the order date and the ship date.
# Q. how is the gap between the order date and the ship date?
df %>% 
  ggplot(aes(gap_date, region)) +
  geom_boxplot()

出荷費用に関しては、どのような国ごとのばらつきはあるのか?

# there must be some variations in the shipping cost across countries.
# Q. how much is that? 
g_corporate <- df %>% 
  filter(segment == "Corporate") %>% 
  ggplot(aes(shipping_cost, market)) +
  geom_boxplot() +
  labs(title = "Corporate")

g_consumer <- df %>% 
  filter(segment == "Consumer") %>% 
  ggplot(aes(shipping_cost, market)) +
  geom_boxplot() + 
  labs(title = "Consumer")

g_home_office <- df %>% 
  filter(segment == "Home Office") %>% 
  ggplot(aes(shipping_cost, market)) +
  geom_boxplot() +
  labs(title = "Home Office")

g_corporate / g_consumer / g_home_office

年毎の利益と地域

# basic info --------------------------------------------------------------

# the ration of regions in profit on the annual basis
df %>% 
  group_by(order_year, order_month, region) %>%
  summarise(sum_profit = sum(profit)) %>% 
  ggplot(aes(order_year, sum_profit, fill = region)) + 
  geom_col()

年ごとの利益

# total profit by year
profit_by_year <- 
  df %>% group_by(order_year) %>% 
  summarise(annual_profit = sum(profit))

# profit ratio by year
df %>% 
  group_by(order_year, order_month, market) %>%
  summarise(sum_profit = sum(profit)) %>% 
  mutate(profit_ratio_by_year = sum_profit/filter(profit_by_year, order_year == order_year)[[2]]) %>% 
  arrange(order_year, desc(sum_profit)) 
## # A tibble: 335 × 5
## # Groups:   order_year, order_month [48]
##    order_year order_month market sum_profit profit_ratio_by_year
##         <dbl>       <dbl> <chr>       <dbl>                <dbl>
##  1       2011           9 EU         13805.               0.0555
##  2       2011          12 APAC       13516.               0.0440
##  3       2011          10 APAC       13496.               0.0439
##  4       2011          12 EU         11540.               0.0464
##  5       2011          11 US          9292.               0.0228
##  6       2011           6 APAC        9216.               0.0300
##  7       2011          12 US          8984.               0.0221
##  8       2011          11 APAC        8951.               0.0291
##  9       2011           9 US          8328.               0.0205
## 10       2011           6 EU          7799.               0.0313
## # … with 325 more rows

どのマーケットが一番儲かっているのか?

Especially the markets in APAc and EU are expanding.

# Q. the most profitable market, product, category, sub_category, 
df %>% 
  group_by(order_year, market) %>% 
  summarise(profit = sum(profit)) %>% 
  arrange(order_year,desc(profit)) %>% 
  top_n(5, profit)
## # A tibble: 20 × 3
## # Groups:   order_year [4]
##    order_year market  profit
##         <dbl> <chr>    <dbl>
##  1       2011 APAC    83032.
##  2       2011 EU      61626.
##  3       2011 US      49544.
##  4       2011 LATAM   36708.
##  5       2011 Africa  10944.
##  6       2012 APAC    89411.
##  7       2012 EU      83985.
##  8       2012 US      61619.
##  9       2012 LATAM   50185.
## 10       2012 Africa  11909.
## 11       2013 APAC   123103.
## 12       2013 EU      98275.
## 13       2013 US      81727.
## 14       2013 LATAM   61415.
## 15       2013 Africa  26687.
## 16       2014 APAC   140454.
## 17       2014 EU     128944.
## 18       2014 US      93508.
## 19       2014 LATAM   73335.
## 20       2014 Africa  39331.

どの国が特に利益を上げているのか?

The presence of China and India is growing.

# Q. which country?
df %>% 
  filter(market %in% c("APAC", "EU")) %>% 
  group_by(order_year, country) %>% 
  summarise(market_profit = sum(profit)) %>% 
  arrange(order_year, desc(market_profit)) %>% 
  top_n(3, market_profit)
## # A tibble: 12 × 3
## # Groups:   order_year [4]
##    order_year country        market_profit
##         <dbl> <chr>                  <dbl>
##  1       2011 China                 33181.
##  2       2011 United Kingdom        20080.
##  3       2011 India                 19929.
##  4       2012 India                 27439.
##  5       2012 United Kingdom        27366.
##  6       2012 China                 26234.
##  7       2013 China                 44474.
##  8       2013 India                 32897.
##  9       2013 France                32416.
## 10       2014 India                 48808.
## 11       2014 China                 46794.
## 12       2014 United Kingdom        36756.

インドと中国では特にどのsubcategoryが売れているのか?

technology, furniture, office suplliesで変らない。

# which category is sold in those region? 
df %>% 
  filter(country %in% c("India", "China")) %>% 
  group_by(order_year, sub_category) %>% 
  summarise(profit_by_subcategory = sum(profit)) %>% 
  arrange(order_year, desc(profit_by_subcategory)) %>% 
  top_n(10, desc(profit_by_subcategory)) 
## # A tibble: 40 × 3
## # Groups:   order_year [4]
##    order_year sub_category profit_by_subcategory
##         <dbl> <chr>                        <dbl>
##  1       2011 Machines                     2599.
##  2       2011 Furnishings                  2008.
##  3       2011 Envelopes                    1187.
##  4       2011 Supplies                     1073.
##  5       2011 Binders                      1022.
##  6       2011 Art                           819.
##  7       2011 Paper                         763.
##  8       2011 Fasteners                     478.
##  9       2011 Labels                        224.
## 10       2011 Tables                      -1465.
## # … with 30 more rows

大手取引先はいるのか?

いるが、それぞれがバラバラ。もっとフォローアップを増やし、one-timeではなく、継続的な大手取引先を作るべき。

df %>% 
  group_by(order_year, customer_id) %>% 
  summarise(profit_by_customer = sum(profit)) %>% 
  arrange(order_year, desc(profit_by_customer)) %>% 
  top_n(10, profit_by_customer) 
## # A tibble: 40 × 3
## # Groups:   order_year [4]
##    order_year customer_id profit_by_customer
##         <dbl> <chr>                    <dbl>
##  1       2011 SC-20095                 5716.
##  2       2011 CA-11965                 3121.
##  3       2011 NM-18445                 2950.
##  4       2011 GT-14710                 2909.
##  5       2011 ON-18715                 2689.
##  6       2011 ER-13855                 2618.
##  7       2011 TB-21400                 2549.
##  8       2011 KN-16390                 2453.
##  9       2011 HL-15040                 2405.
## 10       2011 RB-19330                 2238.
## # … with 30 more rows

大手取引先の属性情報は?

# top10 customers customer's id for every year.
major_customers <- 
  df %>% 
  group_by(order_year, customer_id) %>% 
  summarise(profit_by_customer = sum(profit)) %>% 
  arrange(order_year, desc(profit_by_customer)) %>% 
  top_n(10, profit_by_customer) %>%
  ungroup() %>%  
  select(customer_id) %>% 
  unlist() %>% 
  unname()


df %>% 
  filter(customer_id %in% major_customers) %>%
  group_by(order_year, customer_id, sub_category) %>% 
  summarise(profit_by_subcategory = sum(profit)) %>% 
  arrange(order_year, desc(profit_by_subcategory)) %>% 
  top_n(1, profit_by_subcategory)
## # A tibble: 152 × 4
## # Groups:   order_year, customer_id [152]
##    order_year customer_id sub_category profit_by_subcategory
##         <dbl> <chr>       <chr>                        <dbl>
##  1       2011 SC-20095    Binders                      5480.
##  2       2011 CA-11965    Phones                       2939.
##  3       2011 ER-13855    Appliances                   2476.
##  4       2011 TB-21400    Machines                     2240.
##  5       2011 ON-18715    Chairs                       2125.
##  6       2011 NM-18445    Machines                     1996.
##  7       2011 HL-15040    Phones                       1930.
##  8       2011 DR-12940    Appliances                   1644.
##  9       2011 KN-16390    Tables                       1528.
## 10       2011 GT-14710    Chairs                       1474.
## # … with 142 more rows