Task 1

# Load libraries
pacman::p_load(tidyverse, magrittr, GGally, lubridate, gridExtra,
               install = FALSE, update = FALSE)

# Function to remove/replace characters
convert <- function(x){
  x %>% str_replace_all(c("\\(" = "-",
                          "\\$" = "",
                          "\\)" = "",
                          "\\," = ""
                          )
                        ) %>%
    as.numeric()
}

# Load Global Sales 2016
GlobalSales <- read_csv("G:/My Drive/homework/Lovely J/M6_project_dataset.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_character(),
##   RowID = col_double(),
##   PostalCode = col_double(),
##   Quantity = col_double(),
##   Discount = col_double()
## )
## i Use `spec()` for the full column specifications.
# Variable Type Correction
GlobalSales %<>%
  mutate(across(.cols = c(OrderDate, ShipDate),
                mdy),
         across(.cols = c(Country, Region, Market, Segment,
                          Category, SubCategory, OrderPriority, ShipMode,
                          PostalCode),
                as_factor
                ), # City, State UTF-8 errors, iconv(., to = "UTF-8")?
         across(.cols = c(Sales, Profit, ShippingCost),
                convert
                ),
         Discount = 100 * Discount # proportion to %
         )
# Suggested project code
M6_project_dataset <-
  read_csv("Lovely J/M6_project_dataset.csv",
           col_types = cols(RowID = col_integer(),
                            OrderDate = col_date(format = "%m/%d/%Y"), 
                            ShipDate = col_date(format = "%m/%d/%Y"), 
                            Sales = col_number(), Discount = col_number(),
                            ShippingCost = col_number()
                            )
           )
# still need to fix Profit & add factor levels

Task 2

# Function to create statistic x variable (row x column) summary
sum.fun <- function(x){
  return(c(mean(x),
           median(x),
           sd(x),
           max(x) - min(x)
           )
         )
}

GlobalSales %>%
  select(Sales:ShippingCost) %>%
  lapply(., sum.fun) %>%
  as.data.frame() %>%
  round(digits = 2) ->
  sales.summary

row.names(sales.summary) <- c("Mean", "Median", "sd", "Range")
sales.summary
##          Sales Quantity Discount  Profit ShippingCost
## Mean    398.41     4.11    11.00   49.55        37.19
## Median  350.84     3.00     0.00   45.42        40.62
## sd      293.95     2.50    17.29  147.39        18.77
## Range  3406.28    13.00    80.00 1897.94       393.48

Task 3

# Sales
p1 <- 
  GlobalSales %>%
  ggplot(aes(y = Sales)) +
  geom_boxplot() +
  theme(axis.title = element_blank(),
        axis.text.x = element_blank())
p2 <- 
  GlobalSales %>% 
  ggplot(aes(Sales)) +
  geom_histogram() +
  theme(axis.title = element_blank()
        )
grid.arrange(p1, p2, nrow = 1, top = "Sales")

# Quantity
p1 <- 
  GlobalSales %>%
  ggplot(aes(y = Quantity)) +
  geom_boxplot() +
  theme(axis.title = element_blank(),
        axis.text.x = element_blank())
p2 <- 
  GlobalSales %>% 
  ggplot(aes(Quantity)) +
  geom_histogram() +
  theme(axis.title = element_blank()
        )
grid.arrange(p1, p2, nrow = 1, top = "Quantity")

# Discount
p1 <- 
  GlobalSales %>%
  ggplot(aes(y = Discount)) +
  geom_boxplot() +
  theme(axis.title = element_blank(),
        axis.text.x = element_blank())
p2 <- 
  GlobalSales %>% 
  ggplot(aes(Discount)) +
  geom_histogram() +
  theme(axis.title = element_blank()
        )
grid.arrange(p1, p2, nrow = 1, top = "Discount %")

# Profit
p1 <- 
  GlobalSales %>%
  ggplot(aes(y = Profit)) +
  geom_boxplot() +
  theme(axis.title = element_blank(),
        axis.text.x = element_blank())
p2 <- 
  GlobalSales %>% 
  ggplot(aes(Profit)) +
  geom_histogram() +
  theme(axis.title = element_blank()
        )
grid.arrange(p1, p2, nrow = 1, top = "Profit")

# ShippingCost
p1 <- 
  GlobalSales %>%
  ggplot(aes(y = ShippingCost)) +
  geom_boxplot() +
  theme(axis.title = element_blank(),
        axis.text.x = element_blank())
p2 <- 
  GlobalSales %>% 
  ggplot(aes(ShippingCost)) +
  geom_histogram() +
  theme(axis.title = element_blank()
        )
grid.arrange(p1, p2, nrow = 1, top = "Shipping Cost")

Task 4

# Choose 5 & graph 3
GlobalSales %>%
  select(where(is.factor)) %>%
  gather() %>%
  drop_na() %>%
  ggplot(aes(value)) +
  geom_bar() +
  facet_wrap(vars(key), scales = "free") 
## Warning: attributes are not identical across measure variables;
## they will be dropped

  # + theme(axis.text.x=element_text(angle = 90, hjust = 0))

Task 5

# Categorical: Category, OrderPriority, Segment, ShipMode, Market
# Numerical: Sales:ShippingCost

tapply(GlobalSales$Discount, INDEX = GlobalSales$Market, FUN = mean) %>%
  round(digits = 2)
##       Europe Asia Pacific        LATAM       Africa         USCA 
##         8.52        12.67        10.97         3.39        14.68
GlobalSales %>%
  select(Discount, Market) %>%
  group_by(Market) %>%
  summarise(Mean.Discount = mean(Discount) %>% round(digits = 2),
            Std.Dev.Discount = sd(Discount),
            Count = NROW(Discount)
            )
## # A tibble: 5 x 4
##   Market       Mean.Discount Std.Dev.Discount Count
## * <fct>                <dbl>            <dbl> <int>
## 1 Europe                8.52             15.0   261
## 2 Asia Pacific         12.7              17.9   302
## 3 LATAM                11.0              17.4   205
## 4 Africa                3.39             15.1    62
## 5 USCA                 14.7              18.7   170
GlobalSales %>%
  ggplot(aes(Discount, Market)) +
  geom_boxplot(varwidth = TRUE) +
  labs(title = "Discounts per Market", x = "Discount Percent")

Task 6

GlobalSales %>%
  filter(Market == "Africa") %>%
  select(Discount, ShipMode) %>%
  group_by(ShipMode) %>%
  summarize(Mean.Discount = mean(Discount) %>% round(digits = 2),
            Count.Discount = NROW(Discount)
            )
## # A tibble: 4 x 3
##   ShipMode       Mean.Discount Count.Discount
## * <fct>                  <dbl>          <int>
## 1 Standard Class          0                30
## 2 Second Class            5.83             12
## 3 First Class             9.33             15
## 4 Same Day                0                 5
GlobalSales %>%
  ggplot(aes(Market, Discount, fill = ShipMode)) +
  geom_boxplot(varwidth = TRUE) +
  labs(title = "Discounts per Market by Ship Mode")

p <- GlobalSales %>% ggplot(aes(Market, Discount)) + geom_boxplot(varwidth = TRUE)
p + facet_grid(rows = vars(ShipMode))

p + facet_grid(cols = vars(ShipMode)) + theme(axis.text.x=element_text(angle = 90, hjust = 0))

GlobalSales %>%
  filter(Market == "Africa") %>%
  ggplot(aes(ShipMode, fill = Discount)) +
  geom_bar() +
  labs(title = "Africa")

GlobalSales %>%
  filter(Market == "Africa") %>%
  ggplot(aes(Discount, fill = ShipMode)) +
  geom_bar() +
  labs(title = "Africa")
## Warning: position_stack requires non-overlapping x intervals

GlobalSales %>%
  group_by(Market) %>%
  summarize(Profit = sum(Profit))
## # A tibble: 5 x 2
##   Market       Profit
## * <fct>         <dbl>
## 1 Europe       15460.
## 2 Asia Pacific 16168.
## 3 LATAM         9418.
## 4 Africa        5295.
## 5 USCA          3206.
GlobalSales %>%
  group_by(Market) %>%
  summarize(Profit = sum(Profit)) %>%
  ggplot(aes(Market, Profit)) +
  geom_col() +
  labs(title = "Total Profit", y = "Dollars")

Extra EDA

GlobalSales %>%
  select(Sales:ShippingCost) %>%
  as.data.frame() %>%
  stargazer::stargazer(.,
            type = "text",
            summary.stat = c("mean", "median", "sd", "min", "max"))
## 
## ==========================================================
## Statistic     Mean   Median  St. Dev.    Min        Max   
## ----------------------------------------------------------
## Sales        398.406 350.835 293.948    1.720    3,408.000
## Quantity      4.113     3     2.504       1         14    
## Discount     11.004     0     17.288      0         80    
## Profit       49.549  45.420  147.395  -1,121.690  776.250 
## ShippingCost 37.191  40.625   18.772    1.090     394.570 
## ----------------------------------------------------------
GlobalSales %>%
  select(Sales:ShippingCost) %>%
  gather(key = Variable) %>%
  group_by(Variable) %>%
  summarize(Mean = mean(value),
            Median = median(value),
            Std.Dev. = sd(value),
            Range = max(value) - min(value),
            )
## # A tibble: 5 x 5
##   Variable       Mean Median Std.Dev. Range
## * <chr>         <dbl>  <dbl>    <dbl> <dbl>
## 1 Discount      11.0     0      17.3    80 
## 2 Profit        49.5    45.4   147.   1898.
## 3 Quantity       4.11    3       2.50   13 
## 4 Sales        398.    351.    294.   3406.
## 5 ShippingCost  37.2    40.6    18.8   393.
# GlobalSales %>% glimpse()
GlobalSales %>%
  lapply(., class) %>%
  as.matrix() %>%
  as.data.frame()
##                      V1
## RowID           numeric
## OrderID       character
## OrderDate          Date
## ShipDate           Date
## CustomerID    character
## CustomerName  character
## PostalCode       factor
## City          character
## State         character
## Country          factor
## Region           factor
## Market           factor
## Segment          factor
## ProductID     character
## Category         factor
## SubCategory      factor
## ProductName   character
## OrderPriority    factor
## ShipMode         factor
## Sales           numeric
## Quantity        numeric
## Discount        numeric
## Profit          numeric
## ShippingCost    numeric
# Summary
# GlobalSales %>% select(Sales:ShippingCost) %>% summary()
GlobalSales %>% 
  mutate(across(.cols = c(City, State, Quantity),
                as_factor
                )
         ) %>%
  summary()
##      RowID         OrderID            OrderDate             ShipDate         
##  Min.   :  213   Length:1000        Min.   :2012-01-04   Min.   :2012-01-09  
##  1st Qu.:12218   Class :character   1st Qu.:2013-07-02   1st Qu.:2013-07-05  
##  Median :23220   Mode  :character   Median :2014-08-09   Median :2014-08-13  
##  Mean   :23953                      Mean   :2014-05-30   Mean   :2014-06-03  
##  3rd Qu.:34668                      3rd Qu.:2015-06-03   3rd Qu.:2015-06-08  
##  Max.   :51273                      Max.   :2015-12-31   Max.   :2016-01-04  
##                                                                              
##   CustomerID        CustomerName         PostalCode             City    
##  Length:1000        Length:1000        10011  : 10   New York City: 21  
##  Class :character   Class :character   90032  :  9   Manila       : 17  
##  Mode  :character   Mode  :character   19140  :  7   Los Angeles  : 16  
##                                        43229  :  6   Philadelphia : 16  
##                                        98026  :  6   Seattle      : 12  
##                                        (Other):129   Santo Domingo: 11  
##                                        NA's   :833   (Other)      :907  
##               State              Country                  Region   
##  England         : 30   United States:167   Western Europe   :136  
##  California      : 29   France       : 67   Central America  :100  
##  New York        : 25   Australia    : 65   Oceania          : 78  
##  National Capital: 20   Mexico       : 51   Southeastern Asia: 72  
##  Ile-de-France   : 19   China        : 50   South America    : 63  
##  Victoria        : 18   Germany      : 45   Eastern US       : 61  
##  (Other)         :859   (Other)      :555   (Other)          :490  
##           Market           Segment     ProductID                    Category  
##  Europe      :261   Corporate  :292   Length:1000        Office Supplies:336  
##  Asia Pacific:302   Home Office:155   Class :character   Technology     :355  
##  LATAM       :205   Consumer   :553   Mode  :character   Furniture      :309  
##  Africa      : 62                                                             
##  USCA        :170                                                             
##                                                                               
##                                                                               
##       SubCategory  ProductName         OrderPriority           ShipMode  
##  Chairs     :123   Length:1000        Medium  :508   Standard Class:551  
##  Copiers    :120   Class :character   Low     : 45   Second Class  :204  
##  Bookcases  :114   Mode  :character   High    :342   First Class   :181  
##  Phones     :113                      Critical:105   Same Day      : 64  
##  Storage    :109                                                         
##  Accessories: 72                                                         
##  (Other)    :349                                                         
##      Sales            Quantity      Discount      Profit         
##  Min.   :   1.72   2      :211   Min.   : 0   Min.   :-1121.690  
##  1st Qu.: 210.82   3      :200   1st Qu.: 0   1st Qu.:    4.963  
##  Median : 350.83   4      :145   Median : 0   Median :   45.420  
##  Mean   : 398.41   5      :106   Mean   :11   Mean   :   49.549  
##  3rd Qu.: 533.58   1      : 92   3rd Qu.:20   3rd Qu.:  113.782  
##  Max.   :3408.00   6      : 86   Max.   :80   Max.   :  776.250  
##                    (Other):160                                   
##   ShippingCost   
##  Min.   :  1.09  
##  1st Qu.: 39.09  
##  Median : 40.62  
##  Mean   : 37.19  
##  3rd Qu.: 42.27  
##  Max.   :394.57  
## 
# Count incomplete cases
GlobalSales %>% filter(!complete.cases(.)) %>% nrow()
## [1] 833
# Random columns and rows
GlobalSales %>% slice_sample(n = 10) %>% select(sample(1:ncol(GlobalSales), 7))
## # A tibble: 10 x 7
##    Country     ShipMode      Profit OrderID          Discount Quantity Segment  
##    <fct>       <fct>          <dbl> <chr>               <dbl>    <dbl> <fct>    
##  1 Cuba        Standard Cl~   17.8  MX-2014-SM20950~        0        3 Corporate
##  2 United Sta~ Standard Cl~   18.6  CA-2013-BS11800~        0        3 Home Off~
##  3 United Kin~ Second Class   46.2  ES-2013-RO19780~       10        5 Consumer 
##  4 Australia   Second Class   -6.03 IN-2014-JR16210~       10        5 Corporate
##  5 Australia   Second Class    0    ID-2015-RB19330~       10       14 Consumer 
##  6 New Zealand Second Class   76.1  IN-2014-SF20065~        0        6 Consumer 
##  7 Australia   Same Day       73.3  IN-2014-SB20170~       10        9 Consumer 
##  8 France      Second Class   48.4  ES-2013-CG12520~        0        5 Consumer 
##  9 Indonesia   Second Class -143.   ID-2014-TH21100~       27        4 Consumer 
## 10 Italy       Standard Cl~  -76.8  ES-2014-LC17140~       50        2 Consumer
GlobalSales %>%
  select(Sales:ShippingCost) %>%
  gather() %>%
  ggplot(aes(value)) +
  geom_histogram() +
  facet_wrap(vars(key), scales = "free")

GlobalSales %>%
  select(Sales:ShippingCost) %>%
  gather() %>%
  ggplot(aes(y = value)) +
  geom_boxplot() +
  facet_wrap(vars(key), scales = "free")

# All pairs of numerical and categorical data
GlobalSales %>%
  select(Category, OrderPriority, Segment, ShipMode, Market, Sales:ShippingCost) %>%
  ggpairs(axisLabels = "none", title = "All Markets")

# All pairs of numerical and categorical data filtered for Africa
GlobalSales %>%
  filter(Market == "Africa") %>%
  select(Category, OrderPriority, Segment, ShipMode, Market, Sales:ShippingCost) %>%
  ggpairs(axisLabels = "none", title = "Africa")