# Load libraries
pacman::p_load(tidyverse, magrittr, GGally, lubridate, gridExtra,
install = FALSE, update = FALSE)
# Function to remove/replace characters
convert <- function(x){
x %>% str_replace_all(c("\\(" = "-",
"\\$" = "",
"\\)" = "",
"\\," = ""
)
) %>%
as.numeric()
}
# Load Global Sales 2016
GlobalSales <- read_csv("G:/My Drive/homework/Lovely J/M6_project_dataset.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_character(),
## RowID = col_double(),
## PostalCode = col_double(),
## Quantity = col_double(),
## Discount = col_double()
## )
## i Use `spec()` for the full column specifications.
# Variable Type Correction
GlobalSales %<>%
mutate(across(.cols = c(OrderDate, ShipDate),
mdy),
across(.cols = c(Country, Region, Market, Segment,
Category, SubCategory, OrderPriority, ShipMode,
PostalCode),
as_factor
), # City, State UTF-8 errors, iconv(., to = "UTF-8")?
across(.cols = c(Sales, Profit, ShippingCost),
convert
),
Discount = 100 * Discount # proportion to %
)
# Suggested project code
M6_project_dataset <-
read_csv("Lovely J/M6_project_dataset.csv",
col_types = cols(RowID = col_integer(),
OrderDate = col_date(format = "%m/%d/%Y"),
ShipDate = col_date(format = "%m/%d/%Y"),
Sales = col_number(), Discount = col_number(),
ShippingCost = col_number()
)
)
# still need to fix Profit & add factor levels
# Function to create statistic x variable (row x column) summary
sum.fun <- function(x){
return(c(mean(x),
median(x),
sd(x),
max(x) - min(x)
)
)
}
GlobalSales %>%
select(Sales:ShippingCost) %>%
lapply(., sum.fun) %>%
as.data.frame() %>%
round(digits = 2) ->
sales.summary
row.names(sales.summary) <- c("Mean", "Median", "sd", "Range")
sales.summary
## Sales Quantity Discount Profit ShippingCost
## Mean 398.41 4.11 11.00 49.55 37.19
## Median 350.84 3.00 0.00 45.42 40.62
## sd 293.95 2.50 17.29 147.39 18.77
## Range 3406.28 13.00 80.00 1897.94 393.48
# Sales
p1 <-
GlobalSales %>%
ggplot(aes(y = Sales)) +
geom_boxplot() +
theme(axis.title = element_blank(),
axis.text.x = element_blank())
p2 <-
GlobalSales %>%
ggplot(aes(Sales)) +
geom_histogram() +
theme(axis.title = element_blank()
)
grid.arrange(p1, p2, nrow = 1, top = "Sales")
# Quantity
p1 <-
GlobalSales %>%
ggplot(aes(y = Quantity)) +
geom_boxplot() +
theme(axis.title = element_blank(),
axis.text.x = element_blank())
p2 <-
GlobalSales %>%
ggplot(aes(Quantity)) +
geom_histogram() +
theme(axis.title = element_blank()
)
grid.arrange(p1, p2, nrow = 1, top = "Quantity")
# Discount
p1 <-
GlobalSales %>%
ggplot(aes(y = Discount)) +
geom_boxplot() +
theme(axis.title = element_blank(),
axis.text.x = element_blank())
p2 <-
GlobalSales %>%
ggplot(aes(Discount)) +
geom_histogram() +
theme(axis.title = element_blank()
)
grid.arrange(p1, p2, nrow = 1, top = "Discount %")
# Profit
p1 <-
GlobalSales %>%
ggplot(aes(y = Profit)) +
geom_boxplot() +
theme(axis.title = element_blank(),
axis.text.x = element_blank())
p2 <-
GlobalSales %>%
ggplot(aes(Profit)) +
geom_histogram() +
theme(axis.title = element_blank()
)
grid.arrange(p1, p2, nrow = 1, top = "Profit")
# ShippingCost
p1 <-
GlobalSales %>%
ggplot(aes(y = ShippingCost)) +
geom_boxplot() +
theme(axis.title = element_blank(),
axis.text.x = element_blank())
p2 <-
GlobalSales %>%
ggplot(aes(ShippingCost)) +
geom_histogram() +
theme(axis.title = element_blank()
)
grid.arrange(p1, p2, nrow = 1, top = "Shipping Cost")
# Choose 5 & graph 3
GlobalSales %>%
select(where(is.factor)) %>%
gather() %>%
drop_na() %>%
ggplot(aes(value)) +
geom_bar() +
facet_wrap(vars(key), scales = "free")
## Warning: attributes are not identical across measure variables;
## they will be dropped
# + theme(axis.text.x=element_text(angle = 90, hjust = 0))
# Categorical: Category, OrderPriority, Segment, ShipMode, Market
# Numerical: Sales:ShippingCost
tapply(GlobalSales$Discount, INDEX = GlobalSales$Market, FUN = mean) %>%
round(digits = 2)
## Europe Asia Pacific LATAM Africa USCA
## 8.52 12.67 10.97 3.39 14.68
GlobalSales %>%
select(Discount, Market) %>%
group_by(Market) %>%
summarise(Mean.Discount = mean(Discount) %>% round(digits = 2),
Std.Dev.Discount = sd(Discount),
Count = NROW(Discount)
)
## # A tibble: 5 x 4
## Market Mean.Discount Std.Dev.Discount Count
## * <fct> <dbl> <dbl> <int>
## 1 Europe 8.52 15.0 261
## 2 Asia Pacific 12.7 17.9 302
## 3 LATAM 11.0 17.4 205
## 4 Africa 3.39 15.1 62
## 5 USCA 14.7 18.7 170
GlobalSales %>%
ggplot(aes(Discount, Market)) +
geom_boxplot(varwidth = TRUE) +
labs(title = "Discounts per Market", x = "Discount Percent")
GlobalSales %>%
filter(Market == "Africa") %>%
select(Discount, ShipMode) %>%
group_by(ShipMode) %>%
summarize(Mean.Discount = mean(Discount) %>% round(digits = 2),
Count.Discount = NROW(Discount)
)
## # A tibble: 4 x 3
## ShipMode Mean.Discount Count.Discount
## * <fct> <dbl> <int>
## 1 Standard Class 0 30
## 2 Second Class 5.83 12
## 3 First Class 9.33 15
## 4 Same Day 0 5
GlobalSales %>%
ggplot(aes(Market, Discount, fill = ShipMode)) +
geom_boxplot(varwidth = TRUE) +
labs(title = "Discounts per Market by Ship Mode")
p <- GlobalSales %>% ggplot(aes(Market, Discount)) + geom_boxplot(varwidth = TRUE)
p + facet_grid(rows = vars(ShipMode))
p + facet_grid(cols = vars(ShipMode)) + theme(axis.text.x=element_text(angle = 90, hjust = 0))
GlobalSales %>%
filter(Market == "Africa") %>%
ggplot(aes(ShipMode, fill = Discount)) +
geom_bar() +
labs(title = "Africa")
GlobalSales %>%
filter(Market == "Africa") %>%
ggplot(aes(Discount, fill = ShipMode)) +
geom_bar() +
labs(title = "Africa")
## Warning: position_stack requires non-overlapping x intervals
GlobalSales %>%
group_by(Market) %>%
summarize(Profit = sum(Profit))
## # A tibble: 5 x 2
## Market Profit
## * <fct> <dbl>
## 1 Europe 15460.
## 2 Asia Pacific 16168.
## 3 LATAM 9418.
## 4 Africa 5295.
## 5 USCA 3206.
GlobalSales %>%
group_by(Market) %>%
summarize(Profit = sum(Profit)) %>%
ggplot(aes(Market, Profit)) +
geom_col() +
labs(title = "Total Profit", y = "Dollars")
GlobalSales %>%
select(Sales:ShippingCost) %>%
as.data.frame() %>%
stargazer::stargazer(.,
type = "text",
summary.stat = c("mean", "median", "sd", "min", "max"))
##
## ==========================================================
## Statistic Mean Median St. Dev. Min Max
## ----------------------------------------------------------
## Sales 398.406 350.835 293.948 1.720 3,408.000
## Quantity 4.113 3 2.504 1 14
## Discount 11.004 0 17.288 0 80
## Profit 49.549 45.420 147.395 -1,121.690 776.250
## ShippingCost 37.191 40.625 18.772 1.090 394.570
## ----------------------------------------------------------
GlobalSales %>%
select(Sales:ShippingCost) %>%
gather(key = Variable) %>%
group_by(Variable) %>%
summarize(Mean = mean(value),
Median = median(value),
Std.Dev. = sd(value),
Range = max(value) - min(value),
)
## # A tibble: 5 x 5
## Variable Mean Median Std.Dev. Range
## * <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Discount 11.0 0 17.3 80
## 2 Profit 49.5 45.4 147. 1898.
## 3 Quantity 4.11 3 2.50 13
## 4 Sales 398. 351. 294. 3406.
## 5 ShippingCost 37.2 40.6 18.8 393.
# GlobalSales %>% glimpse()
GlobalSales %>%
lapply(., class) %>%
as.matrix() %>%
as.data.frame()
## V1
## RowID numeric
## OrderID character
## OrderDate Date
## ShipDate Date
## CustomerID character
## CustomerName character
## PostalCode factor
## City character
## State character
## Country factor
## Region factor
## Market factor
## Segment factor
## ProductID character
## Category factor
## SubCategory factor
## ProductName character
## OrderPriority factor
## ShipMode factor
## Sales numeric
## Quantity numeric
## Discount numeric
## Profit numeric
## ShippingCost numeric
# Summary
# GlobalSales %>% select(Sales:ShippingCost) %>% summary()
GlobalSales %>%
mutate(across(.cols = c(City, State, Quantity),
as_factor
)
) %>%
summary()
## RowID OrderID OrderDate ShipDate
## Min. : 213 Length:1000 Min. :2012-01-04 Min. :2012-01-09
## 1st Qu.:12218 Class :character 1st Qu.:2013-07-02 1st Qu.:2013-07-05
## Median :23220 Mode :character Median :2014-08-09 Median :2014-08-13
## Mean :23953 Mean :2014-05-30 Mean :2014-06-03
## 3rd Qu.:34668 3rd Qu.:2015-06-03 3rd Qu.:2015-06-08
## Max. :51273 Max. :2015-12-31 Max. :2016-01-04
##
## CustomerID CustomerName PostalCode City
## Length:1000 Length:1000 10011 : 10 New York City: 21
## Class :character Class :character 90032 : 9 Manila : 17
## Mode :character Mode :character 19140 : 7 Los Angeles : 16
## 43229 : 6 Philadelphia : 16
## 98026 : 6 Seattle : 12
## (Other):129 Santo Domingo: 11
## NA's :833 (Other) :907
## State Country Region
## England : 30 United States:167 Western Europe :136
## California : 29 France : 67 Central America :100
## New York : 25 Australia : 65 Oceania : 78
## National Capital: 20 Mexico : 51 Southeastern Asia: 72
## Ile-de-France : 19 China : 50 South America : 63
## Victoria : 18 Germany : 45 Eastern US : 61
## (Other) :859 (Other) :555 (Other) :490
## Market Segment ProductID Category
## Europe :261 Corporate :292 Length:1000 Office Supplies:336
## Asia Pacific:302 Home Office:155 Class :character Technology :355
## LATAM :205 Consumer :553 Mode :character Furniture :309
## Africa : 62
## USCA :170
##
##
## SubCategory ProductName OrderPriority ShipMode
## Chairs :123 Length:1000 Medium :508 Standard Class:551
## Copiers :120 Class :character Low : 45 Second Class :204
## Bookcases :114 Mode :character High :342 First Class :181
## Phones :113 Critical:105 Same Day : 64
## Storage :109
## Accessories: 72
## (Other) :349
## Sales Quantity Discount Profit
## Min. : 1.72 2 :211 Min. : 0 Min. :-1121.690
## 1st Qu.: 210.82 3 :200 1st Qu.: 0 1st Qu.: 4.963
## Median : 350.83 4 :145 Median : 0 Median : 45.420
## Mean : 398.41 5 :106 Mean :11 Mean : 49.549
## 3rd Qu.: 533.58 1 : 92 3rd Qu.:20 3rd Qu.: 113.782
## Max. :3408.00 6 : 86 Max. :80 Max. : 776.250
## (Other):160
## ShippingCost
## Min. : 1.09
## 1st Qu.: 39.09
## Median : 40.62
## Mean : 37.19
## 3rd Qu.: 42.27
## Max. :394.57
##
# Count incomplete cases
GlobalSales %>% filter(!complete.cases(.)) %>% nrow()
## [1] 833
# Random columns and rows
GlobalSales %>% slice_sample(n = 10) %>% select(sample(1:ncol(GlobalSales), 7))
## # A tibble: 10 x 7
## Country ShipMode Profit OrderID Discount Quantity Segment
## <fct> <fct> <dbl> <chr> <dbl> <dbl> <fct>
## 1 Cuba Standard Cl~ 17.8 MX-2014-SM20950~ 0 3 Corporate
## 2 United Sta~ Standard Cl~ 18.6 CA-2013-BS11800~ 0 3 Home Off~
## 3 United Kin~ Second Class 46.2 ES-2013-RO19780~ 10 5 Consumer
## 4 Australia Second Class -6.03 IN-2014-JR16210~ 10 5 Corporate
## 5 Australia Second Class 0 ID-2015-RB19330~ 10 14 Consumer
## 6 New Zealand Second Class 76.1 IN-2014-SF20065~ 0 6 Consumer
## 7 Australia Same Day 73.3 IN-2014-SB20170~ 10 9 Consumer
## 8 France Second Class 48.4 ES-2013-CG12520~ 0 5 Consumer
## 9 Indonesia Second Class -143. ID-2014-TH21100~ 27 4 Consumer
## 10 Italy Standard Cl~ -76.8 ES-2014-LC17140~ 50 2 Consumer
GlobalSales %>%
select(Sales:ShippingCost) %>%
gather() %>%
ggplot(aes(value)) +
geom_histogram() +
facet_wrap(vars(key), scales = "free")
GlobalSales %>%
select(Sales:ShippingCost) %>%
gather() %>%
ggplot(aes(y = value)) +
geom_boxplot() +
facet_wrap(vars(key), scales = "free")
# All pairs of numerical and categorical data
GlobalSales %>%
select(Category, OrderPriority, Segment, ShipMode, Market, Sales:ShippingCost) %>%
ggpairs(axisLabels = "none", title = "All Markets")
# All pairs of numerical and categorical data filtered for Africa
GlobalSales %>%
filter(Market == "Africa") %>%
select(Category, OrderPriority, Segment, ShipMode, Market, Sales:ShippingCost) %>%
ggpairs(axisLabels = "none", title = "Africa")