store <- read.csv("data-input/Global_Superstore.csv",sep = ";")
head(store)library(dplyr)
glimpse(store)#> Rows: 51,290
#> Columns: 24
#> $ Row.ID <int> 32298, 26341, 25330, 13524, 47221, 22732, 30570, 31192,…
#> $ Order.ID <chr> "CA-2012-124891", "IN-2013-77878", "IN-2013-71249", "ES…
#> $ Order.Date <chr> "31/07/2012", "05/02/2013", "17/10/2013", "28/01/2013",…
#> $ Ship.Date <chr> "31/07/2012", "07/02/2013", "18/10/2013", "30/01/2013",…
#> $ Ship.Mode <chr> "Same Day", "Second Class", "First Class", "First Class…
#> $ Customer.ID <chr> "RH-19495", "JR-16210", "CR-12730", "KM-16375", "RH-949…
#> $ Customer.Name <chr> "Rick Hansen", "Justin Ritter", "Craig Reiter", "Kather…
#> $ Segment <chr> "Consumer", "Corporate", "Consumer", "Home Office", "Co…
#> $ City <chr> "New York City", "Wollongong", "Brisbane", "Berlin", "D…
#> $ State <chr> "New York", "New South Wales", "Queensland", "Berlin", …
#> $ Country <chr> "United States", "Australia", "Australia", "Germany", "…
#> $ Postal.Code <int> 10024, NA, NA, NA, NA, NA, NA, NA, 95823, 28027, 22304,…
#> $ Market <chr> "US", "APAC", "APAC", "EU", "Africa", "APAC", "APAC", "…
#> $ Region <chr> "East", "Oceania", "Oceania", "Central", "Africa", "Oce…
#> $ Product.ID <chr> "TEC-AC-10003033", "FUR-CH-10003950", "TEC-PH-10004664"…
#> $ Category <chr> "Technology", "Furniture", "Technology", "Technology", …
#> $ Sub.Category <chr> "Accessories", "Chairs", "Phones", "Phones", "Copiers",…
#> $ Product.Name <chr> "Plantronics CS510 - Over-the-Head monaural Wireless He…
#> $ Sales <chr> "2309,65", "3709,395", "5175,171", "2892,51", "2832,96"…
#> $ Quantity <int> 7, 9, 9, 5, 8, 5, 4, 6, 5, 13, 5, 5, 4, 7, 12, 4, 9, 14…
#> $ Discount <chr> "0", "0,1", "0,1", "0,1", "0", "0,1", "0", "0", "0,2", …
#> $ Profit <chr> "762,1845", "-288,765", "919,971", "-96,54", "311,52", …
#> $ Shipping.Cost <chr> "933,57", "923,63", "915,49", "910,16", "903,04", "897,…
#> $ Order.Priority <chr> "Critical", "Critical", "Medium", "Medium", "Critical",…
library(readr)
store_clean <- store %>%
mutate(Postal.Code = if_else(is.na(Postal.Code), as.integer("0"), Postal.Code)) %>%
mutate(Order.Date = as.Date(Order.Date, "%d/%m/%Y"),
Ship.Date = as.Date(Ship.Date, "%d/%m/%Y"),
Ship.Mode = as.factor(Ship.Mode),
Segment = as.factor(Segment),
Market = as.factor(Market),
Region = as.factor(Region),
Category = as.factor(Category),
Sub.Category = as.factor(Sub.Category),
Order.Priority = as.factor(Order.Priority),
Postal.Code = as.integer(Postal.Code),
Profit = parse_number(Profit),
Shipping.Cost = parse_number(Shipping.Cost),
Sales = parse_number(Sales)
)In this analysis, we are aggregating data to extract meaningful insights about the sub-categories of products based on sales, profit, and net income for the year 2011.
library(lubridate)
agg_1 <- store_clean %>%
mutate(order_year = year(ymd(Order.Date))) %>%
filter(order_year == 2011) %>%
group_by(Sub.Category) %>%
summarise(Sales, Profit, net_income = Sales - Profit) %>%
arrange(desc(Profit)) %>%
slice(1)
agg_1dataframe provides us with the sub-category that generated the highest profit in 2011, along with its corresponding sales and net income figures. By aggregating the data in this way, we can gain insights into the most profitable sub-category and assess its financial performance compared to others.
The objective is to identify the top shipping cost for each country in the year 2011, considering additional conditions on shipping cost and sales.
agg_2 <- store_clean %>%
mutate(order_year = year(ymd(Order.Date))) %>%
filter(order_year == 2011 & Shipping.Cost <= 200000 & Sales <= 9000000) %>%
group_by(Country) %>%
slice(1) %>%
arrange(desc(Shipping.Cost))
agg_2The resulting dataset provides us with the top shipping cost for each country that satisfies the specified conditions. This analysis allows us to identify countries with the highest shipping costs within the given criteria, helping us gain insights into the shipping dynamics and costs associated with different countries in 2011.
library(ggplot2)
library(scales)
library(plotly)
library(glue)
plot_1<- agg_1 %>%
ggplot(mapping = aes(y = reorder(Sub.Category,Profit),
x = Profit,
fill = Profit,
text = glue("Sales: ${comma(Sales)}\nProfit: ${comma(Profit)}\nNet Income: ${comma(net_income)}"))) +
geom_col() +
labs(title = "Profit of Sub Category Product in 2011",
x = "Profit",
y = "Sub Category Product",
caption = "Source: Global Superstore") +
scale_x_continuous(labels = dollar_format(prefix = "$ ")) +
theme_minimal() +
theme(legend.position = "none")
ggplotly(p = plot_1, tooltip = "text")plot_2 <- agg_2 %>%
ggplot(mapping = aes(x = Shipping.Cost,
y = Sales)) +
geom_jitter(aes(col = Ship.Mode,
text = glue("Product Name: {Product.Name}
Shipping Mode: {Ship.Mode}
Order Priority: {Order.Priority}
Shipping Cost: ${comma(Shipping.Cost)}"))) +
geom_smooth(method = loess, formula = y ~ x) +
labs(title = "Scatter Plot of Sales vs Shipping Cost",
x = "Shipping Cost",
y = "Sales") +
scale_x_continuous(labels = scales::comma) +
scale_y_continuous(labels = scales::comma) +
theme_minimal() +
theme(legend.position = "none")ggplotly(plot_2, tooltip= "text")