Merging

Author

WD

Setup + Importing

remove(list = ls()) 
sales <- read.csv("~/Downloads/sales.csv")
products <- read.csv("~/Downloads/products (1).csv")

Checking ID

?unique
length(unique(sales$Sale_ID))
[1] 829262
  • Sale_ID is unique in sales df

  • Product_ID is unique in products df

  • Cannot merge sales data with products data based on their unique ID’s

  • Thus we use product ID

Merging Data

sales_products <- merge(
  x     = sales,
  y     = products,
  by    = "Product_ID",
  all.x = TRUE
)

Viewing Data

glimpse(sales_products)
Rows: 829,262
Columns: 9
$ Product_ID       <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ Sale_ID          <int> 209473, 818753, 1956, 443121, 561047, 147861, 800529,…
$ Date             <chr> "2017-07-11", "2018-09-23", "2017-01-02", "2018-01-16…
$ Store_ID         <int> 46, 2, 10, 17, 34, 41, 4, 43, 45, 36, 9, 22, 39, 17, …
$ Units            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ Product_Name     <chr> "Action Figure", "Action Figure", "Action Figure", "A…
$ Product_Category <chr> "Toys", "Toys", "Toys", "Toys", "Toys", "Toys", "Toys…
$ Product_Cost     <chr> "$9.99 ", "$9.99 ", "$9.99 ", "$9.99 ", "$9.99 ", "$9…
$ Product_Price    <chr> "$15.99 ", "$15.99 ", "$15.99 ", "$15.99 ", "$15.99 "…

Importing 2

stores <- read.csv("~/Downloads/stores.csv")
inventory <- read.csv("~/Downloads/inventory.csv")

Merging Data

# Merging Data 2
sales_products_stores <- merge(
  x     = sales_products,
  y     = stores,
  by    = "Store_ID",   # <- was Product_ID before, now correct
  all.x = TRUE
)

# Bring in inventory (by both store and product)
sales_products_stores_inventory <- merge(
  x     = sales_products_stores,
  y     = inventory,
  by    = c("Store_ID", "Product_ID"),
  all.x = TRUE
)
remove(sales)
remove(inventory)
remove(stores)
remove(products)
remove(sales_products)
remove(sales_products_stores)

Charts

library(ggplot2)
ggplot(sales_products_stores_inventory, aes(x = Stock_On_Hand)) +
  geom_density() 
Warning: Removed 6232 rows containing non-finite outside the scale range
(`stat_density()`).