Merging

Import

  • Four datasets
remove (list=ls())

library(readxl)
?readxl

#inventory <- read_csv("~/Downloads/inventory.csv")

# View(inventory)

# dim(inventory)
products <- read.csv("~/Desktop/products.csv")
sales <- read.csv("~/Desktop/sales.csv")
stores <- read.csv("~/Desktop/stores.csv")
`inventory.(1)` <- read.csv("~/Desktop/inventory (1).csv")

Basic EDA on each file

Stores

names(stores)
[1] "Store_ID"        "Store_Name"      "Store_City"      "Store_Location" 
[5] "Store_Open_Date"
length(stores$Store_ID)
[1] 50
table(stores$Store_City)

  Aguascalientes         Campeche         Chetumal        Chihuahua 
               1                2                1                2 
    Chilpancingo  Ciudad Victoria       Cuernavaca Cuidad de Mexico 
               1                1                1                4 
        Culiacan          Durango      Guadalajara       Guanajuato 
               1                1                4                3 
      Hermosillo           La Paz           Merida         Mexicali 
               3                1                1                2 
       Monterrey          Morelia           Oaxaca          Pachuca 
               4                1                1                1 
          Puebla         Saltillo  San Luis Potosi         Santiago 
               3                2                1                1 
          Toluca Tuxtla Gutierrez     Villahermosa           Xalapa 
               2                1                1                2 
       Zacatecas 
               1 
table(stores$Store_Location)

    Airport  Commercial    Downtown Residential 
          3          12          29           6 
# simple Bar Plot
count_Store_Location <- table(stores$Store_Location)
barplot(count_Store_Location, main = "Barplot of Store Location")

unique(stores$Store_ID)
 [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
[26] 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
length(stores$Store_ID)
[1] 50

Sales

  • Sales ID is a unique identifier, not stores ID
names(sales)
[1] "Sale_ID"    "Date"       "Store_ID"   "Product_ID" "Units"     
?unique
length(unique(sales$Sale_ID)) # Confirmed sales ID is unique
[1] 829262
unique(sales$Store_ID)
 [1] 24 28  6 48 44  1 40 19 38 21 34 37  5  8  3 33 30  4 41 10 16 36 39 32 25
[26] 46 23 50 12 22 27  7 47 49 42 29 18  9 31 35 13 14 17  2 11 43 26 45 15 20
sort(unique(sales$Store_ID))
 [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
[26] 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50

Merge

?merge
stores_sales <- 
merge(x = sales,
      y = stores,
      by = "Store_ID"
      )

# install.packages(Amelia)
require(Amelia)
Loading required package: Amelia
Loading required package: Rcpp
## 
## Amelia II: Multiple Imputation
## (Version 1.8.2, built: 2024-04-10)
## Copyright (C) 2005-2024 James Honaker, Gary King and Matthew Blackwell
## Refer to http://gking.harvard.edu/amelia/ for more information
## 
missmap(stores_sales)

  • Many to one merge !

Merge with Product

  • One ( products ) to many ( store_sales ) on Product_ID.

  • Save it as product_stores_sales.

Product

unique(products$Product_ID)
 [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
[26] 26 27 28 29 30 31 32 33 34 35
length(unique(products$Product_ID))
[1] 35
product_stores_sales <- 
merge(x = stores_sales, 
      y = products, 
      by = c("Product_ID")
      )

remove(stores_sales)

Merge with Inventory

  • One ( products ) to many ( stores_sales ) on the unique identifier Product_ID and Product_ID.

  • Save it as inventory_product_stores_sales.

unique(`inventory.(1)`$Store_ID)
 [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
[26] 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
store_id <- as.character(`inventory.(1)`$Store_ID)
product_id <- as.character(`inventory.(1)`$Product_ID)

# Test <- `inventory.(1)`$Store_ID + inventory$Product_ID

inventory_product_stores_sales <- 
merge(x = product_stores_sales,
      y = `inventory.(1)`,
      by = c("Store_ID", "Product_ID")
      )

Summary Statistics

Product Prices

?gsub

# removing dollar sign from string
inventory_product_stores_sales$Product_Price <- 
              gsub(pattern = "\\$",
               replacement = "",
               x = inventory_product_stores_sales$Product_Price
               )
?as.numeric
inventory_product_stores_sales$Product_Price <- as.numeric(inventory_product_stores_sales$Product_Price)
  
?head
head(as.numeric(inventory_product_stores_sales$Product_Price))
[1] 15.99 15.99 15.99 15.99 15.99 15.99
?gsub

# removing dollar sign from string
inventory_product_stores_sales$Product_Cost <- 
              gsub(pattern = "\\$",
               replacement = "",
               x = inventory_product_stores_sales$Product_Cost
               )
?as.numeric
inventory_product_stores_sales$Product_Cost <- as.numeric(inventory_product_stores_sales$Product_Cost)
  
?head
head(as.numeric(inventory_product_stores_sales$Product_Cost))
[1] 9.99 9.99 9.99 9.99 9.99 9.99

Product Cost

library("psych")
describe(inventory_product_stores_sales)
Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
                 vars      n      mean        sd    median  min       max
Store_ID            1 823030     25.27     14.36     26.00 1.00     50.00
Product_ID          2 823030     15.01      9.90     14.00 1.00     35.00
Sale_ID             3 823030 414855.48 239754.24 415225.50 1.00 829262.00
Date                4 823030       NaN        NA        NA  Inf      -Inf
Units               5 823030      1.32      0.83      1.00 1.00     30.00
Store_Name          6 823030       NaN        NA        NA  Inf      -Inf
Store_City          7 823030       NaN        NA        NA  Inf      -Inf
Store_Location      8 823030       NaN        NA        NA  Inf      -Inf
Store_Open_Date     9 823030       NaN        NA        NA  Inf      -Inf
Product_Name       10 823030       NaN        NA        NA  Inf      -Inf
Product_Category   11 823030       NaN        NA        NA  Inf      -Inf
Product_Cost       12 823030     10.03      7.82      7.99 1.99     34.99
Product_Price      13 823030     13.80      8.69     12.99 2.99     39.99
Stock_On_Hand      14 823030     25.32     22.96     18.00 0.00    139.00
                  range skew kurtosis     se
Store_ID             49 0.01    -1.21   0.02
Product_ID           34 0.26    -1.22   0.01
Sale_ID          829261 0.00    -1.20 264.28
Date               -Inf   NA       NA     NA
Units                29 4.49    47.82   0.00
Store_Name         -Inf   NA       NA     NA
Store_City         -Inf   NA       NA     NA
Store_Location     -Inf   NA       NA     NA
Store_Open_Date    -Inf   NA       NA     NA
Product_Name       -Inf   NA       NA     NA
Product_Category   -Inf   NA       NA     NA
Product_Cost         33 1.78     3.41   0.01
Product_Price        37 1.37     2.32   0.01
Stock_On_Hand       139 1.58     2.85   0.03

Import R Dataset to Excel

# # Take a random sample of 10 elements from the population
# sample_inventory_product_stores_sales <- sample(x = inventory_product_stores_sales, size = 1000)

#dplyr::sample_n()
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
?sample_n(x, n)

#install.packages("openxlsx")
library(openxlsx)

sample_inventory_product_stores_sales <- sample_n(tbl = inventory_product_stores_sales, size = 10000)

# install.packages("writexl")
library(writexl)
?writexl

write_xlsx(sample_inventory_product_stores_sales, "~/Downloads/inventory.csv")