Merging

Author

AS

Import

  • 4 datasets
remove(list=ls())
library(readxl)
?readxl
inventory <- read_excel(path = "Maven+Toys+Data/inventory.xlsx",
                        range = "A1:C1594"
                        )
dim(inventory)
[1] 1593    3
products <- read.csv("Maven+Toys+Data/products.csv")
sales <- read.csv("Maven+Toys+Data/sales.csv")
stores <- read.csv("Maven+Toys+Data/stores.csv")

Basic EDA on each file

Stores

  • Stores id is unique identifier
names(stores)
[1] "Store_ID"        "Store_Name"      "Store_City"      "Store_Location" 
[5] "Store_Open_Date"
length(stores$Store_ID)
[1] 50
table(stores$Store_City)

  Aguascalientes         Campeche         Chetumal        Chihuahua 
               1                2                1                2 
    Chilpancingo  Ciudad Victoria       Cuernavaca Cuidad de Mexico 
               1                1                1                4 
        Culiacan          Durango      Guadalajara       Guanajuato 
               1                1                4                3 
      Hermosillo           La Paz           Merida         Mexicali 
               3                1                1                2 
       Monterrey          Morelia           Oaxaca          Pachuca 
               4                1                1                1 
          Puebla         Saltillo  San Luis Potosi         Santiago 
               3                2                1                1 
          Toluca Tuxtla Gutierrez     Villahermosa           Xalapa 
               2                1                1                2 
       Zacatecas 
               1 
# Simple Bar Plot
count_Store_Location <- table(stores$Store_Location)
barplot(count_Store_Location, main = "Barplot of Store Location")

unique(stores$Store_ID)
 [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
[26] 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
length(stores$Store_ID)
[1] 50

Sales

  • Sales id is unique identifier, not stores id !
names(sales)
[1] "Sale_ID"    "Date"       "Store_ID"   "Product_ID" "Units"     
?unique
length(unique(sales$Sale_ID)) # confirmed sales ID is unique
[1] 829262
unique(sales$Store_ID)
 [1] 24 28  6 48 44  1 40 19 38 21 34 37  5  8  3 33 30  4 41 10 16 36 39 32 25
[26] 46 23 50 12 22 27  7 47 49 42 29 18  9 31 35 13 14 17  2 11 43 26 45 15 20
sort(unique(sales$Store_ID))
 [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
[26] 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
  • See merge basics in Appendix.
?merge
stores_sales <-
merge(x = sales,
      y = stores,
      by = "Store_ID"
      )
  • Many to one merge !
require(Amelia)
Loading required package: Amelia
Loading required package: Rcpp
## 
## Amelia II: Multiple Imputation
## (Version 1.8.2, built: 2024-04-10)
## Copyright (C) 2005-2024 James Honaker, Gary King and Matthew Blackwell
## Refer to http://gking.harvard.edu/amelia/ for more information
## 
missmap(stores_sales)

https://metudatascience.github.io/datascience/amelia.html

Merge with PRODUCT

One ( products ) to many ( stores_sales ) on Product_ID.

Save it as product_stores_sales.

Product

length(unique(products$Product_ID))
[1] 35
product_stores_sales <-
merge(x = stores_sales,
      y = products, 
      by = c("Product_ID")
      )

remove(stores_sales)

Merge with INVENTORY

One ( products ) to many ( stores_sales ) on the unique identifier StoreID - Product_ID.

Save it as inventory_product_stores_sales.

Inventory

store_id <- as.character(inventory$Store_ID)
product_id <- as.character(inventory$Product_ID)

# test <- cbind(store_id , product_id) # combined vector.

inventory_product_stores_sales <-
merge(x = product_stores_sales,
      y = inventory, 
      by = c("Store_ID","Product_ID")
      )

Summary Statistics

Product Cost

?gsub

# removing dollar sign from string 
inventory_product_stores_sales$Product_Cost   <- 
                gsub(pattern  = "\\$", 
                replacement   = "", 
                x             = inventory_product_stores_sales$Product_Cost
                ) 

?as.numeric
inventory_product_stores_sales$Product_Cost <- as.numeric(inventory_product_stores_sales$Product_Cost)

Product Prices

# removing dollar sign from string 
inventory_product_stores_sales$Product_Price   <- 
                gsub(pattern  = "\\$", 
                replacement   = "", 
                x             = inventory_product_stores_sales$Product_Price
                ) 

?as.numeric
inventory_product_stores_sales$Product_Price <- as.numeric(inventory_product_stores_sales$Product_Price)
library("psych")
describe(inventory_product_stores_sales)
Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
                 vars      n      mean        sd    median  min       max
Store_ID            1 823030     25.27     14.36     26.00 1.00     50.00
Product_ID          2 823030     15.01      9.90     14.00 1.00     35.00
Sale_ID             3 823030 414855.48 239754.24 415225.50 1.00 829262.00
Date                4 823030       NaN        NA        NA  Inf      -Inf
Units               5 823030      1.32      0.83      1.00 1.00     30.00
Store_Name          6 823030       NaN        NA        NA  Inf      -Inf
Store_City          7 823030       NaN        NA        NA  Inf      -Inf
Store_Location      8 823030       NaN        NA        NA  Inf      -Inf
Store_Open_Date     9 823030       NaN        NA        NA  Inf      -Inf
Product_Name       10 823030       NaN        NA        NA  Inf      -Inf
Product_Category   11 823030       NaN        NA        NA  Inf      -Inf
Product_Cost       12 823030     10.03      7.82      7.99 1.99     34.99
Product_Price      13 823030     13.80      8.69     12.99 2.99     39.99
Stock_On_Hand      14 823030     25.32     22.96     18.00 0.00    139.00
                  range skew kurtosis     se
Store_ID             49 0.01    -1.21   0.02
Product_ID           34 0.26    -1.22   0.01
Sale_ID          829261 0.00    -1.20 264.28
Date               -Inf   NA       NA     NA
Units                29 4.49    47.82   0.00
Store_Name         -Inf   NA       NA     NA
Store_City         -Inf   NA       NA     NA
Store_Location     -Inf   NA       NA     NA
Store_Open_Date    -Inf   NA       NA     NA
Product_Name       -Inf   NA       NA     NA
Product_Category   -Inf   NA       NA     NA
Product_Cost         33 1.78     3.41   0.01
Product_Price        37 1.37     2.32   0.01
Stock_On_Hand       139 1.58     2.85   0.03

Sample the data

  • You the full data if your computers can handle it.
?sample
# # Take a random sample of 10 elements from the population
# sampled_inventory_product_stores_sales <- sample(x = inventory_product_stores_sales, 
#                                                  size = 100
#                                                  )

library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
?sample_n
sampled_inventory_product_stores_sales <- sample_n(tbl  = inventory_product_stores_sales,
                                                   size = 10000
                                                   )

Export Data

# install.packages("openxlsx")
library("openxlsx")
?openxlsx

# Write the data frame to an Excel file
write.xlsx(sampled_inventory_product_stores_sales, 
           file = "final_toy_merged_sample.xlsx"
           )

write.xlsx(inventory_product_stores_sales, 
           file = "final_toy_merged_data.xlsx"
           )

# # install.packages("writexl")
# library(writexl)
# ?writexl
# write_xlsx(sampled_inventory_product_stores_sales, 
#            path = "merged_data_sample_writexl.xlsx"
#            )
# 
?describe

Customized Tables

Starting with an example that works, we change the data to our own source.

mtcars
                     mpg cyl  disp  hp drat    wt  qsec vs am gear carb
Mazda RX4           21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
Mazda RX4 Wag       21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
Datsun 710          22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
Hornet 4 Drive      21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
Hornet Sportabout   18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
Valiant             18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
Duster 360          14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
Merc 240D           24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
Merc 230            22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
Merc 280            19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
Merc 280C           17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
Merc 450SE          16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3
Merc 450SL          17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3
Merc 450SLC         15.2   8 275.8 180 3.07 3.780 18.00  0  0    3    3
Cadillac Fleetwood  10.4   8 472.0 205 2.93 5.250 17.98  0  0    3    4
Lincoln Continental 10.4   8 460.0 215 3.00 5.424 17.82  0  0    3    4
Chrysler Imperial   14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4
Fiat 128            32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
Honda Civic         30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2
Toyota Corolla      33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
Toyota Corona       21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1
Dodge Challenger    15.5   8 318.0 150 2.76 3.520 16.87  0  0    3    2
AMC Javelin         15.2   8 304.0 150 3.15 3.435 17.30  0  0    3    2
Camaro Z28          13.3   8 350.0 245 3.73 3.840 15.41  0  0    3    4
Pontiac Firebird    19.2   8 400.0 175 3.08 3.845 17.05  0  0    3    2
Fiat X1-9           27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1
Porsche 914-2       26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2
Lotus Europa        30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2
Ford Pantera L      15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4
Ferrari Dino        19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6
Maserati Bora       15.0   8 301.0 335 3.54 3.570 14.60  0  1    5    8
Volvo 142E          21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2
table(mtcars$cyl)

 4  6  8 
11  7 14 
mtcars |>
  group_by(cyl) |>
  summarize(avg = mean(mpg))
# A tibble: 3 × 2
    cyl   avg
  <dbl> <dbl>
1     4  26.7
2     6  19.7
3     8  15.1
mean(mtcars$mpg)
[1] 20.09062
inventory_product_stores_sales |>
  group_by(Store_Location) |>
  summarize(avg = mean(Product_Cost))
# A tibble: 4 × 2
  Store_Location   avg
  <chr>          <dbl>
1 Airport        10.0 
2 Commercial     10.1 
3 Downtown       10.0 
4 Residential     9.89

Appendix - Merge Basics with dplyr

Merging datasets in R can be done using various functions, primarily from the dplyr package, which provides intuitive functions for different types of merges: one-to-one, one-to-many, and many-to-many.

Below are examples of each type of merge.

1. One-to-One Merge

In a one-to-one merge, each row in one dataset corresponds to exactly one row in another dataset.

# Load the dplyr package
library(dplyr)

# Create two example data frames
df1 <- data.frame(ID = 1:3, Name = c("Alice", "Bob", "Charlie"))
df2 <- data.frame(ID = 1:3, Age = c(25, 30, 35))

# Perform a one-to-one merge
merged_df <- df1 %>%
  inner_join(df2, by = "ID")

print(merged_df)
  ID    Name Age
1  1   Alice  25
2  2     Bob  30
3  3 Charlie  35

2. One-to-Many Merge

In a one-to-many merge, each row in one dataset corresponds to multiple rows in another dataset.

# Create two example data frames
df1 <- data.frame(ID = 1:3, Name = c("Alice", "Bob", "Charlie"))
df2 <- data.frame(ID = c(1, 1, 2, 2, 3), Score = c(90, 85, 80, 75, 95))

# Perform a one-to-many merge
merged_df <- df1 %>%
  inner_join(df2, by = "ID")

print(merged_df)
  ID    Name Score
1  1   Alice    90
2  1   Alice    85
3  2     Bob    80
4  2     Bob    75
5  3 Charlie    95

3. Many-to-Many Merge

In a many-to-many merge, multiple rows in one dataset correspond to multiple rows in another dataset.

# Create two example data frames
df1 <- data.frame(StudentID = c(1, 1, 2, 3), CourseID = c(101, 102, 101, 103))
df2 <- data.frame(CourseID = c(101, 101, 102, 103), CourseName = c("Math", "Science", "History", "Art"))

# Perform a many-to-many merge
merged_df <- df1 %>%
  inner_join(df2, by = "CourseID")
Warning in inner_join(., df2, by = "CourseID"): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 1 of `x` matches multiple rows in `y`.
ℹ Row 1 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
  "many-to-many"` to silence this warning.
print(merged_df)
  StudentID CourseID CourseName
1         1      101       Math
2         1      101    Science
3         1      102    History
4         2      101       Math
5         2      101    Science
6         3      103        Art

Notes on Merge Types

  • Inner Join (inner_join): Returns only the rows with matching keys in both datasets.

  • Left Join (left_join): Returns all rows from the left dataset and matching rows from the right dataset. If there is no match, NA values are returned for columns from the right dataset.

  • Right Join (right_join): Returns all rows from the right dataset and matching rows from the left dataset. If there is no match, NA values are returned for columns from the left dataset.

  • Full Join (full_join): Returns all rows when there is a match in one of the datasets. Non-matching rows will have NA values for columns from the dataset without a match.

You can choose the appropriate type of join based on your specific requirements.

CHEATSHEET