Import

remove(list=ls())
library(readxl)
?readxl

getwd()
## [1] "/Users/zhj/Desktop/DA/day 4"
inventory <- read_excel(path = "/Users/zhj/Desktop/DA/inventory.xlsx",
                        range = "A1:C1594"
                        )
dim(inventory)
## [1] 1593    3
products <- read.csv("/Users/zhj/Desktop/DA/products.csv")
sales <- read.csv("/Users/zhj/Desktop/DA/sales.csv")
stores <- read.csv("/Users/zhj/Desktop/DA/stores.csv")

Basic EDA on each file

Stores

  • Stores id is unique identifier
names(stores)
## [1] "Store_ID"        "Store_Name"      "Store_City"      "Store_Location" 
## [5] "Store_Open_Date"
length(stores$Store_ID)
## [1] 50
table(stores$Store_City)
## 
##   Aguascalientes         Campeche         Chetumal        Chihuahua 
##                1                2                1                2 
##     Chilpancingo  Ciudad Victoria       Cuernavaca Cuidad de Mexico 
##                1                1                1                4 
##         Culiacan          Durango      Guadalajara       Guanajuato 
##                1                1                4                3 
##       Hermosillo           La Paz           Merida         Mexicali 
##                3                1                1                2 
##        Monterrey          Morelia           Oaxaca          Pachuca 
##                4                1                1                1 
##           Puebla         Saltillo  San Luis Potosi         Santiago 
##                3                2                1                1 
##           Toluca Tuxtla Gutierrez     Villahermosa           Xalapa 
##                2                1                1                2 
##        Zacatecas 
##                1
# Simple Bar Plot
count_Store_Location <- table(stores$Store_Location)
barplot(count_Store_Location, main = "Barplot of Store Location")

unique(stores$Store_ID)
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## [26] 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
length(stores$Store_ID)
## [1] 50

Sales

  • Sales id is unique identifier, not stores id!
names(sales)
## [1] "Sale_ID"    "Date"       "Store_ID"   "Product_ID" "Units"
?unique
length(unique(sales$Sale_ID)) # confirmed sales ID is unique
## [1] 829262
unique(sales$Store_ID)
##  [1] 24 28  6 48 44  1 40 19 38 21 34 37  5  8  3 33 30  4 41 10 16 36 39 32 25
## [26] 46 23 50 12 22 27  7 47 49 42 29 18  9 31 35 13 14 17  2 11 43 26 45 15 20
sort(unique(sales$Store_ID))
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## [26] 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
  • See merge basics in Appendix.
?merge
stores_sales <-
merge(x = sales,
      y = stores,
      by = "Store_ID"
      )
  • Many to one merge !
require(Amelia)
## Loading required package: Amelia
## Loading required package: Rcpp
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.2, built: 2024-04-10)
## ## Copyright (C) 2005-2024 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
missmap(stores_sales)

Merge with PRODUCT

One ( products ) to many ( stores_sales ) on Product_ID.

Save it as product_stores_sales.

Product

length(unique(products$Product_ID))
## [1] 35
product_stores_sales <-
merge(x = stores_sales,
      y = products, 
      by = c("Product_ID")
      )

remove(stores_sales)

Merge with INVENTORY

One ( products ) to many ( stores_sales ) on the unique identifier StoreID - Product_ID.

Save it as inventory_product_stores_sales.

Inventory

store_id <- as.character(inventory$Store_ID)
product_id <- as.character(inventory$Product_ID)

# test <- cbind(store_id , product_id) # combined vector.

inventory_product_stores_sales <-
merge(x = product_stores_sales,
      y = inventory, 
      by = c("Store_ID","Product_ID")
      )

Summary Statistics

Product Cost

?gsub

# removing dollar sign from string 
inventory_product_stores_sales$Product_Cost   <- 
                gsub(pattern  = "\\$", 
                replacement   = "", 
                x             = inventory_product_stores_sales$Product_Cost
                ) 

?as.numeric
inventory_product_stores_sales$Product_Cost <- as.numeric(inventory_product_stores_sales$Product_Cost)

Product Prices

# removing dollar sign from string 
inventory_product_stores_sales$Product_Price   <- 
                gsub(pattern  = "\\$", 
                replacement   = "", 
                x             = inventory_product_stores_sales$Product_Price
                ) 

?as.numeric
inventory_product_stores_sales$Product_Price <- as.numeric(inventory_product_stores_sales$Product_Price)
library("psych")
describe(inventory_product_stores_sales)
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
##                  vars      n      mean        sd    median  min       max
## Store_ID            1 823030     25.27     14.36     26.00 1.00     50.00
## Product_ID          2 823030     15.01      9.90     14.00 1.00     35.00
## Sale_ID             3 823030 414855.48 239754.24 415225.50 1.00 829262.00
## Date                4 823030       NaN        NA        NA  Inf      -Inf
## Units               5 823030      1.32      0.83      1.00 1.00     30.00
## Store_Name          6 823030       NaN        NA        NA  Inf      -Inf
## Store_City          7 823030       NaN        NA        NA  Inf      -Inf
## Store_Location      8 823030       NaN        NA        NA  Inf      -Inf
## Store_Open_Date     9 823030       NaN        NA        NA  Inf      -Inf
## Product_Name       10 823030       NaN        NA        NA  Inf      -Inf
## Product_Category   11 823030       NaN        NA        NA  Inf      -Inf
## Product_Cost       12 823030     10.03      7.82      7.99 1.99     34.99
## Product_Price      13 823030     13.80      8.69     12.99 2.99     39.99
## Stock_On_Hand      14 823030     25.32     22.96     18.00 0.00    139.00
##                   range skew kurtosis     se
## Store_ID             49 0.01    -1.21   0.02
## Product_ID           34 0.26    -1.22   0.01
## Sale_ID          829261 0.00    -1.20 264.28
## Date               -Inf   NA       NA     NA
## Units                29 4.49    47.82   0.00
## Store_Name         -Inf   NA       NA     NA
## Store_City         -Inf   NA       NA     NA
## Store_Location     -Inf   NA       NA     NA
## Store_Open_Date    -Inf   NA       NA     NA
## Product_Name       -Inf   NA       NA     NA
## Product_Category   -Inf   NA       NA     NA
## Product_Cost         33 1.78     3.41   0.01
## Product_Price        37 1.37     2.32   0.01
## Stock_On_Hand       139 1.58     2.85   0.03

Sample the data

  • You the full data if your computers can handle it.
?sample
# # Take a random sample of 10 elements from the population
# sampled_inventory_product_stores_sales <- sample(x = inventory_product_stores_sales, 
#                                                  size = 100
#                                                  )

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
?sample_n
sampled_inventory_product_stores_sales <- sample_n(tbl  = inventory_product_stores_sales,
                                                   size = 10000
                                                   )

Export Data

# install.packages("openxlsx")
library("openxlsx")
?openxlsx

# Write the data frame to an Excel file
write.xlsx(sampled_inventory_product_stores_sales, 
           file = "final_toy_merged_sample.xlsx"
           )

write.xlsx(inventory_product_stores_sales, 
           file = "final_toy_merged_data.xlsx"
           )

# # install.packages("writexl")
# library(writexl)
# ?writexl
# write_xlsx(sampled_inventory_product_stores_sales, 
#            path = "merged_data_sample_writexl.xlsx"
#            )
# 
?describe

Customized Tables

Starting with an example that works, we change the data to our own source.

mtcars
##                      mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4           21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag       21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710          22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive      21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout   18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
## Valiant             18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
## Duster 360          14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
## Merc 240D           24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
## Merc 230            22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
## Merc 280            19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
## Merc 280C           17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
## Merc 450SE          16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3
## Merc 450SL          17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3
## Merc 450SLC         15.2   8 275.8 180 3.07 3.780 18.00  0  0    3    3
## Cadillac Fleetwood  10.4   8 472.0 205 2.93 5.250 17.98  0  0    3    4
## Lincoln Continental 10.4   8 460.0 215 3.00 5.424 17.82  0  0    3    4
## Chrysler Imperial   14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4
## Fiat 128            32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
## Honda Civic         30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2
## Toyota Corolla      33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
## Toyota Corona       21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1
## Dodge Challenger    15.5   8 318.0 150 2.76 3.520 16.87  0  0    3    2
## AMC Javelin         15.2   8 304.0 150 3.15 3.435 17.30  0  0    3    2
## Camaro Z28          13.3   8 350.0 245 3.73 3.840 15.41  0  0    3    4
## Pontiac Firebird    19.2   8 400.0 175 3.08 3.845 17.05  0  0    3    2
## Fiat X1-9           27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1
## Porsche 914-2       26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2
## Lotus Europa        30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2
## Ford Pantera L      15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4
## Ferrari Dino        19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6
## Maserati Bora       15.0   8 301.0 335 3.54 3.570 14.60  0  1    5    8
## Volvo 142E          21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2
table(mtcars$cyl)
## 
##  4  6  8 
## 11  7 14
mtcars |>
  group_by(cyl) |>
  summarize(avg = mean(mpg))
## # A tibble: 3 × 2
##     cyl   avg
##   <dbl> <dbl>
## 1     4  26.7
## 2     6  19.7
## 3     8  15.1
mean(mtcars$mpg)
## [1] 20.09062
inventory_product_stores_sales |>
  group_by(Store_Location) |>
  summarize(avg = mean(Product_Cost))
## # A tibble: 4 × 2
##   Store_Location   avg
##   <chr>          <dbl>
## 1 Airport        10.0 
## 2 Commercial     10.1 
## 3 Downtown       10.0 
## 4 Residential     9.89

Appendix - Merge Basics with dplyr

Merging datasets in R can be done using various functions, primarily from the dplyr package, which provides intuitive functions for different types of merges: one-to-one, one-to-many, and many-to-many.

Below are examples of each type of merge.

1. One-to-One Merge

In a one-to-one merge, each row in one dataset corresponds to exactly one row in another dataset.

# Load the dplyr package
library(dplyr)

# Create two example data frames
df1 <- data.frame(ID = 1:3, Name = c("Alice", "Bob", "Charlie"))
df2 <- data.frame(ID = 1:3, Age = c(25, 30, 35))

df1
##   ID    Name
## 1  1   Alice
## 2  2     Bob
## 3  3 Charlie
df2
##   ID Age
## 1  1  25
## 2  2  30
## 3  3  35
# Perform a one-to-one merge
merged_df <- df1 %>%
  inner_join(df2, by = "ID")

print(merged_df)
##   ID    Name Age
## 1  1   Alice  25
## 2  2     Bob  30
## 3  3 Charlie  35

2. One-to-Many Merge

In a one-to-many merge, each row in one dataset corresponds to multiple rows in another dataset.

# Create two example data frames
df1 <- data.frame(ID = 1:3, Name = c("Alice", "Bob", "Charlie"))
df2 <- data.frame(ID = c(1, 1, 2, 2, 3), Score = c(90, 85, 80, 75, 95))

df1
##   ID    Name
## 1  1   Alice
## 2  2     Bob
## 3  3 Charlie
df2
##   ID Score
## 1  1    90
## 2  1    85
## 3  2    80
## 4  2    75
## 5  3    95
# Perform a one-to-many merge
merged_df <- df1 %>%
  inner_join(df2, by = "ID")

print(merged_df)
##   ID    Name Score
## 1  1   Alice    90
## 2  1   Alice    85
## 3  2     Bob    80
## 4  2     Bob    75
## 5  3 Charlie    95

3. Many-to-Many Merge

In a many-to-many merge, multiple rows in one dataset correspond to multiple rows in another dataset.

# Create two example data frames
df1 <- data.frame(StudentID = c(1, 1, 2, 3), CourseID = c(101, 102, 101, 103))
df2 <- data.frame(CourseID = c(101, 101, 102, 103), CourseName = c("Math", "Science", "History", "Art"))

df1
##   StudentID CourseID
## 1         1      101
## 2         1      102
## 3         2      101
## 4         3      103
df2
##   CourseID CourseName
## 1      101       Math
## 2      101    Science
## 3      102    History
## 4      103        Art
# Perform a many-to-many merge
merged_df <- df1 %>%
  inner_join(df2, by = "CourseID")
## Warning in inner_join(., df2, by = "CourseID"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 1 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
print(merged_df)
##   StudentID CourseID CourseName
## 1         1      101       Math
## 2         1      101    Science
## 3         1      102    History
## 4         2      101       Math
## 5         2      101    Science
## 6         3      103        Art

Notes on Merge Types

  • Inner Join (inner_join): Returns only the rows with matching keys in both datasets.

  • Left Join (left_join): Returns all rows from the left dataset and matching rows from the right dataset. If there is no match, NA values are returned for columns from the right dataset.

  • Right Join (right_join): Returns all rows from the right dataset and matching rows from the left dataset. If there is no match, NA values are returned for columns from the left dataset.

  • Full Join (full_join): Returns all rows when there is a match in one of the datasets. Non-matching rows will have NA values for columns from the dataset without a match.

You can choose the appropriate type of join based on your specific requirements.