Import

remove(list = ls())
gc() 
##          used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 541521 29.0    1197082   64         NA   700240 37.4
## Vcells 999522  7.7    8388608   64      16384  1963231 15.0
cat("\f") 
getwd() 
## [1] "/Users/edward.zhu/Project"
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(ggplot2) 
dir()
##  [1] "~$final_toy_merged_data.xlsx"     "data"                            
##  [3] "DAY 2 Solution_files"             "DAY 2 Solution.html"             
##  [5] "DAY 2 Solution.qmd"               "Day 3 HW (Titanic)_files"        
##  [7] "Day 3 HW (Titanic).html"          "Day 3 HW (Titanic).Rmd"          
##  [9] "DAY-2-Solution_files"             "Day-3-HW--Titanic-_files"        
## [11] "Day-3-HW--Titanic-.html"          "final_toy_merged_data.xlsx"      
## [13] "First_RMarkdown_File.html"        "Import.Rmd"                      
## [15] "merged-data_sample_openxlsx.xlsx" "Project.Rproj"                   
## [17] "R_basics.html"                    "R_basics.Rmd"                    
## [19] "rsconnect"                        "summary_statistics.txt"          
## [21] "Toy_files"                        "Toy.Rmd"                         
## [23] "Untitled.pdf"                     "Untitled.Rmd"
df.inventory <- read.csv("~/Downloads/inventory.csv")
df.product <- read.csv("~/Downloads/products.csv")
df.sales <- read.csv("~/Downloads/sales.csv")
df.stores <- read.csv("~/Downloads/stores.csv")

dim(df.inventory)
## [1] 1593    3

Stores

names(df.stores)
## [1] "Store_ID"        "Store_Name"      "Store_City"      "Store_Location" 
## [5] "Store_Open_Date"
length(df.stores$Store_ID)
## [1] 50
table(df.stores$Store_City)
## 
##   Aguascalientes         Campeche         Chetumal        Chihuahua 
##                1                2                1                2 
##     Chilpancingo  Ciudad Victoria       Cuernavaca Cuidad de Mexico 
##                1                1                1                4 
##         Culiacan          Durango      Guadalajara       Guanajuato 
##                1                1                4                3 
##       Hermosillo           La Paz           Merida         Mexicali 
##                3                1                1                2 
##        Monterrey          Morelia           Oaxaca          Pachuca 
##                4                1                1                1 
##           Puebla         Saltillo  San Luis Potosi         Santiago 
##                3                2                1                1 
##           Toluca Tuxtla Gutierrez     Villahermosa           Xalapa 
##                2                1                1                2 
##        Zacatecas 
##                1
count_Store_Location <- table(df.stores$Store_Location)
barplot(count_Store_Location, main = "Barplot of Store Location")

Sales

names(df.sales)
## [1] "Sale_ID"    "Date"       "Store_ID"   "Product_ID" "Units"
length(unique(df.sales$Sale_ID))
## [1] 829262
unique(df.sales$Store_ID)
##  [1] 24 28  6 48 44  1 40 19 38 21 34 37  5  8  3 33 30  4 41 10 16 36 39 32 25
## [26] 46 23 50 12 22 27  7 47 49 42 29 18  9 31 35 13 14 17  2 11 43 26 45 15 20
sort(unique(df.sales$Store_ID))
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## [26] 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50

Merge

store_sales <- merge(x = df.sales, 
       y = df.stores, 
     by = "Store_ID"
      )
require(Amelia)
## Loading required package: Amelia
## Loading required package: Rcpp
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.2, built: 2024-04-10)
## ## Copyright (C) 2005-2024 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
missmap(store_sales)

Product

One (product) to many (stores_sales) on Product_ID

Save it as product_stores_sales

Product

length(unique(df.product$Product_ID))
## [1] 35
product_store_sales <- merge(x = store_sales, y = df.product, by = "Product_ID")

remove(store_sales)

Merge with INVENTORY

Product

store_id <- as.character(df.inventory$Store_ID)
product_id <- as.character(df.inventory$Product_ID)

inventory_product_store_sales <- merge(x = product_store_sales, y = df.inventory, by = c("Store_ID", "Product_ID")
                                       )
rm(df.inventory, product_store_sales)
#test <- cbind(store_id, product_id)

Summary Stats

library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
inventory_product_store_sales$Product_Cost = as.numeric(gsub("\\$", "", inventory_product_store_sales$Product_Cost))

describe(inventory_product_store_sales)
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
##                  vars      n      mean        sd    median  min       max
## Store_ID            1 823030     25.27     14.36     26.00 1.00     50.00
## Product_ID          2 823030     15.01      9.90     14.00 1.00     35.00
## Sale_ID             3 823030 414855.48 239754.24 415225.50 1.00 829262.00
## Date                4 823030       NaN        NA        NA  Inf      -Inf
## Units               5 823030      1.32      0.83      1.00 1.00     30.00
## Store_Name          6 823030       NaN        NA        NA  Inf      -Inf
## Store_City          7 823030       NaN        NA        NA  Inf      -Inf
## Store_Location      8 823030       NaN        NA        NA  Inf      -Inf
## Store_Open_Date     9 823030       NaN        NA        NA  Inf      -Inf
## Product_Name       10 823030       NaN        NA        NA  Inf      -Inf
## Product_Category   11 823030       NaN        NA        NA  Inf      -Inf
## Product_Cost       12 823030     10.03      7.82      7.99 1.99     34.99
## Product_Price      13 823030       NaN        NA        NA  Inf      -Inf
## Stock_On_Hand      14 823030     25.32     22.96     18.00 0.00    139.00
##                   range skew kurtosis     se
## Store_ID             49 0.01    -1.21   0.02
## Product_ID           34 0.26    -1.22   0.01
## Sale_ID          829261 0.00    -1.20 264.28
## Date               -Inf   NA       NA     NA
## Units                29 4.49    47.82   0.00
## Store_Name         -Inf   NA       NA     NA
## Store_City         -Inf   NA       NA     NA
## Store_Location     -Inf   NA       NA     NA
## Store_Open_Date    -Inf   NA       NA     NA
## Product_Name       -Inf   NA       NA     NA
## Product_Category   -Inf   NA       NA     NA
## Product_Cost         33 1.78     3.41   0.01
## Product_Price      -Inf   NA       NA     NA
## Stock_On_Hand       139 1.58     2.85   0.03
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(writexl)

?sample
?sample_n
sample_inventory_product_store_sales <- sample_n(tbl = inventory_product_store_sales,
                                                 size = 10000
                                                 )
library(openxlsx)

write.xlsx(sample_inventory_product_store_sales, file = "merged_data_sample_openxlsx.xlsx")

write.xlsx(inventory_product_store_sales, file = "final_toy_merged_data.xlsx"
)