Import

remove(list = ls())
gc()

##          used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 541521 29.0    1197082   64         NA   700240 37.4
## Vcells 999522  7.7    8388608   64      16384  1963231 15.0

cat("\f")

getwd()

## [1] "/Users/edward.zhu/Project"

library(stargazer)

## 
## Please cite as:

##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.

##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer

library(ggplot2) 
dir()

##  [1] "~$final_toy_merged_data.xlsx"     "data"                            
##  [3] "DAY 2 Solution_files"             "DAY 2 Solution.html"             
##  [5] "DAY 2 Solution.qmd"               "Day 3 HW (Titanic)_files"        
##  [7] "Day 3 HW (Titanic).html"          "Day 3 HW (Titanic).Rmd"          
##  [9] "DAY-2-Solution_files"             "Day-3-HW--Titanic-_files"        
## [11] "Day-3-HW--Titanic-.html"          "final_toy_merged_data.xlsx"      
## [13] "First_RMarkdown_File.html"        "Import.Rmd"                      
## [15] "merged-data_sample_openxlsx.xlsx" "Project.Rproj"                   
## [17] "R_basics.html"                    "R_basics.Rmd"                    
## [19] "rsconnect"                        "summary_statistics.txt"          
## [21] "Toy_files"                        "Toy.Rmd"                         
## [23] "Untitled.pdf"                     "Untitled.Rmd"

df.inventory <- read.csv("~/Downloads/inventory.csv")
df.product <- read.csv("~/Downloads/products.csv")
df.sales <- read.csv("~/Downloads/sales.csv")
df.stores <- read.csv("~/Downloads/stores.csv")

dim(df.inventory)

## [1] 1593    3

Stores

names(df.stores)

## [1] "Store_ID"        "Store_Name"      "Store_City"      "Store_Location" 
## [5] "Store_Open_Date"

length(df.stores$Store_ID)

## [1] 50

table(df.stores$Store_City)

## 
##   Aguascalientes         Campeche         Chetumal        Chihuahua 
##                1                2                1                2 
##     Chilpancingo  Ciudad Victoria       Cuernavaca Cuidad de Mexico 
##                1                1                1                4 
##         Culiacan          Durango      Guadalajara       Guanajuato 
##                1                1                4                3 
##       Hermosillo           La Paz           Merida         Mexicali 
##                3                1                1                2 
##        Monterrey          Morelia           Oaxaca          Pachuca 
##                4                1                1                1 
##           Puebla         Saltillo  San Luis Potosi         Santiago 
##                3                2                1                1 
##           Toluca Tuxtla Gutierrez     Villahermosa           Xalapa 
##                2                1                1                2 
##        Zacatecas 
##                1

count_Store_Location <- table(df.stores$Store_Location)
barplot(count_Store_Location, main = "Barplot of Store Location")

Sales

names(df.sales)

## [1] "Sale_ID"    "Date"       "Store_ID"   "Product_ID" "Units"

length(unique(df.sales$Sale_ID))

## [1] 829262

unique(df.sales$Store_ID)

##  [1] 24 28  6 48 44  1 40 19 38 21 34 37  5  8  3 33 30  4 41 10 16 36 39 32 25
## [26] 46 23 50 12 22 27  7 47 49 42 29 18  9 31 35 13 14 17  2 11 43 26 45 15 20

sort(unique(df.sales$Store_ID))

##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## [26] 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50

Merge

store_sales <- merge(x = df.sales, 
       y = df.stores, 
     by = "Store_ID"
      )

require(Amelia)

## Loading required package: Amelia

## Loading required package: Rcpp

## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.2, built: 2024-04-10)
## ## Copyright (C) 2005-2024 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##

missmap(store_sales)

Product

One (product) to many (stores_sales) on Product_ID

Save it as product_stores_sales

Product

length(unique(df.product$Product_ID))

## [1] 35

product_store_sales <- merge(x = store_sales, y = df.product, by = "Product_ID")

remove(store_sales)

Merge with INVENTORY

Product

store_id <- as.character(df.inventory$Store_ID)
product_id <- as.character(df.inventory$Product_ID)

inventory_product_store_sales <- merge(x = product_store_sales, y = df.inventory, by = c("Store_ID", "Product_ID")
                                       )
rm(df.inventory, product_store_sales)
#test <- cbind(store_id, product_id)

Summary Stats

library(psych)

## 
## Attaching package: 'psych'

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

inventory_product_store_sales$Product_Cost = as.numeric(gsub("\\$", "", inventory_product_store_sales$Product_Cost))

describe(inventory_product_store_sales)

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf

##                  vars      n      mean        sd    median  min       max
## Store_ID            1 823030     25.27     14.36     26.00 1.00     50.00
## Product_ID          2 823030     15.01      9.90     14.00 1.00     35.00
## Sale_ID             3 823030 414855.48 239754.24 415225.50 1.00 829262.00
## Date                4 823030       NaN        NA        NA  Inf      -Inf
## Units               5 823030      1.32      0.83      1.00 1.00     30.00
## Store_Name          6 823030       NaN        NA        NA  Inf      -Inf
## Store_City          7 823030       NaN        NA        NA  Inf      -Inf
## Store_Location      8 823030       NaN        NA        NA  Inf      -Inf
## Store_Open_Date     9 823030       NaN        NA        NA  Inf      -Inf
## Product_Name       10 823030       NaN        NA        NA  Inf      -Inf
## Product_Category   11 823030       NaN        NA        NA  Inf      -Inf
## Product_Cost       12 823030     10.03      7.82      7.99 1.99     34.99
## Product_Price      13 823030       NaN        NA        NA  Inf      -Inf
## Stock_On_Hand      14 823030     25.32     22.96     18.00 0.00    139.00
##                   range skew kurtosis     se
## Store_ID             49 0.01    -1.21   0.02
## Product_ID           34 0.26    -1.22   0.01
## Sale_ID          829261 0.00    -1.20 264.28
## Date               -Inf   NA       NA     NA
## Units                29 4.49    47.82   0.00
## Store_Name         -Inf   NA       NA     NA
## Store_City         -Inf   NA       NA     NA
## Store_Location     -Inf   NA       NA     NA
## Store_Open_Date    -Inf   NA       NA     NA
## Product_Name       -Inf   NA       NA     NA
## Product_Category   -Inf   NA       NA     NA
## Product_Cost         33 1.78     3.41   0.01
## Product_Price      -Inf   NA       NA     NA
## Stock_On_Hand       139 1.58     2.85   0.03

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(writexl)

?sample
?sample_n
sample_inventory_product_store_sales <- sample_n(tbl = inventory_product_store_sales,
                                                 size = 10000
                                                 )

library(openxlsx)

write.xlsx(sample_inventory_product_store_sales, file = "merged_data_sample_openxlsx.xlsx")

write.xlsx(inventory_product_store_sales, file = "final_toy_merged_data.xlsx"
)

Merging

E

2024-07-25

Import

Stores

Sales

Merge

Product

Merge with INVENTORY

Summary Stats