remove(list = ls())
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 541521 29.0 1197082 64 NA 700240 37.4
## Vcells 999522 7.7 8388608 64 16384 1963231 15.0
cat("\f")
getwd()
## [1] "/Users/edward.zhu/Project"
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(ggplot2)
dir()
## [1] "~$final_toy_merged_data.xlsx" "data"
## [3] "DAY 2 Solution_files" "DAY 2 Solution.html"
## [5] "DAY 2 Solution.qmd" "Day 3 HW (Titanic)_files"
## [7] "Day 3 HW (Titanic).html" "Day 3 HW (Titanic).Rmd"
## [9] "DAY-2-Solution_files" "Day-3-HW--Titanic-_files"
## [11] "Day-3-HW--Titanic-.html" "final_toy_merged_data.xlsx"
## [13] "First_RMarkdown_File.html" "Import.Rmd"
## [15] "merged-data_sample_openxlsx.xlsx" "Project.Rproj"
## [17] "R_basics.html" "R_basics.Rmd"
## [19] "rsconnect" "summary_statistics.txt"
## [21] "Toy_files" "Toy.Rmd"
## [23] "Untitled.pdf" "Untitled.Rmd"
df.inventory <- read.csv("~/Downloads/inventory.csv")
df.product <- read.csv("~/Downloads/products.csv")
df.sales <- read.csv("~/Downloads/sales.csv")
df.stores <- read.csv("~/Downloads/stores.csv")
dim(df.inventory)
## [1] 1593 3
names(df.stores)
## [1] "Store_ID" "Store_Name" "Store_City" "Store_Location"
## [5] "Store_Open_Date"
length(df.stores$Store_ID)
## [1] 50
table(df.stores$Store_City)
##
## Aguascalientes Campeche Chetumal Chihuahua
## 1 2 1 2
## Chilpancingo Ciudad Victoria Cuernavaca Cuidad de Mexico
## 1 1 1 4
## Culiacan Durango Guadalajara Guanajuato
## 1 1 4 3
## Hermosillo La Paz Merida Mexicali
## 3 1 1 2
## Monterrey Morelia Oaxaca Pachuca
## 4 1 1 1
## Puebla Saltillo San Luis Potosi Santiago
## 3 2 1 1
## Toluca Tuxtla Gutierrez Villahermosa Xalapa
## 2 1 1 2
## Zacatecas
## 1
count_Store_Location <- table(df.stores$Store_Location)
barplot(count_Store_Location, main = "Barplot of Store Location")
names(df.sales)
## [1] "Sale_ID" "Date" "Store_ID" "Product_ID" "Units"
length(unique(df.sales$Sale_ID))
## [1] 829262
unique(df.sales$Store_ID)
## [1] 24 28 6 48 44 1 40 19 38 21 34 37 5 8 3 33 30 4 41 10 16 36 39 32 25
## [26] 46 23 50 12 22 27 7 47 49 42 29 18 9 31 35 13 14 17 2 11 43 26 45 15 20
sort(unique(df.sales$Store_ID))
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## [26] 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
store_sales <- merge(x = df.sales,
y = df.stores,
by = "Store_ID"
)
require(Amelia)
## Loading required package: Amelia
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.2, built: 2024-04-10)
## ## Copyright (C) 2005-2024 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
missmap(store_sales)
One (product) to many (stores_sales) on Product_ID
Save it as product_stores_sales
Product
length(unique(df.product$Product_ID))
## [1] 35
product_store_sales <- merge(x = store_sales, y = df.product, by = "Product_ID")
remove(store_sales)
Product
store_id <- as.character(df.inventory$Store_ID)
product_id <- as.character(df.inventory$Product_ID)
inventory_product_store_sales <- merge(x = product_store_sales, y = df.inventory, by = c("Store_ID", "Product_ID")
)
rm(df.inventory, product_store_sales)
#test <- cbind(store_id, product_id)
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
inventory_product_store_sales$Product_Cost = as.numeric(gsub("\\$", "", inventory_product_store_sales$Product_Cost))
describe(inventory_product_store_sales)
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## vars n mean sd median min max
## Store_ID 1 823030 25.27 14.36 26.00 1.00 50.00
## Product_ID 2 823030 15.01 9.90 14.00 1.00 35.00
## Sale_ID 3 823030 414855.48 239754.24 415225.50 1.00 829262.00
## Date 4 823030 NaN NA NA Inf -Inf
## Units 5 823030 1.32 0.83 1.00 1.00 30.00
## Store_Name 6 823030 NaN NA NA Inf -Inf
## Store_City 7 823030 NaN NA NA Inf -Inf
## Store_Location 8 823030 NaN NA NA Inf -Inf
## Store_Open_Date 9 823030 NaN NA NA Inf -Inf
## Product_Name 10 823030 NaN NA NA Inf -Inf
## Product_Category 11 823030 NaN NA NA Inf -Inf
## Product_Cost 12 823030 10.03 7.82 7.99 1.99 34.99
## Product_Price 13 823030 NaN NA NA Inf -Inf
## Stock_On_Hand 14 823030 25.32 22.96 18.00 0.00 139.00
## range skew kurtosis se
## Store_ID 49 0.01 -1.21 0.02
## Product_ID 34 0.26 -1.22 0.01
## Sale_ID 829261 0.00 -1.20 264.28
## Date -Inf NA NA NA
## Units 29 4.49 47.82 0.00
## Store_Name -Inf NA NA NA
## Store_City -Inf NA NA NA
## Store_Location -Inf NA NA NA
## Store_Open_Date -Inf NA NA NA
## Product_Name -Inf NA NA NA
## Product_Category -Inf NA NA NA
## Product_Cost 33 1.78 3.41 0.01
## Product_Price -Inf NA NA NA
## Stock_On_Hand 139 1.58 2.85 0.03
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(writexl)
?sample
?sample_n
sample_inventory_product_store_sales <- sample_n(tbl = inventory_product_store_sales,
size = 10000
)
library(openxlsx)
write.xlsx(sample_inventory_product_store_sales, file = "merged_data_sample_openxlsx.xlsx")
write.xlsx(inventory_product_store_sales, file = "final_toy_merged_data.xlsx"
)