remove (list=ls())
library(readxl)
?readxl
#inventory <- read_csv("~/Downloads/inventory.csv")
# View(inventory)
# dim(inventory)Merging
Import
- Four datasets
products <- read.csv("~/Desktop/products.csv")
sales <- read.csv("~/Desktop/sales.csv")
stores <- read.csv("~/Desktop/stores.csv")
`inventory.(1)` <- read.csv("~/Desktop/inventory (1).csv")Basic EDA on each file
Stores
names(stores)[1] "Store_ID" "Store_Name" "Store_City" "Store_Location"
[5] "Store_Open_Date"
length(stores$Store_ID)[1] 50
table(stores$Store_City)
Aguascalientes Campeche Chetumal Chihuahua
1 2 1 2
Chilpancingo Ciudad Victoria Cuernavaca Cuidad de Mexico
1 1 1 4
Culiacan Durango Guadalajara Guanajuato
1 1 4 3
Hermosillo La Paz Merida Mexicali
3 1 1 2
Monterrey Morelia Oaxaca Pachuca
4 1 1 1
Puebla Saltillo San Luis Potosi Santiago
3 2 1 1
Toluca Tuxtla Gutierrez Villahermosa Xalapa
2 1 1 2
Zacatecas
1
table(stores$Store_Location)
Airport Commercial Downtown Residential
3 12 29 6
# simple Bar Plot
count_Store_Location <- table(stores$Store_Location)
barplot(count_Store_Location, main = "Barplot of Store Location")unique(stores$Store_ID) [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
[26] 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
length(stores$Store_ID)[1] 50
Sales
- Sales ID is a unique identifier, not stores ID
names(sales)[1] "Sale_ID" "Date" "Store_ID" "Product_ID" "Units"
?unique
length(unique(sales$Sale_ID)) # Confirmed sales ID is unique[1] 829262
unique(sales$Store_ID) [1] 24 28 6 48 44 1 40 19 38 21 34 37 5 8 3 33 30 4 41 10 16 36 39 32 25
[26] 46 23 50 12 22 27 7 47 49 42 29 18 9 31 35 13 14 17 2 11 43 26 45 15 20
sort(unique(sales$Store_ID)) [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
[26] 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
Merge
?merge
stores_sales <-
merge(x = sales,
y = stores,
by = "Store_ID"
)
# install.packages(Amelia)
require(Amelia)Loading required package: Amelia
Loading required package: Rcpp
##
## Amelia II: Multiple Imputation
## (Version 1.8.2, built: 2024-04-10)
## Copyright (C) 2005-2024 James Honaker, Gary King and Matthew Blackwell
## Refer to http://gking.harvard.edu/amelia/ for more information
##
missmap(stores_sales)- Many to one merge !
Merge with Product
One ( products ) to many ( store_sales ) on Product_ID.
Save it as product_stores_sales.
Product
unique(products$Product_ID) [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
[26] 26 27 28 29 30 31 32 33 34 35
length(unique(products$Product_ID))[1] 35
product_stores_sales <-
merge(x = stores_sales,
y = products,
by = c("Product_ID")
)
remove(stores_sales)Merge with Inventory
One ( products ) to many ( stores_sales ) on the unique identifier Product_ID and Product_ID.
Save it as inventory_product_stores_sales.
unique(`inventory.(1)`$Store_ID) [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
[26] 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
store_id <- as.character(`inventory.(1)`$Store_ID)
product_id <- as.character(`inventory.(1)`$Product_ID)
# Test <- `inventory.(1)`$Store_ID + inventory$Product_ID
inventory_product_stores_sales <-
merge(x = product_stores_sales,
y = `inventory.(1)`,
by = c("Store_ID", "Product_ID")
)Summary Statistics
Product Prices
?gsub
# removing dollar sign from string
inventory_product_stores_sales$Product_Price <-
gsub(pattern = "\\$",
replacement = "",
x = inventory_product_stores_sales$Product_Price
)
?as.numeric
inventory_product_stores_sales$Product_Price <- as.numeric(inventory_product_stores_sales$Product_Price)
?head
head(as.numeric(inventory_product_stores_sales$Product_Price))[1] 15.99 15.99 15.99 15.99 15.99 15.99
?gsub
# removing dollar sign from string
inventory_product_stores_sales$Product_Cost <-
gsub(pattern = "\\$",
replacement = "",
x = inventory_product_stores_sales$Product_Cost
)
?as.numeric
inventory_product_stores_sales$Product_Cost <- as.numeric(inventory_product_stores_sales$Product_Cost)
?head
head(as.numeric(inventory_product_stores_sales$Product_Cost))[1] 9.99 9.99 9.99 9.99 9.99 9.99
Product Cost
library("psych")
describe(inventory_product_stores_sales)Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
vars n mean sd median min max
Store_ID 1 823030 25.27 14.36 26.00 1.00 50.00
Product_ID 2 823030 15.01 9.90 14.00 1.00 35.00
Sale_ID 3 823030 414855.48 239754.24 415225.50 1.00 829262.00
Date 4 823030 NaN NA NA Inf -Inf
Units 5 823030 1.32 0.83 1.00 1.00 30.00
Store_Name 6 823030 NaN NA NA Inf -Inf
Store_City 7 823030 NaN NA NA Inf -Inf
Store_Location 8 823030 NaN NA NA Inf -Inf
Store_Open_Date 9 823030 NaN NA NA Inf -Inf
Product_Name 10 823030 NaN NA NA Inf -Inf
Product_Category 11 823030 NaN NA NA Inf -Inf
Product_Cost 12 823030 10.03 7.82 7.99 1.99 34.99
Product_Price 13 823030 13.80 8.69 12.99 2.99 39.99
Stock_On_Hand 14 823030 25.32 22.96 18.00 0.00 139.00
range skew kurtosis se
Store_ID 49 0.01 -1.21 0.02
Product_ID 34 0.26 -1.22 0.01
Sale_ID 829261 0.00 -1.20 264.28
Date -Inf NA NA NA
Units 29 4.49 47.82 0.00
Store_Name -Inf NA NA NA
Store_City -Inf NA NA NA
Store_Location -Inf NA NA NA
Store_Open_Date -Inf NA NA NA
Product_Name -Inf NA NA NA
Product_Category -Inf NA NA NA
Product_Cost 33 1.78 3.41 0.01
Product_Price 37 1.37 2.32 0.01
Stock_On_Hand 139 1.58 2.85 0.03
Import R Dataset to Excel
# # Take a random sample of 10 elements from the population
# sample_inventory_product_stores_sales <- sample(x = inventory_product_stores_sales, size = 1000)
#dplyr::sample_n()
library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
?sample_n(x, n)
#install.packages("openxlsx")
library(openxlsx)
sample_inventory_product_stores_sales <- sample_n(tbl = inventory_product_stores_sales, size = 10000)
# install.packages("writexl")
library(writexl)
?writexl
write_xlsx(sample_inventory_product_stores_sales, "~/Downloads/inventory.csv")