remove(list=ls())
library(readxl)
?readxl
inventory <- read_excel(path = "Maven+Toys+Data/inventory.xlsx",
range = "A1:C1594"
)
dim(inventory)[1] 1593 3
remove(list=ls())
library(readxl)
?readxl
inventory <- read_excel(path = "Maven+Toys+Data/inventory.xlsx",
range = "A1:C1594"
)
dim(inventory)[1] 1593 3
products <- read.csv("Maven+Toys+Data/products.csv")
sales <- read.csv("Maven+Toys+Data/sales.csv")
stores <- read.csv("Maven+Toys+Data/stores.csv")names(stores)[1] "Store_ID" "Store_Name" "Store_City" "Store_Location"
[5] "Store_Open_Date"
length(stores$Store_ID)[1] 50
table(stores$Store_City)
Aguascalientes Campeche Chetumal Chihuahua
1 2 1 2
Chilpancingo Ciudad Victoria Cuernavaca Cuidad de Mexico
1 1 1 4
Culiacan Durango Guadalajara Guanajuato
1 1 4 3
Hermosillo La Paz Merida Mexicali
3 1 1 2
Monterrey Morelia Oaxaca Pachuca
4 1 1 1
Puebla Saltillo San Luis Potosi Santiago
3 2 1 1
Toluca Tuxtla Gutierrez Villahermosa Xalapa
2 1 1 2
Zacatecas
1
# Simple Bar Plot
count_Store_Location <- table(stores$Store_Location)
barplot(count_Store_Location, main = "Barplot of Store Location")unique(stores$Store_ID) [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
[26] 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
length(stores$Store_ID)[1] 50
names(sales)[1] "Sale_ID" "Date" "Store_ID" "Product_ID" "Units"
?unique
length(unique(sales$Sale_ID)) # confirmed sales ID is unique[1] 829262
unique(sales$Store_ID) [1] 24 28 6 48 44 1 40 19 38 21 34 37 5 8 3 33 30 4 41 10 16 36 39 32 25
[26] 46 23 50 12 22 27 7 47 49 42 29 18 9 31 35 13 14 17 2 11 43 26 45 15 20
sort(unique(sales$Store_ID)) [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
[26] 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
?merge
stores_sales <-
merge(x = sales,
y = stores,
by = "Store_ID"
)require(Amelia)Loading required package: Amelia
Loading required package: Rcpp
##
## Amelia II: Multiple Imputation
## (Version 1.8.2, built: 2024-04-10)
## Copyright (C) 2005-2024 James Honaker, Gary King and Matthew Blackwell
## Refer to http://gking.harvard.edu/amelia/ for more information
##
missmap(stores_sales)One ( products ) to many ( stores_sales ) on Product_ID.
Save it as product_stores_sales.
length(unique(products$Product_ID))[1] 35
product_stores_sales <-
merge(x = stores_sales,
y = products,
by = c("Product_ID")
)
remove(stores_sales)One ( products ) to many ( stores_sales ) on the unique identifier StoreID - Product_ID.
Save it as inventory_product_stores_sales.
store_id <- as.character(inventory$Store_ID)
product_id <- as.character(inventory$Product_ID)
# test <- cbind(store_id , product_id) # combined vector.
inventory_product_stores_sales <-
merge(x = product_stores_sales,
y = inventory,
by = c("Store_ID","Product_ID")
)?gsub
# removing dollar sign from string
inventory_product_stores_sales$Product_Cost <-
gsub(pattern = "\\$",
replacement = "",
x = inventory_product_stores_sales$Product_Cost
)
?as.numeric
inventory_product_stores_sales$Product_Cost <- as.numeric(inventory_product_stores_sales$Product_Cost)# removing dollar sign from string
inventory_product_stores_sales$Product_Price <-
gsub(pattern = "\\$",
replacement = "",
x = inventory_product_stores_sales$Product_Price
)
?as.numeric
inventory_product_stores_sales$Product_Price <- as.numeric(inventory_product_stores_sales$Product_Price)library("psych")
describe(inventory_product_stores_sales)Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
vars n mean sd median min max
Store_ID 1 823030 25.27 14.36 26.00 1.00 50.00
Product_ID 2 823030 15.01 9.90 14.00 1.00 35.00
Sale_ID 3 823030 414855.48 239754.24 415225.50 1.00 829262.00
Date 4 823030 NaN NA NA Inf -Inf
Units 5 823030 1.32 0.83 1.00 1.00 30.00
Store_Name 6 823030 NaN NA NA Inf -Inf
Store_City 7 823030 NaN NA NA Inf -Inf
Store_Location 8 823030 NaN NA NA Inf -Inf
Store_Open_Date 9 823030 NaN NA NA Inf -Inf
Product_Name 10 823030 NaN NA NA Inf -Inf
Product_Category 11 823030 NaN NA NA Inf -Inf
Product_Cost 12 823030 10.03 7.82 7.99 1.99 34.99
Product_Price 13 823030 13.80 8.69 12.99 2.99 39.99
Stock_On_Hand 14 823030 25.32 22.96 18.00 0.00 139.00
range skew kurtosis se
Store_ID 49 0.01 -1.21 0.02
Product_ID 34 0.26 -1.22 0.01
Sale_ID 829261 0.00 -1.20 264.28
Date -Inf NA NA NA
Units 29 4.49 47.82 0.00
Store_Name -Inf NA NA NA
Store_City -Inf NA NA NA
Store_Location -Inf NA NA NA
Store_Open_Date -Inf NA NA NA
Product_Name -Inf NA NA NA
Product_Category -Inf NA NA NA
Product_Cost 33 1.78 3.41 0.01
Product_Price 37 1.37 2.32 0.01
Stock_On_Hand 139 1.58 2.85 0.03
?sample
# # Take a random sample of 10 elements from the population
# sampled_inventory_product_stores_sales <- sample(x = inventory_product_stores_sales,
# size = 100
# )
library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
?sample_n
sampled_inventory_product_stores_sales <- sample_n(tbl = inventory_product_stores_sales,
size = 10000
)# install.packages("openxlsx")
library("openxlsx")
?openxlsx
# Write the data frame to an Excel file
write.xlsx(sampled_inventory_product_stores_sales,
file = "final_toy_merged_sample.xlsx"
)
write.xlsx(inventory_product_stores_sales,
file = "final_toy_merged_data.xlsx"
)
# # install.packages("writexl")
# library(writexl)
# ?writexl
# write_xlsx(sampled_inventory_product_stores_sales,
# path = "merged_data_sample_writexl.xlsx"
# )
#
?describeStarting with an example that works, we change the data to our own source.
mtcars mpg cyl disp hp drat wt qsec vs am gear carb
Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
table(mtcars$cyl)
4 6 8
11 7 14
mtcars |>
group_by(cyl) |>
summarize(avg = mean(mpg))# A tibble: 3 × 2
cyl avg
<dbl> <dbl>
1 4 26.7
2 6 19.7
3 8 15.1
mean(mtcars$mpg)[1] 20.09062
inventory_product_stores_sales |>
group_by(Store_Location) |>
summarize(avg = mean(Product_Cost))# A tibble: 4 × 2
Store_Location avg
<chr> <dbl>
1 Airport 10.0
2 Commercial 10.1
3 Downtown 10.0
4 Residential 9.89
dplyrMerging datasets in R can be done using various functions, primarily from the dplyr package, which provides intuitive functions for different types of merges: one-to-one, one-to-many, and many-to-many.
Below are examples of each type of merge.
In a one-to-one merge, each row in one dataset corresponds to exactly one row in another dataset.
# Load the dplyr package
library(dplyr)
# Create two example data frames
df1 <- data.frame(ID = 1:3, Name = c("Alice", "Bob", "Charlie"))
df2 <- data.frame(ID = 1:3, Age = c(25, 30, 35))
# Perform a one-to-one merge
merged_df <- df1 %>%
inner_join(df2, by = "ID")
print(merged_df) ID Name Age
1 1 Alice 25
2 2 Bob 30
3 3 Charlie 35
In a one-to-many merge, each row in one dataset corresponds to multiple rows in another dataset.
# Create two example data frames
df1 <- data.frame(ID = 1:3, Name = c("Alice", "Bob", "Charlie"))
df2 <- data.frame(ID = c(1, 1, 2, 2, 3), Score = c(90, 85, 80, 75, 95))
# Perform a one-to-many merge
merged_df <- df1 %>%
inner_join(df2, by = "ID")
print(merged_df) ID Name Score
1 1 Alice 90
2 1 Alice 85
3 2 Bob 80
4 2 Bob 75
5 3 Charlie 95
In a many-to-many merge, multiple rows in one dataset correspond to multiple rows in another dataset.
# Create two example data frames
df1 <- data.frame(StudentID = c(1, 1, 2, 3), CourseID = c(101, 102, 101, 103))
df2 <- data.frame(CourseID = c(101, 101, 102, 103), CourseName = c("Math", "Science", "History", "Art"))
# Perform a many-to-many merge
merged_df <- df1 %>%
inner_join(df2, by = "CourseID")Warning in inner_join(., df2, by = "CourseID"): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 1 of `x` matches multiple rows in `y`.
ℹ Row 1 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship =
"many-to-many"` to silence this warning.
print(merged_df) StudentID CourseID CourseName
1 1 101 Math
2 1 101 Science
3 1 102 History
4 2 101 Math
5 2 101 Science
6 3 103 Art
Inner Join (inner_join): Returns only the rows with matching keys in both datasets.
Left Join (left_join): Returns all rows from the left dataset and matching rows from the right dataset. If there is no match, NA values are returned for columns from the right dataset.
Right Join (right_join): Returns all rows from the right dataset and matching rows from the left dataset. If there is no match, NA values are returned for columns from the left dataset.
Full Join (full_join): Returns all rows when there is a match in one of the datasets. Non-matching rows will have NA values for columns from the dataset without a match.
You can choose the appropriate type of join based on your specific requirements.