Notes: Setting up my environment by loading the ‘tidyverse’, ‘skimr’ and ‘janitor’ packages.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(skimr)
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
sales_df <- read_csv('BlackFriday.csv')
## Rows: 537577 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Product_ID, Gender, Age, City_Category, Stay_In_Current_City_Years
## dbl (7): User_ID, Occupation, Marital_Status, Product_Category_1, Product_Ca...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(sales_df)
## # A tibble: 6 × 12
## User_ID Product_ID Gender Age Occupation City_Category
## <dbl> <chr> <chr> <chr> <dbl> <chr>
## 1 1000001 P00069042 F 0-17 10 A
## 2 1000001 P00248942 F 0-17 10 A
## 3 1000001 P00087842 F 0-17 10 A
## 4 1000001 P00085442 F 0-17 10 A
## 5 1000002 P00285442 M 55+ 16 C
## 6 1000003 P00193542 M 26-35 15 A
## # ℹ 6 more variables: Stay_In_Current_City_Years <chr>, Marital_Status <dbl>,
## # Product_Category_1 <dbl>, Product_Category_2 <dbl>,
## # Product_Category_3 <dbl>, Purchase <dbl>
glimpse(sales_df)
## Rows: 537,577
## Columns: 12
## $ User_ID <dbl> 1000001, 1000001, 1000001, 1000001, 1000002…
## $ Product_ID <chr> "P00069042", "P00248942", "P00087842", "P00…
## $ Gender <chr> "F", "F", "F", "F", "M", "M", "M", "M", "M"…
## $ Age <chr> "0-17", "0-17", "0-17", "0-17", "55+", "26-…
## $ Occupation <dbl> 10, 10, 10, 10, 16, 15, 7, 7, 7, 20, 20, 20…
## $ City_Category <chr> "A", "A", "A", "A", "C", "A", "B", "B", "B"…
## $ Stay_In_Current_City_Years <chr> "2", "2", "2", "2", "4+", "3", "2", "2", "2…
## $ Marital_Status <dbl> 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0…
## $ Product_Category_1 <dbl> 3, 1, 12, 12, 8, 1, 1, 1, 1, 8, 5, 8, 8, 1,…
## $ Product_Category_2 <dbl> NA, 6, NA, 14, NA, 2, 8, 15, 16, NA, 11, NA…
## $ Product_Category_3 <dbl> NA, 14, NA, NA, NA, NA, 17, NA, NA, NA, NA,…
## $ Purchase <dbl> 8370, 15200, 1422, 1057, 7969, 15227, 19215…
sum(is.na(sales_df))
## [1] 540285
colSums(is.na(sales_df))
## User_ID Product_ID
## 0 0
## Gender Age
## 0 0
## Occupation City_Category
## 0 0
## Stay_In_Current_City_Years Marital_Status
## 0 0
## Product_Category_1 Product_Category_2
## 0 166986
## Product_Category_3 Purchase
## 373299 0
sales_df$Product_Category_2 <- NULL
sales_df$Product_Category_3 <- NULL
contingency_table <- table(sales_df$Gender, sales_df$Product_Category_1)
chisq.test(contingency_table)
##
## Pearson's Chi-squared test
##
## data: contingency_table
## X-squared = 7902.1, df = 17, p-value < 2.2e-16
Compare purchase spread and outliers across different occupations.
library(ggplot2)
ggplot(sales_df, aes(x=factor(Occupation), y=Purchase))+
geom_boxplot(fill = 'skyblue') +
labs(title = "Purchase by Occupation", x = "Occupation", y = "Purchase Amount")
Analyzing purchasing patterns using grouped summarization of age and marital status.
sales_df %>%
group_by(Age, Marital_Status) %>%
summarise(Average_Purchase = mean(Purchase), .groups = 'drop') %>%
arrange(desc(Average_Purchase))
## # A tibble: 13 × 3
## Age Marital_Status Average_Purchase
## <chr> <dbl> <dbl>
## 1 51-55 0 9664.
## 2 55+ 0 9660.
## 3 51-55 1 9604.
## 4 36-45 0 9470.
## 5 46-50 1 9381.
## 6 55+ 1 9335.
## 7 26-35 1 9316.
## 8 26-35 0 9313.
## 9 36-45 1 9296.
## 10 18-25 0 9281.
## 11 18-25 1 9065.
## 12 46-50 0 9035.
## 13 0-17 0 9020.