Notes: Setting up my environment by loading the ‘tidyverse’, ‘skimr’ and ‘janitor’ packages.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(skimr)
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
customerbehavior_df <- read_csv('CleanedCustomerBehavior.csv')
## Rows: 3900 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (15): Gender, Item Purchased, Category, Location, Size, Color, Season, S...
## dbl (6): Customer ID, Age, Purchase Amount, Review Rating, Previous Purchas...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(customerbehavior_df)
## # A tibble: 6 × 21
## `Customer ID` Age Gender `Item Purchased` Category `Purchase Amount`
## <dbl> <dbl> <chr> <chr> <chr> <dbl>
## 1 1 55 Male Blouse Clothing 53
## 2 2 19 Male Sweater Clothing 64
## 3 3 50 Male Jeans Clothing 73
## 4 4 21 Male Sandals Footwear 90
## 5 5 45 Male Blouse Clothing 49
## 6 6 46 Male Sneakers Footwear 20
## # ℹ 15 more variables: Location <chr>, Size <chr>, Color <chr>, Season <chr>,
## # `Review Rating` <dbl>, `Subscription Status` <chr>, `Shipping Type` <chr>,
## # `Discount Applied` <chr>, `Promo Code Used` <chr>,
## # `Previous Purchases` <dbl>, `Payment Method` <chr>,
## # `Frequency of Purchases` <chr>, Sentiment <chr>, `Age Group` <chr>,
## # CLV <dbl>
anova_result1 <- aov(`Purchase Amount` ~ Category, data=customerbehavior_df)
summary(anova_result1)
## Df Sum Sq Mean Sq F value Pr(>F)
## Category 3 2446 815.2 1.454 0.225
## Residuals 3896 2184885 560.8
ggplot(customerbehavior_df, aes(x=Category, y=`Purchase Amount`)) +
geom_boxplot(fill='lightpink') +
theme_classic() +
labs(title='Purchase Amount by Category', x='Category', y='Purchase Amount')
anova_result2 <- aov(`Purchase Amount` ~ Season, data=customerbehavior_df)
summary(anova_result2)
## Df Sum Sq Mean Sq F value Pr(>F)
## Season 3 6291 2097.1 3.746 0.0106 *
## Residuals 3896 2181039 559.8
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot(customerbehavior_df, aes(x=Season, y=`Purchase Amount`)) +
geom_boxplot(fill='purple') +
theme_classic() +
labs(title='Purchase Amount by Season', x='Season', y='Purchase Amount')
chisq.test(table(customerbehavior_df$`Subscription Status`, customerbehavior_df$`Promo Code Used`))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(customerbehavior_df$`Subscription Status`, customerbehavior_df$`Promo Code Used`)
## X-squared = 1908.9, df = 1, p-value < 2.2e-16
ggplot(customerbehavior_df, aes(x = `Subscription Status`, fill = `Promo Code Used`)) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent) +
labs(
title = "Promo Code Usage by Subscription Status",
x = "Subscription Status",
y = "Proportion",
fill = "Promo Code Used"
) +
theme_minimal()