Online_Retail <- read.csv('C:/Users/laasy/Documents/Fall 2023/Intro to Statistics in R/Datasets for Final Project/OnlineRetail.csv')
summary(Online_Retail)
##   InvoiceNo          StockCode         Description           Quantity        
##  Length:541909      Length:541909      Length:541909      Min.   :-80995.00  
##  Class :character   Class :character   Class :character   1st Qu.:     1.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :     3.00  
##                                                           Mean   :     9.55  
##                                                           3rd Qu.:    10.00  
##                                                           Max.   : 80995.00  
##                                                                              
##  InvoiceDate          UnitPrice           CustomerID       Country         
##  Length:541909      Min.   :-11062.06   Min.   :12346    Length:541909     
##  Class :character   1st Qu.:     1.25   1st Qu.:13953    Class :character  
##  Mode  :character   Median :     2.08   Median :15152    Mode  :character  
##                     Mean   :     4.61   Mean   :15288                      
##                     3rd Qu.:     4.13   3rd Qu.:16791                      
##                     Max.   : 38970.00   Max.   :18287                      
##                                         NA's   :135080
# Load packages
library(dplyr) 
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(infer)
# Create total_price variable
data <- Online_Retail %>% mutate(total_price = Quantity * UnitPrice)

SET 1

# Quantity-Total Spend Analysis
qty_totalspend_set <- data %>% select(total_price, Quantity, UnitPrice)
# Scatterplot 
ggplot(qty_totalspend_set, aes(x = Quantity, y = total_price)) +
  geom_point()

Insight: Positive linear correlation between Quantity and total_price. Total spend increases as more items purchased.

# Histogram
ggplot(qty_totalspend_set, aes(total_price)) +
  geom_histogram(bins=30) 

Insight: Right-skewed distribution of transaction total_price, less than $ 200

# Correlation
cor(qty_totalspend_set$Quantity, qty_totalspend_set$total_price) 
## [1] 0.8866811

0.88 - High positive correlation

Insight: Correlation of 0.88 indicates a robust linear relationship between the number of items purchased and total transaction spend and aligning with the scatterplot.

SET 2

# Quantity-Unit Price Analysis
qty_unitprice_set <- data %>% select(Quantity, UnitPrice, total_price)
# Scatterplot
ggplot(qty_unitprice_set, aes(x = Quantity, y = UnitPrice)) + 
  geom_point()

Insight: Slight downward trend between Quantity and UnitPrice. Larger purchases have lower UnitPrice.

# Boxplot 
ggplot(qty_unitprice_set, aes(x = Quantity, y = UnitPrice)) +
  geom_boxplot()
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

Insight: Median UnitPrice appears lowering for larger Quantity.

# Correlation
cor(qty_unitprice_set$Quantity, qty_unitprice_set$UnitPrice)
## [1] -0.001234925

-0.001 - Weak negative correlation

Insight: Negative correlation confirms the downward relationship observed. The negative correlation and downward trend in the scatterplot suggests customers may get bulk purchase discounts when buying larger quantities. However, the correlation is fairly weak at -0.001, indicating this discounting effect is small.

SET 3

# Total Spend - Expensiveness Analysis
totalspend_expensiveness_set <- data %>% select(total_price, UnitPrice, Quantity)
# Scatterplot
ggplot(totalspend_expensiveness_set, aes(x = total_price, y = UnitPrice)) +
  geom_point()

Insight: Customers with higher total spend purchase more expensive items on average. Insight: Higher total spend transactions have UnitPrice distributions shifted towards more expensive items.

# Correlation
cor(totalspend_expensiveness_set$total_price, totalspend_expensiveness_set$UnitPrice)
## [1] -0.1620286

-0.16 - Weak Negative correlation

Insight: Correlation of -0.16 for this analysis indicates a weak, slightly negative linear relationship between total spend and unit price in the dataset.