R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

Online_Retail <- read.csv('C:/Users/laasy/Documents/Fall 2023/Intro to Statistics in R/Datasets for Final Project/OnlineRetail.csv')
summary(Online_Retail)
##   InvoiceNo          StockCode         Description           Quantity        
##  Length:541909      Length:541909      Length:541909      Min.   :-80995.00  
##  Class :character   Class :character   Class :character   1st Qu.:     1.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :     3.00  
##                                                           Mean   :     9.55  
##                                                           3rd Qu.:    10.00  
##                                                           Max.   : 80995.00  
##                                                                              
##  InvoiceDate          UnitPrice           CustomerID       Country         
##  Length:541909      Min.   :-11062.06   Min.   :12346    Length:541909     
##  Class :character   1st Qu.:     1.25   1st Qu.:13953    Class :character  
##  Mode  :character   Median :     2.08   Median :15152    Mode  :character  
##                     Mean   :     4.61   Mean   :15288                      
##                     3rd Qu.:     4.13   3rd Qu.:16791                      
##                     Max.   : 38970.00   Max.   :18287                      
##                                         NA's   :135080

###Part-1:

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)

##GROUP BY & Expected Probability:

# Group 1: Group by Country and summarize Quantity
group_summary1 <- Online_Retail %>%
  group_by(Country) %>%
  summarize(
    AvgQuantity = mean(Quantity)
  )

# Calculate probability for Group 1
group_summary1 <- group_summary1 %>%
  mutate(Probability = AvgQuantity / sum(AvgQuantity))
View(group_summary1)
ggplot(group_summary1, aes(x = Country, y = 1, fill = AvgQuantity)) +
  geom_tile() +
  labs(title = "Heatmap of Numerical vs. Categorical", x = "Country", y = "AvgQuantity") +
  scale_fill_gradient(low = "pink", high = "blue") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Group 2: Group by StockCode and summarize UnitPrice
group_summary2 <- Online_Retail %>%
  group_by(StockCode) %>%
  summarize(
    AvgUnitPrice = mean(UnitPrice)
  )

# Calculate probability for Group 2
group_summary2 <- group_summary2 %>%
  mutate(Probability = AvgUnitPrice / sum(AvgUnitPrice))
View(group_summary2)
ggplot(group_summary2, aes(x = StockCode, y = 1, fill = AvgUnitPrice)) +
  geom_boxplot() +
  labs(title = "Box Plot") +
  theme_minimal()

# Group 3: Group by Invoice and summarize Unit Price
group_summary3 <- Online_Retail %>%
  group_by(InvoiceNo) %>%
  summarize(
    AvgUnitPrice = mean(UnitPrice)
  )

# Calculate probability for Group 3
group_summary3 <- group_summary3 %>%
  mutate(Probability = AvgUnitPrice / sum(AvgUnitPrice))
View(group_summary3)
ggplot(group_summary3, aes(x = InvoiceNo, y = 1, fill = AvgUnitPrice)) +
  geom_tile() +
  labs(title = "Heatmap of Numerical vs. Categorical", x = "Invoice", y = "AvgUnitPrice") +
  scale_fill_gradient(low = "pink", high = "blue") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Group 4: Group by Quantity and summarize Unit Price
group_summary4 <- Online_Retail %>%
  group_by(Quantity) %>%
  summarize(
    AvgUnitPrice = mean(UnitPrice)
  )
# Calculate probability for Group 4
group_summary4 <- group_summary4 %>%
  mutate(Probability = AvgUnitPrice / sum(AvgUnitPrice))
View(group_summary4)
ggplot(group_summary4, aes(x = Quantity, y = 50, fill = AvgUnitPrice)) +
  geom_boxplot() +
  labs(title = "Box Plot") +
  theme_minimal()
## Warning: The following aesthetics were dropped during statistical transformation: fill
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

Lowest Probabilities:

min_group_summary1 <- group_summary1 %>%
  filter(Probability == min(Probability)) %>%
  mutate(Anomaly_1 ="Targeted Prob anomaly")
summary(min_group_summary1)
##    Country           AvgQuantity     Probability       Anomaly_1        
##  Length:1           Min.   :3.553   Min.   :0.00487   Length:1          
##  Class :character   1st Qu.:3.553   1st Qu.:0.00487   Class :character  
##  Mode  :character   Median :3.553   Median :0.00487   Mode  :character  
##                     Mean   :3.553   Mean   :0.00487                     
##                     3rd Qu.:3.553   3rd Qu.:0.00487                     
##                     Max.   :3.553   Max.   :0.00487
min_group_summary2 <- group_summary2 %>%
  filter(Probability == min(Probability)) %>%
  mutate(Anomaly_2 ="Targeted Prob anomaly")
summary(min_group_summary2)
##   StockCode          AvgUnitPrice    Probability       Anomaly_2        
##  Length:1           Min.   :-3687   Min.   :-0.1837   Length:1          
##  Class :character   1st Qu.:-3687   1st Qu.:-0.1837   Class :character  
##  Mode  :character   Median :-3687   Median :-0.1837   Mode  :character  
##                     Mean   :-3687   Mean   :-0.1837                     
##                     3rd Qu.:-3687   3rd Qu.:-0.1837                     
##                     Max.   :-3687   Max.   :-0.1837
min_group_summary3 <- group_summary3 %>%
  filter(Probability == min(Probability)) %>%
  mutate(Anomaly_3 ="Targeted Prob anomaly")
summary(min_group_summary3)
##   InvoiceNo          AvgUnitPrice     Probability        Anomaly_3        
##  Length:2           Min.   :-11062   Min.   :-0.01967   Length:2          
##  Class :character   1st Qu.:-11062   1st Qu.:-0.01967   Class :character  
##  Mode  :character   Median :-11062   Median :-0.01967   Mode  :character  
##                     Mean   :-11062   Mean   :-0.01967                     
##                     3rd Qu.:-11062   3rd Qu.:-0.01967                     
##                     Max.   :-11062   Max.   :-0.01967
min_group_summary4 <- group_summary4 %>%
  filter(Probability == min(Probability)) %>%
  mutate(Anomaly_4 ="Targeted Prob anomaly")
summary(min_group_summary4)
##     Quantity        AvgUnitPrice  Probability  Anomaly_4        
##  Min.   :-9600.0   Min.   :0     Min.   :0    Length:212        
##  1st Qu.: -676.8   1st Qu.:0     1st Qu.:0    Class :character  
##  Median : -263.0   Median :0     Median :0    Mode  :character  
##  Mean   : -484.8   Mean   :0     Mean   :0                      
##  3rd Qu.: -110.5   3rd Qu.:0     3rd Qu.:0                      
##  Max.   :12540.0   Max.   :0     Max.   :0

#Anomaly and combining data result:

data <- Online_Retail %>%
  left_join(min_group_summary1, by="Country") %>%
  left_join(min_group_summary2, by="StockCode") %>%
  left_join(min_group_summary3, by="InvoiceNo") %>% 
  left_join(min_group_summary4, by="Quantity")
data$Anomaly_1[is.na(data$Anomaly_1)] <- "Not Anomaly"
data$Anomaly_2[is.na(data$Anomaly_2)] <- "Not Anomaly"
data$Anomaly_3[is.na(data$Anomaly_3)] <- "Not Anomaly"
data$Anomaly_4[is.na(data$Anomaly_4)] <- "Not Anomaly"
summary(data)
##   InvoiceNo          StockCode         Description           Quantity        
##  Length:541909      Length:541909      Length:541909      Min.   :-80995.00  
##  Class :character   Class :character   Class :character   1st Qu.:     1.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :     3.00  
##                                                           Mean   :     9.55  
##                                                           3rd Qu.:    10.00  
##                                                           Max.   : 80995.00  
##                                                                              
##  InvoiceDate          UnitPrice           CustomerID       Country         
##  Length:541909      Min.   :-11062.06   Min.   :12346    Length:541909     
##  Class :character   1st Qu.:     1.25   1st Qu.:13953    Class :character  
##  Mode  :character   Median :     2.08   Median :15152    Mode  :character  
##                     Mean   :     4.61   Mean   :15288                      
##                     3rd Qu.:     4.13   3rd Qu.:16791                      
##                     Max.   : 38970.00   Max.   :18287                      
##                                         NA's   :135080                     
##   AvgQuantity     Probability.x     Anomaly_1         AvgUnitPrice.x  
##  Min.   :3.6      Min.   :0        Length:541909      Min.   :-3687   
##  1st Qu.:3.6      1st Qu.:0        Class :character   1st Qu.:-3687   
##  Median :3.6      Median :0        Mode  :character   Median :-3687   
##  Mean   :3.6      Mean   :0                           Mean   :-3687   
##  3rd Qu.:3.6      3rd Qu.:0                           3rd Qu.:-3687   
##  Max.   :3.6      Max.   :0                           Max.   :-3687   
##  NA's   :541618   NA's   :541618                      NA's   :541906  
##  Probability.y     Anomaly_2         AvgUnitPrice.y   Probability.x.x 
##  Min.   :-0.2     Length:541909      Min.   :-11062   Min.   :0       
##  1st Qu.:-0.2     Class :character   1st Qu.:-11062   1st Qu.:0       
##  Median :-0.2     Mode  :character   Median :-11062   Median :0       
##  Mean   :-0.2                        Mean   :-11062   Mean   :0       
##  3rd Qu.:-0.2                        3rd Qu.:-11062   3rd Qu.:0       
##  Max.   :-0.2                        Max.   :-11062   Max.   :0       
##  NA's   :541906                      NA's   :541907   NA's   :541907  
##   Anomaly_3          AvgUnitPrice    Probability.y.y   Anomaly_4        
##  Length:541909      Min.   :0        Min.   :0        Length:541909     
##  Class :character   1st Qu.:0        1st Qu.:0        Class :character  
##  Mode  :character   Median :0        Median :0        Mode  :character  
##                     Mean   :0        Mean   :0                          
##                     3rd Qu.:0        3rd Qu.:0                          
##                     Max.   :0        Max.   :0                          
##                     NA's   :541584   NA's   :541584

##Conclusion: The summary here is about the probabilities and anomalies for the grouped data. As we can see, the length of anomalies for individual group differs based on the category considered. The anomalies here are rare, which are worth a further investigation.

####Part-2:

# Generate all combinations of the categorical variables
data <- Online_Retail
combinations <-expand.grid(cat1 = unique(data$Country), cat2 = unique(data$StockCode), cat3 = unique(data$Quantity))
combination_counts <- data %>%
  group_by(Country,StockCode) %>%
  summarize(Count = n())
## `summarise()` has grouped output by 'Country'. You can override using the
## `.groups` argument.
View(combination_counts)
most_common_combinations <- combination_counts %>%
  filter(Count == max(Count))
cat("\nMost Common Combinations:\n")
## 
## Most Common Combinations:
print(most_common_combinations)
## # A tibble: 252 × 3
## # Groups:   Country [38]
##    Country   StockCode Count
##    <chr>     <chr>     <int>
##  1 Australia 22720        10
##  2 Austria   POST         14
##  3 Bahrain   72802B        3
##  4 Belgium   POST         98
##  5 Brazil    15056BL       1
##  6 Brazil    15056N        1
##  7 Brazil    15056P        1
##  8 Brazil    20679         1
##  9 Brazil    21166         1
## 10 Brazil    21181         1
## # ℹ 242 more rows
least_common_combinations <- combination_counts %>%
  filter(Count == min(Count))
cat("\nLeast Common Combinations:\n")
## 
## Least Common Combinations:
print(least_common_combinations)
## # A tibble: 8,464 × 3
## # Groups:   Country [38]
##    Country   StockCode Count
##    <chr>     <chr>     <int>
##  1 Australia 15036         1
##  2 Australia 15056BL       1
##  3 Australia 16161P        1
##  4 Australia 16169E        1
##  5 Australia 20665         1
##  6 Australia 20711         1
##  7 Australia 20713         1
##  8 Australia 20717         1
##  9 Australia 20837         1
## 10 Australia 20838         1
## # ℹ 8,454 more rows
library(ggplot2)
ggplot(combination_counts, aes(x = combination_counts$Country, y = combination_counts$StockCode, fill = combination_counts$Count)) +
  geom_tile() +
  scale_fill_gradient(low = "white", high = "blue") +
  labs(x = "Country", y = "Quantity", fill = "Count") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45,hjust = 1))

# Generate all combinations of the categorical variables
data <- Online_Retail
combinations <-expand.grid(cat1 = unique(data$Country), cat2 = unique(data$StockCode), cat3 = unique(data$Quantity))
combination_counts <- data %>%
  group_by(Country,StockCode,Quantity) %>%
  summarize(Count = n())
## `summarise()` has grouped output by 'Country', 'StockCode'. You can override
## using the `.groups` argument.
View(combination_counts)
ggplot(combination_counts, aes(x = combination_counts$Country, y = combination_counts$Count, fill = combination_counts$Quantity)) +
  geom_bar(stat = "identity", position = position_dodge(width = 1), width = 1) +
  labs(title = "Grouped Barplot with Different Colors") +
  theme_minimal()

ggplot(combination_counts, aes(x = combination_counts$StockCode, y = combination_counts$Count, fill = combination_counts$Quantity)) +
  geom_bar(stat = "identity")

ggplot(group_summary2, aes(x = StockCode, y = AvgUnitPrice )) +
  geom_point() +
  labs(title = "Scatterplot Example", x = "StockCode", y = "AvgUnitPrice")

#Conclusion:

The Invoice date is not considering as a categorical value as it in the MM/DD/YYYY, so the combination with the Invoice Date cannot be done for the dataset I have taken.