This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
Online_Retail <- read.csv('C:/Users/laasy/Documents/Fall 2023/Intro to Statistics in R/Datasets for Final Project/OnlineRetail.csv')
summary(Online_Retail)
## InvoiceNo StockCode Description Quantity
## Length:541909 Length:541909 Length:541909 Min. :-80995.00
## Class :character Class :character Class :character 1st Qu.: 1.00
## Mode :character Mode :character Mode :character Median : 3.00
## Mean : 9.55
## 3rd Qu.: 10.00
## Max. : 80995.00
##
## InvoiceDate UnitPrice CustomerID Country
## Length:541909 Min. :-11062.06 Min. :12346 Length:541909
## Class :character 1st Qu.: 1.25 1st Qu.:13953 Class :character
## Mode :character Median : 2.08 Median :15152 Mode :character
## Mean : 4.61 Mean :15288
## 3rd Qu.: 4.13 3rd Qu.:16791
## Max. : 38970.00 Max. :18287
## NA's :135080
###Part-1:
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
##GROUP BY & Expected Probability:
# Group 1: Group by Country and summarize Quantity
group_summary1 <- Online_Retail %>%
group_by(Country) %>%
summarize(
AvgQuantity = mean(Quantity)
)
# Calculate probability for Group 1
group_summary1 <- group_summary1 %>%
mutate(Probability = AvgQuantity / sum(AvgQuantity))
View(group_summary1)
ggplot(group_summary1, aes(x = Country, y = 1, fill = AvgQuantity)) +
geom_tile() +
labs(title = "Heatmap of Numerical vs. Categorical", x = "Country", y = "AvgQuantity") +
scale_fill_gradient(low = "pink", high = "blue") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Group 2: Group by StockCode and summarize UnitPrice
group_summary2 <- Online_Retail %>%
group_by(StockCode) %>%
summarize(
AvgUnitPrice = mean(UnitPrice)
)
# Calculate probability for Group 2
group_summary2 <- group_summary2 %>%
mutate(Probability = AvgUnitPrice / sum(AvgUnitPrice))
View(group_summary2)
ggplot(group_summary2, aes(x = StockCode, y = 1, fill = AvgUnitPrice)) +
geom_boxplot() +
labs(title = "Box Plot") +
theme_minimal()
# Group 3: Group by Invoice and summarize Unit Price
group_summary3 <- Online_Retail %>%
group_by(InvoiceNo) %>%
summarize(
AvgUnitPrice = mean(UnitPrice)
)
# Calculate probability for Group 3
group_summary3 <- group_summary3 %>%
mutate(Probability = AvgUnitPrice / sum(AvgUnitPrice))
View(group_summary3)
ggplot(group_summary3, aes(x = InvoiceNo, y = 1, fill = AvgUnitPrice)) +
geom_tile() +
labs(title = "Heatmap of Numerical vs. Categorical", x = "Invoice", y = "AvgUnitPrice") +
scale_fill_gradient(low = "pink", high = "blue") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Group 4: Group by Quantity and summarize Unit Price
group_summary4 <- Online_Retail %>%
group_by(Quantity) %>%
summarize(
AvgUnitPrice = mean(UnitPrice)
)
# Calculate probability for Group 4
group_summary4 <- group_summary4 %>%
mutate(Probability = AvgUnitPrice / sum(AvgUnitPrice))
View(group_summary4)
ggplot(group_summary4, aes(x = Quantity, y = 50, fill = AvgUnitPrice)) +
geom_boxplot() +
labs(title = "Box Plot") +
theme_minimal()
## Warning: The following aesthetics were dropped during statistical transformation: fill
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
min_group_summary1 <- group_summary1 %>%
filter(Probability == min(Probability)) %>%
mutate(Anomaly_1 ="Targeted Prob anomaly")
summary(min_group_summary1)
## Country AvgQuantity Probability Anomaly_1
## Length:1 Min. :3.553 Min. :0.00487 Length:1
## Class :character 1st Qu.:3.553 1st Qu.:0.00487 Class :character
## Mode :character Median :3.553 Median :0.00487 Mode :character
## Mean :3.553 Mean :0.00487
## 3rd Qu.:3.553 3rd Qu.:0.00487
## Max. :3.553 Max. :0.00487
min_group_summary2 <- group_summary2 %>%
filter(Probability == min(Probability)) %>%
mutate(Anomaly_2 ="Targeted Prob anomaly")
summary(min_group_summary2)
## StockCode AvgUnitPrice Probability Anomaly_2
## Length:1 Min. :-3687 Min. :-0.1837 Length:1
## Class :character 1st Qu.:-3687 1st Qu.:-0.1837 Class :character
## Mode :character Median :-3687 Median :-0.1837 Mode :character
## Mean :-3687 Mean :-0.1837
## 3rd Qu.:-3687 3rd Qu.:-0.1837
## Max. :-3687 Max. :-0.1837
min_group_summary3 <- group_summary3 %>%
filter(Probability == min(Probability)) %>%
mutate(Anomaly_3 ="Targeted Prob anomaly")
summary(min_group_summary3)
## InvoiceNo AvgUnitPrice Probability Anomaly_3
## Length:2 Min. :-11062 Min. :-0.01967 Length:2
## Class :character 1st Qu.:-11062 1st Qu.:-0.01967 Class :character
## Mode :character Median :-11062 Median :-0.01967 Mode :character
## Mean :-11062 Mean :-0.01967
## 3rd Qu.:-11062 3rd Qu.:-0.01967
## Max. :-11062 Max. :-0.01967
min_group_summary4 <- group_summary4 %>%
filter(Probability == min(Probability)) %>%
mutate(Anomaly_4 ="Targeted Prob anomaly")
summary(min_group_summary4)
## Quantity AvgUnitPrice Probability Anomaly_4
## Min. :-9600.0 Min. :0 Min. :0 Length:212
## 1st Qu.: -676.8 1st Qu.:0 1st Qu.:0 Class :character
## Median : -263.0 Median :0 Median :0 Mode :character
## Mean : -484.8 Mean :0 Mean :0
## 3rd Qu.: -110.5 3rd Qu.:0 3rd Qu.:0
## Max. :12540.0 Max. :0 Max. :0
#Anomaly and combining data result:
data <- Online_Retail %>%
left_join(min_group_summary1, by="Country") %>%
left_join(min_group_summary2, by="StockCode") %>%
left_join(min_group_summary3, by="InvoiceNo") %>%
left_join(min_group_summary4, by="Quantity")
data$Anomaly_1[is.na(data$Anomaly_1)] <- "Not Anomaly"
data$Anomaly_2[is.na(data$Anomaly_2)] <- "Not Anomaly"
data$Anomaly_3[is.na(data$Anomaly_3)] <- "Not Anomaly"
data$Anomaly_4[is.na(data$Anomaly_4)] <- "Not Anomaly"
summary(data)
## InvoiceNo StockCode Description Quantity
## Length:541909 Length:541909 Length:541909 Min. :-80995.00
## Class :character Class :character Class :character 1st Qu.: 1.00
## Mode :character Mode :character Mode :character Median : 3.00
## Mean : 9.55
## 3rd Qu.: 10.00
## Max. : 80995.00
##
## InvoiceDate UnitPrice CustomerID Country
## Length:541909 Min. :-11062.06 Min. :12346 Length:541909
## Class :character 1st Qu.: 1.25 1st Qu.:13953 Class :character
## Mode :character Median : 2.08 Median :15152 Mode :character
## Mean : 4.61 Mean :15288
## 3rd Qu.: 4.13 3rd Qu.:16791
## Max. : 38970.00 Max. :18287
## NA's :135080
## AvgQuantity Probability.x Anomaly_1 AvgUnitPrice.x
## Min. :3.6 Min. :0 Length:541909 Min. :-3687
## 1st Qu.:3.6 1st Qu.:0 Class :character 1st Qu.:-3687
## Median :3.6 Median :0 Mode :character Median :-3687
## Mean :3.6 Mean :0 Mean :-3687
## 3rd Qu.:3.6 3rd Qu.:0 3rd Qu.:-3687
## Max. :3.6 Max. :0 Max. :-3687
## NA's :541618 NA's :541618 NA's :541906
## Probability.y Anomaly_2 AvgUnitPrice.y Probability.x.x
## Min. :-0.2 Length:541909 Min. :-11062 Min. :0
## 1st Qu.:-0.2 Class :character 1st Qu.:-11062 1st Qu.:0
## Median :-0.2 Mode :character Median :-11062 Median :0
## Mean :-0.2 Mean :-11062 Mean :0
## 3rd Qu.:-0.2 3rd Qu.:-11062 3rd Qu.:0
## Max. :-0.2 Max. :-11062 Max. :0
## NA's :541906 NA's :541907 NA's :541907
## Anomaly_3 AvgUnitPrice Probability.y.y Anomaly_4
## Length:541909 Min. :0 Min. :0 Length:541909
## Class :character 1st Qu.:0 1st Qu.:0 Class :character
## Mode :character Median :0 Median :0 Mode :character
## Mean :0 Mean :0
## 3rd Qu.:0 3rd Qu.:0
## Max. :0 Max. :0
## NA's :541584 NA's :541584
##Conclusion: The summary here is about the probabilities and anomalies for the grouped data. As we can see, the length of anomalies for individual group differs based on the category considered. The anomalies here are rare, which are worth a further investigation.
####Part-2:
# Generate all combinations of the categorical variables
data <- Online_Retail
combinations <-expand.grid(cat1 = unique(data$Country), cat2 = unique(data$StockCode), cat3 = unique(data$Quantity))
combination_counts <- data %>%
group_by(Country,StockCode) %>%
summarize(Count = n())
## `summarise()` has grouped output by 'Country'. You can override using the
## `.groups` argument.
View(combination_counts)
most_common_combinations <- combination_counts %>%
filter(Count == max(Count))
cat("\nMost Common Combinations:\n")
##
## Most Common Combinations:
print(most_common_combinations)
## # A tibble: 252 × 3
## # Groups: Country [38]
## Country StockCode Count
## <chr> <chr> <int>
## 1 Australia 22720 10
## 2 Austria POST 14
## 3 Bahrain 72802B 3
## 4 Belgium POST 98
## 5 Brazil 15056BL 1
## 6 Brazil 15056N 1
## 7 Brazil 15056P 1
## 8 Brazil 20679 1
## 9 Brazil 21166 1
## 10 Brazil 21181 1
## # ℹ 242 more rows
least_common_combinations <- combination_counts %>%
filter(Count == min(Count))
cat("\nLeast Common Combinations:\n")
##
## Least Common Combinations:
print(least_common_combinations)
## # A tibble: 8,464 × 3
## # Groups: Country [38]
## Country StockCode Count
## <chr> <chr> <int>
## 1 Australia 15036 1
## 2 Australia 15056BL 1
## 3 Australia 16161P 1
## 4 Australia 16169E 1
## 5 Australia 20665 1
## 6 Australia 20711 1
## 7 Australia 20713 1
## 8 Australia 20717 1
## 9 Australia 20837 1
## 10 Australia 20838 1
## # ℹ 8,454 more rows
library(ggplot2)
ggplot(combination_counts, aes(x = combination_counts$Country, y = combination_counts$StockCode, fill = combination_counts$Count)) +
geom_tile() +
scale_fill_gradient(low = "white", high = "blue") +
labs(x = "Country", y = "Quantity", fill = "Count") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45,hjust = 1))
# Generate all combinations of the categorical variables
data <- Online_Retail
combinations <-expand.grid(cat1 = unique(data$Country), cat2 = unique(data$StockCode), cat3 = unique(data$Quantity))
combination_counts <- data %>%
group_by(Country,StockCode,Quantity) %>%
summarize(Count = n())
## `summarise()` has grouped output by 'Country', 'StockCode'. You can override
## using the `.groups` argument.
View(combination_counts)
ggplot(combination_counts, aes(x = combination_counts$Country, y = combination_counts$Count, fill = combination_counts$Quantity)) +
geom_bar(stat = "identity", position = position_dodge(width = 1), width = 1) +
labs(title = "Grouped Barplot with Different Colors") +
theme_minimal()
ggplot(combination_counts, aes(x = combination_counts$StockCode, y = combination_counts$Count, fill = combination_counts$Quantity)) +
geom_bar(stat = "identity")
ggplot(group_summary2, aes(x = StockCode, y = AvgUnitPrice )) +
geom_point() +
labs(title = "Scatterplot Example", x = "StockCode", y = "AvgUnitPrice")
#Conclusion:
The Invoice date is not considering as a categorical value as it in the MM/DD/YYYY, so the combination with the Invoice Date cannot be done for the dataset I have taken.