R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

OnlineRetail <- read.csv('C:/Users/laasy/Documents/Fall 2023/Intro to Statistics in R/Datasets for Final Project/OnlineRetail.csv')
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(purrr)
summary(OnlineRetail)
##   InvoiceNo          StockCode         Description           Quantity        
##  Length:541909      Length:541909      Length:541909      Min.   :-80995.00  
##  Class :character   Class :character   Class :character   1st Qu.:     1.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :     3.00  
##                                                           Mean   :     9.55  
##                                                           3rd Qu.:    10.00  
##                                                           Max.   : 80995.00  
##                                                                              
##  InvoiceDate          UnitPrice           CustomerID       Country         
##  Length:541909      Min.   :-11062.06   Min.   :12346    Length:541909     
##  Class :character   1st Qu.:     1.25   1st Qu.:13953    Class :character  
##  Mode  :character   Median :     2.08   Median :15152    Mode  :character  
##                     Mean   :     4.61   Mean   :15288                      
##                     3rd Qu.:     4.13   3rd Qu.:16791                      
##                     Max.   : 38970.00   Max.   :18287                      
##                                         NA's   :135080
online_sample_1 <- OnlineRetail |> sample_frac(0.5,replace = TRUE) 
View(online_sample_1)
online_sample_1 |> pluck("UnitPrice") |> mean()
## [1] 4.421394
online_sample_1 |> pluck("Quantity") |> max()
## [1] 74215

Random sampling of Online Retail dataset

num <-sample(5:10,1)
columns <- sample(c("InvoiceNo","StockCode","InvoiceDate","Quantity","UnitPrice","CustomerID","Country"))
online_subsample_list <- list()
for (i in 1:num) {
  # Determine sample size (approximately 50% of the data)
  s_size <- round(0.5 * nrow(OnlineRetail))
  # Randomly select rows with replacement
  s_index <- sample(1:nrow(OnlineRetail), size = s_size, replace = TRUE)
  # Create the subsample data frame
  online_subsample <- OnlineRetail[s_index, columns]
  # Store the subsample in the list
  online_subsample_list[[i]] <- online_subsample
}
View(online_subsample_list)

Displaying the dimensions of each subsample

sapply(online_subsample_list, dim)
##        [,1]   [,2]   [,3]   [,4]   [,5]   [,6]   [,7]
## [1,] 270954 270954 270954 270954 270954 270954 270954
## [2,]      7      7      7      7      7      7      7

Summaries of the sample data

summary_table <- lapply(online_subsample_list, function(online_subsample){
  summary_df <-summary(online_subsample)
  knitr::kable(summary_df, caption = "summary statistics")
}) 
for (i in 1: num){
  cat("### Subsample", i, "summary statisics \n")
  print(summary_table[[i]])
}
## ### Subsample 1 summary statisics 
## 
## 
## Table: summary statistics
## 
## |   |InvoiceDate      |  UnitPrice       |  CustomerID  | StockCode       | InvoiceNo       |   Quantity       |  Country        |
## |:--|:----------------|:-----------------|:-------------|:----------------|:----------------|:-----------------|:----------------|
## |   |Length:270954    |Min.   :-11062.06 |Min.   :12346 |Length:270954    |Length:270954    |Min.   :-74215.00 |Length:270954    |
## |   |Class :character |1st Qu.:     1.25 |1st Qu.:13939 |Class :character |Class :character |1st Qu.:     1.00 |Class :character |
## |   |Mode  :character |Median :     2.08 |Median :15146 |Mode  :character |Mode  :character |Median :     3.00 |Mode  :character |
## |   |NA               |Mean   :     4.62 |Mean   :15284 |NA               |NA               |Mean   :    10.25 |NA               |
## |   |NA               |3rd Qu.:     4.13 |3rd Qu.:16791 |NA               |NA               |3rd Qu.:    10.00 |NA               |
## |   |NA               |Max.   : 38970.00 |Max.   :18287 |NA               |NA               |Max.   : 80995.00 |NA               |
## |   |NA               |NA                |NA's   :67327 |NA               |NA               |NA                |NA               |
## ### Subsample 2 summary statisics 
## 
## 
## Table: summary statistics
## 
## |   |InvoiceDate      |  UnitPrice       |  CustomerID  | StockCode       | InvoiceNo       |   Quantity       |  Country        |
## |:--|:----------------|:-----------------|:-------------|:----------------|:----------------|:-----------------|:----------------|
## |   |Length:270954    |Min.   :-11062.06 |Min.   :12347 |Length:270954    |Length:270954    |Min.   :-80995.00 |Length:270954    |
## |   |Class :character |1st Qu.:     1.25 |1st Qu.:13952 |Class :character |Class :character |1st Qu.:     1.00 |Class :character |
## |   |Mode  :character |Median :     2.08 |Median :15150 |Mode  :character |Mode  :character |Median :     3.00 |Mode  :character |
## |   |NA               |Mean   :     4.52 |Mean   :15289 |NA               |NA               |Mean   :     9.45 |NA               |
## |   |NA               |3rd Qu.:     4.13 |3rd Qu.:16794 |NA               |NA               |3rd Qu.:    10.00 |NA               |
## |   |NA               |Max.   : 38970.00 |Max.   :18287 |NA               |NA               |Max.   : 12540.00 |NA               |
## |   |NA               |NA                |NA's   :67765 |NA               |NA               |NA                |NA               |
## ### Subsample 3 summary statisics 
## 
## 
## Table: summary statistics
## 
## |   |InvoiceDate      |  UnitPrice      |  CustomerID  | StockCode       | InvoiceNo       |   Quantity      |  Country        |
## |:--|:----------------|:----------------|:-------------|:----------------|:----------------|:----------------|:----------------|
## |   |Length:270954    |Min.   :    0.00 |Min.   :12346 |Length:270954    |Length:270954    |Min.   :-9600.00 |Length:270954    |
## |   |Class :character |1st Qu.:    1.25 |1st Qu.:13949 |Class :character |Class :character |1st Qu.:    1.00 |Class :character |
## |   |Mode  :character |Median :    2.08 |Median :15150 |Mode  :character |Mode  :character |Median :    3.00 |Mode  :character |
## |   |NA               |Mean   :    4.88 |Mean   :15285 |NA               |NA               |Mean   :   10.17 |NA               |
## |   |NA               |3rd Qu.:    4.13 |3rd Qu.:16790 |NA               |NA               |3rd Qu.:   10.00 |NA               |
## |   |NA               |Max.   :38970.00 |Max.   :18287 |NA               |NA               |Max.   :74215.00 |NA               |
## |   |NA               |NA               |NA's   :67490 |NA               |NA               |NA               |NA               |
## ### Subsample 4 summary statisics 
## 
## 
## Table: summary statistics
## 
## |   |InvoiceDate      |  UnitPrice       |  CustomerID  | StockCode       | InvoiceNo       |   Quantity      |  Country        |
## |:--|:----------------|:-----------------|:-------------|:----------------|:----------------|:----------------|:----------------|
## |   |Length:270954    |Min.   :-11062.06 |Min.   :12346 |Length:270954    |Length:270954    |Min.   :-9600.00 |Length:270954    |
## |   |Class :character |1st Qu.:     1.25 |1st Qu.:13959 |Class :character |Class :character |1st Qu.:    1.00 |Class :character |
## |   |Mode  :character |Median :     2.08 |Median :15152 |Mode  :character |Mode  :character |Median :    3.00 |Mode  :character |
## |   |NA               |Mean   :     5.01 |Mean   :15290 |NA               |NA               |Mean   :   10.71 |NA               |
## |   |NA               |3rd Qu.:     4.13 |3rd Qu.:16791 |NA               |NA               |3rd Qu.:   10.00 |NA               |
## |   |NA               |Max.   : 38970.00 |Max.   :18287 |NA               |NA               |Max.   :80995.00 |NA               |
## |   |NA               |NA                |NA's   :67621 |NA               |NA               |NA               |NA               |
## ### Subsample 5 summary statisics 
## 
## 
## Table: summary statistics
## 
## |   |InvoiceDate      |  UnitPrice       |  CustomerID  | StockCode       | InvoiceNo       |   Quantity       |  Country        |
## |:--|:----------------|:-----------------|:-------------|:----------------|:----------------|:-----------------|:----------------|
## |   |Length:270954    |Min.   :    0.000 |Min.   :12346 |Length:270954    |Length:270954    |Min.   :-80995.00 |Length:270954    |
## |   |Class :character |1st Qu.:    1.250 |1st Qu.:13955 |Class :character |Class :character |1st Qu.:     1.00 |Class :character |
## |   |Mode  :character |Median :    2.080 |Median :15152 |Mode  :character |Mode  :character |Median :     3.00 |Mode  :character |
## |   |NA               |Mean   :    4.743 |Mean   :15286 |NA               |NA               |Mean   :     8.94 |NA               |
## |   |NA               |3rd Qu.:    4.130 |3rd Qu.:16788 |NA               |NA               |3rd Qu.:    10.00 |NA               |
## |   |NA               |Max.   :17836.460 |Max.   :18287 |NA               |NA               |Max.   : 12540.00 |NA               |
## |   |NA               |NA                |NA's   :67384 |NA               |NA               |NA                |NA               |
## ### Subsample 6 summary statisics 
## 
## 
## Table: summary statistics
## 
## |   |InvoiceDate      |  UnitPrice      |  CustomerID  | StockCode       | InvoiceNo       |   Quantity       |  Country        |
## |:--|:----------------|:----------------|:-------------|:----------------|:----------------|:-----------------|:----------------|
## |   |Length:270954    |Min.   :    0.00 |Min.   :12346 |Length:270954    |Length:270954    |Min.   :-74215.00 |Length:270954    |
## |   |Class :character |1st Qu.:    1.25 |1st Qu.:13946 |Class :character |Class :character |1st Qu.:     1.00 |Class :character |
## |   |Mode  :character |Median :    2.08 |Median :15152 |Mode  :character |Mode  :character |Median :     3.00 |Mode  :character |
## |   |NA               |Mean   :    4.70 |Mean   :15288 |NA               |NA               |Mean   :     9.85 |NA               |
## |   |NA               |3rd Qu.:    4.13 |3rd Qu.:16794 |NA               |NA               |3rd Qu.:    10.00 |NA               |
## |   |NA               |Max.   :38970.00 |Max.   :18287 |NA               |NA               |Max.   : 74215.00 |NA               |
## |   |NA               |NA               |NA's   :67796 |NA               |NA               |NA                |NA               |
## ### Subsample 7 summary statisics 
## 
## 
## Table: summary statistics
## 
## |   |InvoiceDate      |  UnitPrice       |  CustomerID  | StockCode       | InvoiceNo       |   Quantity      |  Country        |
## |:--|:----------------|:-----------------|:-------------|:----------------|:----------------|:----------------|:----------------|
## |   |Length:270954    |Min.   :-11062.06 |Min.   :12346 |Length:270954    |Length:270954    |Min.   :-9600.00 |Length:270954    |
## |   |Class :character |1st Qu.:     1.25 |1st Qu.:13969 |Class :character |Class :character |1st Qu.:    1.00 |Class :character |
## |   |Mode  :character |Median :     2.08 |Median :15159 |Mode  :character |Mode  :character |Median :    3.00 |Mode  :character |
## |   |NA               |Mean   :     4.46 |Mean   :15297 |NA               |NA               |Mean   :   10.03 |NA               |
## |   |NA               |3rd Qu.:     4.13 |3rd Qu.:16801 |NA               |NA               |3rd Qu.:   10.00 |NA               |
## |   |NA               |Max.   : 38970.00 |Max.   :18287 |NA               |NA               |Max.   :74215.00 |NA               |
## |   |NA               |NA                |NA's   :67365 |NA               |NA               |NA               |NA               |

Scrutinizing subsamples

# summary statistics for each subsamples
summary_stats <-lapply(online_subsample_list, summary)
# plot histogram
histograms <- lapply(online_subsample_list, function(online_subsample) {
  ggplot(online_subsample, aes(x = CustomerID)) +
    geom_histogram(binwidth = 1, fill = 'pink', color = 'maroon') +
    labs(title = "Histogram for CustomerID", x = 'Value', y = 'Frequency')
})
# display histograms and summary
for (i in 1:num) {
  cat("online_Subsample", i, "summary statistics:\n")
  print(summary_stats[[i]])
  print(histograms[[i]])
}
## online_Subsample 1 summary statistics:
##  InvoiceDate          UnitPrice           CustomerID     StockCode        
##  Length:270954      Min.   :-11062.06   Min.   :12346   Length:270954     
##  Class :character   1st Qu.:     1.25   1st Qu.:13939   Class :character  
##  Mode  :character   Median :     2.08   Median :15146   Mode  :character  
##                     Mean   :     4.62   Mean   :15284                     
##                     3rd Qu.:     4.13   3rd Qu.:16791                     
##                     Max.   : 38970.00   Max.   :18287                     
##                                         NA's   :67327                     
##   InvoiceNo            Quantity           Country         
##  Length:270954      Min.   :-74215.00   Length:270954     
##  Class :character   1st Qu.:     1.00   Class :character  
##  Mode  :character   Median :     3.00   Mode  :character  
##                     Mean   :    10.25                     
##                     3rd Qu.:    10.00                     
##                     Max.   : 80995.00                     
## 
## Warning: Removed 67327 rows containing non-finite values (`stat_bin()`).

## online_Subsample 2 summary statistics:
##  InvoiceDate          UnitPrice           CustomerID     StockCode        
##  Length:270954      Min.   :-11062.06   Min.   :12347   Length:270954     
##  Class :character   1st Qu.:     1.25   1st Qu.:13952   Class :character  
##  Mode  :character   Median :     2.08   Median :15150   Mode  :character  
##                     Mean   :     4.52   Mean   :15289                     
##                     3rd Qu.:     4.13   3rd Qu.:16794                     
##                     Max.   : 38970.00   Max.   :18287                     
##                                         NA's   :67765                     
##   InvoiceNo            Quantity           Country         
##  Length:270954      Min.   :-80995.00   Length:270954     
##  Class :character   1st Qu.:     1.00   Class :character  
##  Mode  :character   Median :     3.00   Mode  :character  
##                     Mean   :     9.45                     
##                     3rd Qu.:    10.00                     
##                     Max.   : 12540.00                     
## 
## Warning: Removed 67765 rows containing non-finite values (`stat_bin()`).

## online_Subsample 3 summary statistics:
##  InvoiceDate          UnitPrice          CustomerID     StockCode        
##  Length:270954      Min.   :    0.00   Min.   :12346   Length:270954     
##  Class :character   1st Qu.:    1.25   1st Qu.:13949   Class :character  
##  Mode  :character   Median :    2.08   Median :15150   Mode  :character  
##                     Mean   :    4.88   Mean   :15285                     
##                     3rd Qu.:    4.13   3rd Qu.:16790                     
##                     Max.   :38970.00   Max.   :18287                     
##                                        NA's   :67490                     
##   InvoiceNo            Quantity          Country         
##  Length:270954      Min.   :-9600.00   Length:270954     
##  Class :character   1st Qu.:    1.00   Class :character  
##  Mode  :character   Median :    3.00   Mode  :character  
##                     Mean   :   10.17                     
##                     3rd Qu.:   10.00                     
##                     Max.   :74215.00                     
## 
## Warning: Removed 67490 rows containing non-finite values (`stat_bin()`).

## online_Subsample 4 summary statistics:
##  InvoiceDate          UnitPrice           CustomerID     StockCode        
##  Length:270954      Min.   :-11062.06   Min.   :12346   Length:270954     
##  Class :character   1st Qu.:     1.25   1st Qu.:13959   Class :character  
##  Mode  :character   Median :     2.08   Median :15152   Mode  :character  
##                     Mean   :     5.01   Mean   :15290                     
##                     3rd Qu.:     4.13   3rd Qu.:16791                     
##                     Max.   : 38970.00   Max.   :18287                     
##                                         NA's   :67621                     
##   InvoiceNo            Quantity          Country         
##  Length:270954      Min.   :-9600.00   Length:270954     
##  Class :character   1st Qu.:    1.00   Class :character  
##  Mode  :character   Median :    3.00   Mode  :character  
##                     Mean   :   10.71                     
##                     3rd Qu.:   10.00                     
##                     Max.   :80995.00                     
## 
## Warning: Removed 67621 rows containing non-finite values (`stat_bin()`).

## online_Subsample 5 summary statistics:
##  InvoiceDate          UnitPrice           CustomerID     StockCode        
##  Length:270954      Min.   :    0.000   Min.   :12346   Length:270954     
##  Class :character   1st Qu.:    1.250   1st Qu.:13955   Class :character  
##  Mode  :character   Median :    2.080   Median :15152   Mode  :character  
##                     Mean   :    4.743   Mean   :15286                     
##                     3rd Qu.:    4.130   3rd Qu.:16788                     
##                     Max.   :17836.460   Max.   :18287                     
##                                         NA's   :67384                     
##   InvoiceNo            Quantity           Country         
##  Length:270954      Min.   :-80995.00   Length:270954     
##  Class :character   1st Qu.:     1.00   Class :character  
##  Mode  :character   Median :     3.00   Mode  :character  
##                     Mean   :     8.94                     
##                     3rd Qu.:    10.00                     
##                     Max.   : 12540.00                     
## 
## Warning: Removed 67384 rows containing non-finite values (`stat_bin()`).

## online_Subsample 6 summary statistics:
##  InvoiceDate          UnitPrice          CustomerID     StockCode        
##  Length:270954      Min.   :    0.00   Min.   :12346   Length:270954     
##  Class :character   1st Qu.:    1.25   1st Qu.:13946   Class :character  
##  Mode  :character   Median :    2.08   Median :15152   Mode  :character  
##                     Mean   :    4.70   Mean   :15288                     
##                     3rd Qu.:    4.13   3rd Qu.:16794                     
##                     Max.   :38970.00   Max.   :18287                     
##                                        NA's   :67796                     
##   InvoiceNo            Quantity           Country         
##  Length:270954      Min.   :-74215.00   Length:270954     
##  Class :character   1st Qu.:     1.00   Class :character  
##  Mode  :character   Median :     3.00   Mode  :character  
##                     Mean   :     9.85                     
##                     3rd Qu.:    10.00                     
##                     Max.   : 74215.00                     
## 
## Warning: Removed 67796 rows containing non-finite values (`stat_bin()`).

## online_Subsample 7 summary statistics:
##  InvoiceDate          UnitPrice           CustomerID     StockCode        
##  Length:270954      Min.   :-11062.06   Min.   :12346   Length:270954     
##  Class :character   1st Qu.:     1.25   1st Qu.:13969   Class :character  
##  Mode  :character   Median :     2.08   Median :15159   Mode  :character  
##                     Mean   :     4.46   Mean   :15297                     
##                     3rd Qu.:     4.13   3rd Qu.:16801                     
##                     Max.   : 38970.00   Max.   :18287                     
##                                         NA's   :67365                     
##   InvoiceNo            Quantity          Country         
##  Length:270954      Min.   :-9600.00   Length:270954     
##  Class :character   1st Qu.:    1.00   Class :character  
##  Mode  :character   Median :    3.00   Mode  :character  
##                     Mean   :   10.03                     
##                     3rd Qu.:   10.00                     
##                     Max.   :74215.00                     
## 
## Warning: Removed 67365 rows containing non-finite values (`stat_bin()`).

### Summary: The graph is drawn with the Customer ID so that there can be some insights for the how frequent or the consistant of the orders. After taking the few random samples from data , I have obsereved that the Customer orders are consistant accross all the samples with the few deviations.

Anomalies and consistency

means <- lapply(online_subsample_list, function(online_subsample) {
  mean(online_subsample$Quantity)
})
View(means)

sds <- lapply(online_subsample_list, function(online_subsample) {
  sd(online_subsample$Quantity)
})
View(sds)


median <- lapply(online_subsample_list, function(online_subsample) {
  median(online_subsample$Quantity)
})
View(median)
for (i in 1:num) {
  cat("online_Subsample", i, "Mean of Quantity", means[[i]], "\n")
  cat("online_Subsample", i, "SD of Quantity", sds[[i]], "\n")
  cat("online_Subsample", i, "Median of Quantity", median[[i]], "\n")
}
## online_Subsample 1 Mean of Quantity 10.24929 
## online_Subsample 1 SD of Quantity 297.9788 
## online_Subsample 1 Median of Quantity 3 
## online_Subsample 2 Mean of Quantity 9.445622 
## online_Subsample 2 SD of Quantity 165.2793 
## online_Subsample 2 Median of Quantity 3 
## online_Subsample 3 Mean of Quantity 10.16995 
## online_Subsample 3 SD of Quantity 209.4496 
## online_Subsample 3 Median of Quantity 3 
## online_Subsample 4 Mean of Quantity 10.70518 
## online_Subsample 4 SD of Quantity 308.9987 
## online_Subsample 4 Median of Quantity 3 
## online_Subsample 5 Mean of Quantity 8.93686 
## online_Subsample 5 SD of Quantity 219.7432 
## online_Subsample 5 Median of Quantity 3 
## online_Subsample 6 Mean of Quantity 9.850746 
## online_Subsample 6 SD of Quantity 210.9712 
## online_Subsample 6 Median of Quantity 3 
## online_Subsample 7 Mean of Quantity 10.02634 
## online_Subsample 7 SD of Quantity 207.8675 
## online_Subsample 7 Median of Quantity 3