This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
OnlineRetail <- read.csv('C:/Users/laasy/Documents/Fall 2023/Intro to Statistics in R/Datasets for Final Project/OnlineRetail.csv')
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(purrr)
summary(OnlineRetail)
## InvoiceNo StockCode Description Quantity
## Length:541909 Length:541909 Length:541909 Min. :-80995.00
## Class :character Class :character Class :character 1st Qu.: 1.00
## Mode :character Mode :character Mode :character Median : 3.00
## Mean : 9.55
## 3rd Qu.: 10.00
## Max. : 80995.00
##
## InvoiceDate UnitPrice CustomerID Country
## Length:541909 Min. :-11062.06 Min. :12346 Length:541909
## Class :character 1st Qu.: 1.25 1st Qu.:13953 Class :character
## Mode :character Median : 2.08 Median :15152 Mode :character
## Mean : 4.61 Mean :15288
## 3rd Qu.: 4.13 3rd Qu.:16791
## Max. : 38970.00 Max. :18287
## NA's :135080
online_sample_1 <- OnlineRetail |> sample_frac(0.5,replace = TRUE)
View(online_sample_1)
online_sample_1 |> pluck("UnitPrice") |> mean()
## [1] 4.421394
online_sample_1 |> pluck("Quantity") |> max()
## [1] 74215
num <-sample(5:10,1)
columns <- sample(c("InvoiceNo","StockCode","InvoiceDate","Quantity","UnitPrice","CustomerID","Country"))
online_subsample_list <- list()
for (i in 1:num) {
# Determine sample size (approximately 50% of the data)
s_size <- round(0.5 * nrow(OnlineRetail))
# Randomly select rows with replacement
s_index <- sample(1:nrow(OnlineRetail), size = s_size, replace = TRUE)
# Create the subsample data frame
online_subsample <- OnlineRetail[s_index, columns]
# Store the subsample in the list
online_subsample_list[[i]] <- online_subsample
}
View(online_subsample_list)
sapply(online_subsample_list, dim)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,] 270954 270954 270954 270954 270954 270954 270954
## [2,] 7 7 7 7 7 7 7
summary_table <- lapply(online_subsample_list, function(online_subsample){
summary_df <-summary(online_subsample)
knitr::kable(summary_df, caption = "summary statistics")
})
for (i in 1: num){
cat("### Subsample", i, "summary statisics \n")
print(summary_table[[i]])
}
## ### Subsample 1 summary statisics
##
##
## Table: summary statistics
##
## | |InvoiceDate | UnitPrice | CustomerID | StockCode | InvoiceNo | Quantity | Country |
## |:--|:----------------|:-----------------|:-------------|:----------------|:----------------|:-----------------|:----------------|
## | |Length:270954 |Min. :-11062.06 |Min. :12346 |Length:270954 |Length:270954 |Min. :-74215.00 |Length:270954 |
## | |Class :character |1st Qu.: 1.25 |1st Qu.:13939 |Class :character |Class :character |1st Qu.: 1.00 |Class :character |
## | |Mode :character |Median : 2.08 |Median :15146 |Mode :character |Mode :character |Median : 3.00 |Mode :character |
## | |NA |Mean : 4.62 |Mean :15284 |NA |NA |Mean : 10.25 |NA |
## | |NA |3rd Qu.: 4.13 |3rd Qu.:16791 |NA |NA |3rd Qu.: 10.00 |NA |
## | |NA |Max. : 38970.00 |Max. :18287 |NA |NA |Max. : 80995.00 |NA |
## | |NA |NA |NA's :67327 |NA |NA |NA |NA |
## ### Subsample 2 summary statisics
##
##
## Table: summary statistics
##
## | |InvoiceDate | UnitPrice | CustomerID | StockCode | InvoiceNo | Quantity | Country |
## |:--|:----------------|:-----------------|:-------------|:----------------|:----------------|:-----------------|:----------------|
## | |Length:270954 |Min. :-11062.06 |Min. :12347 |Length:270954 |Length:270954 |Min. :-80995.00 |Length:270954 |
## | |Class :character |1st Qu.: 1.25 |1st Qu.:13952 |Class :character |Class :character |1st Qu.: 1.00 |Class :character |
## | |Mode :character |Median : 2.08 |Median :15150 |Mode :character |Mode :character |Median : 3.00 |Mode :character |
## | |NA |Mean : 4.52 |Mean :15289 |NA |NA |Mean : 9.45 |NA |
## | |NA |3rd Qu.: 4.13 |3rd Qu.:16794 |NA |NA |3rd Qu.: 10.00 |NA |
## | |NA |Max. : 38970.00 |Max. :18287 |NA |NA |Max. : 12540.00 |NA |
## | |NA |NA |NA's :67765 |NA |NA |NA |NA |
## ### Subsample 3 summary statisics
##
##
## Table: summary statistics
##
## | |InvoiceDate | UnitPrice | CustomerID | StockCode | InvoiceNo | Quantity | Country |
## |:--|:----------------|:----------------|:-------------|:----------------|:----------------|:----------------|:----------------|
## | |Length:270954 |Min. : 0.00 |Min. :12346 |Length:270954 |Length:270954 |Min. :-9600.00 |Length:270954 |
## | |Class :character |1st Qu.: 1.25 |1st Qu.:13949 |Class :character |Class :character |1st Qu.: 1.00 |Class :character |
## | |Mode :character |Median : 2.08 |Median :15150 |Mode :character |Mode :character |Median : 3.00 |Mode :character |
## | |NA |Mean : 4.88 |Mean :15285 |NA |NA |Mean : 10.17 |NA |
## | |NA |3rd Qu.: 4.13 |3rd Qu.:16790 |NA |NA |3rd Qu.: 10.00 |NA |
## | |NA |Max. :38970.00 |Max. :18287 |NA |NA |Max. :74215.00 |NA |
## | |NA |NA |NA's :67490 |NA |NA |NA |NA |
## ### Subsample 4 summary statisics
##
##
## Table: summary statistics
##
## | |InvoiceDate | UnitPrice | CustomerID | StockCode | InvoiceNo | Quantity | Country |
## |:--|:----------------|:-----------------|:-------------|:----------------|:----------------|:----------------|:----------------|
## | |Length:270954 |Min. :-11062.06 |Min. :12346 |Length:270954 |Length:270954 |Min. :-9600.00 |Length:270954 |
## | |Class :character |1st Qu.: 1.25 |1st Qu.:13959 |Class :character |Class :character |1st Qu.: 1.00 |Class :character |
## | |Mode :character |Median : 2.08 |Median :15152 |Mode :character |Mode :character |Median : 3.00 |Mode :character |
## | |NA |Mean : 5.01 |Mean :15290 |NA |NA |Mean : 10.71 |NA |
## | |NA |3rd Qu.: 4.13 |3rd Qu.:16791 |NA |NA |3rd Qu.: 10.00 |NA |
## | |NA |Max. : 38970.00 |Max. :18287 |NA |NA |Max. :80995.00 |NA |
## | |NA |NA |NA's :67621 |NA |NA |NA |NA |
## ### Subsample 5 summary statisics
##
##
## Table: summary statistics
##
## | |InvoiceDate | UnitPrice | CustomerID | StockCode | InvoiceNo | Quantity | Country |
## |:--|:----------------|:-----------------|:-------------|:----------------|:----------------|:-----------------|:----------------|
## | |Length:270954 |Min. : 0.000 |Min. :12346 |Length:270954 |Length:270954 |Min. :-80995.00 |Length:270954 |
## | |Class :character |1st Qu.: 1.250 |1st Qu.:13955 |Class :character |Class :character |1st Qu.: 1.00 |Class :character |
## | |Mode :character |Median : 2.080 |Median :15152 |Mode :character |Mode :character |Median : 3.00 |Mode :character |
## | |NA |Mean : 4.743 |Mean :15286 |NA |NA |Mean : 8.94 |NA |
## | |NA |3rd Qu.: 4.130 |3rd Qu.:16788 |NA |NA |3rd Qu.: 10.00 |NA |
## | |NA |Max. :17836.460 |Max. :18287 |NA |NA |Max. : 12540.00 |NA |
## | |NA |NA |NA's :67384 |NA |NA |NA |NA |
## ### Subsample 6 summary statisics
##
##
## Table: summary statistics
##
## | |InvoiceDate | UnitPrice | CustomerID | StockCode | InvoiceNo | Quantity | Country |
## |:--|:----------------|:----------------|:-------------|:----------------|:----------------|:-----------------|:----------------|
## | |Length:270954 |Min. : 0.00 |Min. :12346 |Length:270954 |Length:270954 |Min. :-74215.00 |Length:270954 |
## | |Class :character |1st Qu.: 1.25 |1st Qu.:13946 |Class :character |Class :character |1st Qu.: 1.00 |Class :character |
## | |Mode :character |Median : 2.08 |Median :15152 |Mode :character |Mode :character |Median : 3.00 |Mode :character |
## | |NA |Mean : 4.70 |Mean :15288 |NA |NA |Mean : 9.85 |NA |
## | |NA |3rd Qu.: 4.13 |3rd Qu.:16794 |NA |NA |3rd Qu.: 10.00 |NA |
## | |NA |Max. :38970.00 |Max. :18287 |NA |NA |Max. : 74215.00 |NA |
## | |NA |NA |NA's :67796 |NA |NA |NA |NA |
## ### Subsample 7 summary statisics
##
##
## Table: summary statistics
##
## | |InvoiceDate | UnitPrice | CustomerID | StockCode | InvoiceNo | Quantity | Country |
## |:--|:----------------|:-----------------|:-------------|:----------------|:----------------|:----------------|:----------------|
## | |Length:270954 |Min. :-11062.06 |Min. :12346 |Length:270954 |Length:270954 |Min. :-9600.00 |Length:270954 |
## | |Class :character |1st Qu.: 1.25 |1st Qu.:13969 |Class :character |Class :character |1st Qu.: 1.00 |Class :character |
## | |Mode :character |Median : 2.08 |Median :15159 |Mode :character |Mode :character |Median : 3.00 |Mode :character |
## | |NA |Mean : 4.46 |Mean :15297 |NA |NA |Mean : 10.03 |NA |
## | |NA |3rd Qu.: 4.13 |3rd Qu.:16801 |NA |NA |3rd Qu.: 10.00 |NA |
## | |NA |Max. : 38970.00 |Max. :18287 |NA |NA |Max. :74215.00 |NA |
## | |NA |NA |NA's :67365 |NA |NA |NA |NA |
# summary statistics for each subsamples
summary_stats <-lapply(online_subsample_list, summary)
# plot histogram
histograms <- lapply(online_subsample_list, function(online_subsample) {
ggplot(online_subsample, aes(x = CustomerID)) +
geom_histogram(binwidth = 1, fill = 'pink', color = 'maroon') +
labs(title = "Histogram for CustomerID", x = 'Value', y = 'Frequency')
})
# display histograms and summary
for (i in 1:num) {
cat("online_Subsample", i, "summary statistics:\n")
print(summary_stats[[i]])
print(histograms[[i]])
}
## online_Subsample 1 summary statistics:
## InvoiceDate UnitPrice CustomerID StockCode
## Length:270954 Min. :-11062.06 Min. :12346 Length:270954
## Class :character 1st Qu.: 1.25 1st Qu.:13939 Class :character
## Mode :character Median : 2.08 Median :15146 Mode :character
## Mean : 4.62 Mean :15284
## 3rd Qu.: 4.13 3rd Qu.:16791
## Max. : 38970.00 Max. :18287
## NA's :67327
## InvoiceNo Quantity Country
## Length:270954 Min. :-74215.00 Length:270954
## Class :character 1st Qu.: 1.00 Class :character
## Mode :character Median : 3.00 Mode :character
## Mean : 10.25
## 3rd Qu.: 10.00
## Max. : 80995.00
##
## Warning: Removed 67327 rows containing non-finite values (`stat_bin()`).
## online_Subsample 2 summary statistics:
## InvoiceDate UnitPrice CustomerID StockCode
## Length:270954 Min. :-11062.06 Min. :12347 Length:270954
## Class :character 1st Qu.: 1.25 1st Qu.:13952 Class :character
## Mode :character Median : 2.08 Median :15150 Mode :character
## Mean : 4.52 Mean :15289
## 3rd Qu.: 4.13 3rd Qu.:16794
## Max. : 38970.00 Max. :18287
## NA's :67765
## InvoiceNo Quantity Country
## Length:270954 Min. :-80995.00 Length:270954
## Class :character 1st Qu.: 1.00 Class :character
## Mode :character Median : 3.00 Mode :character
## Mean : 9.45
## 3rd Qu.: 10.00
## Max. : 12540.00
##
## Warning: Removed 67765 rows containing non-finite values (`stat_bin()`).
## online_Subsample 3 summary statistics:
## InvoiceDate UnitPrice CustomerID StockCode
## Length:270954 Min. : 0.00 Min. :12346 Length:270954
## Class :character 1st Qu.: 1.25 1st Qu.:13949 Class :character
## Mode :character Median : 2.08 Median :15150 Mode :character
## Mean : 4.88 Mean :15285
## 3rd Qu.: 4.13 3rd Qu.:16790
## Max. :38970.00 Max. :18287
## NA's :67490
## InvoiceNo Quantity Country
## Length:270954 Min. :-9600.00 Length:270954
## Class :character 1st Qu.: 1.00 Class :character
## Mode :character Median : 3.00 Mode :character
## Mean : 10.17
## 3rd Qu.: 10.00
## Max. :74215.00
##
## Warning: Removed 67490 rows containing non-finite values (`stat_bin()`).
## online_Subsample 4 summary statistics:
## InvoiceDate UnitPrice CustomerID StockCode
## Length:270954 Min. :-11062.06 Min. :12346 Length:270954
## Class :character 1st Qu.: 1.25 1st Qu.:13959 Class :character
## Mode :character Median : 2.08 Median :15152 Mode :character
## Mean : 5.01 Mean :15290
## 3rd Qu.: 4.13 3rd Qu.:16791
## Max. : 38970.00 Max. :18287
## NA's :67621
## InvoiceNo Quantity Country
## Length:270954 Min. :-9600.00 Length:270954
## Class :character 1st Qu.: 1.00 Class :character
## Mode :character Median : 3.00 Mode :character
## Mean : 10.71
## 3rd Qu.: 10.00
## Max. :80995.00
##
## Warning: Removed 67621 rows containing non-finite values (`stat_bin()`).
## online_Subsample 5 summary statistics:
## InvoiceDate UnitPrice CustomerID StockCode
## Length:270954 Min. : 0.000 Min. :12346 Length:270954
## Class :character 1st Qu.: 1.250 1st Qu.:13955 Class :character
## Mode :character Median : 2.080 Median :15152 Mode :character
## Mean : 4.743 Mean :15286
## 3rd Qu.: 4.130 3rd Qu.:16788
## Max. :17836.460 Max. :18287
## NA's :67384
## InvoiceNo Quantity Country
## Length:270954 Min. :-80995.00 Length:270954
## Class :character 1st Qu.: 1.00 Class :character
## Mode :character Median : 3.00 Mode :character
## Mean : 8.94
## 3rd Qu.: 10.00
## Max. : 12540.00
##
## Warning: Removed 67384 rows containing non-finite values (`stat_bin()`).
## online_Subsample 6 summary statistics:
## InvoiceDate UnitPrice CustomerID StockCode
## Length:270954 Min. : 0.00 Min. :12346 Length:270954
## Class :character 1st Qu.: 1.25 1st Qu.:13946 Class :character
## Mode :character Median : 2.08 Median :15152 Mode :character
## Mean : 4.70 Mean :15288
## 3rd Qu.: 4.13 3rd Qu.:16794
## Max. :38970.00 Max. :18287
## NA's :67796
## InvoiceNo Quantity Country
## Length:270954 Min. :-74215.00 Length:270954
## Class :character 1st Qu.: 1.00 Class :character
## Mode :character Median : 3.00 Mode :character
## Mean : 9.85
## 3rd Qu.: 10.00
## Max. : 74215.00
##
## Warning: Removed 67796 rows containing non-finite values (`stat_bin()`).
## online_Subsample 7 summary statistics:
## InvoiceDate UnitPrice CustomerID StockCode
## Length:270954 Min. :-11062.06 Min. :12346 Length:270954
## Class :character 1st Qu.: 1.25 1st Qu.:13969 Class :character
## Mode :character Median : 2.08 Median :15159 Mode :character
## Mean : 4.46 Mean :15297
## 3rd Qu.: 4.13 3rd Qu.:16801
## Max. : 38970.00 Max. :18287
## NA's :67365
## InvoiceNo Quantity Country
## Length:270954 Min. :-9600.00 Length:270954
## Class :character 1st Qu.: 1.00 Class :character
## Mode :character Median : 3.00 Mode :character
## Mean : 10.03
## 3rd Qu.: 10.00
## Max. :74215.00
##
## Warning: Removed 67365 rows containing non-finite values (`stat_bin()`).
### Summary: The graph is drawn with the Customer ID so that there can
be some insights for the how frequent or the consistant of the orders.
After taking the few random samples from data , I have obsereved that
the Customer orders are consistant accross all the samples with the few
deviations.
means <- lapply(online_subsample_list, function(online_subsample) {
mean(online_subsample$Quantity)
})
View(means)
sds <- lapply(online_subsample_list, function(online_subsample) {
sd(online_subsample$Quantity)
})
View(sds)
median <- lapply(online_subsample_list, function(online_subsample) {
median(online_subsample$Quantity)
})
View(median)
for (i in 1:num) {
cat("online_Subsample", i, "Mean of Quantity", means[[i]], "\n")
cat("online_Subsample", i, "SD of Quantity", sds[[i]], "\n")
cat("online_Subsample", i, "Median of Quantity", median[[i]], "\n")
}
## online_Subsample 1 Mean of Quantity 10.24929
## online_Subsample 1 SD of Quantity 297.9788
## online_Subsample 1 Median of Quantity 3
## online_Subsample 2 Mean of Quantity 9.445622
## online_Subsample 2 SD of Quantity 165.2793
## online_Subsample 2 Median of Quantity 3
## online_Subsample 3 Mean of Quantity 10.16995
## online_Subsample 3 SD of Quantity 209.4496
## online_Subsample 3 Median of Quantity 3
## online_Subsample 4 Mean of Quantity 10.70518
## online_Subsample 4 SD of Quantity 308.9987
## online_Subsample 4 Median of Quantity 3
## online_Subsample 5 Mean of Quantity 8.93686
## online_Subsample 5 SD of Quantity 219.7432
## online_Subsample 5 Median of Quantity 3
## online_Subsample 6 Mean of Quantity 9.850746
## online_Subsample 6 SD of Quantity 210.9712
## online_Subsample 6 Median of Quantity 3
## online_Subsample 7 Mean of Quantity 10.02634
## online_Subsample 7 SD of Quantity 207.8675
## online_Subsample 7 Median of Quantity 3