Descriptive Statistics & Boxplot


Data Import

Importing the Discussion2Dataset.csv file

path <-  "C:/Users/rajak/OneDrive/Desktop/Study/University/ALY6000 - Into to Analytics/Assignments/ALY6000 R Project"
data <-  read.csv(paste0(path, "/DataSets/Discussion2Data.csv"))


Descriptive statistics

Display the descriptive statistics of the objects created

mul_fun <- function(x) {
  c(mean(x), sd(x), median(x), min(x), max(x), max(x)-min(x), quantile(x, 0.25), 
    quantile(x, 0.5), quantile(x, 0.75))
}

var_names <- c("mean", "std_dev", "median", "min", "max", "range", "percentile_25", "percentile_50", 
               "percentile_75")

var_name <- "Sales"
obs_var <- list(data[,var_name])
val = 1
for (i in sapply(obs_var, mul_fun)) {
  assign(var_names[val], format(round(i, digits = 0), big.mark = ","))
  print(paste("The value of ", var_names[val], "is ", eval(parse(text = var_names[val]))))
  val = val + 1
}
## [1] "The value of  mean is  1,711"
## [1] "The value of  std_dev is  1,259"
## [1] "The value of  median is  1,585"
## [1] "The value of  min is  2"
## [1] "The value of  max is  9,893"
## [1] "The value of  range is  9,891"
## [1] "The value of  percentile_25 is  827"
## [1] "The value of  percentile_50 is  1,585"
## [1] "The value of  percentile_75 is  2,478"


Descriptive Statstics Table

Display the descriptive statistics in a tabular format

library(knitr)
desc_stats <- data.frame(mean, std_dev, median, min, max, range, percentile_25, percentile_50, 
                         percentile_75)
kable(desc_stats, format = "simple", caption = "Descriptive Statistics Summary")
Descriptive Statistics Summary
mean std_dev median min max range percentile_25 percentile_50 percentile_75
1,711 1,259 1,585 2 9,893 9,891 827 1,585 2,478


Boxplot of Sales across Product categories

Compare the distribution of Sales variable using boxplots across categories

library(ggplot2)
library(scales)
ggplot(data = data) + 
  geom_boxplot(mapping = aes(x = Sales, y = Product_Category, 
                             fill = Product_Category,), notch = TRUE) + 
  scale_x_continuous(labels = comma) +
  geom_jitter(mapping = aes(x = Sales, y = Product_Category), alpha = 0.05,
              width = 0.1) + 
  labs(title="Sales Comparison across Categories", y="Product Categories", x="Sales", caption="Source: Discussion2Dataset.csv") +
  theme(panel.background = element_rect(fill = "white", colour = "grey50"), legend.position = "None")


Note: All the items priced above ~USD 5k are outliers

boxplot.stats(data$Sales)$out
##  [1] 5175.17 5244.84 7958.58 5273.70 5667.87 5785.02 5049.00 5729.35 5301.24
## [10] 5211.12 5276.99 5451.30 6998.64 5211.12 5751.54 5486.67 5451.30 9892.74
## [19] 5725.35 5737.50 6439.80