Importing the Discussion2Dataset.csv file
path <- "C:/Users/rajak/OneDrive/Desktop/Study/University/ALY6000 - Into to Analytics/Assignments/ALY6000 R Project"
data <- read.csv(paste0(path, "/DataSets/Discussion2Data.csv"))
Display the descriptive statistics of the objects created
mul_fun <- function(x) {
c(mean(x), sd(x), median(x), min(x), max(x), max(x)-min(x), quantile(x, 0.25),
quantile(x, 0.5), quantile(x, 0.75))
}
var_names <- c("mean", "std_dev", "median", "min", "max", "range", "percentile_25", "percentile_50",
"percentile_75")
var_name <- "Sales"
obs_var <- list(data[,var_name])
val = 1
for (i in sapply(obs_var, mul_fun)) {
assign(var_names[val], format(round(i, digits = 0), big.mark = ","))
print(paste("The value of ", var_names[val], "is ", eval(parse(text = var_names[val]))))
val = val + 1
}
## [1] "The value of mean is 1,711"
## [1] "The value of std_dev is 1,259"
## [1] "The value of median is 1,585"
## [1] "The value of min is 2"
## [1] "The value of max is 9,893"
## [1] "The value of range is 9,891"
## [1] "The value of percentile_25 is 827"
## [1] "The value of percentile_50 is 1,585"
## [1] "The value of percentile_75 is 2,478"
Display the descriptive statistics in a tabular format
library(knitr)
desc_stats <- data.frame(mean, std_dev, median, min, max, range, percentile_25, percentile_50,
percentile_75)
kable(desc_stats, format = "simple", caption = "Descriptive Statistics Summary")
| mean | std_dev | median | min | max | range | percentile_25 | percentile_50 | percentile_75 |
|---|---|---|---|---|---|---|---|---|
| 1,711 | 1,259 | 1,585 | 2 | 9,893 | 9,891 | 827 | 1,585 | 2,478 |
Compare the distribution of Sales variable using boxplots across categories
library(ggplot2)
library(scales)
ggplot(data = data) +
geom_boxplot(mapping = aes(x = Sales, y = Product_Category,
fill = Product_Category,), notch = TRUE) +
scale_x_continuous(labels = comma) +
geom_jitter(mapping = aes(x = Sales, y = Product_Category), alpha = 0.05,
width = 0.1) +
labs(title="Sales Comparison across Categories", y="Product Categories", x="Sales", caption="Source: Discussion2Dataset.csv") +
theme(panel.background = element_rect(fill = "white", colour = "grey50"), legend.position = "None")
Note: All the items priced above ~USD 5k are outliers
boxplot.stats(data$Sales)$out
## [1] 5175.17 5244.84 7958.58 5273.70 5667.87 5785.02 5049.00 5729.35 5301.24
## [10] 5211.12 5276.99 5451.30 6998.64 5211.12 5751.54 5486.67 5451.30 9892.74
## [19] 5725.35 5737.50 6439.80