library(ggplot2)
library(vcd)
## Loading required package: grid
df <-read.csv('/Users/fahadmehfooz/Desktop/IUPUI/First Semester/Intro to Statistics/Intro to Stats Dataset/Dataset 1/Superstore.csv')
# Created a new variable cost as asked in the question
df["Cost"] <- df["Sales"] - df["Profit"]
head(df, 10)
## Row.ID Order.ID Order.Date Ship.Date Ship.Mode Customer.ID
## 1 1 CA-2013-152156 09-11-2013 12-11-2013 Second Class CG-12520
## 2 2 CA-2013-152156 09-11-2013 12-11-2013 Second Class CG-12520
## 3 3 CA-2013-138688 13-06-2013 17-06-2013 Second Class DV-13045
## 4 4 US-2012-108966 11-10-2012 18-10-2012 Standard Class SO-20335
## 5 5 US-2012-108966 11-10-2012 18-10-2012 Standard Class SO-20335
## 6 6 CA-2011-115812 09-06-2011 14-06-2011 Standard Class BH-11710
## 7 7 CA-2011-115812 09-06-2011 14-06-2011 Standard Class BH-11710
## 8 8 CA-2011-115812 09-06-2011 14-06-2011 Standard Class BH-11710
## 9 9 CA-2011-115812 09-06-2011 14-06-2011 Standard Class BH-11710
## 10 10 CA-2011-115812 09-06-2011 14-06-2011 Standard Class BH-11710
## Customer.Name Segment Country City State
## 1 Claire Gute Consumer United States Henderson Kentucky
## 2 Claire Gute Consumer United States Henderson Kentucky
## 3 Darrin Van Huff Corporate United States Los Angeles California
## 4 Sean O'Donnell Consumer United States Fort Lauderdale Florida
## 5 Sean O'Donnell Consumer United States Fort Lauderdale Florida
## 6 Brosina Hoffman Consumer United States Los Angeles California
## 7 Brosina Hoffman Consumer United States Los Angeles California
## 8 Brosina Hoffman Consumer United States Los Angeles California
## 9 Brosina Hoffman Consumer United States Los Angeles California
## 10 Brosina Hoffman Consumer United States Los Angeles California
## Postal.Code Region Product.ID Category Sub.Category
## 1 42420 South FUR-BO-10001798 Furniture Bookcases
## 2 42420 South FUR-CH-10000454 Furniture Chairs
## 3 90036 West OFF-LA-10000240 Office Supplies Labels
## 4 33311 South FUR-TA-10000577 Furniture Tables
## 5 33311 South OFF-ST-10000760 Office Supplies Storage
## 6 90032 West FUR-FU-10001487 Furniture Furnishings
## 7 90032 West OFF-AR-10002833 Office Supplies Art
## 8 90032 West TEC-PH-10002275 Technology Phones
## 9 90032 West OFF-BI-10003910 Office Supplies Binders
## 10 90032 West OFF-AP-10002892 Office Supplies Appliances
## Product.Name Sales
## 1 Bush Somerset Collection Bookcase 261.9600
## 2 Hon Deluxe Fabric Upholstered Stacking Chairs, Rounded Back 731.9400
## 3 Self-Adhesive Address Labels for Typewriters by Universal 14.6200
## 4 Bretford CR4500 Series Slim Rectangular Table 957.5775
## 5 Eldon Fold 'N Roll Cart System 22.3680
## 6 Eldon Expressions Wood and Plastic Desk Accessories, Cherry Wood 48.8600
## 7 Newell 322 7.2800
## 8 Mitel 5320 IP Phone VoIP phone 907.1520
## 9 DXL Angle-View Binders with Locking Rings by Samsill 18.5040
## 10 Belkin F5C206VTEL 6 Outlet Surge 114.9000
## Quantity Discount Profit Cost
## 1 2 0.00 41.9136 220.0464
## 2 3 0.00 219.5820 512.3580
## 3 2 0.00 6.8714 7.7486
## 4 5 0.45 -383.0310 1340.6085
## 5 2 0.20 2.5164 19.8516
## 6 7 0.00 14.1694 34.6906
## 7 4 0.00 1.9656 5.3144
## 8 6 0.20 90.7152 816.4368
## 9 3 0.20 5.7825 12.7215
## 10 5 0.00 34.4700 80.4300
Same Day- 4 First Class - 3 Second Class- 2 Standard Class- 1
unique(df["Ship.Mode"])
## Ship.Mode
## 1 Second Class
## 4 Standard Class
## 36 First Class
## 367 Same Day
set_1 <- c("Discount", "Quantity", "Ship.Mode", "Cost")
set_2 <- c("Quantity", "Sales", "Ship.Mode", "Cost")
set_3 <- c("Discount", "Sales", "Ship.Mode", "Cost")
df_1 <- df[set_1]
df_2 <- df[set_2]
df_3 <- df[set_3]
IMP: Setting response variable as “Ship.Mode” for all the 3 sample dataframes.
# Convert to factors if necessary
df_1$Ship.Mode <- as.factor(df_1$Ship.Mode)
# Create boxplots using a loop
for (variable in set_1) {
if (variable != "Ship.Mode") {
plot_data <- ggplot(df_1, aes(x = Ship.Mode, y = df_1[[variable]])) +
geom_boxplot() +
ggtitle(paste("Boxplot of", variable, "by Ship.Mode")) +
xlab("Ship.Mode") +
ylab(variable)
print(plot_data)
}
}
## Warning: Use of `df_1[[variable]]` is discouraged.
## ℹ Use `.data[[variable]]` instead.
## Warning: Use of `df_1[[variable]]` is discouraged.
## ℹ Use `.data[[variable]]` instead.
## Warning: Use of `df_1[[variable]]` is discouraged.
## ℹ Use `.data[[variable]]` instead.
> Above upper whisker, for all ship modes some outliers can be
observed.
# Creating scatter plots for all pairs of continuous variables excluding "Ship.Mode"
for (i in 1:(length(set_1) - 1)) {
for (j in (i + 1):length(set_1)) {
if (is.numeric(df_1[[set_1[i]]]) && is.numeric(df_1[[set_1[j]]]) &&
!identical(df_1[[set_1[i]]], df_1[[set_1[j]]])) {
plot_data <- ggplot(df_1, aes_string(x = set_1[i], y = set_1[j])) +
geom_point() +
ggtitle(paste("Scatter plot of", set_1[i], "vs", set_1[j])) +
xlab(set_1[i]) +
ylab(set_1[j])
print(plot_data)
}
}
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
> Nothing very different can be seen in the scatter plots, its mostly
uniform for all the variables.
# Calculating Pearson correlation for each pair of continuous variables in set_1
cor_matrix <- cor(df_1[, set_1[set_1!="Ship.Mode"]])
# Printing the correlation matrix
print(cor_matrix)
## Discount Quantity Cost
## Discount 1.00000000 0.00862297 0.06144767
## Quantity 0.00862297 1.00000000 0.19901241
## Cost 0.06144767 0.19901241 1.00000000
Strongest relationship here can be seen between cost and quantity which makes sense also. Although, I would not term this also a very strong relationship.
# This is a two tailed test
# Calculating confidence intervals for means
confidence_intervals <- sapply(df_1[, set_1[set_1!="Ship.Mode"]], function(var) {
mean_value <- mean(var, na.rm = TRUE)
std_dev <- sd(var, na.rm = TRUE)
n <- length(var)
margin_error <- qt(0.975, df = n - 1) * (std_dev / sqrt(n))
lower <- mean_value - margin_error
upper <- mean_value + margin_error
c(lower, upper)
})
print(confidence_intervals)
## Discount Quantity Cost
## [1,] 0.1521546 3.745944 190.4003
## [2,] 0.1602508 3.833203 212.0019
first row tells about the lower bound for confidence interval and the second row tells about the upper bound.
Anything outside these boundaries suggests the chances of a mean is as rare as those 5% events.
Here cost is the response variable and we can see anything that falls out of this range is not following the mean.
# Convert to factors if necessary
df_2$Ship.Mode <- as.factor(df_2$Ship.Mode)
# Create boxplots using a loop
for (variable in set_2) {
if (variable != "Ship.Mode") {
plot_data <- ggplot(df_2, aes(x = Ship.Mode, y = df_2[[variable]])) +
geom_boxplot() +
ggtitle(paste("Boxplot of", variable, "by Ship.Mode")) +
xlab("Ship.Mode") +
ylab(variable)
print(plot_data)
}
}
## Warning: Use of `df_2[[variable]]` is discouraged.
## ℹ Use `.data[[variable]]` instead.
## Warning: Use of `df_2[[variable]]` is discouraged.
## ℹ Use `.data[[variable]]` instead.
## Warning: Use of `df_2[[variable]]` is discouraged.
## ℹ Use `.data[[variable]]` instead.
Standard class here has the most number of outliers.
# Creating scatter plots for all pairs of continuous variables excluding "Ship.Mode"
for (i in 1:(length(set_2) - 1)) {
for (j in (i + 1):length(set_2)) {
if (is.numeric(df_2[[set_2[i]]]) && is.numeric(df_2[[set_2[j]]]) &&
!identical(df_2[[set_2[i]]], df_2[[set_1[j]]])) {
plot_data <- ggplot(df_2, aes_string(x = set_2[i], y = set_2[j])) +
geom_point() +
ggtitle(paste("Scatter plot of", set_2[i], "vs", set_2[j])) +
xlab(set_2[i]) +
ylab(set_2[j])
print(plot_data)
}
}
}
> Sales Vs cost is an interesting pattern to observe here. for sales
between 0 to 5000, and similar cost we are seeing a huge number of
datapoints. This suggests that the majority consumer behaviour is in
this range.
# Calculating Pearson correlation for each pair of continuous variables in set_2
cor_matrix <- cor(df_2[, set_2[set_2!="Ship.Mode"]])
# Printing the correlation matrix
print(cor_matrix)
## Quantity Sales Cost
## Quantity 1.0000000 0.2007948 0.1990124
## Sales 0.2007948 1.0000000 0.9277104
## Cost 0.1990124 0.9277104 1.0000000
Cost and sales has a very strong correlation. but thats obvious too, we created cost variable from sales and profit.
# This is a two tailed test
# Calculating confidence intervals for means
confidence_intervals <- sapply(df_2[, set_2[set_2!="Ship.Mode"]], function(var) {
mean_value <- mean(var, na.rm = TRUE)
std_dev <- sd(var, na.rm = TRUE)
n <- length(var)
margin_error <- qt(0.975, df = n - 1) * (std_dev / sqrt(n))
lower <- mean_value - margin_error
upper <- mean_value + margin_error
c(lower, upper)
})
print(confidence_intervals)
## Quantity Sales Cost
## [1,] 3.745944 217.6375 190.4003
## [2,] 3.833203 242.0785 212.0019
first row tells about the lower bound for confidence interval and the second row tells about the upper bound.
Anything outside these boundaries suggests the chances of a mean is as rare as those 5% events.
Here cost is the response variable and we can see anything that falls out of this range is not following the mean.
# Convert to factors if necessary
df_3$Ship.Mode <- as.factor(df_3$Ship.Mode)
# Create boxplots using a loop
for (variable in set_3) {
if (variable != "Ship.Mode") {
plot_data <- ggplot(df_3, aes(x = Ship.Mode, y = df_3[[variable]])) +
geom_boxplot() +
ggtitle(paste("Boxplot of", variable, "by Ship.Mode")) +
xlab("Ship.Mode") +
ylab(variable)
print(plot_data)
}
}
## Warning: Use of `df_3[[variable]]` is discouraged.
## ℹ Use `.data[[variable]]` instead.
## Warning: Use of `df_3[[variable]]` is discouraged.
## ℹ Use `.data[[variable]]` instead.
## Warning: Use of `df_3[[variable]]` is discouraged.
## ℹ Use `.data[[variable]]` instead.
similar trends can be observed here as well.
# Creating scatter plots for all pairs of continuous variables excluding "Ship.Mode"
for (i in 1:(length(set_3) - 1)) {
for (j in (i + 1):length(set_1)) {
if (is.numeric(df_3[[set_3[i]]]) && is.numeric(df_3[[set_3[j]]]) &&
!identical(df_3[[set_3[i]]], df_3[[set_3[j]]])) {
plot_data <- ggplot(df_3, aes_string(x = set_3[i], y = set_3[j])) +
geom_point() +
ggtitle(paste("Scatter plot of", set_3[i], "vs", set_3[j])) +
xlab(set_3[i]) +
ylab(set_3[j])
print(plot_data)
}
}
}
> We have seen same trends for set 2.
# Calculating Pearson correlation for each pair of continuous variables in set_3
cor_matrix <- cor(df_3[, set_3[set_3!="Ship.Mode"]])
# Printing the correlation matrix
print(cor_matrix)
## Discount Sales Cost
## Discount 1.00000000 -0.02819012 0.06144767
## Sales -0.02819012 1.00000000 0.92771037
## Cost 0.06144767 0.92771037 1.00000000
We see a negative correlation between discount and sales.
# This is a two tailed test
# Calculating confidence intervals for means
confidence_intervals <- sapply(df_3[, set_3[set_3!="Ship.Mode"]], function(var) {
mean_value <- mean(var, na.rm = TRUE)
std_dev <- sd(var, na.rm = TRUE)
n <- length(var)
margin_error <- qt(0.975, df = n - 1) * (std_dev / sqrt(n))
lower <- mean_value - margin_error
upper <- mean_value + margin_error
c(lower, upper)
})
print(confidence_intervals)
## Discount Sales Cost
## [1,] 0.1521546 217.6375 190.4003
## [2,] 0.1602508 242.0785 212.0019
first row tells about the lower bound for confidence interval and the second row tells about the upper bound.
Anything outside these boundaries suggests the chances of a mean is as rare as those 5% events.
Here cost is the response variable and we can see anything that falls out of this range is not following the mean.