Loading data

library(ggplot2)
library(vcd)
## Loading required package: grid
df <-read.csv('/Users/fahadmehfooz/Desktop/IUPUI/First Semester/Intro to Statistics/Intro to Stats Dataset/Dataset 1/Superstore.csv')

# Created a new variable cost as asked in the question

df["Cost"] <- df["Sales"] - df["Profit"]
head(df, 10)
##    Row.ID       Order.ID Order.Date  Ship.Date      Ship.Mode Customer.ID
## 1       1 CA-2013-152156 09-11-2013 12-11-2013   Second Class    CG-12520
## 2       2 CA-2013-152156 09-11-2013 12-11-2013   Second Class    CG-12520
## 3       3 CA-2013-138688 13-06-2013 17-06-2013   Second Class    DV-13045
## 4       4 US-2012-108966 11-10-2012 18-10-2012 Standard Class    SO-20335
## 5       5 US-2012-108966 11-10-2012 18-10-2012 Standard Class    SO-20335
## 6       6 CA-2011-115812 09-06-2011 14-06-2011 Standard Class    BH-11710
## 7       7 CA-2011-115812 09-06-2011 14-06-2011 Standard Class    BH-11710
## 8       8 CA-2011-115812 09-06-2011 14-06-2011 Standard Class    BH-11710
## 9       9 CA-2011-115812 09-06-2011 14-06-2011 Standard Class    BH-11710
## 10     10 CA-2011-115812 09-06-2011 14-06-2011 Standard Class    BH-11710
##      Customer.Name   Segment       Country            City      State
## 1      Claire Gute  Consumer United States       Henderson   Kentucky
## 2      Claire Gute  Consumer United States       Henderson   Kentucky
## 3  Darrin Van Huff Corporate United States     Los Angeles California
## 4   Sean O'Donnell  Consumer United States Fort Lauderdale    Florida
## 5   Sean O'Donnell  Consumer United States Fort Lauderdale    Florida
## 6  Brosina Hoffman  Consumer United States     Los Angeles California
## 7  Brosina Hoffman  Consumer United States     Los Angeles California
## 8  Brosina Hoffman  Consumer United States     Los Angeles California
## 9  Brosina Hoffman  Consumer United States     Los Angeles California
## 10 Brosina Hoffman  Consumer United States     Los Angeles California
##    Postal.Code Region      Product.ID        Category Sub.Category
## 1        42420  South FUR-BO-10001798       Furniture    Bookcases
## 2        42420  South FUR-CH-10000454       Furniture       Chairs
## 3        90036   West OFF-LA-10000240 Office Supplies       Labels
## 4        33311  South FUR-TA-10000577       Furniture       Tables
## 5        33311  South OFF-ST-10000760 Office Supplies      Storage
## 6        90032   West FUR-FU-10001487       Furniture  Furnishings
## 7        90032   West OFF-AR-10002833 Office Supplies          Art
## 8        90032   West TEC-PH-10002275      Technology       Phones
## 9        90032   West OFF-BI-10003910 Office Supplies      Binders
## 10       90032   West OFF-AP-10002892 Office Supplies   Appliances
##                                                        Product.Name    Sales
## 1                                 Bush Somerset Collection Bookcase 261.9600
## 2       Hon Deluxe Fabric Upholstered Stacking Chairs, Rounded Back 731.9400
## 3         Self-Adhesive Address Labels for Typewriters by Universal  14.6200
## 4                     Bretford CR4500 Series Slim Rectangular Table 957.5775
## 5                                    Eldon Fold 'N Roll Cart System  22.3680
## 6  Eldon Expressions Wood and Plastic Desk Accessories, Cherry Wood  48.8600
## 7                                                        Newell 322   7.2800
## 8                                    Mitel 5320 IP Phone VoIP phone 907.1520
## 9              DXL Angle-View Binders with Locking Rings by Samsill  18.5040
## 10                                 Belkin F5C206VTEL 6 Outlet Surge 114.9000
##    Quantity Discount    Profit      Cost
## 1         2     0.00   41.9136  220.0464
## 2         3     0.00  219.5820  512.3580
## 3         2     0.00    6.8714    7.7486
## 4         5     0.45 -383.0310 1340.6085
## 5         2     0.20    2.5164   19.8516
## 6         7     0.00   14.1694   34.6906
## 7         4     0.00    1.9656    5.3144
## 8         6     0.20   90.7152  816.4368
## 9         3     0.20    5.7825   12.7215
## 10        5     0.00   34.4700   80.4300

Assumption: There are no ordered variables in this dataset apart from Ship.Mode. Lets assume ordering/ weights for these classes in the following order:

Same Day- 4 First Class - 3 Second Class- 2 Standard Class- 1

unique(df["Ship.Mode"])
##          Ship.Mode
## 1     Second Class
## 4   Standard Class
## 36     First Class
## 367       Same Day
set_1 <- c("Discount", "Quantity", "Ship.Mode", "Cost")
set_2 <- c("Quantity", "Sales", "Ship.Mode", "Cost")
set_3 <- c("Discount", "Sales", "Ship.Mode", "Cost")

df_1 <- df[set_1]
df_2 <- df[set_2]
df_3 <- df[set_3]

IMP: Setting response variable as “Ship.Mode” for all the 3 sample dataframes.

Set 1 Analysis

# Convert to factors if necessary
df_1$Ship.Mode <- as.factor(df_1$Ship.Mode)

# Create boxplots using a loop
for (variable in set_1) {
  if (variable != "Ship.Mode") {
    plot_data <- ggplot(df_1, aes(x = Ship.Mode, y = df_1[[variable]])) +
      geom_boxplot() +
      ggtitle(paste("Boxplot of", variable, "by Ship.Mode")) +
      xlab("Ship.Mode") +
      ylab(variable)
    
    print(plot_data)
  }
}
## Warning: Use of `df_1[[variable]]` is discouraged.
## ℹ Use `.data[[variable]]` instead.

## Warning: Use of `df_1[[variable]]` is discouraged.
## ℹ Use `.data[[variable]]` instead.

## Warning: Use of `df_1[[variable]]` is discouraged.
## ℹ Use `.data[[variable]]` instead.

> Above upper whisker, for all ship modes some outliers can be observed.

# Creating scatter plots for all pairs of continuous variables excluding "Ship.Mode"
for (i in 1:(length(set_1) - 1)) {
  for (j in (i + 1):length(set_1)) {
    if (is.numeric(df_1[[set_1[i]]]) && is.numeric(df_1[[set_1[j]]]) &&
        !identical(df_1[[set_1[i]]], df_1[[set_1[j]]])) {
      plot_data <- ggplot(df_1, aes_string(x = set_1[i], y = set_1[j])) +
        geom_point() +
        ggtitle(paste("Scatter plot of", set_1[i], "vs", set_1[j])) +
        xlab(set_1[i]) +
        ylab(set_1[j])
      
      print(plot_data)
    }
  }
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

> Nothing very different can be seen in the scatter plots, its mostly uniform for all the variables.

Correlation Matrix for set 1

# Calculating Pearson correlation for each pair of continuous variables in set_1
cor_matrix <- cor(df_1[, set_1[set_1!="Ship.Mode"]])

# Printing the correlation matrix
print(cor_matrix)
##            Discount   Quantity       Cost
## Discount 1.00000000 0.00862297 0.06144767
## Quantity 0.00862297 1.00000000 0.19901241
## Cost     0.06144767 0.19901241 1.00000000

Strongest relationship here can be seen between cost and quantity which makes sense also. Although, I would not term this also a very strong relationship.

Confidence intervals for set 1

# This is a two tailed test

# Calculating confidence intervals for means
confidence_intervals <- sapply(df_1[, set_1[set_1!="Ship.Mode"]], function(var) {
  mean_value <- mean(var, na.rm = TRUE)
  std_dev <- sd(var, na.rm = TRUE)
  n <- length(var)
  margin_error <- qt(0.975, df = n - 1) * (std_dev / sqrt(n))
  lower <- mean_value - margin_error
  upper <- mean_value + margin_error
  c(lower, upper)
})

print(confidence_intervals)
##       Discount Quantity     Cost
## [1,] 0.1521546 3.745944 190.4003
## [2,] 0.1602508 3.833203 212.0019

first row tells about the lower bound for confidence interval and the second row tells about the upper bound.

Anything outside these boundaries suggests the chances of a mean is as rare as those 5% events.

Here cost is the response variable and we can see anything that falls out of this range is not following the mean.

Set 2 Analysis

# Convert to factors if necessary
df_2$Ship.Mode <- as.factor(df_2$Ship.Mode)

# Create boxplots using a loop
for (variable in set_2) {
  if (variable != "Ship.Mode") {
    plot_data <- ggplot(df_2, aes(x = Ship.Mode, y = df_2[[variable]])) +
      geom_boxplot() +
      ggtitle(paste("Boxplot of", variable, "by Ship.Mode")) +
      xlab("Ship.Mode") +
      ylab(variable)
    
    print(plot_data)
  }
}
## Warning: Use of `df_2[[variable]]` is discouraged.
## ℹ Use `.data[[variable]]` instead.

## Warning: Use of `df_2[[variable]]` is discouraged.
## ℹ Use `.data[[variable]]` instead.

## Warning: Use of `df_2[[variable]]` is discouraged.
## ℹ Use `.data[[variable]]` instead.

Standard class here has the most number of outliers.

# Creating scatter plots for all pairs of continuous variables excluding "Ship.Mode"
for (i in 1:(length(set_2) - 1)) {
  for (j in (i + 1):length(set_2)) {
    if (is.numeric(df_2[[set_2[i]]]) && is.numeric(df_2[[set_2[j]]]) &&
        !identical(df_2[[set_2[i]]], df_2[[set_1[j]]])) {
      plot_data <- ggplot(df_2, aes_string(x = set_2[i], y = set_2[j])) +
        geom_point() +
        ggtitle(paste("Scatter plot of", set_2[i], "vs", set_2[j])) +
        xlab(set_2[i]) +
        ylab(set_2[j])
      
      print(plot_data)
    }
  }
}

> Sales Vs cost is an interesting pattern to observe here. for sales between 0 to 5000, and similar cost we are seeing a huge number of datapoints. This suggests that the majority consumer behaviour is in this range.

Correlation Matrix for set 2

# Calculating Pearson correlation for each pair of continuous variables in set_2
cor_matrix <- cor(df_2[, set_2[set_2!="Ship.Mode"]])

# Printing the correlation matrix
print(cor_matrix)
##           Quantity     Sales      Cost
## Quantity 1.0000000 0.2007948 0.1990124
## Sales    0.2007948 1.0000000 0.9277104
## Cost     0.1990124 0.9277104 1.0000000

Cost and sales has a very strong correlation. but thats obvious too, we created cost variable from sales and profit.

Confidence intervals for set 2

# This is a two tailed test

# Calculating confidence intervals for means
confidence_intervals <- sapply(df_2[, set_2[set_2!="Ship.Mode"]], function(var) {
  mean_value <- mean(var, na.rm = TRUE)
  std_dev <- sd(var, na.rm = TRUE)
  n <- length(var)
  margin_error <- qt(0.975, df = n - 1) * (std_dev / sqrt(n))
  lower <- mean_value - margin_error
  upper <- mean_value + margin_error
  c(lower, upper)
})

print(confidence_intervals)
##      Quantity    Sales     Cost
## [1,] 3.745944 217.6375 190.4003
## [2,] 3.833203 242.0785 212.0019

first row tells about the lower bound for confidence interval and the second row tells about the upper bound.

Anything outside these boundaries suggests the chances of a mean is as rare as those 5% events.

Here cost is the response variable and we can see anything that falls out of this range is not following the mean.

set 3 Analysis

# Convert to factors if necessary
df_3$Ship.Mode <- as.factor(df_3$Ship.Mode)

# Create boxplots using a loop
for (variable in set_3) {
  if (variable != "Ship.Mode") {
    plot_data <- ggplot(df_3, aes(x = Ship.Mode, y = df_3[[variable]])) +
      geom_boxplot() +
      ggtitle(paste("Boxplot of", variable, "by Ship.Mode")) +
      xlab("Ship.Mode") +
      ylab(variable)
    
    print(plot_data)
  }
}
## Warning: Use of `df_3[[variable]]` is discouraged.
## ℹ Use `.data[[variable]]` instead.

## Warning: Use of `df_3[[variable]]` is discouraged.
## ℹ Use `.data[[variable]]` instead.

## Warning: Use of `df_3[[variable]]` is discouraged.
## ℹ Use `.data[[variable]]` instead.

similar trends can be observed here as well.

# Creating scatter plots for all pairs of continuous variables excluding "Ship.Mode"
for (i in 1:(length(set_3) - 1)) {
  for (j in (i + 1):length(set_1)) {
    if (is.numeric(df_3[[set_3[i]]]) && is.numeric(df_3[[set_3[j]]]) &&
        !identical(df_3[[set_3[i]]], df_3[[set_3[j]]])) {
      plot_data <- ggplot(df_3, aes_string(x = set_3[i], y = set_3[j])) +
        geom_point() +
        ggtitle(paste("Scatter plot of", set_3[i], "vs", set_3[j])) +
        xlab(set_3[i]) +
        ylab(set_3[j])
      
      print(plot_data)
    }
  }
}

> We have seen same trends for set 2.

Correlation Matrix for set 3

# Calculating Pearson correlation for each pair of continuous variables in set_3
cor_matrix <- cor(df_3[, set_3[set_3!="Ship.Mode"]])

# Printing the correlation matrix
print(cor_matrix)
##             Discount       Sales       Cost
## Discount  1.00000000 -0.02819012 0.06144767
## Sales    -0.02819012  1.00000000 0.92771037
## Cost      0.06144767  0.92771037 1.00000000

We see a negative correlation between discount and sales.

Confidence intervals for set 3

# This is a two tailed test

# Calculating confidence intervals for means
confidence_intervals <- sapply(df_3[, set_3[set_3!="Ship.Mode"]], function(var) {
  mean_value <- mean(var, na.rm = TRUE)
  std_dev <- sd(var, na.rm = TRUE)
  n <- length(var)
  margin_error <- qt(0.975, df = n - 1) * (std_dev / sqrt(n))
  lower <- mean_value - margin_error
  upper <- mean_value + margin_error
  c(lower, upper)
})

print(confidence_intervals)
##       Discount    Sales     Cost
## [1,] 0.1521546 217.6375 190.4003
## [2,] 0.1602508 242.0785 212.0019

first row tells about the lower bound for confidence interval and the second row tells about the upper bound.

Anything outside these boundaries suggests the chances of a mean is as rare as those 5% events.

Here cost is the response variable and we can see anything that falls out of this range is not following the mean.