library("tidymodels")
library("tidyverse")
library("GGally")
library("flextable")
library(ggplot2)
library(dplyr)
library(caret)
library(skimr)
library("knitr")

Data Loading and Inspecting Data

# Read the tab-delimited file from the URL
customers <- read_csv("~/631/Ecommerce.csv")

head(customers,5)
## # A tibble: 5 × 8
##   Email     Address Avatar `Avg. Session Length` `Time on App` `Time on Website`
##   <chr>     <chr>   <chr>                  <dbl>         <dbl>             <dbl>
## 1 mstephen… "835 F… Violet                  34.5          12.7              39.6
## 2 hduke@ho… "4547 … DarkG…                  31.9          11.1              37.3
## 3 pallen@y… "24645… Bisque                  33.0          11.3              37.1
## 4 riverare… "1414 … Saddl…                  34.3          13.7              36.7
## 5 mstephen… "14023… Mediu…                  33.3          12.8              37.5
## # ℹ 2 more variables: `Length of Membership` <dbl>, `Yearly Amount Spent` <dbl>
glimpse(customers)
## Rows: 500
## Columns: 8
## $ Email                  <chr> "mstephenson@fernandez.com", "hduke@hotmail.com…
## $ Address                <chr> "835 Frank Tunnel\nWrightmouth, MI 82180-9605",…
## $ Avatar                 <chr> "Violet", "DarkGreen", "Bisque", "SaddleBrown",…
## $ `Avg. Session Length`  <dbl> 34.49727, 31.92627, 33.00091, 34.30556, 33.3306…
## $ `Time on App`          <dbl> 12.65565, 11.10946, 11.33028, 13.71751, 12.7951…
## $ `Time on Website`      <dbl> 39.57767, 37.26896, 37.11060, 36.72128, 37.5366…
## $ `Length of Membership` <dbl> 4.082621, 2.664034, 4.104543, 3.120179, 4.44630…
## $ `Yearly Amount Spent`  <dbl> 587.9511, 392.2049, 487.5475, 581.8523, 599.406…
skim(customers)
Data summary
Name customers
Number of rows 500
Number of columns 8
_______________________
Column type frequency:
character 3
numeric 5
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Email 0 1 13 39 0 500 0
Address 0 1 23 67 0 500 0
Avatar 0 1 3 20 0 138 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Avg. Session Length 0 1 33.05 0.99 29.53 32.34 33.08 33.71 36.14 ▁▃▇▆▁
Time on App 0 1 12.05 0.99 8.51 11.39 11.98 12.75 15.13 ▁▃▇▅▁
Time on Website 0 1 37.06 1.01 33.91 36.35 37.07 37.72 40.01 ▁▃▇▅▁
Length of Membership 0 1 3.53 1.00 0.27 2.93 3.53 4.13 6.92 ▁▃▇▃▁
Yearly Amount Spent 0 1 499.31 79.31 256.67 445.04 498.89 549.31 765.52 ▁▅▇▃▁

Data Preprocessing

# Create a subset of the 'customers' data frame without the specified columns
customers_subset <- customers %>%
  select(-Email, -Address, -Avatar)
customers_subset <- customers_subset %>%
  rename("Avg_Session_Length" = "Avg. Session Length",
         "Time_on_App"  = "Time on App",
        "Time_on_Website" = "Time on Website",
         "Length_of_Membership" = "Length of Membership",
        "Yearly_Amount_Spent" = "Yearly Amount Spent")
kable(head(customers_subset))
Avg_Session_Length Time_on_App Time_on_Website Length_of_Membership Yearly_Amount_Spent
34.49727 12.65565 39.57767 4.082621 587.9511
31.92627 11.10946 37.26896 2.664034 392.2049
33.00091 11.33028 37.11060 4.104543 487.5475
34.30556 13.71751 36.72128 3.120179 581.8523
33.33067 12.79519 37.53665 4.446308 599.4061
33.87104 12.02693 34.47688 5.493507 637.1024

Exploratory Data Analysis

A joint plot to compare Time on Website and Yearly Amount Spent

# Load the ggplot2 package


# Create a joint plot
ggplot(customers, aes(x = `Time on Website`, y = `Yearly Amount Spent`)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +  # Add a linear regression line
  labs(x = "Time on Website", y = "Yearly Amount Spent")  # Set axis labels

A joint plot to compare Time on App and Yearly Amount Spent

# Create a joint plot
ggplot(customers, aes(x = `Time on App`, y = `Yearly Amount Spent`)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +  # Add a linear regression line
  labs(x = "Time on App", y = "Yearly Amount Spent")  # Set axis labels

hexbin joint plot comparing Time on App and Length of Membership

# Create a hexbin joint plot
ggplot(customers, aes(x = `Time on App`, y = `Length of Membership`)) +
  geom_hex() +  # Use hexagonal binning
  labs(x = "Time on App", y = "Length of Membership")  # Set axis labels

I explore these types of relationships across the entire data set. I am going to use ggpairs to recreate the plot below

# Create a pairplot
ggpairs(customers_subset)

# Create a scatter plot with a linear regression line
ggplot(customers_subset, aes(x = `Length_of_Membership`, y = `Yearly_Amount_Spent`)) +
  geom_point() +  # Add points
  geom_smooth(method = "lm", se = FALSE) +  # Add a linear regression line
  labs(x = "Length of Membership", y = "Yearly Amount Spent")  # Set axis labels

X <- customers_subset[, c("Avg_Session_Length", "Time_on_App", "Time_on_Website", "Length_of_Membership")]
# Extract the "Yearly Amount Spent" column using the $ operator
y <- customers_subset$`Yearly_Amount_Spent`
# Set the random seed for reproducibility
set.seed(101)

# Split the data into training and testing sets
splits <- createDataPartition(y, p = 0.8, list = FALSE, times = 1)

# Use the same split indices for both X and y
X_train <- X[splits, ]
X_test <- X[-splits, ]
y_train <- y[splits]
y_test <- y[-splits]

# Print the dimensions of the training and testing sets
print(dim(X_train))
## [1] 400   4
print(dim(X_test))
## [1] 100   4
# Fit a linear regression model
lm_model <- lm(y_train ~ ., data = data.frame(y = y_train, X_train))

# Print the summary of the model
summary(lm_model)
## Warning in summary.lm(lm_model): essentially perfect fit: summary may be
## unreliable
## 
## Call:
## lm(formula = y_train ~ ., data = data.frame(y = y_train, X_train))
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -1.529e-13 -1.056e-15  5.750e-16  2.256e-15  1.731e-14 
## 
## Coefficients:
##                        Estimate Std. Error    t value Pr(>|t|)    
## (Intercept)          -1.994e-14  5.031e-14 -3.960e-01    0.692    
## y                     1.000e+00  4.315e-17  2.318e+16   <2e-16 ***
## Avg_Session_Length   -1.694e-15  1.192e-15 -1.421e+00    0.156    
## Time_on_App          -2.258e-15  1.724e-15 -1.310e+00    0.191    
## Time_on_Website       1.039e-16  4.302e-16  2.420e-01    0.809    
## Length_of_Membership -4.076e-15  2.694e-15 -1.513e+00    0.131    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.75e-15 on 394 degrees of freedom
## Multiple R-squared:      1,  Adjusted R-squared:      1 
## F-statistic: 6.253e+33 on 5 and 394 DF,  p-value: < 2.2e-16
# Create a linear regression model
lm <- lm(y_train ~ ., data = data.frame(y = y_train, X_train))
lm
## 
## Call:
## lm(formula = y_train ~ ., data = data.frame(y = y_train, X_train))
## 
## Coefficients:
##          (Intercept)                     y    Avg_Session_Length  
##           -1.994e-14             1.000e+00            -1.694e-15  
##          Time_on_App       Time_on_Website  Length_of_Membership  
##           -2.258e-15             1.039e-16            -4.076e-15
# Fit a linear regression model
lm <- lm(y_train ~ ., data = data.frame(y = y_train, X_train))
# Fit a linear regression model using the entire dataset
lm_model_full <- lm(y ~ ., data = data.frame(y = y, X))

# Make predictions using the updated linear regression model
predictions <- predict(lm_model_full, newdata = X_test)

# Check the dimensions of the predictions
dim(predictions)
## NULL
# Evaluate model performance
mse <- mean((predictions - y_test)^2)
rmse <- sqrt(mse)
mae <- mean(abs(predictions - y_test))
rsquared <- cor(predictions, y_test)^2

# Print evaluation metrics
cat("Mean Squared Error (MSE):", mse, "\n")
## Mean Squared Error (MSE): 80.40227
cat("Root Mean Squared Error (RMSE):", rmse, "\n")
## Root Mean Squared Error (RMSE): 8.966731
cat("Mean Absolute Error (MAE):", mae, "\n")
## Mean Absolute Error (MAE): 7.059969
cat("R-squared:", rsquared, "\n")
## R-squared: 0.98954
# Visualize results
plot(predictions, y_test, main = "Predicted vs Actual", xlab = "Predicted", ylab = "Actual")
abline(0, 1, col = "red")

# Calculate residuals
residuals <- y_test - predictions

# Plot histogram of residuals
hist(residuals, breaks = 50, main = "Histogram of Residuals", xlab = "Residuals", ylab = "Frequency")

# Calculate residuals
residuals <- y_test - predictions

# Plot histogram of residuals
hist(residuals, breaks = 50, freq = FALSE, main = "Histogram of Residuals", xlab = "Residuals", ylab = "Density")

# Overlay density plot
lines(density(residuals), col = "blue", lwd = 2)

# Extract coefficients from the model summary
coefficients <- coef(summary(lm_model_full))[, 1]

# Remove the intercept coefficient
coefficients <- coefficients[-1]

# Create dataframe with coefficients
coefficients_df <- data.frame(Coefficient = coefficients)

# Print the dataframe
print(coefficients_df)
##                      Coefficient
## Avg_Session_Length    25.7342711
## Time_on_App           38.7091538
## Time_on_Website        0.4367388
## Length_of_Membership  61.5773238