library("tidymodels")
library("tidyverse")
library("GGally")
library("flextable")
library(ggplot2)
library(dplyr)
library(caret)
library(skimr)
library("knitr")Data Loading and Inspecting Data
# Read the tab-delimited file from the URL
customers <- read_csv("~/631/Ecommerce.csv")
head(customers,5)## # A tibble: 5 × 8
## Email Address Avatar `Avg. Session Length` `Time on App` `Time on Website`
## <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 mstephen… "835 F… Violet 34.5 12.7 39.6
## 2 hduke@ho… "4547 … DarkG… 31.9 11.1 37.3
## 3 pallen@y… "24645… Bisque 33.0 11.3 37.1
## 4 riverare… "1414 … Saddl… 34.3 13.7 36.7
## 5 mstephen… "14023… Mediu… 33.3 12.8 37.5
## # ℹ 2 more variables: `Length of Membership` <dbl>, `Yearly Amount Spent` <dbl>
## Rows: 500
## Columns: 8
## $ Email <chr> "mstephenson@fernandez.com", "hduke@hotmail.com…
## $ Address <chr> "835 Frank Tunnel\nWrightmouth, MI 82180-9605",…
## $ Avatar <chr> "Violet", "DarkGreen", "Bisque", "SaddleBrown",…
## $ `Avg. Session Length` <dbl> 34.49727, 31.92627, 33.00091, 34.30556, 33.3306…
## $ `Time on App` <dbl> 12.65565, 11.10946, 11.33028, 13.71751, 12.7951…
## $ `Time on Website` <dbl> 39.57767, 37.26896, 37.11060, 36.72128, 37.5366…
## $ `Length of Membership` <dbl> 4.082621, 2.664034, 4.104543, 3.120179, 4.44630…
## $ `Yearly Amount Spent` <dbl> 587.9511, 392.2049, 487.5475, 581.8523, 599.406…
| Name | customers |
| Number of rows | 500 |
| Number of columns | 8 |
| _______________________ | |
| Column type frequency: | |
| character | 3 |
| numeric | 5 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| 0 | 1 | 13 | 39 | 0 | 500 | 0 | |
| Address | 0 | 1 | 23 | 67 | 0 | 500 | 0 |
| Avatar | 0 | 1 | 3 | 20 | 0 | 138 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Avg. Session Length | 0 | 1 | 33.05 | 0.99 | 29.53 | 32.34 | 33.08 | 33.71 | 36.14 | ▁▃▇▆▁ |
| Time on App | 0 | 1 | 12.05 | 0.99 | 8.51 | 11.39 | 11.98 | 12.75 | 15.13 | ▁▃▇▅▁ |
| Time on Website | 0 | 1 | 37.06 | 1.01 | 33.91 | 36.35 | 37.07 | 37.72 | 40.01 | ▁▃▇▅▁ |
| Length of Membership | 0 | 1 | 3.53 | 1.00 | 0.27 | 2.93 | 3.53 | 4.13 | 6.92 | ▁▃▇▃▁ |
| Yearly Amount Spent | 0 | 1 | 499.31 | 79.31 | 256.67 | 445.04 | 498.89 | 549.31 | 765.52 | ▁▅▇▃▁ |
Data Preprocessing
# Create a subset of the 'customers' data frame without the specified columns
customers_subset <- customers %>%
select(-Email, -Address, -Avatar)customers_subset <- customers_subset %>%
rename("Avg_Session_Length" = "Avg. Session Length",
"Time_on_App" = "Time on App",
"Time_on_Website" = "Time on Website",
"Length_of_Membership" = "Length of Membership",
"Yearly_Amount_Spent" = "Yearly Amount Spent")| Avg_Session_Length | Time_on_App | Time_on_Website | Length_of_Membership | Yearly_Amount_Spent |
|---|---|---|---|---|
| 34.49727 | 12.65565 | 39.57767 | 4.082621 | 587.9511 |
| 31.92627 | 11.10946 | 37.26896 | 2.664034 | 392.2049 |
| 33.00091 | 11.33028 | 37.11060 | 4.104543 | 487.5475 |
| 34.30556 | 13.71751 | 36.72128 | 3.120179 | 581.8523 |
| 33.33067 | 12.79519 | 37.53665 | 4.446308 | 599.4061 |
| 33.87104 | 12.02693 | 34.47688 | 5.493507 | 637.1024 |
Exploratory Data Analysis
A joint plot to compare Time on Website and Yearly Amount Spent
# Load the ggplot2 package
# Create a joint plot
ggplot(customers, aes(x = `Time on Website`, y = `Yearly Amount Spent`)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) + # Add a linear regression line
labs(x = "Time on Website", y = "Yearly Amount Spent") # Set axis labels
A joint plot to compare Time on App and Yearly Amount Spent
# Create a joint plot
ggplot(customers, aes(x = `Time on App`, y = `Yearly Amount Spent`)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) + # Add a linear regression line
labs(x = "Time on App", y = "Yearly Amount Spent") # Set axis labels
hexbin joint plot comparing Time on App and Length of
Membership
# Create a hexbin joint plot
ggplot(customers, aes(x = `Time on App`, y = `Length of Membership`)) +
geom_hex() + # Use hexagonal binning
labs(x = "Time on App", y = "Length of Membership") # Set axis labelsI explore these types of relationships across the entire data set. I am going to use ggpairs to recreate the plot below
# Create a scatter plot with a linear regression line
ggplot(customers_subset, aes(x = `Length_of_Membership`, y = `Yearly_Amount_Spent`)) +
geom_point() + # Add points
geom_smooth(method = "lm", se = FALSE) + # Add a linear regression line
labs(x = "Length of Membership", y = "Yearly Amount Spent") # Set axis labelsX <- customers_subset[, c("Avg_Session_Length", "Time_on_App", "Time_on_Website", "Length_of_Membership")]# Extract the "Yearly Amount Spent" column using the $ operator
y <- customers_subset$`Yearly_Amount_Spent`# Set the random seed for reproducibility
set.seed(101)
# Split the data into training and testing sets
splits <- createDataPartition(y, p = 0.8, list = FALSE, times = 1)
# Use the same split indices for both X and y
X_train <- X[splits, ]
X_test <- X[-splits, ]
y_train <- y[splits]
y_test <- y[-splits]
# Print the dimensions of the training and testing sets
print(dim(X_train))## [1] 400 4
## [1] 100 4
# Fit a linear regression model
lm_model <- lm(y_train ~ ., data = data.frame(y = y_train, X_train))
# Print the summary of the model
summary(lm_model)## Warning in summary.lm(lm_model): essentially perfect fit: summary may be
## unreliable
##
## Call:
## lm(formula = y_train ~ ., data = data.frame(y = y_train, X_train))
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.529e-13 -1.056e-15 5.750e-16 2.256e-15 1.731e-14
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.994e-14 5.031e-14 -3.960e-01 0.692
## y 1.000e+00 4.315e-17 2.318e+16 <2e-16 ***
## Avg_Session_Length -1.694e-15 1.192e-15 -1.421e+00 0.156
## Time_on_App -2.258e-15 1.724e-15 -1.310e+00 0.191
## Time_on_Website 1.039e-16 4.302e-16 2.420e-01 0.809
## Length_of_Membership -4.076e-15 2.694e-15 -1.513e+00 0.131
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.75e-15 on 394 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 6.253e+33 on 5 and 394 DF, p-value: < 2.2e-16
# Create a linear regression model
lm <- lm(y_train ~ ., data = data.frame(y = y_train, X_train))
lm##
## Call:
## lm(formula = y_train ~ ., data = data.frame(y = y_train, X_train))
##
## Coefficients:
## (Intercept) y Avg_Session_Length
## -1.994e-14 1.000e+00 -1.694e-15
## Time_on_App Time_on_Website Length_of_Membership
## -2.258e-15 1.039e-16 -4.076e-15
# Fit a linear regression model using the entire dataset
lm_model_full <- lm(y ~ ., data = data.frame(y = y, X))
# Make predictions using the updated linear regression model
predictions <- predict(lm_model_full, newdata = X_test)
# Check the dimensions of the predictions
dim(predictions)## NULL
# Evaluate model performance
mse <- mean((predictions - y_test)^2)
rmse <- sqrt(mse)
mae <- mean(abs(predictions - y_test))
rsquared <- cor(predictions, y_test)^2
# Print evaluation metrics
cat("Mean Squared Error (MSE):", mse, "\n")## Mean Squared Error (MSE): 80.40227
## Root Mean Squared Error (RMSE): 8.966731
## Mean Absolute Error (MAE): 7.059969
## R-squared: 0.98954
# Visualize results
plot(predictions, y_test, main = "Predicted vs Actual", xlab = "Predicted", ylab = "Actual")
abline(0, 1, col = "red")# Calculate residuals
residuals <- y_test - predictions
# Plot histogram of residuals
hist(residuals, breaks = 50, main = "Histogram of Residuals", xlab = "Residuals", ylab = "Frequency")# Calculate residuals
residuals <- y_test - predictions
# Plot histogram of residuals
hist(residuals, breaks = 50, freq = FALSE, main = "Histogram of Residuals", xlab = "Residuals", ylab = "Density")
# Overlay density plot
lines(density(residuals), col = "blue", lwd = 2)# Extract coefficients from the model summary
coefficients <- coef(summary(lm_model_full))[, 1]
# Remove the intercept coefficient
coefficients <- coefficients[-1]
# Create dataframe with coefficients
coefficients_df <- data.frame(Coefficient = coefficients)
# Print the dataframe
print(coefficients_df)## Coefficient
## Avg_Session_Length 25.7342711
## Time_on_App 38.7091538
## Time_on_Website 0.4367388
## Length_of_Membership 61.5773238