library("tidymodels")
library("tidyverse")
library("GGally")
library("flextable")
library(ggplot2)
library(dplyr)
library(caret)
library(skimr)
library("knitr")

Data Loading and Inspecting Data

# Read the tab-delimited file from the URL
customers <- read_csv("~/631/Ecommerce.csv")

head(customers,5)

## # A tibble: 5 × 8
##   Email     Address Avatar `Avg. Session Length` `Time on App` `Time on Website`
##   <chr>     <chr>   <chr>                  <dbl>         <dbl>             <dbl>
## 1 mstephen… "835 F… Violet                  34.5          12.7              39.6
## 2 hduke@ho… "4547 … DarkG…                  31.9          11.1              37.3
## 3 pallen@y… "24645… Bisque                  33.0          11.3              37.1
## 4 riverare… "1414 … Saddl…                  34.3          13.7              36.7
## 5 mstephen… "14023… Mediu…                  33.3          12.8              37.5
## # ℹ 2 more variables: `Length of Membership` <dbl>, `Yearly Amount Spent` <dbl>

glimpse(customers)

## Rows: 500
## Columns: 8
## $ Email                  <chr> "mstephenson@fernandez.com", "hduke@hotmail.com…
## $ Address                <chr> "835 Frank Tunnel\nWrightmouth, MI 82180-9605",…
## $ Avatar                 <chr> "Violet", "DarkGreen", "Bisque", "SaddleBrown",…
## $ `Avg. Session Length`  <dbl> 34.49727, 31.92627, 33.00091, 34.30556, 33.3306…
## $ `Time on App`          <dbl> 12.65565, 11.10946, 11.33028, 13.71751, 12.7951…
## $ `Time on Website`      <dbl> 39.57767, 37.26896, 37.11060, 36.72128, 37.5366…
## $ `Length of Membership` <dbl> 4.082621, 2.664034, 4.104543, 3.120179, 4.44630…
## $ `Yearly Amount Spent`  <dbl> 587.9511, 392.2049, 487.5475, 581.8523, 599.406…

skim(customers)

Data summary
Name	customers
Number of rows	500
Number of columns	8
_______________________
Column type frequency:
character	3
numeric	5
________________________
Group variables	None

Variable type: character

skim_variable	complete_rate	min	max	n_unique
Email	1	13	39	500
Address	1	23	67	500
Avatar	1	3	20	138

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
Avg. Session Length	1	33.05	0.99	29.53	32.34	33.08	33.71	36.14	▁▃▇▆▁
Time on App	1	12.05	0.99	8.51	11.39	11.98	12.75	15.13	▁▃▇▅▁
Time on Website	1	37.06	1.01	33.91	36.35	37.07	37.72	40.01	▁▃▇▅▁
Length of Membership	1	3.53	1.00	0.27	2.93	3.53	4.13	6.92	▁▃▇▃▁
Yearly Amount Spent	1	499.31	79.31	256.67	445.04	498.89	549.31	765.52	▁▅▇▃▁

Data Preprocessing

# Create a subset of the 'customers' data frame without the specified columns
customers_subset <- customers %>%
  select(-Email, -Address, -Avatar)

customers_subset <- customers_subset %>%
  rename("Avg_Session_Length" = "Avg. Session Length",
         "Time_on_App"  = "Time on App",
        "Time_on_Website" = "Time on Website",
         "Length_of_Membership" = "Length of Membership",
        "Yearly_Amount_Spent" = "Yearly Amount Spent")

kable(head(customers_subset))

Avg_Session_Length	Time_on_App	Time_on_Website	Length_of_Membership	Yearly_Amount_Spent
34.49727	12.65565	39.57767	4.082621	587.9511
31.92627	11.10946	37.26896	2.664034	392.2049
33.00091	11.33028	37.11060	4.104543	487.5475
34.30556	13.71751	36.72128	3.120179	581.8523
33.33067	12.79519	37.53665	4.446308	599.4061
33.87104	12.02693	34.47688	5.493507	637.1024

Exploratory Data Analysis

A joint plot to compare Time on Website and Yearly Amount Spent

# Load the ggplot2 package


# Create a joint plot
ggplot(customers, aes(x = `Time on Website`, y = `Yearly Amount Spent`)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +  # Add a linear regression line
  labs(x = "Time on Website", y = "Yearly Amount Spent")  # Set axis labels

A joint plot to compare Time on App and Yearly Amount Spent

# Create a joint plot
ggplot(customers, aes(x = `Time on App`, y = `Yearly Amount Spent`)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +  # Add a linear regression line
  labs(x = "Time on App", y = "Yearly Amount Spent")  # Set axis labels

hexbin joint plot comparing Time on App and Length of Membership

# Create a hexbin joint plot
ggplot(customers, aes(x = `Time on App`, y = `Length of Membership`)) +
  geom_hex() +  # Use hexagonal binning
  labs(x = "Time on App", y = "Length of Membership")  # Set axis labels

I explore these types of relationships across the entire data set. I am going to use ggpairs to recreate the plot below

# Create a pairplot
ggpairs(customers_subset)

# Create a scatter plot with a linear regression line
ggplot(customers_subset, aes(x = `Length_of_Membership`, y = `Yearly_Amount_Spent`)) +
  geom_point() +  # Add points
  geom_smooth(method = "lm", se = FALSE) +  # Add a linear regression line
  labs(x = "Length of Membership", y = "Yearly Amount Spent")  # Set axis labels

X <- customers_subset[, c("Avg_Session_Length", "Time_on_App", "Time_on_Website", "Length_of_Membership")]

# Extract the "Yearly Amount Spent" column using the $ operator
y <- customers_subset$`Yearly_Amount_Spent`

# Set the random seed for reproducibility
set.seed(101)

# Split the data into training and testing sets
splits <- createDataPartition(y, p = 0.8, list = FALSE, times = 1)

# Use the same split indices for both X and y
X_train <- X[splits, ]
X_test <- X[-splits, ]
y_train <- y[splits]
y_test <- y[-splits]

# Print the dimensions of the training and testing sets
print(dim(X_train))

## [1] 400   4

print(dim(X_test))

## [1] 100   4

# Fit a linear regression model
lm_model <- lm(y_train ~ ., data = data.frame(y = y_train, X_train))

# Print the summary of the model
summary(lm_model)

## Warning in summary.lm(lm_model): essentially perfect fit: summary may be
## unreliable

## 
## Call:
## lm(formula = y_train ~ ., data = data.frame(y = y_train, X_train))
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -1.529e-13 -1.056e-15  5.750e-16  2.256e-15  1.731e-14 
## 
## Coefficients:
##                        Estimate Std. Error    t value Pr(>|t|)    
## (Intercept)          -1.994e-14  5.031e-14 -3.960e-01    0.692    
## y                     1.000e+00  4.315e-17  2.318e+16   <2e-16 ***
## Avg_Session_Length   -1.694e-15  1.192e-15 -1.421e+00    0.156    
## Time_on_App          -2.258e-15  1.724e-15 -1.310e+00    0.191    
## Time_on_Website       1.039e-16  4.302e-16  2.420e-01    0.809    
## Length_of_Membership -4.076e-15  2.694e-15 -1.513e+00    0.131    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.75e-15 on 394 degrees of freedom
## Multiple R-squared:      1,  Adjusted R-squared:      1 
## F-statistic: 6.253e+33 on 5 and 394 DF,  p-value: < 2.2e-16

# Create a linear regression model
lm <- lm(y_train ~ ., data = data.frame(y = y_train, X_train))
lm

## 
## Call:
## lm(formula = y_train ~ ., data = data.frame(y = y_train, X_train))
## 
## Coefficients:
##          (Intercept)                     y    Avg_Session_Length  
##           -1.994e-14             1.000e+00            -1.694e-15  
##          Time_on_App       Time_on_Website  Length_of_Membership  
##           -2.258e-15             1.039e-16            -4.076e-15

# Fit a linear regression model
lm <- lm(y_train ~ ., data = data.frame(y = y_train, X_train))

# Fit a linear regression model using the entire dataset
lm_model_full <- lm(y ~ ., data = data.frame(y = y, X))

# Make predictions using the updated linear regression model
predictions <- predict(lm_model_full, newdata = X_test)

# Check the dimensions of the predictions
dim(predictions)

## NULL

# Evaluate model performance
mse <- mean((predictions - y_test)^2)
rmse <- sqrt(mse)
mae <- mean(abs(predictions - y_test))
rsquared <- cor(predictions, y_test)^2

# Print evaluation metrics
cat("Mean Squared Error (MSE):", mse, "\n")

## Mean Squared Error (MSE): 80.40227

cat("Root Mean Squared Error (RMSE):", rmse, "\n")

## Root Mean Squared Error (RMSE): 8.966731

cat("Mean Absolute Error (MAE):", mae, "\n")

## Mean Absolute Error (MAE): 7.059969

cat("R-squared:", rsquared, "\n")

## R-squared: 0.98954

# Visualize results
plot(predictions, y_test, main = "Predicted vs Actual", xlab = "Predicted", ylab = "Actual")
abline(0, 1, col = "red")

# Calculate residuals
residuals <- y_test - predictions

# Plot histogram of residuals
hist(residuals, breaks = 50, main = "Histogram of Residuals", xlab = "Residuals", ylab = "Frequency")

# Calculate residuals
residuals <- y_test - predictions

# Plot histogram of residuals
hist(residuals, breaks = 50, freq = FALSE, main = "Histogram of Residuals", xlab = "Residuals", ylab = "Density")

# Overlay density plot
lines(density(residuals), col = "blue", lwd = 2)

# Extract coefficients from the model summary
coefficients <- coef(summary(lm_model_full))[, 1]

# Remove the intercept coefficient
coefficients <- coefficients[-1]

# Create dataframe with coefficients
coefficients_df <- data.frame(Coefficient = coefficients)

# Print the dataframe
print(coefficients_df)

##                      Coefficient
## Avg_Session_Length    25.7342711
## Time_on_App           38.7091538
## Time_on_Website        0.4367388
## Length_of_Membership  61.5773238

portofolio

Enockevans Kirui-G02504972

2024-04-18

Data Loading and Inspecting Data

Data Preprocessing

Exploratory Data Analysis