1.Lazada is a leading e-commerce platform in Southeast Asia, attracts global sellers and consumers through diverse promotional activities.
2.For merchants, understanding the effectiveness of these promotions is crucial for optimizing marketing strategies and enhancing platform competitiveness.
This analysis focuses on Lazada’s cross-border e-commerce promotions, aiming to assess factors such as discount intensity and brand influence on sales, providing data-driven insights and support for marketing decisions in cross-border e-commerce.
Here are the questions we going to address:
Predict product sales and analyze the impact of factors.( eg. discount rates, ratings, and the number of reviews).
Identify which categories of products are more likely to achieve high sales through promotions.
For those who are looking to enter the cross-border e-commerce market or optimize promotional strategies to enhance the effectiveness of promotions.
For companies who are aiming to refine their promotional policies and increase market share (e.g., Lazada、Shopee, Taobao etc.).
For those who are seeking to better predict demand fluctuations, optimize inventory management, and improve delivery efficiency.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readxl)
data <- read_excel("./wqd7004_dataset.xlsx")
head(data)
## # A tibble: 6 × 14
## itemId name brandName category originalPrice priceShow itemSoldCntShow
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 342230105 Origina… Xiaomi Cellula… 3590000 3466863 72
## 2 464752465 Xiaomi … Xiaomi Cellula… 5200000 2850000 5
## 3 709428868 Xiaomi … Xiaomi Cellula… 2690000 1757000 288
## 4 895822931 Xiaomi … Xiaomi Cellula… 2690000 1849000 52
## 5 1205582653 Phone X… Xiaomi Cellula… 4890000 4890000 10
## 6 1277481768 Xiaomi … Xiaomi Tablet 5090000 3999000 17
## # ℹ 7 more variables: discount <dbl>, ratingScore <dbl>, review <dbl>,
## # location <chr>, sellerName <chr>, sellerId <dbl>, brandId <dbl>
#The number of rows and columns
nrow(data)
## [1] 3586
ncol(data)
## [1] 14
dim(data)
## [1] 3586 14
names(data)
## [1] "itemId" "name" "brandName" "category"
## [5] "originalPrice" "priceShow" "itemSoldCntShow" "discount"
## [9] "ratingScore" "review" "location" "sellerName"
## [13] "sellerId" "brandId"
#Type of data
sapply(data, class)
## itemId name brandName category originalPrice
## "numeric" "character" "character" "character" "numeric"
## priceShow itemSoldCntShow discount ratingScore review
## "numeric" "numeric" "numeric" "numeric" "numeric"
## location sellerName sellerId brandId
## "character" "character" "numeric" "numeric"
#Data distribution
summary(data)
## itemId name brandName category
## Min. :2.542e+08 Length:3586 Length:3586 Length:3586
## 1st Qu.:2.273e+09 Class :character Class :character Class :character
## Median :2.427e+09 Mode :character Mode :character Mode :character
## Mean :2.377e+09
## 3rd Qu.:2.593e+09
## Max. :2.671e+09
## originalPrice priceShow itemSoldCntShow discount
## Min. : 1399 Min. : 1399 Min. : 0.0 Min. : 0.00
## 1st Qu.: 1200000 1st Qu.: 837328 1st Qu.: 0.0 1st Qu.:15.00
## Median : 2150299 Median : 1131232 Median : 5.0 Median :43.00
## Mean : 3834455 Mean : 2813634 Mean : 262.8 Mean :33.45
## 3rd Qu.: 3388000 3rd Qu.: 2596000 3rd Qu.: 57.0 3rd Qu.:50.00
## Max. :44490000 Max. :39890000 Max. :128000.0 Max. :72.00
## ratingScore review location sellerName
## Min. :0.000 Min. : 0.00 Length:3586 Length:3586
## 1st Qu.:0.000 1st Qu.: 0.00 Class :character Class :character
## Median :4.622 Median : 1.00 Mode :character Mode :character
## Mean :2.686 Mean : 26.46
## 3rd Qu.:5.000 3rd Qu.: 41.00
## Max. :5.000 Max. :2886.00
## sellerId brandId
## Min. :1.276e+04 Min. : 667
## 1st Qu.:2.002e+11 1st Qu.: 65074
## Median :2.003e+11 Median : 134666
## Mean :1.800e+11 Mean : 59893744
## 3rd Qu.:2.003e+11 3rd Qu.:127167201
## Max. :2.007e+11 Max. :127221667
missing_val <- colSums(is.na(data))
print(missing_val)
## itemId name brandName category originalPrice
## 0 0 0 0 0
## priceShow itemSoldCntShow discount ratingScore review
## 0 0 0 0 0
## location sellerName sellerId brandId
## 0 0 0 0
#check itemId
itemId_positive_int <- all(data$itemId > 0 & data$itemId %% 1 == 0, na.rm = TRUE)
itemId_unique <- length(data$itemId) == length(unique(data$itemId))
if (itemId_positive_int & itemId_unique) {
cat("itemId:All values are positive integers and unique.\n")
} else {
cat("itemId:The value of the existence of positive integer or duplicate values.\n")
}
## itemId:All values are positive integers and unique.
##check priceShow&originalPrice
priceShow_positive<-all(data$priceShow > 0,na.rm = TRUE)
originalPrice_positive<-all(data$originalPrice > 0,na.rm = TRUE)
if (priceShow_positive & originalPrice_positive) {
cat("priceShow: All values are positive.\n")
cat("originalPrice: All values are positive.\n")
} else {
cat("Non-positive values exist.")
}
## priceShow: All values are positive.
## originalPrice: All values are positive.
##check discount
if(all(data$discount>=0 & data$discount<100)){
cat("discount:All values are between 0 and 100.\n")
}else{
cat("discount:An invalid value exists.\n")
}
## discount:All values are between 0 and 100.
##check ratingScore
if(all(data$ratingScore>=0 & data$ratingScore<=5)){
cat("ratingScore:All values are between 0 and 5.\n")
}else{
cat("ratingScore:An invalid value exists.\n")
}
## ratingScore:All values are between 0 and 5.
##check Review and itemSoldCntShow
review<-all(data$review >= 0 & data$review %% 1 == 0,na.rm = TRUE)
itemSoldCntShow<-all(data$itemSoldCntShow >= 0 & data$itemSoldCntShow %% 1 == 0,na.rm = TRUE)
if (review & itemSoldCntShow) {
cat("review: All values are non-negative integers.\n")
cat("itemSoldCntShow: All values are non-negative integers.\n")
} else {
cat("An invalid value exists.")
}
## review: All values are non-negative integers.
## itemSoldCntShow: All values are non-negative integers.
library(ggplot2)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(rlang)
barplot <- function(data, column_name) {
ggplot(data, aes(x = !!sym(column_name))) + # 使用 !!sym() 代替 aes_string()
geom_bar(fill = "skyblue", color = "red") +
labs(title = paste(column_name), x = column_name, y = "Frequency") +
theme_minimal()
}
p1 <- barplot(data, "originalPrice")
p2 <- barplot(data, "priceShow")
p3 <- barplot(data, "itemSoldCntShow")
p4 <- barplot(data, "discount")
p5 <- barplot(data, "review")
p6 <- barplot(data, "ratingScore")
grid.arrange(p1, p2, p3, p4, p5, p6, ncol = 3)
library(ggplot2)
library(reshape2)
library(readxl)
head(data)
## # A tibble: 6 × 14
## itemId name brandName category originalPrice priceShow itemSoldCntShow
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 342230105 Origina… Xiaomi Cellula… 3590000 3466863 72
## 2 464752465 Xiaomi … Xiaomi Cellula… 5200000 2850000 5
## 3 709428868 Xiaomi … Xiaomi Cellula… 2690000 1757000 288
## 4 895822931 Xiaomi … Xiaomi Cellula… 2690000 1849000 52
## 5 1205582653 Phone X… Xiaomi Cellula… 4890000 4890000 10
## 6 1277481768 Xiaomi … Xiaomi Tablet 5090000 3999000 17
## # ℹ 7 more variables: discount <dbl>, ratingScore <dbl>, review <dbl>,
## # location <chr>, sellerName <chr>, sellerId <dbl>, brandId <dbl>
numeric <- data[, c("itemId","sellerId","brandId","originalPrice","itemSoldCntShow", "priceShow","ratingScore", "review","discount")]
matrix <- cor(numeric, use = "complete.obs") # "complete.obs"
melted <- melt(matrix)
ggplot(melted, aes(Var1, Var2, fill = value)) +
geom_tile() +
geom_text(aes(label = sprintf("%.2f", value)), color = "black", size = 3) +
scale_fill_gradient2(low = "purple", high = "red", mid = "white", midpoint = 0, limit = c(-1, 1)) +
labs(title = "Heat map among variable", x = "variable", y = "variable", fill = "Correlation") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Load necessary libraries
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:reshape2':
##
## smiths
library(knitr)
library(stringr)
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(caret) # For data set partitioning
## Loading required package: lattice
# Step 1: Clear column names
names(data) <- gsub(" ", "_", names(data)) # Replace spaces with underscores
# Step 2: Convert columns to appropriate data types
data$category <- as.factor(data$category)
data$originalPrice <- as.numeric(data$originalPrice)
data$priceShow <- as.numeric(data$priceShow)
data$itemSoldCntShow <- as.numeric(data$itemSoldCntShow)
data$discount <- as.numeric(data$discount)
data$ratingScore <- as.numeric(data$ratingScore)
data$review <- as.numeric(data$review)
# Step 3: Handle missing values
data <- data %>%
filter(!is.na(originalPrice) & !is.na(priceShow) &
!is.na(itemSoldCntShow) & !is.na(discount))
# Step 4: Handle outliers
## Function to remove outliers based on IQR
remove_outliers <- function(df, var) {
Q1 <- quantile(df[[var]], 0.25)
Q3 <- quantile(df[[var]], 0.75)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
df <- df %>% filter(df[[var]] >= lower_bound & df[[var]] <= upper_bound)
return(df)
}
## Loop through numeric columns and remove outliers
numeric_columns <- sapply(data, is.numeric)
for (col in names(data)[numeric_columns]) {
data <- remove_outliers(data, col)
}
# Step 5: Create new feature variable (Sales Score) with normalization and weights
data <- data %>%
mutate(
normalized_sales = (itemSoldCntShow - min(itemSoldCntShow)) /
(max(itemSoldCntShow) - min(itemSoldCntShow)),
normalized_rating = (ratingScore - min(ratingScore)) /
(max(ratingScore) - min(ratingScore)),
sales_score = (normalized_sales * 0.7) + (normalized_rating * 0.3) # Setting weights
)
# Step 6: Clean 'sellerName' and handle 'name' column
data <- data %>%
mutate(
sellerName = str_replace_all(sellerName, "[\u4e00-\u9fa5]", ""), # Remove Chinese characters
sellerName = str_trim(sellerName), # Trim whitespace
name = ifelse(nchar(name) > 10, paste0(substr(name, 1, 10), "..."), name) # Truncate 'name' column
)
# Step 7: Display cleaned data
kable(head(data), format = "html", caption = "Prepared Dataset") %>%
kable_styling("striped", full_width = F)
| itemId | name | brandName | category | originalPrice | priceShow | itemSoldCntShow | discount | ratingScore | review | location | sellerName | sellerId | brandId | normalized_sales | normalized_rating | sales_score |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2033044000 | Xiaomi MiM… | Xiaomi | Cellular phone | 1600000 | 930468 | 45 | 42 | 4.652 | 23 | Ho Chi Minh | TAY NGUYEN STORE 47 | 200169939195 | 4348 | 0.2571429 | 0.9304 | 0.45912 |
| 2033070723 | Xiaomi Mi8… | Xiaomi | Cellular phone | 3150000 | 1674657 | 30 | 47 | 5.000 | 20 | Ho Chi Minh | TAY NGUYEN STORE 47 | 200169939195 | 4348 | 0.1714286 | 1.0000 | 0.42000 |
| 2033123603 | Xiaomi MiM… | Xiaomi | Cellular phone | 1500000 | 930468 | 37 | 38 | 4.611 | 18 | Ho Chi Minh | TAY NGUYEN STORE 47 | 200169939195 | 4348 | 0.2114286 | 0.9222 | 0.42466 |
| 2033161146 | Xiaomi Mi8… | Xiaomi | Cellular phone | 4050000 | 2141288 | 28 | 47 | 5.000 | 17 | Ho Chi Minh | TAY NGUYEN STORE 47 | 200169939195 | 4348 | 0.1600000 | 1.0000 | 0.41200 |
| 2033266624 | Genuine Xi… | Xiaomi | Cellular phone | 1999000 | 1116748 | 82 | 44 | 4.815 | 65 | Ho Chi Minh | TAY NGUYEN STORE 47 | 200169939195 | 4348 | 0.4685714 | 0.9630 | 0.61690 |
| 2033406937 | Genuine Xi… | Xiaomi | Cellular phone | 2850000 | 1535878 | 28 | 46 | 5.000 | 16 | Ho Chi Minh | TAY NGUYEN STORE 47 | 200169939195 | 4348 | 0.1600000 | 1.0000 | 0.41200 |
# Step 8: Save the cleaned dataset
write.csv(data, "prepared_dataset.csv", row.names = FALSE)
# Split the dataset into training and testing sets
set.seed(123) # For reproducibility
train_index <- createDataPartition(data$sales_score, p = .8, # 80% As a training set
list = FALSE,
times = 1) # List is FALSE Return index
train_data <- data[train_index, ] # Training Data
test_data <- data[-train_index, ] # Test Data
# Display the first few rows of the prepared training data
kable(head(train_data), format = "html", caption = "Training Dataset") %>%
kable_styling("striped", full_width = F)
| itemId | name | brandName | category | originalPrice | priceShow | itemSoldCntShow | discount | ratingScore | review | location | sellerName | sellerId | brandId | normalized_sales | normalized_rating | sales_score |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2033044000 | Xiaomi MiM… | Xiaomi | Cellular phone | 1600000 | 930468 | 45 | 42 | 4.652 | 23 | Ho Chi Minh | TAY NGUYEN STORE 47 | 200169939195 | 4348 | 0.2571429 | 0.9304 | 0.45912 |
| 2033123603 | Xiaomi MiM… | Xiaomi | Cellular phone | 1500000 | 930468 | 37 | 38 | 4.611 | 18 | Ho Chi Minh | TAY NGUYEN STORE 47 | 200169939195 | 4348 | 0.2114286 | 0.9222 | 0.42466 |
| 2033266624 | Genuine Xi… | Xiaomi | Cellular phone | 1999000 | 1116748 | 82 | 44 | 4.815 | 65 | Ho Chi Minh | TAY NGUYEN STORE 47 | 200169939195 | 4348 | 0.4685714 | 0.9630 | 0.61690 |
| 2033406937 | Genuine Xi… | Xiaomi | Cellular phone | 2850000 | 1535878 | 28 | 46 | 5.000 | 16 | Ho Chi Minh | TAY NGUYEN STORE 47 | 200169939195 | 4348 | 0.1600000 | 1.0000 | 0.41200 |
| 2034907442 | Genuine Xi… | Xiaomi | Cellular phone | 1599000 | 930468 | 114 | 42 | 5.000 | 96 | Ho Chi Minh | TAY NGUYEN STORE 47 | 200169939195 | 4348 | 0.6514286 | 1.0000 | 0.75600 |
| 2035034952 | Genuine Xi… | Xiaomi | Cellular phone | 1350000 | 837328 | 38 | 38 | 4.684 | 19 | Ho Chi Minh | TAY NGUYEN STORE 47 | 200169939195 | 4348 | 0.2171429 | 0.9368 | 0.43304 |
# Display the first few rows of the prepared testing data
kable(head(test_data), format = "html", caption = "Testing Dataset") %>%
kable_styling("striped", full_width = F)
| itemId | name | brandName | category | originalPrice | priceShow | itemSoldCntShow | discount | ratingScore | review | location | sellerName | sellerId | brandId | normalized_sales | normalized_rating | sales_score |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2033070723 | Xiaomi Mi8… | Xiaomi | Cellular phone | 3150000 | 1674657 | 30 | 47 | 5.000 | 20 | Ho Chi Minh | TAY NGUYEN STORE 47 | 200169939195 | 4348 | 0.1714286 | 1.0000 | 0.42000 |
| 2033161146 | Xiaomi Mi8… | Xiaomi | Cellular phone | 4050000 | 2141288 | 28 | 47 | 5.000 | 17 | Ho Chi Minh | TAY NGUYEN STORE 47 | 200169939195 | 4348 | 0.1600000 | 1.0000 | 0.41200 |
| 2035335256 | Xiaomi Red… | Xiaomi | Cellular phone | 2400000 | 1208957 | 75 | 50 | 4.862 | 65 | Ho Chi Minh | TAY NGUYEN STORE 47 | 200169939195 | 4348 | 0.4285714 | 0.9724 | 0.59172 |
| 2035948704 | Xiaomi Mim… | Xiaomi | Cellular phone | 1100000 | 604478 | 34 | 45 | 4.545 | 22 | Ho Chi Minh | TAY NGUYEN STORE 47 | 200169939195 | 4348 | 0.1942857 | 0.9090 | 0.40870 |
| 2036324094 | Genuine Xi… | Xiaomi | Cellular phone | 1950000 | 929537 | 35 | 52 | 5.000 | 18 | Ho Chi Minh | TAY NGUYEN STORE 47 | 200169939195 | 4348 | 0.2000000 | 1.0000 | 0.44000 |
| 2036369720 | Xiaomi Red… | Xiaomi | Cellular phone | 668000 | 361573 | 141 | 46 | 4.956 | 114 | Ho Chi Minh | TAY NGUYEN STORE 47 | 200169939195 | 4348 | 0.8057143 | 0.9912 | 0.86136 |
# Load necessary libraries
library(readr)
library(ggplot2)
library(dplyr)
library(corrplot)
# 1. Load the dataset
data <- read_csv("prepared_dataset.csv")
# 2. Explore Key Features
## 2.1 Distribution of Sales Score
ggplot(data, aes(x = sales_score)) +
geom_histogram(bins = 30, fill = "blue", color = "black") +
labs(title = "Distribution of Sales Score", x = "Sales Score", y = "Frequency") +
theme_minimal() # Ensure using theme_minimal() here
## 2.2 Sales Score vs Discount
ggplot(data, aes(x = discount, y = sales_score)) +
geom_point(alpha = 0.5, color = "red") +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
labs(title = "Sales Score vs Discount", x = "Discount (%)", y = "Sales Score") +
theme_minimal()
## 2.3 Sales by Category
ggplot(data, aes(x = category, y = itemSoldCntShow)) +
geom_bar(stat = "identity", fill = "purple", color = "black") +
labs(title = "Sales by Category", x = "Category", y = "Units Sold") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme_minimal()
## 2.4 Price vs Sales Relationship
ggplot(data, aes(x = originalPrice, y = itemSoldCntShow)) +
geom_point(alpha = 0.5, color = "green") +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
labs(title = "Price vs Sales Relationship", x = "Original Price", y = "Units Sold") +
theme_minimal()
## 2.5 Discount vs Sales Relationship
ggplot(data, aes(x = discount, y = itemSoldCntShow)) +
geom_point(alpha = 0.5, color = "red") +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
labs(title = "Discount vs Sales Relationship", x = "Discount (%)", y = "Units Sold") +
theme_minimal()
## 2.6 Correlation Analysis
# Select numeric columns
numeric_data <- data %>% select(where(is.numeric))
# Calculate the correlation matrix
correlation_matrix <- cor(numeric_data, use = "complete.obs")
# Set up the graphical parameters
par(mar = c(5, 4, 5, 1)) # Top margin increased further to create space for the title
# Create the correlation plot without title
corrplot(correlation_matrix, method = "circle", type = "upper",
tl.col = "black", tl.srt = 45,
addgrid.col = NA) # Optionally remove grid lines for clarity
# Add the title manually
title(main = "Correlation Matrix of Numeric Features", cex.main = 1.5, line = 3)
library(caret)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
set.seed(123)
# Training and Testing Sets
train_index <- createDataPartition(data$itemSoldCntShow, p = 0.8, list = FALSE)
train_data <- data[train_index, ]
test_data <- data[-train_index, ]
# Build a Linear Regression Model
#lm_model <- lm(Sales ~ Discount + Price, data = train_data)
lm_model <- lm(itemSoldCntShow ~ discount + priceShow+ originalPrice+ ratingScore+ review, data = train_data)
# Evaluation: Assessing Model Performance
summary(lm_model)
##
## Call:
## lm(formula = itemSoldCntShow ~ discount + priceShow + originalPrice +
## ratingScore + review, data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26.544 -12.069 -1.299 1.462 109.309
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.579e-01 1.323e+01 0.035 0.972
## discount 1.731e-01 2.883e-01 0.601 0.548
## priceShow 7.915e-06 1.529e-05 0.518 0.605
## originalPrice -7.947e-06 8.179e-06 -0.972 0.331
## ratingScore 3.336e+00 3.229e-01 10.330 <2e-16 ***
## review 1.062e+00 2.429e-02 43.716 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.94 on 1509 degrees of freedom
## Multiple R-squared: 0.8228, Adjusted R-squared: 0.8222
## F-statistic: 1401 on 5 and 1509 DF, p-value: < 2.2e-16
# Prediction on the Test Set
lm_predictions <- predict(lm_model, newdata = test_data)
# Calculate Mean Squared Error (MSE)
lm_mse <- mean((test_data$itemSoldCntShow - lm_predictions)^2)
cat("Regression Model MSE:", lm_mse, "\n")
## Regression Model MSE: 319.625
# Custom Classification Thresholds(eg.defining high sales as greater than the average sales)
threshold <- mean(data$itemSoldCntShow, na.rm = TRUE)
data$Sales_Class <- ifelse(data$itemSoldCntShow > threshold, "High", "Low")
data$Sales_Class <- as.factor(data$Sales_Class) # Convert to Categorical Variable
library(caret)
library(randomForest)
library(ggplot2)
library(lattice)
# Training and Testing Sets
set.seed(123)
train_index_class <- createDataPartition(data$Sales_Class, p = 0.8, list = FALSE)
train_data_class <- data[train_index_class, ]
test_data_class <- data[-train_index_class, ]
# Random Forest Classification Model
rf_model <- randomForest(Sales_Class ~ discount + priceShow +originalPrice +ratingScore+ review, data = train_data_class, ntree = 100)
# Evaluation: Checking Classification Accuracy
rf_predictions <- predict(rf_model, newdata = test_data_class)
confusion_matrix <- confusionMatrix(rf_predictions, test_data_class$Sales_Class)
# Output Model Evaluation Results
print(confusion_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction High Low
## High 179 6
## Low 4 189
##
## Accuracy : 0.9735
## 95% CI : (0.9519, 0.9872)
## No Information Rate : 0.5159
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9471
##
## Mcnemar's Test P-Value : 0.7518
##
## Sensitivity : 0.9781
## Specificity : 0.9692
## Pos Pred Value : 0.9676
## Neg Pred Value : 0.9793
## Prevalence : 0.4841
## Detection Rate : 0.4735
## Detection Prevalence : 0.4894
## Balanced Accuracy : 0.9737
##
## 'Positive' Class : High
##
# Optimize the Model
#Tune the Parameters of the Random Forest Using Grid Search
library(caret)
tune_grid <- expand.grid(.mtry = c(2, 3), .splitrule = "gini", .min.node.size = c(1, 5, 10))
control <- trainControl(method = "cv", number = 5)
rf_tuned <- train(Sales_Class ~ discount + priceShow +originalPrice +ratingScore+ review, data = train_data_class, method = "ranger",
trControl = control, tuneGrid = tune_grid) # Output Best Parameters
print(rf_tuned$bestTune)
## mtry splitrule min.node.size
## 3 2 gini 10
print(rf_tuned)
## Random Forest
##
## 1514 samples
## 5 predictor
## 2 classes: 'High', 'Low'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 1211, 1211, 1212, 1210, 1212
## Resampling results across tuning parameters:
##
## mtry min.node.size Accuracy Kappa
## 2 1 0.9590538 0.9180311
## 2 5 0.9590538 0.9180247
## 2 10 0.9597161 0.9193568
## 3 1 0.9590647 0.9180498
## 3 5 0.9583981 0.9167223
## 3 10 0.9570802 0.9140662
##
## Tuning parameter 'splitrule' was held constant at a value of gini
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 2, splitrule = gini
## and min.node.size = 10.
# Evaluate the Tuned Model on the Test Set
rf_tuned_predictions <- predict(rf_tuned, newdata = test_data_class)
confusion_matrix_tuned <- confusionMatrix(rf_tuned_predictions, test_data_class$Sales_Class)
print(confusion_matrix_tuned)
## Confusion Matrix and Statistics
##
## Reference
## Prediction High Low
## High 178 8
## Low 5 187
##
## Accuracy : 0.9656
## 95% CI : (0.9419, 0.9816)
## No Information Rate : 0.5159
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9312
##
## Mcnemar's Test P-Value : 0.5791
##
## Sensitivity : 0.9727
## Specificity : 0.9590
## Pos Pred Value : 0.9570
## Neg Pred Value : 0.9740
## Prevalence : 0.4841
## Detection Rate : 0.4709
## Detection Prevalence : 0.4921
## Balanced Accuracy : 0.9658
##
## 'Positive' Class : High
##
# Visualize Important Features
importance <- importance(rf_model)
varImpPlot(rf_model)
# Create evaluation metric visualizations
library(ggplot2)
# Sales prediction model evaluation
predictions_df <- data.frame(
Actual = test_data$itemSoldCntShow,
Predicted = lm_predictions
)
ggplot(predictions_df, aes(x = Actual, y = Predicted)) +
geom_point(alpha = 0.5) +
geom_abline(color = "red") +
labs(title = "Sales Prediction Model: Actual vs Predicted",
x = "Actual Sales",
y = "Predicted Sales") +
theme_minimal()
# Classification model evaluation visualization
plot_confusion_matrix <- function(cm) {
cm_d <- as.data.frame(cm$table)
ggplot(cm_d, aes(Prediction, Reference, fill = Freq)) +
geom_tile() +
geom_text(aes(label = Freq)) +
scale_fill_gradient(low = "white", high = "steelblue") +
labs(title = "Confusion Matrix Heatmap") +
theme_minimal()
}
plot_confusion_matrix(confusion_matrix_tuned)
# Model performance metrics summary
model_metrics <- data.frame(
Metric = c("Regression Model MSE", "Classification Accuracy", "Classification Precision", "Classification Recall"),
Value = c(
lm_mse,
confusion_matrix_tuned$overall["Accuracy"],
confusion_matrix_tuned$byClass["Precision"],
confusion_matrix_tuned$byClass["Recall"]
)
)
kable(model_metrics, format = "html", caption = "Model Performance Metrics Summary") %>%
kable_styling("striped", full_width = F)
| Metric | Value | |
|---|---|---|
| Regression Model MSE | 319.6249906 | |
| Accuracy | Classification Accuracy | 0.9656085 |
| Precision | Classification Precision | 0.9569892 |
| Recall | Classification Recall | 0.9726776 |
# Create prediction function for deployment
predict_sales <- function(discount, price, original_price, rating, reviews) {
new_data <- data.frame(
discount = discount,
priceShow = price,
originalPrice = original_price,
ratingScore = rating,
review = reviews
)
predicted_sales <- predict(lm_model, newdata = new_data)
return(predicted_sales)
}
# Test prediction function
test_prediction <- predict_sales(
discount = 20,
price = 100,
original_price = 120,
rating = 4.5,
reviews = 100
)
cat("Test Prediction Result:", test_prediction)
## Test Prediction Result: 125.1366
library(shiny)
library(shinydashboard)
##
## Attaching package: 'shinydashboard'
## The following object is masked from 'package:graphics':
##
## box
# UI
ui <- dashboardPage(
dashboardHeader(title = "Lazada Sales Analysis Dashboard"),
dashboardSidebar(
sidebarMenu(
menuItem("Sales Prediction", tabName = "prediction"),
menuItem("Sales Classification", tabName = "classification")
)
),
dashboardBody(
tabItems(
tabItem(tabName = "prediction",
fluidRow(
box(
title = "Input Parameters",
sliderInput("discount", "Discount Rate (%)", 0, 100, 20),
numericInput("price", "Sale Price", 100),
numericInput("original_price", "Original Price", 120),
sliderInput("rating", "Rating", 0, 5, 4.5),
numericInput("reviews", "Number of Reviews", 100)
),
box(
title = "Prediction Results",
verbatimTextOutput("sales_prediction")
)
)
),
tabItem(tabName = "classification",
fluidRow(
box(plotOutput("confusion_matrix_plot")),
box(plotOutput("feature_importance_plot"))
)
)
)
)
)
# Server section
server <- function(input, output) {
output$sales_prediction <- renderText({
pred <- predict_sales(
input$discount,
input$price,
input$original_price,
input$rating,
input$reviews
)
paste("Predicted Sales:", round(pred, 2))
})
output$confusion_matrix_plot <- renderPlot({
plot_confusion_matrix(confusion_matrix_tuned)
})
output$feature_importance_plot <- renderPlot({
varImpPlot(rf_model)
})
}
# Run Shiny Apps
shinyApp(ui, server)
The regression analysis aimed to predict product sales and examine the impact of features such as discount intensity and price on sales. Key conclusions are as follows:
The classification analysis aimed to categorize products based on sales volume and examine the factors that make products more likely to achieve high sales. Key findings include: