HW04

I: Data pre-processing

data = read.csv("C:\\Users\\ADMIN\\OneDrive\\Attachments\\online_shoppers_intention.csv")

head(data)

##   Administrative Administrative_Duration Informational Informational_Duration
## 1              0                       0             0                      0
## 2              0                       0             0                      0
## 3              0                       0             0                      0
## 4              0                       0             0                      0
## 5              0                       0             0                      0
## 6              0                       0             0                      0
##   ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues
## 1              1                0.000000  0.20000000 0.2000000          0
## 2              2               64.000000  0.00000000 0.1000000          0
## 3              1                0.000000  0.20000000 0.2000000          0
## 4              2                2.666667  0.05000000 0.1400000          0
## 5             10              627.500000  0.02000000 0.0500000          0
## 6             19              154.216667  0.01578947 0.0245614          0
##   SpecialDay Month OperatingSystems Browser Region TrafficType
## 1          0   Feb                1       1      1           1
## 2          0   Feb                2       2      1           2
## 3          0   Feb                4       1      9           3
## 4          0   Feb                3       2      2           4
## 5          0   Feb                3       3      1           4
## 6          0   Feb                2       2      1           3
##         VisitorType Weekend Revenue
## 1 Returning_Visitor   FALSE   FALSE
## 2 Returning_Visitor   FALSE   FALSE
## 3 Returning_Visitor   FALSE   FALSE
## 4 Returning_Visitor   FALSE   FALSE
## 5 Returning_Visitor    TRUE   FALSE
## 6 Returning_Visitor   FALSE   FALSE

Data cleaning

library(data.table)
df <- copy(data)

summary(df)

##  Administrative   Administrative_Duration Informational    
##  Min.   : 0.000   Min.   :   0.00         Min.   : 0.0000  
##  1st Qu.: 0.000   1st Qu.:   0.00         1st Qu.: 0.0000  
##  Median : 1.000   Median :   7.50         Median : 0.0000  
##  Mean   : 2.315   Mean   :  80.82         Mean   : 0.5036  
##  3rd Qu.: 4.000   3rd Qu.:  93.26         3rd Qu.: 0.0000  
##  Max.   :27.000   Max.   :3398.75         Max.   :24.0000  
##  Informational_Duration ProductRelated   ProductRelated_Duration
##  Min.   :   0.00        Min.   :  0.00   Min.   :    0.0        
##  1st Qu.:   0.00        1st Qu.:  7.00   1st Qu.:  184.1        
##  Median :   0.00        Median : 18.00   Median :  598.9        
##  Mean   :  34.47        Mean   : 31.73   Mean   : 1194.8        
##  3rd Qu.:   0.00        3rd Qu.: 38.00   3rd Qu.: 1464.2        
##  Max.   :2549.38        Max.   :705.00   Max.   :63973.5        
##   BounceRates         ExitRates         PageValues        SpecialDay     
##  Min.   :0.000000   Min.   :0.00000   Min.   :  0.000   Min.   :0.00000  
##  1st Qu.:0.000000   1st Qu.:0.01429   1st Qu.:  0.000   1st Qu.:0.00000  
##  Median :0.003112   Median :0.02516   Median :  0.000   Median :0.00000  
##  Mean   :0.022191   Mean   :0.04307   Mean   :  5.889   Mean   :0.06143  
##  3rd Qu.:0.016813   3rd Qu.:0.05000   3rd Qu.:  0.000   3rd Qu.:0.00000  
##  Max.   :0.200000   Max.   :0.20000   Max.   :361.764   Max.   :1.00000  
##     Month           OperatingSystems    Browser           Region     
##  Length:12330       Min.   :1.000    Min.   : 1.000   Min.   :1.000  
##  Class :character   1st Qu.:2.000    1st Qu.: 2.000   1st Qu.:1.000  
##  Mode  :character   Median :2.000    Median : 2.000   Median :3.000  
##                     Mean   :2.124    Mean   : 2.357   Mean   :3.147  
##                     3rd Qu.:3.000    3rd Qu.: 2.000   3rd Qu.:4.000  
##                     Max.   :8.000    Max.   :13.000   Max.   :9.000  
##   TrafficType    VisitorType         Weekend         Revenue       
##  Min.   : 1.00   Length:12330       Mode :logical   Mode :logical  
##  1st Qu.: 2.00   Class :character   FALSE:9462      FALSE:10422    
##  Median : 2.00   Mode  :character   TRUE :2868      TRUE :1908     
##  Mean   : 4.07                                                     
##  3rd Qu.: 4.00                                                     
##  Max.   :20.00

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:data.table':
## 
##     between, first, last

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

# Chuyển đổi kiểu dữ liệu của các cột
df <- df %>%
  mutate(
    OperatingSystems = as.character(OperatingSystems),
    Browser = as.character(Browser),
    Region = as.character(Region),
    TrafficType = as.character(TrafficType)
  )

# Xác định các cột số liệu, cột phân loại và cột thời gian
numericals <- c('Administrative', 'Administrative_Duration', 'Informational', 'Informational_Duration', 'ProductRelated', 
                'ProductRelated_Duration', 'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay', 'Revenue')
categoricals <- c('OperatingSystems', 'Browser', 'Region', 'TrafficType', 'VisitorType', 'Weekend', 'Revenue')
timestamps <- c('Month')

# Hiển thị dataframe sau khi chuyển đổi kiểu dữ liệu

str(df)

## 'data.frame':    12330 obs. of  18 variables:
##  $ Administrative         : int  0 0 0 0 0 0 0 1 0 0 ...
##  $ Administrative_Duration: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Informational          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Informational_Duration : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ProductRelated         : int  1 2 1 2 10 19 1 0 2 3 ...
##  $ ProductRelated_Duration: num  0 64 0 2.67 627.5 ...
##  $ BounceRates            : num  0.2 0 0.2 0.05 0.02 ...
##  $ ExitRates              : num  0.2 0.1 0.2 0.14 0.05 ...
##  $ PageValues             : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SpecialDay             : num  0 0 0 0 0 0 0.4 0 0.8 0.4 ...
##  $ Month                  : chr  "Feb" "Feb" "Feb" "Feb" ...
##  $ OperatingSystems       : chr  "1" "2" "4" "3" ...
##  $ Browser                : chr  "1" "2" "1" "2" ...
##  $ Region                 : chr  "1" "1" "9" "2" ...
##  $ TrafficType            : chr  "1" "2" "3" "4" ...
##  $ VisitorType            : chr  "Returning_Visitor" "Returning_Visitor" "Returning_Visitor" "Returning_Visitor" ...
##  $ Weekend                : logi  FALSE FALSE FALSE FALSE TRUE FALSE ...
##  $ Revenue                : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...

Xử lí dữ liệu lặp

library(psych)
df <- data %>% distinct()
duplicate_drop_percentage <- round(((nrow(data) - nrow(df))/nrow(data)) * 100, 2)

cat("Duplicate data drop percentage:", duplicate_drop_percentage, "%\n")

## Duplicate data drop percentage: 1.01 %

Handling outliers

IQR_outliers <- function(x) {
  q1 <- quantile(x, 0.25)
  q3 <- quantile(x, 0.75)
  
  bb <- q1 - 1.5 * (q3 - q1)
  ba <- q3 + 1.5 * (q3 - q1)
  
  outlier_indices <- which(x < bb | x > ba)
  outlier_values <- x[outlier_indices]
  
  return(length(outlier_indices))
}

#IQR method
for (i in numericals) {
  cat("Number of IQR outliers", i, ":", IQR_outliers(df[[i]]), "\n")
}

## Number of IQR outliers Administrative : 404 
## Number of IQR outliers Administrative_Duration : 1149 
## Number of IQR outliers Informational : 2631 
## Number of IQR outliers Informational_Duration : 2405 
## Number of IQR outliers ProductRelated : 1007 
## Number of IQR outliers ProductRelated_Duration : 951 
## Number of IQR outliers BounceRates : 1428 
## Number of IQR outliers ExitRates : 1325 
## Number of IQR outliers PageValues : 2730 
## Number of IQR outliers SpecialDay : 1249 
## Number of IQR outliers Revenue : 1908

Encoder data

Label encoding for Revenue, Weekend (False: 0, True: 1)

# label encoding for Revenue, Weekend  
df$Revenue <- as.numeric(factor(df$Revenue))
df$Weekend <- as.numeric(factor(df$Weekend))

# label encoding for Month
mapping_month <- c('Jan' = 1, 'Feb' = 2, 'Mar' = 3, 'Apr' = 4, 'May' = 5, 'June' = 6,
                   'Jul' = 7, 'Aug' = 8, 'Sep' = 9, 'Oct' = 10, 'Nov' = 11, 'Dec' = 12)
df$Month <- unname(mapping_month[df$Month])

One Hot Encoding Feature Categorical

library(dplyr)
library(caret)

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following objects are masked from 'package:psych':
## 
##     %+%, alpha

## Loading required package: lattice

library(lattice)

# One-Hot Encoding for VisitorType
dummy <- dummyVars(" ~ VisitorType", data = df)
onehot <- data.frame(predict(dummy, newdata = df))
colnames(onehot) <- paste0("VT_", colnames(onehot))
df <- cbind(df, onehot)

# Dropping the Original VisitorType Column
df <- df %>% select(-VisitorType)

# Lưu dữ liệu đã xử lý vào dataset df_cleaned
df_cleaned <- df

# Hiển thị thông tin về dataframe df_cleaned
str(df_cleaned)

## 'data.frame':    12205 obs. of  20 variables:
##  $ Administrative                 : int  0 0 0 0 0 0 0 1 0 0 ...
##  $ Administrative_Duration        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Informational                  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Informational_Duration         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ProductRelated                 : int  1 2 1 2 10 19 1 0 2 3 ...
##  $ ProductRelated_Duration        : num  0 64 0 2.67 627.5 ...
##  $ BounceRates                    : num  0.2 0 0.2 0.05 0.02 ...
##  $ ExitRates                      : num  0.2 0.1 0.2 0.14 0.05 ...
##  $ PageValues                     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SpecialDay                     : num  0 0 0 0 0 0 0.4 0 0.8 0.4 ...
##  $ Month                          : num  2 2 2 2 2 2 2 2 2 2 ...
##  $ OperatingSystems               : int  1 2 4 3 3 2 2 1 2 2 ...
##  $ Browser                        : int  1 2 1 2 3 2 4 2 2 4 ...
##  $ Region                         : int  1 1 9 2 1 1 3 1 2 1 ...
##  $ TrafficType                    : int  1 2 3 4 4 3 3 5 3 2 ...
##  $ Weekend                        : num  1 1 1 1 2 1 1 2 1 1 ...
##  $ Revenue                        : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ VT_VisitorTypeNew_Visitor      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ VT_VisitorTypeOther            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ VT_VisitorTypeReturning_Visitor: num  1 1 1 1 1 1 1 1 1 1 ...

Featuring Engineering

Numeric Correlation

# Chuyển đổi các cột không phải là số sang số (nếu cần)
df_cleaned <- df_cleaned %>%
  mutate_if(is.character, as.factor) %>%
  mutate_if(is.factor, as.numeric)

# Kiểm tra lại kiểu dữ liệu sau khi chuyển đổi
str(df_cleaned)

## 'data.frame':    12205 obs. of  20 variables:
##  $ Administrative                 : int  0 0 0 0 0 0 0 1 0 0 ...
##  $ Administrative_Duration        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Informational                  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Informational_Duration         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ProductRelated                 : int  1 2 1 2 10 19 1 0 2 3 ...
##  $ ProductRelated_Duration        : num  0 64 0 2.67 627.5 ...
##  $ BounceRates                    : num  0.2 0 0.2 0.05 0.02 ...
##  $ ExitRates                      : num  0.2 0.1 0.2 0.14 0.05 ...
##  $ PageValues                     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SpecialDay                     : num  0 0 0 0 0 0 0.4 0 0.8 0.4 ...
##  $ Month                          : num  2 2 2 2 2 2 2 2 2 2 ...
##  $ OperatingSystems               : int  1 2 4 3 3 2 2 1 2 2 ...
##  $ Browser                        : int  1 2 1 2 3 2 4 2 2 4 ...
##  $ Region                         : int  1 1 9 2 1 1 3 1 2 1 ...
##  $ TrafficType                    : int  1 2 3 4 4 3 3 5 3 2 ...
##  $ Weekend                        : num  1 1 1 1 2 1 1 2 1 1 ...
##  $ Revenue                        : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ VT_VisitorTypeNew_Visitor      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ VT_VisitorTypeOther            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ VT_VisitorTypeReturning_Visitor: num  1 1 1 1 1 1 1 1 1 1 ...

library(reshape2)

## 
## Attaching package: 'reshape2'

## The following objects are masked from 'package:data.table':
## 
##     dcast, melt

# Tính toán ma trận tương quan
corr_matrix <- cor(df_cleaned)

# Chuyển đổi ma trận tương quan thành định dạng dài
corr_melt <- melt(corr_matrix)

# Tạo heatmap cho ma trận tương quan
ggplot(data = corr_melt, aes(x = Var1, y = Var2, fill = value)) +
  geom_tile(color = "white") +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
                       midpoint = 0, limit = c(-1, 1), space = "Lab", 
                       name="Correlation") +
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 45, vjust = 1, 
                                   size = 12, hjust = 1)) +
  coord_fixed() +
  geom_text(aes(label = sprintf("%.2f", value)), color = "black", size = 4) +
  labs(title = "Data Correlation", x = "", y = "")

Features with a correlation value > 0.80:

ExitRates - BounceRates (0.90) redundant >> drop one VT_Returning_Visitor - VT_New_Visitor (0.97) >> drop one ProductRelated - ProductRelated_Duration (0.86) >> feature transformation

Feature selection

# Loại bỏ các cột không mong muốn
df_selected <- df_cleaned %>%
  select(-BounceRates, -Region)

# Xem trước dữ liệu
head(df_selected)

##   Administrative Administrative_Duration Informational Informational_Duration
## 1              0                       0             0                      0
## 2              0                       0             0                      0
## 3              0                       0             0                      0
## 4              0                       0             0                      0
## 5              0                       0             0                      0
## 6              0                       0             0                      0
##   ProductRelated ProductRelated_Duration ExitRates PageValues SpecialDay Month
## 1              1                0.000000 0.2000000          0          0     2
## 2              2               64.000000 0.1000000          0          0     2
## 3              1                0.000000 0.2000000          0          0     2
## 4              2                2.666667 0.1400000          0          0     2
## 5             10              627.500000 0.0500000          0          0     2
## 6             19              154.216667 0.0245614          0          0     2
##   OperatingSystems Browser TrafficType Weekend Revenue
## 1                1       1           1       1       1
## 2                2       2           2       1       1
## 3                4       1           3       1       1
## 4                3       2           4       1       1
## 5                3       3           4       2       1
## 6                2       2           3       1       1
##   VT_VisitorTypeNew_Visitor VT_VisitorTypeOther VT_VisitorTypeReturning_Visitor
## 1                         0                   0                               1
## 2                         0                   0                               1
## 3                         0                   0                               1
## 4                         0                   0                               1
## 5                         0                   0                               1
## 6                         0                   0                               1

##Feature Transformation Transformation of three data sets that have a high correlation related to page types into the average duration per page opened

#Tạo ra ba cột mới bằng cách chia thời gian cho số trang tương ứng trong các cột Administrative, Informational, và ProductRelated
df_selected$Administrative_Duration_Page <- df_selected$Administrative_Duration / df_selected$Administrative
df_selected$Informational_Duration_Page <- df_selected$Informational_Duration / df_selected$Informational
df_selected$ProductRelated_Duration_Page <- df_selected$ProductRelated_Duration / df_selected$ProductRelated

#Loại bỏ các cột gốc khỏi dataframe df_selected và lưu kết quả vào df_transformed.

df_transformed <- df_selected %>%
  select(-Administrative, -Informational, -ProductRelated, -Administrative_Duration, -Informational_Duration, -ProductRelated_Duration)

# Xử lý giá trị chia cho 0
df_transformed$Administrative_Duration_Page[is.na(df_transformed$Administrative_Duration_Page)] <- 0
df_transformed$Informational_Duration_Page[is.na(df_transformed$Informational_Duration_Page)] <- 0
df_transformed$ProductRelated_Duration_Page[is.na(df_transformed$ProductRelated_Duration_Page)] <- 0

# Hiển thị thông tin về dataframe đã xử lý
str(df_transformed)

## 'data.frame':    12205 obs. of  15 variables:
##  $ ExitRates                      : num  0.2 0.1 0.2 0.14 0.05 ...
##  $ PageValues                     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SpecialDay                     : num  0 0 0 0 0 0 0.4 0 0.8 0.4 ...
##  $ Month                          : num  2 2 2 2 2 2 2 2 2 2 ...
##  $ OperatingSystems               : int  1 2 4 3 3 2 2 1 2 2 ...
##  $ Browser                        : int  1 2 1 2 3 2 4 2 2 4 ...
##  $ TrafficType                    : int  1 2 3 4 4 3 3 5 3 2 ...
##  $ Weekend                        : num  1 1 1 1 2 1 1 2 1 1 ...
##  $ Revenue                        : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ VT_VisitorTypeNew_Visitor      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ VT_VisitorTypeOther            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ VT_VisitorTypeReturning_Visitor: num  1 1 1 1 1 1 1 1 1 1 ...
##  $ Administrative_Duration_Page   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Informational_Duration_Page    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ProductRelated_Duration_Page   : num  0 32 0 1.33 62.75 ...

library(ggplot2)
library(reshape2)
library(dplyr)

# Hiển thị ma trận tương quan
cor_matrix <- cor(df_transformed)
melted_cor_matrix <- melt(cor_matrix)

ggplot(data = melted_cor_matrix, aes(x=Var1, y=Var2, fill=value)) +
  geom_tile() +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
                       midpoint = 0, limit = c(-1,1), space = "Lab", 
                       name="Correlation") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, 
                                   size = 12, hjust = 1)) +
  coord_fixed() +
  ggtitle("Data Correlation")

# Lấy mẫu ngẫu nhiên 5 hàng từ dataframe
sample_n(df_transformed, 5)

##        ExitRates PageValues SpecialDay Month OperatingSystems Browser
## 7096 0.019859307   4.023324          0    10                2       2
## 6166 0.040000000   0.000000          0    10                1       1
## 5159 0.005882353   0.000000          0     5                2       2
## 5990 0.051351351   0.000000          0     8                2       2
## 1069 0.033777778   0.000000          0     3                2       2
##      TrafficType Weekend Revenue VT_VisitorTypeNew_Visitor VT_VisitorTypeOther
## 7096           2       1       1                         0                   0
## 6166           3       2       1                         0                   0
## 5159           4       2       1                         0                   0
## 5990           1       2       1                         0                   0
## 1069           2       1       1                         0                   0
##      VT_VisitorTypeReturning_Visitor Administrative_Duration_Page
## 7096                               1                       73.450
## 6166                               1                       26.700
## 5159                               1                       39.000
## 5990                               1                       16.375
## 1069                               1                       54.000
##      Informational_Duration_Page ProductRelated_Duration_Page
## 7096                           0                    149.05644
## 6166                           0                     22.13333
## 5159                           0                     17.49074
## 5990                           0                    166.58229
## 1069                         237                     46.53125

# Loại bỏ các cột phân loại
df_droped <- df_transformed %>%
  select(-OperatingSystems, -Browser, -TrafficType, -Month)
# Lưu kết quả xử lý đặc trưng vào dataset data
data_processed <- df_droped

# Hiển thị thông tin về dataframe đã xử lý
str(data_processed)

## 'data.frame':    12205 obs. of  11 variables:
##  $ ExitRates                      : num  0.2 0.1 0.2 0.14 0.05 ...
##  $ PageValues                     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SpecialDay                     : num  0 0 0 0 0 0 0.4 0 0.8 0.4 ...
##  $ Weekend                        : num  1 1 1 1 2 1 1 2 1 1 ...
##  $ Revenue                        : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ VT_VisitorTypeNew_Visitor      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ VT_VisitorTypeOther            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ VT_VisitorTypeReturning_Visitor: num  1 1 1 1 1 1 1 1 1 1 ...
##  $ Administrative_Duration_Page   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Informational_Duration_Page    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ProductRelated_Duration_Page   : num  0 32 0 1.33 62.75 ...

II: Model and Evaluation

Split data

library(dplyr)
# Tách các đặc trưng và mục tiêu
X <- data_processed %>% select(-Revenue)
y <- data_processed$Revenue

# Chia dữ liệu thành tập huấn luyện và tập kiểm tra
set.seed(42)
trainIndex <- createDataPartition(y, p = 0.7, list = FALSE)
X_train_raw <- X[trainIndex, ]
X_test_raw <- X[-trainIndex, ]
y_train <- y[trainIndex]
y_test <- y[-trainIndex]

# Hiển thị thông tin về các tập dữ liệu
str(X_train_raw)

## 'data.frame':    8544 obs. of  10 variables:
##  $ ExitRates                      : num  0.2 0.1 0.2 0.14 0.05 ...
##  $ PageValues                     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SpecialDay                     : num  0 0 0 0 0 0.4 0 0.8 0 0 ...
##  $ Weekend                        : num  1 1 1 1 2 1 2 1 1 1 ...
##  $ VT_VisitorTypeNew_Visitor      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ VT_VisitorTypeOther            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ VT_VisitorTypeReturning_Visitor: num  1 1 1 1 1 1 1 1 1 1 ...
##  $ Administrative_Duration_Page   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Informational_Duration_Page    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ProductRelated_Duration_Page   : num  0 32 0 1.33 62.75 ...

str(X_test_raw)

## 'data.frame':    3661 obs. of  10 variables:
##  $ ExitRates                      : num  0.0246 0.0222 0.0667 0.0258 0.2 ...
##  $ PageValues                     : num  0 0 0 0 0 ...
##  $ SpecialDay                     : num  0 0.4 0 0.4 0 0 1 0 0 0.4 ...
##  $ Weekend                        : num  1 1 1 1 1 1 2 1 1 1 ...
##  $ VT_VisitorTypeNew_Visitor      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ VT_VisitorTypeOther            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ VT_VisitorTypeReturning_Visitor: num  1 1 1 1 1 1 1 1 1 1 ...
##  $ Administrative_Duration_Page   : num  0 0 0 0 0 0 0 0 0 6 ...
##  $ Informational_Duration_Page    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ProductRelated_Duration_Page   : num  8.12 246 131.67 25.48 0 ...

str(y_train)

##  num [1:8544] 1 1 1 1 1 1 1 1 1 1 ...

str(y_test)

##  num [1:3661] 1 1 1 1 1 1 1 1 1 1 ...

Scaler transformation

Bộ chuẩn hóa này sẽ chuẩn hóa các đặc trưng bằng cách loại bỏ trung bình và chia cho độ lệch chuẩn để có giá trị trung bình là 0 và độ lệch chuẩn là 1.

library(caret)

# Khởi tạo bộ chuẩn hóa (Scaler) chuẩn hóa dữ liệu bằng cách loại bỏ trung bình và chia cho độ lệch chuẩn.
scaler <- preProcess(X_train_raw, method = c("center", "scale"))

# Chuẩn hóa dữ liệu huấn luyện (X_train)
X_train <- predict(scaler, X_train_raw)

# Chuẩn hóa dữ liệu kiểm tra (X_test)
X_test <- predict(scaler, X_test_raw)

# Chuyển đổi kết quả chuẩn hóa thành dataframe
X_train <- as.data.frame(X_train)
X_test <- as.data.frame(X_test)

# Đặt tên cột cho dataframe
colnames(X_train) <- colnames(X_train_raw)
colnames(X_test) <- colnames(X_test_raw)

# Hiển thị thông tin về các dataframe đã chuẩn hóa
str(X_train)

## 'data.frame':    8544 obs. of  10 variables:
##  $ ExitRates                      : num  3.431 1.268 3.431 2.133 0.186 ...
##  $ PageValues                     : num  -0.318 -0.318 -0.318 -0.318 -0.318 ...
##  $ SpecialDay                     : num  -0.307 -0.307 -0.307 -0.307 -0.307 ...
##  $ Weekend                        : num  -0.558 -0.558 -0.558 -0.558 1.792 ...
##  $ VT_VisitorTypeNew_Visitor      : num  -0.405 -0.405 -0.405 -0.405 -0.405 ...
##  $ VT_VisitorTypeOther            : num  -0.0827 -0.0827 -0.0827 -0.0827 -0.0827 ...
##  $ VT_VisitorTypeReturning_Visitor: num  0.417 0.417 0.417 0.417 0.417 ...
##  $ Administrative_Duration_Page   : num  -0.431 -0.431 -0.431 -0.431 -0.431 ...
##  $ Informational_Duration_Page    : num  -0.228 -0.228 -0.228 -0.228 -0.228 ...
##  $ ProductRelated_Duration_Page   : num  -0.831 -0.129 -0.831 -0.802 0.545 ...

str(X_test)

## 'data.frame':    3661 obs. of  10 variables:
##  $ ExitRates                      : num  -0.364 -0.414 0.547 -0.336 3.431 ...
##  $ PageValues                     : num  -0.318 -0.318 -0.318 -0.318 -0.318 ...
##  $ SpecialDay                     : num  -0.307 1.7 -0.307 1.7 -0.307 ...
##  $ Weekend                        : num  -0.558 -0.558 -0.558 -0.558 -0.558 ...
##  $ VT_VisitorTypeNew_Visitor      : num  -0.405 -0.405 -0.405 -0.405 -0.405 ...
##  $ VT_VisitorTypeOther            : num  -0.0827 -0.0827 -0.0827 -0.0827 -0.0827 ...
##  $ VT_VisitorTypeReturning_Visitor: num  0.417 0.417 0.417 0.417 0.417 ...
##  $ Administrative_Duration_Page   : num  -0.431 -0.431 -0.431 -0.431 -0.431 ...
##  $ Informational_Duration_Page    : num  -0.228 -0.228 -0.228 -0.228 -0.228 ...
##  $ ProductRelated_Duration_Page   : num  -0.653 4.565 2.057 -0.272 -0.831 ...

Handling imbalance

library(dplyr)
library(ROSE)

## Loaded ROSE 0.0-4

# Kiểm tra tỷ lệ phần trăm của giá trị mục tiêu
y_train_percent <- prop.table(table(y_train)) * 100
print(y_train_percent)

## y_train
##        1        2 
## 83.94195 16.05805

library(caret)
library(ROSE)

smote_data <- function(data, target, perc.over = 100, perc.under = 200) {
  # Tạo công thức
  formula <- as.formula(paste(target, "~ ."))
  
  # Áp dụng SMOTE
  data_smote <- ROSE(formula, data = data, seed = 42, N = perc.over * nrow(data) / 100)$data
  
  return(data_smote)
}

# Kết hợp dữ liệu huấn luyện thành một dataframe
data_train <- data.frame(X_train, Revenue = y_train)

# Áp dụng SMOTE
data_train <- data.frame(X_train, Revenue = y_train)
data_smote <- ROSE(Revenue ~ ., data = data_train, seed = 42, N = 7181 + 3590)$data
X_over_smote <- data_smote %>% select(-Revenue)
y_over_smote <- data_smote$Revenue

# Tách lại các đặc trưng và mục tiêu sau khi áp dụng SMOTE
X_over_smote <- data_smote %>% select(-Revenue)
y_over_smote <- data_smote$Revenue

print('AFTER oversampling (smote)')

## [1] "AFTER oversampling (smote)"

print(table(y_over_smote))

## y_over_smote
##    1    2 
## 5375 5396

Classification model

Logistic Regression

In the dataset, there are quite extreme outliers, so the logistic model produces poor performance.

# Cài đặt và tải các gói cần thiết
library(caret)
library(e1071)
library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

# Chuyển đổi biến mục tiêu thành yếu tố
y_train <- as.factor(y_train)
y_test <- as.factor(y_test)

# Đảm bảo y_over_smote là yếu tố với hai mức
y_over_smote <- as.factor(y_over_smote)

# Khởi tạo và huấn luyện mô hình Logistic Regression
model <- train(Revenue ~ ., data = data.frame(X_over_smote, Revenue = y_over_smote), method = "glm", family = binomial)

# Dự đoán trên tập dữ liệu kiểm tra
Y_pred <- predict(model, newdata = data.frame(X_test))

# Tính toán độ chính xác
accuracy <- sum(Y_pred == y_test) / length(y_test)
print(paste("Accuracy:", accuracy))

## [1] "Accuracy: 0.833378858235455"

# Tính toán ma trận nhầm lẫn
confusion_matrix <- confusionMatrix(as.factor(Y_pred), as.factor(y_test))
print("Confusion Matrix:")

## [1] "Confusion Matrix:"

print(confusion_matrix$table)

##           Reference
## Prediction    1    2
##          1 2676  161
##          2  449  375

# Tính toán báo cáo phân loại
classification_report <- confusionMatrix(as.factor(Y_pred), as.factor(y_test))
print("Classification Report:")

## [1] "Classification Report:"

print(classification_report)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    1    2
##          1 2676  161
##          2  449  375
##                                           
##                Accuracy : 0.8334          
##                  95% CI : (0.8209, 0.8453)
##     No Information Rate : 0.8536          
##     P-Value [Acc > NIR] : 0.9997          
##                                           
##                   Kappa : 0.4547          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.8563          
##             Specificity : 0.6996          
##          Pos Pred Value : 0.9432          
##          Neg Pred Value : 0.4551          
##              Prevalence : 0.8536          
##          Detection Rate : 0.7309          
##    Detection Prevalence : 0.7749          
##       Balanced Accuracy : 0.7780          
##                                           
##        'Positive' Class : 1               
##

# Tính toán điểm ROC AUC
roc_auc <- roc(y_test, as.numeric(predict(model, newdata = data.frame(X_test), type = "prob")[,2]))

## Setting levels: control = 1, case = 2

## Setting direction: controls < cases

print(paste("ROC AUC Score:", auc(roc_auc)))

## [1] "ROC AUC Score: 0.86296"

# Vẽ đường cong ROC
plot(roc_auc, main = "ROC Curve", col = "blue")
abline(a = 0, b = 1, lty = 2, col = "red")

Simple linear

library(caret)
library(pROC)

y_train <- as.factor(y_train)
y_test <- as.factor(y_test)

# Huấn luyện mô hình Simple Linear Regression
model <- lm(Revenue ~ ., data = data.frame(X_train, Revenue = y_train))

## Warning in model.response(mf, "numeric"): using type = "numeric" with a factor
## response will be ignored

## Warning in Ops.factor(y, z$residuals): '-' not meaningful for factors

# Dự đoán nhãn cho tập kiểm tra
Y_pred <- predict(model, X_test)

# Chuyển đổi dự đoán thành nhãn (0 hoặc 1) dựa trên ngưỡng 0.5
Y_pred_class <- ifelse(Y_pred > 0.5, 1, 0)

# Tính toán độ chính xác
accuracy <- mean(Y_pred_class == y_test)
cat("Accuracy:", accuracy, "\n")

## Accuracy: 0.8535919

# Tính toán ma trận nhầm lẫn
confusion_matrix <- confusionMatrix(as.factor(Y_pred_class), y_test)

## Warning in confusionMatrix.default(as.factor(Y_pred_class), y_test): Levels are
## not in the same order for reference and data. Refactoring data to match.

cat("Confusion Matrix:\n")

## Confusion Matrix:

print(confusion_matrix$table)

##           Reference
## Prediction    1    2
##          1 3125  536
##          2    0    0

# Tính toán báo cáo phân loại
cat("Classification Report:\n")

## Classification Report:

print(confusion_matrix$byClass)

##          Sensitivity          Specificity       Pos Pred Value 
##            1.0000000            0.0000000            0.8535919 
##       Neg Pred Value            Precision               Recall 
##                  NaN            0.8535919            1.0000000 
##                   F1           Prevalence       Detection Rate 
##            0.9210139            0.8535919            0.8535919 
## Detection Prevalence    Balanced Accuracy 
##            1.0000000            0.5000000

# Tính toán ROC AUC score
roc_auc <- auc(roc(y_test, Y_pred))

## Setting levels: control = 1, case = 2

## Setting direction: controls < cases

cat("ROC AUC Score:", roc_auc, "\n")

## ROC AUC Score: 0.8852597

# Vẽ đường cong ROC
roc_curve <- roc(y_test, Y_pred)

## Setting levels: control = 1, case = 2
## Setting direction: controls < cases

plot(roc_curve, main = "AUC plot", col = "blue")