Load and Prepare the Data

# Read dataset
web_data <- read_csv("data/Web_Analytic_Dataset.csv")
## Rows: 250 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (3): Source / Medium, Bounce Rate, Conversion Rate (%)
## dbl  (2): Year, Month of the year
## num  (7): Users, New Users, Sessions, Pageviews, Transactions, Revenue, Quan...
## time (1): Avg. Session Duration
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Convert numeric fields
numeric_cols <- c("Users", "New Users", "Sessions", "Pageviews", 
                  "Transactions", "Revenue", "Quantity Sold")
web_data[numeric_cols] <- lapply(web_data[numeric_cols], function(x) as.numeric(gsub(",", "", x)))

# Convert Conversion Rate to numeric
web_data$`Conversion Rate (%)` <- as.numeric(web_data$`Conversion Rate (%)`) / 100
## Warning: NAs introduced by coercion
# Ensure Year column exists
if (!"Year" %in% names(web_data)) {
  if ("Year of the Visit" %in% names(web_data)) {
    web_data$Year <- web_data$`Year of the Visit`
  }
}

# Use 'Source / Medium' as proxy for Device
web_data$Device <- web_data$`Source / Medium`

Task a

top_sources <- web_data %>%
  group_by(Year, `Source / Medium`) %>%
  summarise(Total_Revenue = sum(Revenue, na.rm = TRUE), .groups = 'drop') %>%
  arrange(Year, desc(Total_Revenue)) %>%
  group_by(Year) %>%
  slice_max(order_by = Total_Revenue, n = 3)

DT::datatable(top_sources, caption = "Top 3 Traffic Sources by Revenue (per Year)")

This table summarizes the total revenue for each traffic source per year, and keeps only the top 3 highest revenue sources per year.

ggplot(top_sources, aes(x = reorder(`Source / Medium`, -Total_Revenue), y = Total_Revenue, fill = `Source / Medium`)) +
  geom_col() +
  geom_text(aes(label = round(Total_Revenue, 0)), vjust = -0.5, size = 3) +
  facet_wrap(~Year) +
  labs(title = "Top 3 Traffic Sources by Revenue (per Year)", x = "Source / Medium", y = "Total Revenue") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 1, hjust = 1))

This bar chart visually presents the top 3 sources per year with revenue labels to easily compare contributions.

Task b

Users and New Users by Device Proxy

df <- read.csv("data/Web_Analytic_Dataset.csv")
names(df) <- make.names(names(df))

df$Users <- as.numeric(gsub(",", "", df$Users))
df$New.Users <- as.numeric(gsub(",", "", df$New.Users))

devices_users <- df %>%
  group_by(Source...Medium) %>%
  summarise(
    total_users = sum(Users, na.rm = TRUE),
    total_newusers = sum(New.Users, na.rm = TRUE)
  )

DT::datatable(devices_users, caption = "Users vs New Users by Device")
#User chart
library(ggplot2)
ggplot(devices_users, aes(x = Source...Medium, y = total_users, fill = Source...Medium)) +
  geom_bar(stat = "identity", position = "dodge") +
  geom_text(aes(label = total_users), vjust = -0.3, size = 3) +
  labs(title = "Users by Device", x = "Device", y = "Total Users") +
  theme_minimal() +
  theme(
    legend.position = "none",
    axis.text.x = element_text(angle = 45, hjust = 1, size = 8)
  )

#New user chart
ggplot(devices_users, aes(x = Source...Medium, y = total_newusers, fill = Source...Medium)) +
  geom_bar(stat = "identity", position = "dodge") +
  geom_text(aes(label = total_newusers), vjust = -0.3, size = 3) +
  labs( title = "New Users by Device", x = "Device", y = "Total New Users") +
  theme_minimal() +
  theme(
    legend.position = "none",
    axis.text.x = element_text(angle = 45, hjust = 1, size = 8)
  )

Conversion Rate Over Time by Device Proxy

device_summary <- web_data %>%
  group_by(Device, Year, `Month of the year`) %>%
  summarise(
    avg_conversion_rate = mean(`Conversion Rate (%)`, na.rm = TRUE),
    .groups = "drop"
  )
DT::datatable(device_summary, caption = "Average Conversion Rate by Device, Month, and Year")

This table shows the average conversion rate for each source per month and year

ggplot(device_summary, aes(x = paste(Year, `Month of the year`, sep = "-"), 
                           y = avg_conversion_rate, 
                           fill = Device)) +
  geom_col(position = position_dodge2(preserve = "single", padding = 0.05), width = 1.25) +
  labs(title = "Conversion Rate by Device", x = "Month", y = "Average Conversion Rate") +
  theme_minimal(base_size = 40) +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 1),
    legend.position = "right",
    legend.key.size = unit(2, "cm"),
    legend.text = element_text(size = 30)
  )
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_col()`).

This bar chart compares average conversion rates per source across different months and years.


Task c – Relationship Analysis Between Bounce Rate, Conversion, Transaction, and Revenue

# Clean and convert columns
web_data$bounce_clean <- as.numeric(gsub("%", "", web_data$`Bounce Rate`))
web_data$conversion_clean <- as.numeric(web_data$`Conversion Rate (%)`)
web_data$transaction_clean <- as.numeric(web_data$Transactions)
web_data$revenue_clean <- as.numeric(web_data$Revenue)

# Filter complete cases
task_c_data <- web_data %>%
  select(bounce_clean, conversion_clean, transaction_clean, revenue_clean) %>%
  drop_na()

Correlation Heatmap

corr_matrix <- round(cor(task_c_data), 2)
melted_corr <- melt(corr_matrix)

ggplot(melted_corr, aes(x = Var1, y = Var2, fill = value)) +
  geom_tile(color = "white") +
  scale_fill_gradient2(low = "blue", high = "yellow", mid = "white", midpoint = 0, limit = c(-1,1), name = "Correlation") +
  geom_text(aes(label = value), color = "black", size = 4) +
  labs(title = "Correlation Heatmap") +
  theme_minimal()

Scatterplots for Each Pair

# Bounce vs Conversion
p1 <- ggplot(task_c_data, aes(x = bounce_clean, y = conversion_clean)) +
  geom_point(color = "blue", alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(title = "Bounce Rate vs Conversion Rate") +
  theme_minimal()

# Bounce vs Transaction
p2 <- ggplot(task_c_data, aes(x = bounce_clean, y = transaction_clean)) +
  geom_point(color = "red", alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(title = "Bounce Rate vs Transactions") +
  theme_minimal()

# Bounce vs Revenue
p3 <- ggplot(task_c_data, aes(x = bounce_clean, y = revenue_clean)) +
  geom_point(color = "gold", alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE, color = "orange") +
  labs(title = "Bounce Rate vs Revenue") +
  theme_minimal()

# Conversion vs Revenue
p4 <- ggplot(task_c_data, aes(x = conversion_clean, y = revenue_clean)) +
  geom_point(color = "deeppink", alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE, color = "deeppink") +
  labs(title = "Conversion Rate vs Revenue") +
  theme_minimal()

# Transaction vs Revenue
p5 <- ggplot(task_c_data, aes(x = transaction_clean, y = revenue_clean)) +
  geom_point(color = "seagreen", alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE, color = "darkgreen") +
  labs(title = "Transaction vs Revenue") +
  theme_minimal()

p1; p2; p3; p4; p5
## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

Scatterplot 1: Bounce Rate vs Conversion Rate

There is a negative correlation between Bounce Rate and Conversion Rate. This suggests that as Bounce Rate increases (users leaving quickly), the Conversion Rate tends to decrease. This is expected behavior—visitors who leave quickly are less likely to convert.

Scatterplot 2: Bounce Rate vs Transactions

This plot also shows a negative trend, meaning higher Bounce Rates are generally associated with fewer Transactions. It indicates that if users are bouncing more, they are less engaged, and thus less likely to complete transactions.

Scatterplot 3: Bounce Rate vs Revenue

There is a slight inverse relationship between Bounce Rate and Revenue, though it’s weaker than the previous ones. A high Bounce Rate may negatively impact revenue, but this relationship might also be influenced by other variables (e.g., marketing quality or landing page design).

Scatterplot 4: Conversion Rate vs Revenue

This plot shows a positive correlation—higher Conversion Rates usually lead to higher Revenue. This is an intuitive and critical relationship: improving conversion directly boosts revenue performance.

Scatterplot 5: Transaction vs Revenue

This scatterplot shows a strong positive linear relationship between the number of Transactions and the amount of Revenue. As the number of transactions increases, revenue increases correspondingly.
This makes intuitive sense, as each transaction typically contributes directly to total revenue.
The consistency of this relationship suggests that encouraging more transactions (e.g., through smoother checkout, promotions) will significantly boost revenue performance.

Overall Comment:

The heatmap and scatterplots suggest the following: Bounce Rate negatively affects both Conversion Rate and Transactions, which in turn impact Revenue. The strongest positive correlation is between Conversion Rate and Revenue, highlighting its business importance. Reducing bounce rate and improving user engagement can be key levers for increasing both conversions and revenue. This analysis supports strategic efforts to optimize landing pages and user experience in order to lower bounce and raise conversions.


Task 2

Task d – Descriptive Analysis of diabetes.csv

diabetes <- read_csv("data/diabetes.csv")
## Rows: 768 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (9): Pregnancies, Glucose, BloodPressure, SkinThickness, Insulin, BMI, D...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
desc_stats_before <- diabetes %>%
  summarise(across(everything(),
                   list(
                     Mean = ~mean(.x, na.rm = TRUE),
                     Median = ~median(.x, na.rm = TRUE),
                     SD = ~sd(.x, na.rm = TRUE),
                     Variance = ~var(.x, na.rm = TRUE)
                   ))) %>%
  pivot_longer(cols = everything(),
               names_to = c("Variable", ".value"),
               names_sep = "_") %>%
  arrange(Variable)

DT::datatable(desc_stats_before, caption = "Descriptive Statistics Before Cleaning")
par(mfrow = c(3, 3))
lapply(names(diabetes), function(col) {
  hist(diabetes[[col]], main = paste("Histogram of", col, "before cleaning"),
       xlab = col, col = "lightgray")
})

## [[1]]
## $breaks
##  [1]  0  2  4  6  8 10 12 14 16 18
## 
## $counts
## [1] 349 143 107  83  52  20  12   1   1
## 
## $density
## [1] 0.2272135417 0.0930989583 0.0696614583 0.0540364583 0.0338541667
## [6] 0.0130208333 0.0078125000 0.0006510417 0.0006510417
## 
## $mids
## [1]  1  3  5  7  9 11 13 15 17
## 
## $xname
## [1] "diabetes[[col]]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
## 
## [[2]]
## $breaks
##  [1]   0  20  40  60  80 100 120 140 160 180 200
## 
## $counts
##  [1]   5   0   4  38 167 205 157  91  60  41
## 
## $density
##  [1] 0.0003255208 0.0000000000 0.0002604167 0.0024739583 0.0108723958
##  [6] 0.0133463542 0.0102213542 0.0059244792 0.0039062500 0.0026692708
## 
## $mids
##  [1]  10  30  50  70  90 110 130 150 170 190
## 
## $xname
## [1] "diabetes[[col]]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
## 
## [[3]]
## $breaks
##  [1]   0  10  20  30  40  50  60  70  80  90 100 110 120 130
## 
## $counts
##  [1]  35   0   3   2  24  94 217 228 127  25  11   1   1
## 
## $density
##  [1] 0.0045572917 0.0000000000 0.0003906250 0.0002604167 0.0031250000
##  [6] 0.0122395833 0.0282552083 0.0296875000 0.0165364583 0.0032552083
## [11] 0.0014322917 0.0001302083 0.0001302083
## 
## $mids
##  [1]   5  15  25  35  45  55  65  75  85  95 105 115 125
## 
## $xname
## [1] "diabetes[[col]]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
## 
## [[4]]
## $breaks
##  [1]   0  10  20  30  40  50  60  70  80  90 100
## 
## $counts
##  [1] 236 115 179 164  65   7   1   0   0   1
## 
## $density
##  [1] 0.0307291667 0.0149739583 0.0233072917 0.0213541667 0.0084635417
##  [6] 0.0009114583 0.0001302083 0.0000000000 0.0000000000 0.0001302083
## 
## $mids
##  [1]  5 15 25 35 45 55 65 75 85 95
## 
## $xname
## [1] "diabetes[[col]]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
## 
## [[5]]
## $breaks
##  [1]   0 100 200 300 400 500 600 700 800 900
## 
## $counts
## [1] 525 158  48  17  11   6   1   1   1
## 
## $density
## [1] 6.835938e-03 2.057292e-03 6.250000e-04 2.213542e-04 1.432292e-04
## [6] 7.812500e-05 1.302083e-05 1.302083e-05 1.302083e-05
## 
## $mids
## [1]  50 150 250 350 450 550 650 750 850
## 
## $xname
## [1] "diabetes[[col]]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
## 
## [[6]]
## $breaks
##  [1]  0  5 10 15 20 25 30 35 40 45 50 55 60 65 70
## 
## $counts
##  [1]  11   0   0  14  98 180 221 148  61  27   5   2   0   1
## 
## $density
##  [1] 0.0028645833 0.0000000000 0.0000000000 0.0036458333 0.0255208333
##  [6] 0.0468750000 0.0575520833 0.0385416667 0.0158854167 0.0070312500
## [11] 0.0013020833 0.0005208333 0.0000000000 0.0002604167
## 
## $mids
##  [1]  2.5  7.5 12.5 17.5 22.5 27.5 32.5 37.5 42.5 47.5 52.5 57.5 62.5 67.5
## 
## $xname
## [1] "diabetes[[col]]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
## 
## [[7]]
## $breaks
##  [1] 0.0 0.2 0.4 0.6 0.8 1.0 1.2 1.4 1.6 1.8 2.0 2.2 2.4 2.6
## 
## $counts
##  [1] 128 282 154  99  54  22  16   4   4   1   1   2   1
## 
## $density
##  [1] 0.833333333 1.835937500 1.002604167 0.644531250 0.351562500 0.143229167
##  [7] 0.104166667 0.026041667 0.026041667 0.006510417 0.006510417 0.013020833
## [13] 0.006510417
## 
## $mids
##  [1] 0.1 0.3 0.5 0.7 0.9 1.1 1.3 1.5 1.7 1.9 2.1 2.3 2.5
## 
## $xname
## [1] "diabetes[[col]]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
## 
## [[8]]
## $breaks
##  [1] 20 25 30 35 40 45 50 55 60 65 70 75 80 85
## 
## $counts
##  [1] 267 150  81  76  76  37  31  23  14  11   1   0   1
## 
## $density
##  [1] 0.0695312500 0.0390625000 0.0210937500 0.0197916667 0.0197916667
##  [6] 0.0096354167 0.0080729167 0.0059895833 0.0036458333 0.0028645833
## [11] 0.0002604167 0.0000000000 0.0002604167
## 
## $mids
##  [1] 22.5 27.5 32.5 37.5 42.5 47.5 52.5 57.5 62.5 67.5 72.5 77.5 82.5
## 
## $xname
## [1] "diabetes[[col]]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
## 
## [[9]]
## $breaks
##  [1] 0.0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0
## 
## $counts
##  [1] 500   0   0   0   0   0   0   0   0 268
## 
## $density
##  [1] 6.510417 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
##  [9] 0.000000 3.489583
## 
## $mids
##  [1] 0.05 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95
## 
## $xname
## [1] "diabetes[[col]]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
par(mfrow = c(1, 1))
# Boxplots before cleaning
boxplot_cols <- c("Pregnancies", "Glucose", "BloodPressure", "SkinThickness", 
                  "Insulin", "BMI", "DiabetesPedigreeFunction", "Age")

par(mfrow = c(3, 3))
lapply(boxplot_cols, function(col) {
  boxplot(diabetes[[col]], main = paste("Boxplot of", col, "before cleaning"), 
          col = "pink", horizontal = TRUE)
})
## [[1]]
## [[1]]$stats
##      [,1]
## [1,]    0
## [2,]    1
## [3,]    3
## [4,]    6
## [5,]   13
## 
## [[1]]$n
## [1] 768
## 
## [[1]]$conf
##          [,1]
## [1,] 2.714933
## [2,] 3.285067
## 
## [[1]]$out
## [1] 15 17 14 14
## 
## [[1]]$group
## [1] 1 1 1 1
## 
## [[1]]$names
## [1] ""
## 
## 
## [[2]]
## [[2]]$stats
##       [,1]
## [1,]  44.0
## [2,]  99.0
## [3,] 117.0
## [4,] 140.5
## [5,] 199.0
## 
## [[2]]$n
## [1] 768
## 
## [[2]]$conf
##          [,1]
## [1,] 114.6339
## [2,] 119.3661
## 
## [[2]]$out
## [1] 0 0 0 0 0
## 
## [[2]]$group
## [1] 1 1 1 1 1
## 
## [[2]]$names
## [1] ""
## 
## 
## [[3]]
## [[3]]$stats
##      [,1]
## [1,]   38
## [2,]   62
## [3,]   72
## [4,]   80
## [5,]  106
## 
## [[3]]$n
## [1] 768
## 
## [[3]]$conf
##          [,1]
## [1,] 70.97376
## [2,] 73.02624
## 
## [[3]]$out
##  [1]   0   0  30 110   0   0   0   0 108 122  30   0 110   0   0   0   0   0   0
## [20]   0   0   0   0 108   0   0   0   0   0   0   0   0   0   0 110   0  24   0
## [39]   0   0   0 114   0   0   0
## 
## [[3]]$group
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [39] 1 1 1 1 1 1 1
## 
## [[3]]$names
## [1] ""
## 
## 
## [[4]]
## [[4]]$stats
##      [,1]
## [1,]    0
## [2,]    0
## [3,]   23
## [4,]   32
## [5,]   63
## 
## [[4]]$n
## [1] 768
## 
## [[4]]$conf
##          [,1]
## [1,] 21.17557
## [2,] 24.82443
## 
## [[4]]$out
## [1] 99
## 
## [[4]]$group
## [1] 1
## 
## [[4]]$names
## [1] ""
## 
## 
## [[5]]
## [[5]]$stats
##       [,1]
## [1,]   0.0
## [2,]   0.0
## [3,]  30.5
## [4,] 127.5
## [5,] 318.0
## 
## [[5]]$n
## [1] 768
## 
## [[5]]$conf
##         [,1]
## [1,] 23.2308
## [2,] 37.7692
## 
## [[5]]$out
##  [1] 543 846 342 495 325 485 495 478 744 370 680 402 375 545 360 325 465 325 415
## [20] 579 474 328 480 326 330 600 321 440 540 480 335 387 392 510
## 
## [[5]]$group
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 
## [[5]]$names
## [1] ""
## 
## 
## [[6]]
## [[6]]$stats
##      [,1]
## [1,] 18.2
## [2,] 27.3
## [3,] 32.0
## [4,] 36.6
## [5,] 50.0
## 
## [[6]]$n
## [1] 768
## 
## [[6]]$conf
##          [,1]
## [1,] 31.46978
## [2,] 32.53022
## 
## [[6]]$out
##  [1]  0.0  0.0  0.0  0.0 53.2 55.0  0.0 67.1 52.3 52.3 52.9  0.0  0.0 59.4  0.0
## [16]  0.0 57.3  0.0  0.0
## 
## [[6]]$group
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 
## [[6]]$names
## [1] ""
## 
## 
## [[7]]
## [[7]]$stats
##        [,1]
## [1,] 0.0780
## [2,] 0.2435
## [3,] 0.3725
## [4,] 0.6265
## [5,] 1.1910
## 
## [[7]]$n
## [1] 768
## 
## [[7]]$conf
##           [,1]
## [1,] 0.3506639
## [2,] 0.3943361
## 
## [[7]]$out
##  [1] 2.288 1.441 1.390 1.893 1.781 1.222 1.400 1.321 1.224 2.329 1.318 1.213
## [13] 1.353 1.224 1.391 1.476 2.137 1.731 1.268 1.600 2.420 1.251 1.699 1.258
## [25] 1.282 1.698 1.461 1.292 1.394
## 
## [[7]]$group
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 
## [[7]]$names
## [1] ""
## 
## 
## [[8]]
## [[8]]$stats
##      [,1]
## [1,]   21
## [2,]   24
## [3,]   29
## [4,]   41
## [5,]   66
## 
## [[8]]$n
## [1] 768
## 
## [[8]]$conf
##          [,1]
## [1,] 28.03077
## [2,] 29.96923
## 
## [[8]]$out
## [1] 69 67 72 81 67 67 70 68 69
## 
## [[8]]$group
## [1] 1 1 1 1 1 1 1 1 1
## 
## [[8]]$names
## [1] ""
# Barplot Outcome
outcome_counts <- table(diabetes$Outcome)
barplot(outcome_counts,
        main = "Count of Outcome before cleaning",
        col = c("lightblue", "tomato"),
        names.arg = c("No Diabetes", "Diabetes"))

par(mfrow = c(1, 1))

Task e – Error Identification and Cleaning

# Columns that shouldn't contain 0s
cols_with_invalid_zeros <- c("Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI")

# Replace 0s with NA
for (col in cols_with_invalid_zeros) {
  diabetes[[col]][diabetes[[col]] == 0] <- NA
}

# Impute with median
for (col in cols_with_invalid_zeros) {
  diabetes[[col]][is.na(diabetes[[col]])] <- median(diabetes[[col]], na.rm = TRUE)
}
desc_stats_after <- diabetes %>%
  summarise(across(everything(),
                   list(
                     Mean = ~mean(.x, na.rm = TRUE),
                     Median = ~median(.x, na.rm = TRUE),
                     SD = ~sd(.x, na.rm = TRUE),
                     Variance = ~var(.x, na.rm = TRUE)
                   ))) %>%
  pivot_longer(cols = everything(),
               names_to = c("Variable", ".value"),
               names_sep = "_") %>%
  arrange(Variable)

DT::datatable(desc_stats_after, caption = "Descriptive Statistics After Cleaning")
par(mfrow = c(3, 3))
lapply(names(diabetes), function(col) {
  hist(diabetes[[col]], main = paste("Histogram of", col, "after cleaning"),
       xlab = col, col = "lightblue")
})

## [[1]]
## $breaks
##  [1]  0  2  4  6  8 10 12 14 16 18
## 
## $counts
## [1] 349 143 107  83  52  20  12   1   1
## 
## $density
## [1] 0.2272135417 0.0930989583 0.0696614583 0.0540364583 0.0338541667
## [6] 0.0130208333 0.0078125000 0.0006510417 0.0006510417
## 
## $mids
## [1]  1  3  5  7  9 11 13 15 17
## 
## $xname
## [1] "diabetes[[col]]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
## 
## [[2]]
## $breaks
## [1]  40  60  80 100 120 140 160 180 200
## 
## $counts
## [1]   4  38 167 210 157  91  60  41
## 
## $density
## [1] 0.0002604167 0.0024739583 0.0108723958 0.0136718750 0.0102213542
## [6] 0.0059244792 0.0039062500 0.0026692708
## 
## $mids
## [1]  50  70  90 110 130 150 170 190
## 
## $xname
## [1] "diabetes[[col]]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
## 
## [[3]]
## $breaks
##  [1]  20  30  40  50  60  70  80  90 100 110 120 130
## 
## $counts
##  [1]   3   2  24  94 217 263 127  25  11   1   1
## 
## $density
##  [1] 0.0003906250 0.0002604167 0.0031250000 0.0122395833 0.0282552083
##  [6] 0.0342447917 0.0165364583 0.0032552083 0.0014322917 0.0001302083
## [11] 0.0001302083
## 
## $mids
##  [1]  25  35  45  55  65  75  85  95 105 115 125
## 
## $xname
## [1] "diabetes[[col]]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
## 
## [[4]]
## $breaks
##  [1]   0  10  20  30  40  50  60  70  80  90 100
## 
## $counts
##  [1]   9 115 406 164  65   7   1   0   0   1
## 
## $density
##  [1] 0.0011718750 0.0149739583 0.0528645833 0.0213541667 0.0084635417
##  [6] 0.0009114583 0.0001302083 0.0000000000 0.0000000000 0.0001302083
## 
## $mids
##  [1]  5 15 25 35 45 55 65 75 85 95
## 
## $xname
## [1] "diabetes[[col]]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
## 
## [[5]]
## $breaks
##  [1]   0 100 200 300 400 500 600 700 800 900
## 
## $counts
## [1] 151 532  48  17  11   6   1   1   1
## 
## $density
## [1] 1.966146e-03 6.927083e-03 6.250000e-04 2.213542e-04 1.432292e-04
## [6] 7.812500e-05 1.302083e-05 1.302083e-05 1.302083e-05
## 
## $mids
## [1]  50 150 250 350 450 550 650 750 850
## 
## $xname
## [1] "diabetes[[col]]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
## 
## [[6]]
## $breaks
##  [1] 15 20 25 30 35 40 45 50 55 60 65 70
## 
## $counts
##  [1]  14  98 180 232 148  61  27   5   2   0   1
## 
## $density
##  [1] 0.0036458333 0.0255208333 0.0468750000 0.0604166667 0.0385416667
##  [6] 0.0158854167 0.0070312500 0.0013020833 0.0005208333 0.0000000000
## [11] 0.0002604167
## 
## $mids
##  [1] 17.5 22.5 27.5 32.5 37.5 42.5 47.5 52.5 57.5 62.5 67.5
## 
## $xname
## [1] "diabetes[[col]]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
## 
## [[7]]
## $breaks
##  [1] 0.0 0.2 0.4 0.6 0.8 1.0 1.2 1.4 1.6 1.8 2.0 2.2 2.4 2.6
## 
## $counts
##  [1] 128 282 154  99  54  22  16   4   4   1   1   2   1
## 
## $density
##  [1] 0.833333333 1.835937500 1.002604167 0.644531250 0.351562500 0.143229167
##  [7] 0.104166667 0.026041667 0.026041667 0.006510417 0.006510417 0.013020833
## [13] 0.006510417
## 
## $mids
##  [1] 0.1 0.3 0.5 0.7 0.9 1.1 1.3 1.5 1.7 1.9 2.1 2.3 2.5
## 
## $xname
## [1] "diabetes[[col]]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
## 
## [[8]]
## $breaks
##  [1] 20 25 30 35 40 45 50 55 60 65 70 75 80 85
## 
## $counts
##  [1] 267 150  81  76  76  37  31  23  14  11   1   0   1
## 
## $density
##  [1] 0.0695312500 0.0390625000 0.0210937500 0.0197916667 0.0197916667
##  [6] 0.0096354167 0.0080729167 0.0059895833 0.0036458333 0.0028645833
## [11] 0.0002604167 0.0000000000 0.0002604167
## 
## $mids
##  [1] 22.5 27.5 32.5 37.5 42.5 47.5 52.5 57.5 62.5 67.5 72.5 77.5 82.5
## 
## $xname
## [1] "diabetes[[col]]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
## 
## [[9]]
## $breaks
##  [1] 0.0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0
## 
## $counts
##  [1] 500   0   0   0   0   0   0   0   0 268
## 
## $density
##  [1] 6.510417 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
##  [9] 0.000000 3.489583
## 
## $mids
##  [1] 0.05 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95
## 
## $xname
## [1] "diabetes[[col]]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
par(mfrow = c(1, 1))
# Boxplots after cleaning
par(mfrow = c(3, 3))
lapply(boxplot_cols, function(col) {
  boxplot(diabetes[[col]], main = paste("Boxplot of", col, "after cleaning"), 
          col = "lightgreen", horizontal = TRUE)
})
## [[1]]
## [[1]]$stats
##      [,1]
## [1,]    0
## [2,]    1
## [3,]    3
## [4,]    6
## [5,]   13
## 
## [[1]]$n
## [1] 768
## 
## [[1]]$conf
##          [,1]
## [1,] 2.714933
## [2,] 3.285067
## 
## [[1]]$out
## [1] 15 17 14 14
## 
## [[1]]$group
## [1] 1 1 1 1
## 
## [[1]]$names
## [1] ""
## 
## 
## [[2]]
## [[2]]$stats
##       [,1]
## [1,]  44.0
## [2,]  99.5
## [3,] 117.0
## [4,] 140.5
## [5,] 199.0
## 
## [[2]]$n
## [1] 768
## 
## [[2]]$conf
##          [,1]
## [1,] 114.6625
## [2,] 119.3375
## 
## [[2]]$out
## numeric(0)
## 
## [[2]]$group
## numeric(0)
## 
## [[2]]$names
## [1] ""
## 
## 
## [[3]]
## [[3]]$stats
##      [,1]
## [1,]   40
## [2,]   64
## [3,]   72
## [4,]   80
## [5,]  104
## 
## [[3]]$n
## [1] 768
## 
## [[3]]$conf
##          [,1]
## [1,] 71.08779
## [2,] 72.91221
## 
## [[3]]$out
##  [1]  30 110 108 122  30 110 108 110  24  38 106 106 106 114
## 
## [[3]]$group
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 
## [[3]]$names
## [1] ""
## 
## 
## [[4]]
## [[4]]$stats
##      [,1]
## [1,]   15
## [2,]   25
## [3,]   29
## [4,]   32
## [5,]   42
## 
## [[4]]$n
## [1] 768
## 
## [[4]]$conf
##          [,1]
## [1,] 28.60091
## [2,] 29.39909
## 
## [[4]]$out
##  [1] 45 47 11 47 11 10 60 13 13 54 51 56 14 13 50 44 12 46 44 13 44 54 14  7 50
## [26] 52 10 44 43 45 14 10 11 12 43 13 12 48 43 43  8 13 14 12 49 46 46 11  8 12
## [51] 63 12 45 13 48 13 10 45  7 52 49 43 14 47 99 46 11 50 45 14 13 13 47 12 48
## [76] 43 46 46 45 10 46 49 11 13 46 44 48
## 
## [[4]]$group
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [39] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [77] 1 1 1 1 1 1 1 1 1 1 1
## 
## [[4]]$names
## [1] ""
## 
## 
## [[5]]
## [[5]]$stats
##       [,1]
## [1,] 112.0
## [2,] 121.0
## [3,] 125.0
## [4,] 127.5
## [5,] 135.0
## 
## [[5]]$n
## [1] 768
## 
## [[5]]$conf
##          [,1]
## [1,] 124.6294
## [2,] 125.3706
## 
## [[5]]$out
##   [1]  94 168  88 543 846 175 230  83  96 235 146 140 110 245  54 192 207  70
##  [19] 240  82  36  23 300 342 304 110 142  38 100  90 140 270  71  71 110 176
##  [37]  48  64 228  76  64 220  40 152 140  18  36 495  37 175  51 100 100  99
##  [55]  94 145 168 225  49 140  50  92 325  63 284 204 155 485  94  53 105 285
##  [73] 156  78  48  55  92  23 495  58 160  94 210  48  99 318  44 190 280  87
##  [91] 175 271 478 190  56  32 744  53 370  37  45 192  88 176 194 680 402  55
## [109] 258 375 150  67  56  45  57 278 155 545 220  49  75  40  74 182 194 360
## [127] 215 184  42 105 148 180 205 148  96  85  94  64 140 231  29 168 156  68
## [145]  52  58 255 171 105  73 108  83  74  43 167  54 249 325 293  83  66 140
## [163] 465  89  66  94 158 325  84  75  72  82 182  59 110  50 285  81 196 415
## [181]  87 275  88 165 579 176 310  61 167 474 170  76  78 210 277 180 145 180
## [199]  85  60  50  14  70  92  64  63  95 210 105  71 237  60  56  49 105  36
## [217] 100 140 191 110  75 328  49 250 480 265  66  76 145 193  71  79  90 170
## [235]  76 210  86 105 165 326  66  82 105 188 106  65  56 210 155 215 190  56
## [253]  76 225 207 166  67 106  44 215 274  77  54  88  18 165  44 330  63 600
## [271] 156 140 230 185  25 293  41 272 182 158 194 321 144  15 160  54  90 183
## [289]  66  91  46 105 152 440 144 159 100 106  77 540  90 200  70 231 190 100
## [307] 168  49 240 265  45 105 205 180 180  95 480 155 200 100 335 160 387  22
## [325] 291 392 185 178 200 105 180  79 165 160 150  94 140 105  57 200  74 510
## [343] 110  16 180
## 
## [[5]]$group
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [186] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [223] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [260] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [297] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [334] 1 1 1 1 1 1 1 1 1 1 1 1
## 
## [[5]]$names
## [1] ""
## 
## 
## [[6]]
## [[6]]$stats
##      [,1]
## [1,] 18.2
## [2,] 27.5
## [3,] 32.3
## [4,] 36.6
## [5,] 50.0
## 
## [[6]]$n
## [1] 768
## 
## [[6]]$conf
##          [,1]
## [1,] 31.78118
## [2,] 32.81882
## 
## [[6]]$out
## [1] 53.2 55.0 67.1 52.3 52.3 52.9 59.4 57.3
## 
## [[6]]$group
## [1] 1 1 1 1 1 1 1 1
## 
## [[6]]$names
## [1] ""
## 
## 
## [[7]]
## [[7]]$stats
##        [,1]
## [1,] 0.0780
## [2,] 0.2435
## [3,] 0.3725
## [4,] 0.6265
## [5,] 1.1910
## 
## [[7]]$n
## [1] 768
## 
## [[7]]$conf
##           [,1]
## [1,] 0.3506639
## [2,] 0.3943361
## 
## [[7]]$out
##  [1] 2.288 1.441 1.390 1.893 1.781 1.222 1.400 1.321 1.224 2.329 1.318 1.213
## [13] 1.353 1.224 1.391 1.476 2.137 1.731 1.268 1.600 2.420 1.251 1.699 1.258
## [25] 1.282 1.698 1.461 1.292 1.394
## 
## [[7]]$group
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 
## [[7]]$names
## [1] ""
## 
## 
## [[8]]
## [[8]]$stats
##      [,1]
## [1,]   21
## [2,]   24
## [3,]   29
## [4,]   41
## [5,]   66
## 
## [[8]]$n
## [1] 768
## 
## [[8]]$conf
##          [,1]
## [1,] 28.03077
## [2,] 29.96923
## 
## [[8]]$out
## [1] 69 67 72 81 67 67 70 68 69
## 
## [[8]]$group
## [1] 1 1 1 1 1 1 1 1 1
## 
## [[8]]$names
## [1] ""
# Barplot Outcome sau khi làm sạch
outcome_counts_clean <- table(diabetes$Outcome)
barplot(outcome_counts_clean,
        main = "Count of Outcome after cleaning",
        col = c("lightblue", "tomato"),
        names.arg = c("No Diabetes", "Diabetes"))

par(mfrow = c(1, 1))

Summary Table of Cleaned Dataset Columns

df_clean <- web_data

sample_values <- function(x) {
  x_unique <- unique(x)
  paste(head(x_unique, 5), collapse = " | ")
}

summary_table <- data.frame(
  Column = names(df_clean),
  Type = sapply(df_clean, function(x) class(x)[1]),
  Sample_Values = sapply(df_clean, sample_values),
  stringsAsFactors = FALSE
)

datatable(summary_table, caption = "Table X. Cleaned Dataset Column Information")