# Read dataset
web_data <- read_csv("data/Web_Analytic_Dataset.csv")
## Rows: 250 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Source / Medium, Bounce Rate, Conversion Rate (%)
## dbl (2): Year, Month of the year
## num (7): Users, New Users, Sessions, Pageviews, Transactions, Revenue, Quan...
## time (1): Avg. Session Duration
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Convert numeric fields
numeric_cols <- c("Users", "New Users", "Sessions", "Pageviews",
"Transactions", "Revenue", "Quantity Sold")
web_data[numeric_cols] <- lapply(web_data[numeric_cols], function(x) as.numeric(gsub(",", "", x)))
# Convert Conversion Rate to numeric
web_data$`Conversion Rate (%)` <- as.numeric(web_data$`Conversion Rate (%)`) / 100
## Warning: NAs introduced by coercion
# Ensure Year column exists
if (!"Year" %in% names(web_data)) {
if ("Year of the Visit" %in% names(web_data)) {
web_data$Year <- web_data$`Year of the Visit`
}
}
# Use 'Source / Medium' as proxy for Device
web_data$Device <- web_data$`Source / Medium`
top_sources <- web_data %>%
group_by(Year, `Source / Medium`) %>%
summarise(Total_Revenue = sum(Revenue, na.rm = TRUE), .groups = 'drop') %>%
arrange(Year, desc(Total_Revenue)) %>%
group_by(Year) %>%
slice_max(order_by = Total_Revenue, n = 3)
DT::datatable(top_sources, caption = "Top 3 Traffic Sources by Revenue (per Year)")
This table summarizes the total revenue for each traffic source per year, and keeps only the top 3 highest revenue sources per year.
ggplot(top_sources, aes(x = reorder(`Source / Medium`, -Total_Revenue), y = Total_Revenue, fill = `Source / Medium`)) +
geom_col() +
geom_text(aes(label = round(Total_Revenue, 0)), vjust = -0.5, size = 3) +
facet_wrap(~Year) +
labs(title = "Top 3 Traffic Sources by Revenue (per Year)", x = "Source / Medium", y = "Total Revenue") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 1, hjust = 1))
This bar chart visually presents the top 3 sources per year with revenue labels to easily compare contributions.
df <- read.csv("data/Web_Analytic_Dataset.csv")
names(df) <- make.names(names(df))
df$Users <- as.numeric(gsub(",", "", df$Users))
df$New.Users <- as.numeric(gsub(",", "", df$New.Users))
devices_users <- df %>%
group_by(Source...Medium) %>%
summarise(
total_users = sum(Users, na.rm = TRUE),
total_newusers = sum(New.Users, na.rm = TRUE)
)
DT::datatable(devices_users, caption = "Users vs New Users by Device")
#User chart
library(ggplot2)
ggplot(devices_users, aes(x = Source...Medium, y = total_users, fill = Source...Medium)) +
geom_bar(stat = "identity", position = "dodge") +
geom_text(aes(label = total_users), vjust = -0.3, size = 3) +
labs(title = "Users by Device", x = "Device", y = "Total Users") +
theme_minimal() +
theme(
legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1, size = 8)
)
#New user chart
ggplot(devices_users, aes(x = Source...Medium, y = total_newusers, fill = Source...Medium)) +
geom_bar(stat = "identity", position = "dodge") +
geom_text(aes(label = total_newusers), vjust = -0.3, size = 3) +
labs( title = "New Users by Device", x = "Device", y = "Total New Users") +
theme_minimal() +
theme(
legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1, size = 8)
)
device_summary <- web_data %>%
group_by(Device, Year, `Month of the year`) %>%
summarise(
avg_conversion_rate = mean(`Conversion Rate (%)`, na.rm = TRUE),
.groups = "drop"
)
DT::datatable(device_summary, caption = "Average Conversion Rate by Device, Month, and Year")
This table shows the average conversion rate for each source per month and year
ggplot(device_summary, aes(x = paste(Year, `Month of the year`, sep = "-"),
y = avg_conversion_rate,
fill = Device)) +
geom_col(position = position_dodge2(preserve = "single", padding = 0.05), width = 1.25) +
labs(title = "Conversion Rate by Device", x = "Month", y = "Average Conversion Rate") +
theme_minimal(base_size = 40) +
theme(
axis.text.x = element_text(angle = 0, hjust = 1),
legend.position = "right",
legend.key.size = unit(2, "cm"),
legend.text = element_text(size = 30)
)
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_col()`).
This bar chart compares average conversion rates per source across different months and years.
# Clean and convert columns
web_data$bounce_clean <- as.numeric(gsub("%", "", web_data$`Bounce Rate`))
web_data$conversion_clean <- as.numeric(web_data$`Conversion Rate (%)`)
web_data$transaction_clean <- as.numeric(web_data$Transactions)
web_data$revenue_clean <- as.numeric(web_data$Revenue)
# Filter complete cases
task_c_data <- web_data %>%
select(bounce_clean, conversion_clean, transaction_clean, revenue_clean) %>%
drop_na()
corr_matrix <- round(cor(task_c_data), 2)
melted_corr <- melt(corr_matrix)
ggplot(melted_corr, aes(x = Var1, y = Var2, fill = value)) +
geom_tile(color = "white") +
scale_fill_gradient2(low = "blue", high = "yellow", mid = "white", midpoint = 0, limit = c(-1,1), name = "Correlation") +
geom_text(aes(label = value), color = "black", size = 4) +
labs(title = "Correlation Heatmap") +
theme_minimal()
# Bounce vs Conversion
p1 <- ggplot(task_c_data, aes(x = bounce_clean, y = conversion_clean)) +
geom_point(color = "blue", alpha = 0.6) +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
labs(title = "Bounce Rate vs Conversion Rate") +
theme_minimal()
# Bounce vs Transaction
p2 <- ggplot(task_c_data, aes(x = bounce_clean, y = transaction_clean)) +
geom_point(color = "red", alpha = 0.6) +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(title = "Bounce Rate vs Transactions") +
theme_minimal()
# Bounce vs Revenue
p3 <- ggplot(task_c_data, aes(x = bounce_clean, y = revenue_clean)) +
geom_point(color = "gold", alpha = 0.6) +
geom_smooth(method = "lm", se = FALSE, color = "orange") +
labs(title = "Bounce Rate vs Revenue") +
theme_minimal()
# Conversion vs Revenue
p4 <- ggplot(task_c_data, aes(x = conversion_clean, y = revenue_clean)) +
geom_point(color = "deeppink", alpha = 0.6) +
geom_smooth(method = "lm", se = FALSE, color = "deeppink") +
labs(title = "Conversion Rate vs Revenue") +
theme_minimal()
# Transaction vs Revenue
p5 <- ggplot(task_c_data, aes(x = transaction_clean, y = revenue_clean)) +
geom_point(color = "seagreen", alpha = 0.6) +
geom_smooth(method = "lm", se = FALSE, color = "darkgreen") +
labs(title = "Transaction vs Revenue") +
theme_minimal()
p1; p2; p3; p4; p5
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
Scatterplot 1: Bounce Rate vs Conversion Rate
There is a negative correlation between Bounce Rate and Conversion Rate. This suggests that as Bounce Rate increases (users leaving quickly), the Conversion Rate tends to decrease. This is expected behavior—visitors who leave quickly are less likely to convert.
Scatterplot 2: Bounce Rate vs Transactions
This plot also shows a negative trend, meaning higher Bounce Rates are generally associated with fewer Transactions. It indicates that if users are bouncing more, they are less engaged, and thus less likely to complete transactions.
Scatterplot 3: Bounce Rate vs Revenue
There is a slight inverse relationship between Bounce Rate and Revenue, though it’s weaker than the previous ones. A high Bounce Rate may negatively impact revenue, but this relationship might also be influenced by other variables (e.g., marketing quality or landing page design).
Scatterplot 4: Conversion Rate vs Revenue
This plot shows a positive correlation—higher Conversion Rates usually lead to higher Revenue. This is an intuitive and critical relationship: improving conversion directly boosts revenue performance.
Scatterplot 5: Transaction vs Revenue
This scatterplot shows a strong positive linear relationship between the number of Transactions and the amount of Revenue. As the number of transactions increases, revenue increases correspondingly.
This makes intuitive sense, as each transaction typically contributes directly to total revenue.
The consistency of this relationship suggests that encouraging more transactions (e.g., through smoother checkout, promotions) will significantly boost revenue performance.Overall Comment:
The heatmap and scatterplots suggest the following: Bounce Rate negatively affects both Conversion Rate and Transactions, which in turn impact Revenue. The strongest positive correlation is between Conversion Rate and Revenue, highlighting its business importance. Reducing bounce rate and improving user engagement can be key levers for increasing both conversions and revenue. This analysis supports strategic efforts to optimize landing pages and user experience in order to lower bounce and raise conversions.
diabetes <- read_csv("data/diabetes.csv")
## Rows: 768 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (9): Pregnancies, Glucose, BloodPressure, SkinThickness, Insulin, BMI, D...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
desc_stats_before <- diabetes %>%
summarise(across(everything(),
list(
Mean = ~mean(.x, na.rm = TRUE),
Median = ~median(.x, na.rm = TRUE),
SD = ~sd(.x, na.rm = TRUE),
Variance = ~var(.x, na.rm = TRUE)
))) %>%
pivot_longer(cols = everything(),
names_to = c("Variable", ".value"),
names_sep = "_") %>%
arrange(Variable)
DT::datatable(desc_stats_before, caption = "Descriptive Statistics Before Cleaning")
par(mfrow = c(3, 3))
lapply(names(diabetes), function(col) {
hist(diabetes[[col]], main = paste("Histogram of", col, "before cleaning"),
xlab = col, col = "lightgray")
})
## [[1]]
## $breaks
## [1] 0 2 4 6 8 10 12 14 16 18
##
## $counts
## [1] 349 143 107 83 52 20 12 1 1
##
## $density
## [1] 0.2272135417 0.0930989583 0.0696614583 0.0540364583 0.0338541667
## [6] 0.0130208333 0.0078125000 0.0006510417 0.0006510417
##
## $mids
## [1] 1 3 5 7 9 11 13 15 17
##
## $xname
## [1] "diabetes[[col]]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## [[2]]
## $breaks
## [1] 0 20 40 60 80 100 120 140 160 180 200
##
## $counts
## [1] 5 0 4 38 167 205 157 91 60 41
##
## $density
## [1] 0.0003255208 0.0000000000 0.0002604167 0.0024739583 0.0108723958
## [6] 0.0133463542 0.0102213542 0.0059244792 0.0039062500 0.0026692708
##
## $mids
## [1] 10 30 50 70 90 110 130 150 170 190
##
## $xname
## [1] "diabetes[[col]]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## [[3]]
## $breaks
## [1] 0 10 20 30 40 50 60 70 80 90 100 110 120 130
##
## $counts
## [1] 35 0 3 2 24 94 217 228 127 25 11 1 1
##
## $density
## [1] 0.0045572917 0.0000000000 0.0003906250 0.0002604167 0.0031250000
## [6] 0.0122395833 0.0282552083 0.0296875000 0.0165364583 0.0032552083
## [11] 0.0014322917 0.0001302083 0.0001302083
##
## $mids
## [1] 5 15 25 35 45 55 65 75 85 95 105 115 125
##
## $xname
## [1] "diabetes[[col]]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## [[4]]
## $breaks
## [1] 0 10 20 30 40 50 60 70 80 90 100
##
## $counts
## [1] 236 115 179 164 65 7 1 0 0 1
##
## $density
## [1] 0.0307291667 0.0149739583 0.0233072917 0.0213541667 0.0084635417
## [6] 0.0009114583 0.0001302083 0.0000000000 0.0000000000 0.0001302083
##
## $mids
## [1] 5 15 25 35 45 55 65 75 85 95
##
## $xname
## [1] "diabetes[[col]]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## [[5]]
## $breaks
## [1] 0 100 200 300 400 500 600 700 800 900
##
## $counts
## [1] 525 158 48 17 11 6 1 1 1
##
## $density
## [1] 6.835938e-03 2.057292e-03 6.250000e-04 2.213542e-04 1.432292e-04
## [6] 7.812500e-05 1.302083e-05 1.302083e-05 1.302083e-05
##
## $mids
## [1] 50 150 250 350 450 550 650 750 850
##
## $xname
## [1] "diabetes[[col]]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## [[6]]
## $breaks
## [1] 0 5 10 15 20 25 30 35 40 45 50 55 60 65 70
##
## $counts
## [1] 11 0 0 14 98 180 221 148 61 27 5 2 0 1
##
## $density
## [1] 0.0028645833 0.0000000000 0.0000000000 0.0036458333 0.0255208333
## [6] 0.0468750000 0.0575520833 0.0385416667 0.0158854167 0.0070312500
## [11] 0.0013020833 0.0005208333 0.0000000000 0.0002604167
##
## $mids
## [1] 2.5 7.5 12.5 17.5 22.5 27.5 32.5 37.5 42.5 47.5 52.5 57.5 62.5 67.5
##
## $xname
## [1] "diabetes[[col]]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## [[7]]
## $breaks
## [1] 0.0 0.2 0.4 0.6 0.8 1.0 1.2 1.4 1.6 1.8 2.0 2.2 2.4 2.6
##
## $counts
## [1] 128 282 154 99 54 22 16 4 4 1 1 2 1
##
## $density
## [1] 0.833333333 1.835937500 1.002604167 0.644531250 0.351562500 0.143229167
## [7] 0.104166667 0.026041667 0.026041667 0.006510417 0.006510417 0.013020833
## [13] 0.006510417
##
## $mids
## [1] 0.1 0.3 0.5 0.7 0.9 1.1 1.3 1.5 1.7 1.9 2.1 2.3 2.5
##
## $xname
## [1] "diabetes[[col]]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## [[8]]
## $breaks
## [1] 20 25 30 35 40 45 50 55 60 65 70 75 80 85
##
## $counts
## [1] 267 150 81 76 76 37 31 23 14 11 1 0 1
##
## $density
## [1] 0.0695312500 0.0390625000 0.0210937500 0.0197916667 0.0197916667
## [6] 0.0096354167 0.0080729167 0.0059895833 0.0036458333 0.0028645833
## [11] 0.0002604167 0.0000000000 0.0002604167
##
## $mids
## [1] 22.5 27.5 32.5 37.5 42.5 47.5 52.5 57.5 62.5 67.5 72.5 77.5 82.5
##
## $xname
## [1] "diabetes[[col]]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## [[9]]
## $breaks
## [1] 0.0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0
##
## $counts
## [1] 500 0 0 0 0 0 0 0 0 268
##
## $density
## [1] 6.510417 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## [9] 0.000000 3.489583
##
## $mids
## [1] 0.05 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95
##
## $xname
## [1] "diabetes[[col]]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
par(mfrow = c(1, 1))
# Boxplots before cleaning
boxplot_cols <- c("Pregnancies", "Glucose", "BloodPressure", "SkinThickness",
"Insulin", "BMI", "DiabetesPedigreeFunction", "Age")
par(mfrow = c(3, 3))
lapply(boxplot_cols, function(col) {
boxplot(diabetes[[col]], main = paste("Boxplot of", col, "before cleaning"),
col = "pink", horizontal = TRUE)
})
## [[1]]
## [[1]]$stats
## [,1]
## [1,] 0
## [2,] 1
## [3,] 3
## [4,] 6
## [5,] 13
##
## [[1]]$n
## [1] 768
##
## [[1]]$conf
## [,1]
## [1,] 2.714933
## [2,] 3.285067
##
## [[1]]$out
## [1] 15 17 14 14
##
## [[1]]$group
## [1] 1 1 1 1
##
## [[1]]$names
## [1] ""
##
##
## [[2]]
## [[2]]$stats
## [,1]
## [1,] 44.0
## [2,] 99.0
## [3,] 117.0
## [4,] 140.5
## [5,] 199.0
##
## [[2]]$n
## [1] 768
##
## [[2]]$conf
## [,1]
## [1,] 114.6339
## [2,] 119.3661
##
## [[2]]$out
## [1] 0 0 0 0 0
##
## [[2]]$group
## [1] 1 1 1 1 1
##
## [[2]]$names
## [1] ""
##
##
## [[3]]
## [[3]]$stats
## [,1]
## [1,] 38
## [2,] 62
## [3,] 72
## [4,] 80
## [5,] 106
##
## [[3]]$n
## [1] 768
##
## [[3]]$conf
## [,1]
## [1,] 70.97376
## [2,] 73.02624
##
## [[3]]$out
## [1] 0 0 30 110 0 0 0 0 108 122 30 0 110 0 0 0 0 0 0
## [20] 0 0 0 0 108 0 0 0 0 0 0 0 0 0 0 110 0 24 0
## [39] 0 0 0 114 0 0 0
##
## [[3]]$group
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [39] 1 1 1 1 1 1 1
##
## [[3]]$names
## [1] ""
##
##
## [[4]]
## [[4]]$stats
## [,1]
## [1,] 0
## [2,] 0
## [3,] 23
## [4,] 32
## [5,] 63
##
## [[4]]$n
## [1] 768
##
## [[4]]$conf
## [,1]
## [1,] 21.17557
## [2,] 24.82443
##
## [[4]]$out
## [1] 99
##
## [[4]]$group
## [1] 1
##
## [[4]]$names
## [1] ""
##
##
## [[5]]
## [[5]]$stats
## [,1]
## [1,] 0.0
## [2,] 0.0
## [3,] 30.5
## [4,] 127.5
## [5,] 318.0
##
## [[5]]$n
## [1] 768
##
## [[5]]$conf
## [,1]
## [1,] 23.2308
## [2,] 37.7692
##
## [[5]]$out
## [1] 543 846 342 495 325 485 495 478 744 370 680 402 375 545 360 325 465 325 415
## [20] 579 474 328 480 326 330 600 321 440 540 480 335 387 392 510
##
## [[5]]$group
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##
## [[5]]$names
## [1] ""
##
##
## [[6]]
## [[6]]$stats
## [,1]
## [1,] 18.2
## [2,] 27.3
## [3,] 32.0
## [4,] 36.6
## [5,] 50.0
##
## [[6]]$n
## [1] 768
##
## [[6]]$conf
## [,1]
## [1,] 31.46978
## [2,] 32.53022
##
## [[6]]$out
## [1] 0.0 0.0 0.0 0.0 53.2 55.0 0.0 67.1 52.3 52.3 52.9 0.0 0.0 59.4 0.0
## [16] 0.0 57.3 0.0 0.0
##
## [[6]]$group
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##
## [[6]]$names
## [1] ""
##
##
## [[7]]
## [[7]]$stats
## [,1]
## [1,] 0.0780
## [2,] 0.2435
## [3,] 0.3725
## [4,] 0.6265
## [5,] 1.1910
##
## [[7]]$n
## [1] 768
##
## [[7]]$conf
## [,1]
## [1,] 0.3506639
## [2,] 0.3943361
##
## [[7]]$out
## [1] 2.288 1.441 1.390 1.893 1.781 1.222 1.400 1.321 1.224 2.329 1.318 1.213
## [13] 1.353 1.224 1.391 1.476 2.137 1.731 1.268 1.600 2.420 1.251 1.699 1.258
## [25] 1.282 1.698 1.461 1.292 1.394
##
## [[7]]$group
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##
## [[7]]$names
## [1] ""
##
##
## [[8]]
## [[8]]$stats
## [,1]
## [1,] 21
## [2,] 24
## [3,] 29
## [4,] 41
## [5,] 66
##
## [[8]]$n
## [1] 768
##
## [[8]]$conf
## [,1]
## [1,] 28.03077
## [2,] 29.96923
##
## [[8]]$out
## [1] 69 67 72 81 67 67 70 68 69
##
## [[8]]$group
## [1] 1 1 1 1 1 1 1 1 1
##
## [[8]]$names
## [1] ""
# Barplot Outcome
outcome_counts <- table(diabetes$Outcome)
barplot(outcome_counts,
main = "Count of Outcome before cleaning",
col = c("lightblue", "tomato"),
names.arg = c("No Diabetes", "Diabetes"))
par(mfrow = c(1, 1))
# Columns that shouldn't contain 0s
cols_with_invalid_zeros <- c("Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI")
# Replace 0s with NA
for (col in cols_with_invalid_zeros) {
diabetes[[col]][diabetes[[col]] == 0] <- NA
}
# Impute with median
for (col in cols_with_invalid_zeros) {
diabetes[[col]][is.na(diabetes[[col]])] <- median(diabetes[[col]], na.rm = TRUE)
}
desc_stats_after <- diabetes %>%
summarise(across(everything(),
list(
Mean = ~mean(.x, na.rm = TRUE),
Median = ~median(.x, na.rm = TRUE),
SD = ~sd(.x, na.rm = TRUE),
Variance = ~var(.x, na.rm = TRUE)
))) %>%
pivot_longer(cols = everything(),
names_to = c("Variable", ".value"),
names_sep = "_") %>%
arrange(Variable)
DT::datatable(desc_stats_after, caption = "Descriptive Statistics After Cleaning")
par(mfrow = c(3, 3))
lapply(names(diabetes), function(col) {
hist(diabetes[[col]], main = paste("Histogram of", col, "after cleaning"),
xlab = col, col = "lightblue")
})
## [[1]]
## $breaks
## [1] 0 2 4 6 8 10 12 14 16 18
##
## $counts
## [1] 349 143 107 83 52 20 12 1 1
##
## $density
## [1] 0.2272135417 0.0930989583 0.0696614583 0.0540364583 0.0338541667
## [6] 0.0130208333 0.0078125000 0.0006510417 0.0006510417
##
## $mids
## [1] 1 3 5 7 9 11 13 15 17
##
## $xname
## [1] "diabetes[[col]]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## [[2]]
## $breaks
## [1] 40 60 80 100 120 140 160 180 200
##
## $counts
## [1] 4 38 167 210 157 91 60 41
##
## $density
## [1] 0.0002604167 0.0024739583 0.0108723958 0.0136718750 0.0102213542
## [6] 0.0059244792 0.0039062500 0.0026692708
##
## $mids
## [1] 50 70 90 110 130 150 170 190
##
## $xname
## [1] "diabetes[[col]]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## [[3]]
## $breaks
## [1] 20 30 40 50 60 70 80 90 100 110 120 130
##
## $counts
## [1] 3 2 24 94 217 263 127 25 11 1 1
##
## $density
## [1] 0.0003906250 0.0002604167 0.0031250000 0.0122395833 0.0282552083
## [6] 0.0342447917 0.0165364583 0.0032552083 0.0014322917 0.0001302083
## [11] 0.0001302083
##
## $mids
## [1] 25 35 45 55 65 75 85 95 105 115 125
##
## $xname
## [1] "diabetes[[col]]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## [[4]]
## $breaks
## [1] 0 10 20 30 40 50 60 70 80 90 100
##
## $counts
## [1] 9 115 406 164 65 7 1 0 0 1
##
## $density
## [1] 0.0011718750 0.0149739583 0.0528645833 0.0213541667 0.0084635417
## [6] 0.0009114583 0.0001302083 0.0000000000 0.0000000000 0.0001302083
##
## $mids
## [1] 5 15 25 35 45 55 65 75 85 95
##
## $xname
## [1] "diabetes[[col]]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## [[5]]
## $breaks
## [1] 0 100 200 300 400 500 600 700 800 900
##
## $counts
## [1] 151 532 48 17 11 6 1 1 1
##
## $density
## [1] 1.966146e-03 6.927083e-03 6.250000e-04 2.213542e-04 1.432292e-04
## [6] 7.812500e-05 1.302083e-05 1.302083e-05 1.302083e-05
##
## $mids
## [1] 50 150 250 350 450 550 650 750 850
##
## $xname
## [1] "diabetes[[col]]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## [[6]]
## $breaks
## [1] 15 20 25 30 35 40 45 50 55 60 65 70
##
## $counts
## [1] 14 98 180 232 148 61 27 5 2 0 1
##
## $density
## [1] 0.0036458333 0.0255208333 0.0468750000 0.0604166667 0.0385416667
## [6] 0.0158854167 0.0070312500 0.0013020833 0.0005208333 0.0000000000
## [11] 0.0002604167
##
## $mids
## [1] 17.5 22.5 27.5 32.5 37.5 42.5 47.5 52.5 57.5 62.5 67.5
##
## $xname
## [1] "diabetes[[col]]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## [[7]]
## $breaks
## [1] 0.0 0.2 0.4 0.6 0.8 1.0 1.2 1.4 1.6 1.8 2.0 2.2 2.4 2.6
##
## $counts
## [1] 128 282 154 99 54 22 16 4 4 1 1 2 1
##
## $density
## [1] 0.833333333 1.835937500 1.002604167 0.644531250 0.351562500 0.143229167
## [7] 0.104166667 0.026041667 0.026041667 0.006510417 0.006510417 0.013020833
## [13] 0.006510417
##
## $mids
## [1] 0.1 0.3 0.5 0.7 0.9 1.1 1.3 1.5 1.7 1.9 2.1 2.3 2.5
##
## $xname
## [1] "diabetes[[col]]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## [[8]]
## $breaks
## [1] 20 25 30 35 40 45 50 55 60 65 70 75 80 85
##
## $counts
## [1] 267 150 81 76 76 37 31 23 14 11 1 0 1
##
## $density
## [1] 0.0695312500 0.0390625000 0.0210937500 0.0197916667 0.0197916667
## [6] 0.0096354167 0.0080729167 0.0059895833 0.0036458333 0.0028645833
## [11] 0.0002604167 0.0000000000 0.0002604167
##
## $mids
## [1] 22.5 27.5 32.5 37.5 42.5 47.5 52.5 57.5 62.5 67.5 72.5 77.5 82.5
##
## $xname
## [1] "diabetes[[col]]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## [[9]]
## $breaks
## [1] 0.0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0
##
## $counts
## [1] 500 0 0 0 0 0 0 0 0 268
##
## $density
## [1] 6.510417 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
## [9] 0.000000 3.489583
##
## $mids
## [1] 0.05 0.15 0.25 0.35 0.45 0.55 0.65 0.75 0.85 0.95
##
## $xname
## [1] "diabetes[[col]]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
par(mfrow = c(1, 1))
# Boxplots after cleaning
par(mfrow = c(3, 3))
lapply(boxplot_cols, function(col) {
boxplot(diabetes[[col]], main = paste("Boxplot of", col, "after cleaning"),
col = "lightgreen", horizontal = TRUE)
})
## [[1]]
## [[1]]$stats
## [,1]
## [1,] 0
## [2,] 1
## [3,] 3
## [4,] 6
## [5,] 13
##
## [[1]]$n
## [1] 768
##
## [[1]]$conf
## [,1]
## [1,] 2.714933
## [2,] 3.285067
##
## [[1]]$out
## [1] 15 17 14 14
##
## [[1]]$group
## [1] 1 1 1 1
##
## [[1]]$names
## [1] ""
##
##
## [[2]]
## [[2]]$stats
## [,1]
## [1,] 44.0
## [2,] 99.5
## [3,] 117.0
## [4,] 140.5
## [5,] 199.0
##
## [[2]]$n
## [1] 768
##
## [[2]]$conf
## [,1]
## [1,] 114.6625
## [2,] 119.3375
##
## [[2]]$out
## numeric(0)
##
## [[2]]$group
## numeric(0)
##
## [[2]]$names
## [1] ""
##
##
## [[3]]
## [[3]]$stats
## [,1]
## [1,] 40
## [2,] 64
## [3,] 72
## [4,] 80
## [5,] 104
##
## [[3]]$n
## [1] 768
##
## [[3]]$conf
## [,1]
## [1,] 71.08779
## [2,] 72.91221
##
## [[3]]$out
## [1] 30 110 108 122 30 110 108 110 24 38 106 106 106 114
##
## [[3]]$group
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##
## [[3]]$names
## [1] ""
##
##
## [[4]]
## [[4]]$stats
## [,1]
## [1,] 15
## [2,] 25
## [3,] 29
## [4,] 32
## [5,] 42
##
## [[4]]$n
## [1] 768
##
## [[4]]$conf
## [,1]
## [1,] 28.60091
## [2,] 29.39909
##
## [[4]]$out
## [1] 45 47 11 47 11 10 60 13 13 54 51 56 14 13 50 44 12 46 44 13 44 54 14 7 50
## [26] 52 10 44 43 45 14 10 11 12 43 13 12 48 43 43 8 13 14 12 49 46 46 11 8 12
## [51] 63 12 45 13 48 13 10 45 7 52 49 43 14 47 99 46 11 50 45 14 13 13 47 12 48
## [76] 43 46 46 45 10 46 49 11 13 46 44 48
##
## [[4]]$group
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [39] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [77] 1 1 1 1 1 1 1 1 1 1 1
##
## [[4]]$names
## [1] ""
##
##
## [[5]]
## [[5]]$stats
## [,1]
## [1,] 112.0
## [2,] 121.0
## [3,] 125.0
## [4,] 127.5
## [5,] 135.0
##
## [[5]]$n
## [1] 768
##
## [[5]]$conf
## [,1]
## [1,] 124.6294
## [2,] 125.3706
##
## [[5]]$out
## [1] 94 168 88 543 846 175 230 83 96 235 146 140 110 245 54 192 207 70
## [19] 240 82 36 23 300 342 304 110 142 38 100 90 140 270 71 71 110 176
## [37] 48 64 228 76 64 220 40 152 140 18 36 495 37 175 51 100 100 99
## [55] 94 145 168 225 49 140 50 92 325 63 284 204 155 485 94 53 105 285
## [73] 156 78 48 55 92 23 495 58 160 94 210 48 99 318 44 190 280 87
## [91] 175 271 478 190 56 32 744 53 370 37 45 192 88 176 194 680 402 55
## [109] 258 375 150 67 56 45 57 278 155 545 220 49 75 40 74 182 194 360
## [127] 215 184 42 105 148 180 205 148 96 85 94 64 140 231 29 168 156 68
## [145] 52 58 255 171 105 73 108 83 74 43 167 54 249 325 293 83 66 140
## [163] 465 89 66 94 158 325 84 75 72 82 182 59 110 50 285 81 196 415
## [181] 87 275 88 165 579 176 310 61 167 474 170 76 78 210 277 180 145 180
## [199] 85 60 50 14 70 92 64 63 95 210 105 71 237 60 56 49 105 36
## [217] 100 140 191 110 75 328 49 250 480 265 66 76 145 193 71 79 90 170
## [235] 76 210 86 105 165 326 66 82 105 188 106 65 56 210 155 215 190 56
## [253] 76 225 207 166 67 106 44 215 274 77 54 88 18 165 44 330 63 600
## [271] 156 140 230 185 25 293 41 272 182 158 194 321 144 15 160 54 90 183
## [289] 66 91 46 105 152 440 144 159 100 106 77 540 90 200 70 231 190 100
## [307] 168 49 240 265 45 105 205 180 180 95 480 155 200 100 335 160 387 22
## [325] 291 392 185 178 200 105 180 79 165 160 150 94 140 105 57 200 74 510
## [343] 110 16 180
##
## [[5]]$group
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [186] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [223] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [260] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [297] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [334] 1 1 1 1 1 1 1 1 1 1 1 1
##
## [[5]]$names
## [1] ""
##
##
## [[6]]
## [[6]]$stats
## [,1]
## [1,] 18.2
## [2,] 27.5
## [3,] 32.3
## [4,] 36.6
## [5,] 50.0
##
## [[6]]$n
## [1] 768
##
## [[6]]$conf
## [,1]
## [1,] 31.78118
## [2,] 32.81882
##
## [[6]]$out
## [1] 53.2 55.0 67.1 52.3 52.3 52.9 59.4 57.3
##
## [[6]]$group
## [1] 1 1 1 1 1 1 1 1
##
## [[6]]$names
## [1] ""
##
##
## [[7]]
## [[7]]$stats
## [,1]
## [1,] 0.0780
## [2,] 0.2435
## [3,] 0.3725
## [4,] 0.6265
## [5,] 1.1910
##
## [[7]]$n
## [1] 768
##
## [[7]]$conf
## [,1]
## [1,] 0.3506639
## [2,] 0.3943361
##
## [[7]]$out
## [1] 2.288 1.441 1.390 1.893 1.781 1.222 1.400 1.321 1.224 2.329 1.318 1.213
## [13] 1.353 1.224 1.391 1.476 2.137 1.731 1.268 1.600 2.420 1.251 1.699 1.258
## [25] 1.282 1.698 1.461 1.292 1.394
##
## [[7]]$group
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##
## [[7]]$names
## [1] ""
##
##
## [[8]]
## [[8]]$stats
## [,1]
## [1,] 21
## [2,] 24
## [3,] 29
## [4,] 41
## [5,] 66
##
## [[8]]$n
## [1] 768
##
## [[8]]$conf
## [,1]
## [1,] 28.03077
## [2,] 29.96923
##
## [[8]]$out
## [1] 69 67 72 81 67 67 70 68 69
##
## [[8]]$group
## [1] 1 1 1 1 1 1 1 1 1
##
## [[8]]$names
## [1] ""
# Barplot Outcome sau khi làm sạch
outcome_counts_clean <- table(diabetes$Outcome)
barplot(outcome_counts_clean,
main = "Count of Outcome after cleaning",
col = c("lightblue", "tomato"),
names.arg = c("No Diabetes", "Diabetes"))
par(mfrow = c(1, 1))
df_clean <- web_data
sample_values <- function(x) {
x_unique <- unique(x)
paste(head(x_unique, 5), collapse = " | ")
}
summary_table <- data.frame(
Column = names(df_clean),
Type = sapply(df_clean, function(x) class(x)[1]),
Sample_Values = sapply(df_clean, sample_values),
stringsAsFactors = FALSE
)
datatable(summary_table, caption = "Table X. Cleaned Dataset Column Information")