# Setup environment
knitr::opts_chunk$set(echo=TRUE, warning=FALSE, message=FALSE)
library(tidyverse)
library(ggplot2)
library(reshape2)
library(gridExtra)
This project aims to analyze an advertisement dataset. We clean and visualize the data, then apply KMeans Clustering, a method not taught in class, to discover hidden structures in the data.
data <- read_csv("add.csv")
data <- data %>% select(-1)
# Rename columns
data <- data %>% rename(
height = `0`,
width = `1`,
`ratio(width/height)` = `2`,
local = `3`,
`ad status` = `1558`
)
# Preview data
head(data)
## # A tibble: 6 × 1,559
## height width `ratio(width/height)` local `4` `5` `6` `7` `8` `9`
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 125 125 1 1 0 0 0 0 0 0
## 2 57 468 8.2105 1 0 0 0 0 0 0
## 3 33 230 6.9696 1 0 0 0 0 0 0
## 4 60 468 7.8 1 0 0 0 0 0 0
## 5 60 468 7.8 1 0 0 0 0 0 0
## 6 60 468 7.8 1 0 0 0 0 0 0
## # ℹ 1,549 more variables: `10` <dbl>, `11` <dbl>, `12` <dbl>, `13` <dbl>,
## # `14` <dbl>, `15` <dbl>, `16` <dbl>, `17` <dbl>, `18` <dbl>, `19` <dbl>,
## # `20` <dbl>, `21` <dbl>, `22` <dbl>, `23` <dbl>, `24` <dbl>, `25` <dbl>,
## # `26` <dbl>, `27` <dbl>, `28` <dbl>, `29` <dbl>, `30` <dbl>, `31` <dbl>,
## # `32` <dbl>, `33` <dbl>, `34` <dbl>, `35` <dbl>, `36` <dbl>, `37` <dbl>,
## # `38` <dbl>, `39` <dbl>, `40` <dbl>, `41` <dbl>, `42` <dbl>, `43` <dbl>,
## # `44` <dbl>, `45` <dbl>, `46` <dbl>, `47` <dbl>, `48` <dbl>, `49` <dbl>, …
data <- data[, -c(5:1558)]
# Replace '?' with NA
data_new <- data %>% mutate(across(everything(), ~ ifelse(. == "?", NA, .)))
# Convert types to numeric
data_new <- data_new %>% mutate(across(c(height, width, `ratio(width/height)`), as.numeric))
# Encode 'ad.' as 1 and 'nonad.' as 0
data_encoded <- data_new
data_encoded[[4]] <- ifelse(data_encoded[[4]] == "ad.", 1, 0)
# Compute correlation matrix
num_data <- data_encoded %>% select(where(is.numeric))
num_data_clean <- num_data %>% select(where(~ sd(., na.rm=TRUE) != 0))
correlation_table <- cor(num_data_clean, use="complete.obs")
melted_corr <- melt(correlation_table)
# Plot heatmap
ggplot(data=melted_corr, aes(Var1, Var2, fill=value)) +
geom_tile() +
geom_text(aes(label=round(value, 2)), color="black") +
scale_fill_gradient2(low="blue", high="red", mid="white", midpoint=0) +
theme_minimal() +
ggtitle("Correlation Matrix")
# Impute with median
median_vals <- apply(data_encoded[, 1:3], 2, median, na.rm=TRUE)
datacopy <- data_encoded %>% mutate(
height = ifelse(is.na(height), median_vals[1], height),
width = ifelse(is.na(width), median_vals[2], width),
`ratio(width/height)` = ifelse(is.na(`ratio(width/height)`), median_vals[3], `ratio(width/height)`)
)
# Impute with mean
mean_vals <- colMeans(data_encoded[, 1:3], na.rm=TRUE)
data_encoded <- data_encoded %>% mutate(
height = ifelse(is.na(height), mean_vals[1], height),
width = ifelse(is.na(width), mean_vals[2], width),
`ratio(width/height)` = ifelse(is.na(`ratio(width/height)`), mean_vals[3], `ratio(width/height)`)
)
# Compare imputation methods via histogram
p1 <- ggplot() +
geom_histogram(aes(x=data_encoded$height), fill="blue", alpha=0.5, bins=20) +
geom_histogram(aes(x=datacopy$height), fill="red", alpha=0.5, bins=20) +
labs(title="Height Comparison", x="Height", y="Count") +
theme_minimal()
p2 <- ggplot() +
geom_histogram(aes(x=data_encoded$width), fill="red", alpha=0.5, bins=20) +
geom_histogram(aes(x=datacopy$width), fill="orange", alpha=0.5, bins=20) +
labs(title="Width Comparison", x="Width", y="Count") +
theme_minimal()
p3 <- ggplot() +
geom_histogram(aes(x=data_encoded$`ratio(width/height)`), fill="blue", alpha=0.5, bins=20) +
geom_histogram(aes(x=datacopy$`ratio(width/height)`), fill="pink", alpha=0.5, bins=20) +
labs(title="Ratio Comparison", x="Ratio", y="Count") +
theme_minimal()
grid.arrange(p1, p2, p3, ncol=3)
# Clean final dataset
df1 <- data_encoded %>%
na.omit() %>%
distinct()
# Summary stats
summary(df1)
## height width ratio(width/height) local
## Min. : 1.00 Min. : 1.0 Min. : 0.0015 Min. :0
## 1st Qu.: 35.00 1st Qu.: 80.5 1st Qu.: 1.0000 1st Qu.:0
## Median : 63.00 Median :120.0 Median : 1.5614 Median :0
## Mean : 81.58 Mean :145.0 Mean : 2.8893 Mean :0
## 3rd Qu.:108.00 3rd Qu.:187.5 3rd Qu.: 3.7554 3rd Qu.:0
## Max. :640.00 Max. :640.0 Max. :60.0000 Max. :0
## ad status
## Length:991
## Class :character
## Mode :character
##
##
##
# Plot histograms
p11 <- ggplot(df1, aes(x=height)) + geom_histogram(fill="blue", bins=20) + ggtitle("Height")
p22 <- ggplot(df1, aes(x=width)) + geom_histogram(fill="red", bins=20) + ggtitle("Width")
p33 <- ggplot(df1, aes(x=`ratio(width/height)`)) + geom_histogram(fill="green", bins=20) + ggtitle("Ratio")
grid.arrange(p11, p22, p33, ncol=3)
# Plot boxplots
p4 <- ggplot(df1, aes(y=height)) + geom_boxplot(fill="cyan") + ggtitle("Boxplot: Height")
p5 <- ggplot(df1, aes(y=width)) + geom_boxplot(fill="red") + ggtitle("Boxplot: Width")
p6 <- ggplot(df1, aes(y=`ratio(width/height)`)) + geom_boxplot(fill="green") + ggtitle("Boxplot: Ratio")
grid.arrange(p4, p5, p6, ncol=3)
# Apply KMeans clustering
set.seed(42)
df_kmeans <- df1 %>% select(height, width, `ratio(width/height)`)
kmeans_result <- kmeans(df_kmeans, centers=2)
# Add cluster labels
data_clustered <- df1 %>% mutate(cluster=as.factor(kmeans_result$cluster))
# Plot clusters
ggplot(data_clustered, aes(x=width, y=height, color=cluster)) +
geom_point(alpha=0.6, size=2) +
labs(title="KMeans Clustering (k=2)", x="Width", y="Height")