# Setup environment
knitr::opts_chunk$set(echo=TRUE, warning=FALSE, message=FALSE)
library(tidyverse)
library(ggplot2)
library(reshape2)
library(gridExtra)

1 Introduction

This project aims to analyze an advertisement dataset. We clean and visualize the data, then apply KMeans Clustering, a method not taught in class, to discover hidden structures in the data.

2 Data Description

2.1 Load and inspect the dataset

data <- read_csv("add.csv")
data <- data %>% select(-1)

# Rename columns
data <- data %>% rename(
  height = `0`,
  width = `1`,
  `ratio(width/height)` = `2`,
  local = `3`,
  `ad status` = `1558`
)

# Preview data
head(data)
## # A tibble: 6 × 1,559
##   height width `ratio(width/height)` local   `4`   `5`   `6`   `7`   `8`   `9`
##   <chr>  <chr> <chr>                 <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 125    125   1                     1         0     0     0     0     0     0
## 2 57     468   8.2105                1         0     0     0     0     0     0
## 3 33     230   6.9696                1         0     0     0     0     0     0
## 4 60     468   7.8                   1         0     0     0     0     0     0
## 5 60     468   7.8                   1         0     0     0     0     0     0
## 6 60     468   7.8                   1         0     0     0     0     0     0
## # ℹ 1,549 more variables: `10` <dbl>, `11` <dbl>, `12` <dbl>, `13` <dbl>,
## #   `14` <dbl>, `15` <dbl>, `16` <dbl>, `17` <dbl>, `18` <dbl>, `19` <dbl>,
## #   `20` <dbl>, `21` <dbl>, `22` <dbl>, `23` <dbl>, `24` <dbl>, `25` <dbl>,
## #   `26` <dbl>, `27` <dbl>, `28` <dbl>, `29` <dbl>, `30` <dbl>, `31` <dbl>,
## #   `32` <dbl>, `33` <dbl>, `34` <dbl>, `35` <dbl>, `36` <dbl>, `37` <dbl>,
## #   `38` <dbl>, `39` <dbl>, `40` <dbl>, `41` <dbl>, `42` <dbl>, `43` <dbl>,
## #   `44` <dbl>, `45` <dbl>, `46` <dbl>, `47` <dbl>, `48` <dbl>, `49` <dbl>, …

3 Data Cleaning and Preprocessing

3.1 Drop unused columns

data <- data[, -c(5:1558)]

# Replace '?' with NA
data_new <- data %>% mutate(across(everything(), ~ ifelse(. == "?", NA, .)))

# Convert types to numeric
data_new <- data_new %>% mutate(across(c(height, width, `ratio(width/height)`), as.numeric))

# Encode 'ad.' as 1 and 'nonad.' as 0
data_encoded <- data_new
data_encoded[[4]] <- ifelse(data_encoded[[4]] == "ad.", 1, 0)

3.2 Correlation Matrix

# Compute correlation matrix
num_data <- data_encoded %>% select(where(is.numeric))
num_data_clean <- num_data %>% select(where(~ sd(., na.rm=TRUE) != 0))
correlation_table <- cor(num_data_clean, use="complete.obs")
melted_corr <- melt(correlation_table)

# Plot heatmap
ggplot(data=melted_corr, aes(Var1, Var2, fill=value)) +
  geom_tile() +
  geom_text(aes(label=round(value, 2)), color="black") +
  scale_fill_gradient2(low="blue", high="red", mid="white", midpoint=0) +
  theme_minimal() +
  ggtitle("Correlation Matrix")

3.3 Handle Missing Values

# Impute with median
median_vals <- apply(data_encoded[, 1:3], 2, median, na.rm=TRUE)
datacopy <- data_encoded %>% mutate(
  height = ifelse(is.na(height), median_vals[1], height),
  width = ifelse(is.na(width), median_vals[2], width),
  `ratio(width/height)` = ifelse(is.na(`ratio(width/height)`), median_vals[3], `ratio(width/height)`)
)

# Impute with mean
mean_vals <- colMeans(data_encoded[, 1:3], na.rm=TRUE)
data_encoded <- data_encoded %>% mutate(
  height = ifelse(is.na(height), mean_vals[1], height),
  width = ifelse(is.na(width), mean_vals[2], width),
  `ratio(width/height)` = ifelse(is.na(`ratio(width/height)`), mean_vals[3], `ratio(width/height)`)
)

3.3.1 Compare Mean vs Median Imputation

# Compare imputation methods via histogram
p1 <- ggplot() +
  geom_histogram(aes(x=data_encoded$height), fill="blue", alpha=0.5, bins=20) +
  geom_histogram(aes(x=datacopy$height), fill="red", alpha=0.5, bins=20) +
  labs(title="Height Comparison", x="Height", y="Count") +
  theme_minimal()

p2 <- ggplot() +
  geom_histogram(aes(x=data_encoded$width), fill="red", alpha=0.5, bins=20) +
  geom_histogram(aes(x=datacopy$width), fill="orange", alpha=0.5, bins=20) +
  labs(title="Width Comparison", x="Width", y="Count") +
  theme_minimal()

p3 <- ggplot() +
  geom_histogram(aes(x=data_encoded$`ratio(width/height)`), fill="blue", alpha=0.5, bins=20) +
  geom_histogram(aes(x=datacopy$`ratio(width/height)`), fill="pink", alpha=0.5, bins=20) +
  labs(title="Ratio Comparison", x="Ratio", y="Count") +
  theme_minimal()

grid.arrange(p1, p2, p3, ncol=3)

3.4 Remove NA and Duplicates

# Clean final dataset
df1 <- data_encoded %>% 
  na.omit() %>% 
  distinct()

4 Descriptive Statistics

# Summary stats
summary(df1)
##      height           width       ratio(width/height)     local  
##  Min.   :  1.00   Min.   :  1.0   Min.   : 0.0015     Min.   :0  
##  1st Qu.: 35.00   1st Qu.: 80.5   1st Qu.: 1.0000     1st Qu.:0  
##  Median : 63.00   Median :120.0   Median : 1.5614     Median :0  
##  Mean   : 81.58   Mean   :145.0   Mean   : 2.8893     Mean   :0  
##  3rd Qu.:108.00   3rd Qu.:187.5   3rd Qu.: 3.7554     3rd Qu.:0  
##  Max.   :640.00   Max.   :640.0   Max.   :60.0000     Max.   :0  
##   ad status        
##  Length:991        
##  Class :character  
##  Mode  :character  
##                    
##                    
## 

4.1 Histograms

# Plot histograms
p11 <- ggplot(df1, aes(x=height)) + geom_histogram(fill="blue", bins=20) + ggtitle("Height")
p22 <- ggplot(df1, aes(x=width)) + geom_histogram(fill="red", bins=20) + ggtitle("Width")
p33 <- ggplot(df1, aes(x=`ratio(width/height)`)) + geom_histogram(fill="green", bins=20) + ggtitle("Ratio")

grid.arrange(p11, p22, p33, ncol=3)

4.2 Boxplots

# Plot boxplots
p4 <- ggplot(df1, aes(y=height)) + geom_boxplot(fill="cyan") + ggtitle("Boxplot: Height")
p5 <- ggplot(df1, aes(y=width)) + geom_boxplot(fill="red") + ggtitle("Boxplot: Width")
p6 <- ggplot(df1, aes(y=`ratio(width/height)`)) + geom_boxplot(fill="green") + ggtitle("Boxplot: Ratio")

grid.arrange(p4, p5, p6, ncol=3)

5 Statistical Method: KMeans Clustering

# Apply KMeans clustering
set.seed(42)
df_kmeans <- df1 %>% select(height, width, `ratio(width/height)`)
kmeans_result <- kmeans(df_kmeans, centers=2)

# Add cluster labels
data_clustered <- df1 %>% mutate(cluster=as.factor(kmeans_result$cluster))

# Plot clusters
ggplot(data_clustered, aes(x=width, y=height, color=cluster)) +
  geom_point(alpha=0.6, size=2) +
  labs(title="KMeans Clustering (k=2)", x="Width", y="Height")

5.1 KMeans Interpretation

  • We applied KMeans with k=2 to group the observations based on numerical features.
  • The clusters show distinct groups likely associated with ad vs non-ad, or size-based categorization.

6 Conclusion