1. Load Data

events <- read_csv("events.csv") # user_id, item_id, event, timestamp
features <- read_csv("features.csv") # item_id, bedrooms, bathrooms, floorarea

2. Clean and Prepare Data

#IDs as character
events <- events %>%
mutate(
user_id = as.character(user_id),
item_id = as.character(item_id)
)

features <- features %>%
mutate(
item_id = as.character(item_id),
bedrooms = as.numeric(bedrooms),
bathrooms = as.numeric(bathrooms),
floorarea = as.numeric(floorarea)
) %>%
distinct(item_id, .keep_all = TRUE)

#Bedrooms: fix impossible values, impute medians
features <- features %>%
mutate(bedrooms = ifelse(bedrooms < 0, NA, bedrooms))

for (col in c("bedrooms", "bathrooms", "floorarea")) {
med <- median(features[[col]], na.rm = TRUE)
features[[col]][is.na(features[[col]])] <- med
}

#Time features
events <- events %>%
mutate(
ts = as_datetime(timestamp),
hour = hour(ts) # 0–23
)

3. Model 1 – Timing Analysis (When Users Engage)

hourly_views <- events %>%
filter(event == "VIEW") %>%
count(hour, name = "n_views") %>%
arrange(hour)

p_hour <- ggplot(hourly_views,
aes(x = hour, y = n_views)) +
geom_col(fill = "orange", alpha = 0.8, width = 0.7) +
labs(
title = "Views by Hour of Day",
subtitle = "When do users engage with PropertyGuru?",
x = "Hour of Day",
y = "Total Views"
) +
scale_x_continuous(breaks = 0:23) +
theme_minimal(base_size = 12)

p_hour

4. Model 2 – User Clustering (Personas)

#Join views with property attributes
events_joined <- events %>%
filter(event == "VIEW") %>%
inner_join(
features %>% select(item_id, bedrooms, floorarea),
by = "item_id"
)

#Aggregate to user level
user_agg <- events_joined %>%
group_by(user_id) %>%
summarise(
total_views = n(),
avg_bedrooms_viewed = mean(bedrooms, na.rm = TRUE),
avg_floorarea_viewed = mean(floorarea, na.rm = TRUE),
.groups = "drop"
) %>%
filter(total_views >= 3)

#Scale and k-means
user_scaled <- user_agg %>%
select(total_views, avg_bedrooms_viewed, avg_floorarea_viewed) %>%
scale()

set.seed(123)
km3 <- kmeans(user_scaled, centers = 3, nstart = 25)

user_agg$persona <- factor(
km3$cluster,
labels = c("Persona A", "Persona B", "Persona C")
)

persona_summary <- user_agg %>%
group_by(persona) %>%
summarise(
n_users = n(),
avg_views = mean(total_views),
avg_bedrooms = mean(avg_bedrooms_viewed),
avg_floor = mean(avg_floorarea_viewed),
.groups = "drop"
)

persona_summary
## # A tibble: 3 × 5
##   persona   n_users avg_views avg_bedrooms avg_floor
##   <fct>       <int>     <dbl>        <dbl>     <dbl>
## 1 Persona A       1      21           2.05   115995.
## 2 Persona B     685      27.2         2.54     1101.
## 3 Persona C     406      46.1         3.70     2877.
p_persona <- ggplot(user_agg,
aes(x = avg_bedrooms_viewed,
y = avg_floorarea_viewed,
color = persona)) +
geom_point(alpha = 0.6) +
labs(
title = "User Personas from K-means Clustering",
x = "Average Bedrooms Viewed",
y = "Average Floor Area Viewed",
color = "Persona"
) +
theme_minimal(base_size = 12)

p_persona

5. Model 3 – Popularity Prediction (Logistic Regression)

#Views per listing
view_counts <- events %>%
filter(event == "VIEW") %>%
count(item_id, name = "viewcount")

#Join with features
features_model <- features %>%
left_join(view_counts, by = "item_id") %>%
mutate(viewcount = ifelse(is.na(viewcount), 0L, viewcount))

#Popular = top 25% by viewcount
q75 <- quantile(features_model$viewcount, 0.75, na.rm = TRUE)

features_model <- features_model %>%
mutate(
popular = ifelse(viewcount >= q75, 1L, 0L),
popular = factor(popular, levels = c(0, 1))
)

prop_data <- features_model %>%
select(bedrooms, bathrooms, floorarea, popular)

set.seed(123)
idx <- createDataPartition(prop_data$popular, p = 0.75, list = FALSE)
train_df <- prop_data[idx, ]
test_df <- prop_data[-idx, ]

logit_fit <- glm(
popular ~ bedrooms + bathrooms + floorarea,
data = train_df,
family = binomial(link = "logit")
)

summary(logit_fit)
## 
## Call:
## glm(formula = popular ~ bedrooms + bathrooms + floorarea, family = binomial(link = "logit"), 
##     data = train_df)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -5.661e-01  4.046e-02 -13.991   <2e-16 ***
## bedrooms    -1.641e-02  2.052e-02  -0.800    0.424    
## bathrooms   -2.659e-02  1.919e-02  -1.385    0.166    
## floorarea   -1.199e-06  1.277e-06  -0.939    0.348    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 22412  on 17566  degrees of freedom
## Residual deviance: 22398  on 17563  degrees of freedom
## AIC: 22406
## 
## Number of Fisher Scoring iterations: 4
odds_ratios <- exp(coef(logit_fit))
odds_ratios
## (Intercept)    bedrooms   bathrooms   floorarea 
##   0.5677246   0.9837219   0.9737628   0.9999988
test_df$prob_popular <- predict(
logit_fit,
newdata = test_df,
type = "response"
)

threshold <- 0.5
test_df$pred_popular <- ifelse(test_df$prob_popular >= threshold, 1L, 0L)
test_df$pred_popular <- factor(test_df$pred_popular, levels = c(0, 1))

conf_mat <- table(
Actual = test_df$popular,
Predicted = test_df$pred_popular
)
conf_mat
##       Predicted
## Actual    0    1
##      0 3892    0
##      1 1963    0
accuracy <- sum(diag(conf_mat)) / sum(conf_mat)
accuracy
## [1] 0.664731
roc_obj <- roc(
response = test_df$popular,
predictor = test_df$prob_popular
)
auc_val <- auc(roc_obj)
auc_val
## Area under the curve: 0.5141