1. Load Data
events <- read_csv("events.csv") # user_id, item_id, event, timestamp
features <- read_csv("features.csv") # item_id, bedrooms, bathrooms, floorarea
2. Clean and Prepare Data
#IDs as character
events <- events %>%
mutate(
user_id = as.character(user_id),
item_id = as.character(item_id)
)
features <- features %>%
mutate(
item_id = as.character(item_id),
bedrooms = as.numeric(bedrooms),
bathrooms = as.numeric(bathrooms),
floorarea = as.numeric(floorarea)
) %>%
distinct(item_id, .keep_all = TRUE)
#Bedrooms: fix impossible values, impute medians
features <- features %>%
mutate(bedrooms = ifelse(bedrooms < 0, NA, bedrooms))
for (col in c("bedrooms", "bathrooms", "floorarea")) {
med <- median(features[[col]], na.rm = TRUE)
features[[col]][is.na(features[[col]])] <- med
}
#Time features
events <- events %>%
mutate(
ts = as_datetime(timestamp),
hour = hour(ts) # 0–23
)
3. Model 1 – Timing Analysis (When Users Engage)
hourly_views <- events %>%
filter(event == "VIEW") %>%
count(hour, name = "n_views") %>%
arrange(hour)
p_hour <- ggplot(hourly_views,
aes(x = hour, y = n_views)) +
geom_col(fill = "orange", alpha = 0.8, width = 0.7) +
labs(
title = "Views by Hour of Day",
subtitle = "When do users engage with PropertyGuru?",
x = "Hour of Day",
y = "Total Views"
) +
scale_x_continuous(breaks = 0:23) +
theme_minimal(base_size = 12)
p_hour

4. Model 2 – User Clustering (Personas)
#Join views with property attributes
events_joined <- events %>%
filter(event == "VIEW") %>%
inner_join(
features %>% select(item_id, bedrooms, floorarea),
by = "item_id"
)
#Aggregate to user level
user_agg <- events_joined %>%
group_by(user_id) %>%
summarise(
total_views = n(),
avg_bedrooms_viewed = mean(bedrooms, na.rm = TRUE),
avg_floorarea_viewed = mean(floorarea, na.rm = TRUE),
.groups = "drop"
) %>%
filter(total_views >= 3)
#Scale and k-means
user_scaled <- user_agg %>%
select(total_views, avg_bedrooms_viewed, avg_floorarea_viewed) %>%
scale()
set.seed(123)
km3 <- kmeans(user_scaled, centers = 3, nstart = 25)
user_agg$persona <- factor(
km3$cluster,
labels = c("Persona A", "Persona B", "Persona C")
)
persona_summary <- user_agg %>%
group_by(persona) %>%
summarise(
n_users = n(),
avg_views = mean(total_views),
avg_bedrooms = mean(avg_bedrooms_viewed),
avg_floor = mean(avg_floorarea_viewed),
.groups = "drop"
)
persona_summary
## # A tibble: 3 × 5
## persona n_users avg_views avg_bedrooms avg_floor
## <fct> <int> <dbl> <dbl> <dbl>
## 1 Persona A 1 21 2.05 115995.
## 2 Persona B 685 27.2 2.54 1101.
## 3 Persona C 406 46.1 3.70 2877.
p_persona <- ggplot(user_agg,
aes(x = avg_bedrooms_viewed,
y = avg_floorarea_viewed,
color = persona)) +
geom_point(alpha = 0.6) +
labs(
title = "User Personas from K-means Clustering",
x = "Average Bedrooms Viewed",
y = "Average Floor Area Viewed",
color = "Persona"
) +
theme_minimal(base_size = 12)
p_persona

5. Model 3 – Popularity Prediction (Logistic Regression)
#Views per listing
view_counts <- events %>%
filter(event == "VIEW") %>%
count(item_id, name = "viewcount")
#Join with features
features_model <- features %>%
left_join(view_counts, by = "item_id") %>%
mutate(viewcount = ifelse(is.na(viewcount), 0L, viewcount))
#Popular = top 25% by viewcount
q75 <- quantile(features_model$viewcount, 0.75, na.rm = TRUE)
features_model <- features_model %>%
mutate(
popular = ifelse(viewcount >= q75, 1L, 0L),
popular = factor(popular, levels = c(0, 1))
)
prop_data <- features_model %>%
select(bedrooms, bathrooms, floorarea, popular)
set.seed(123)
idx <- createDataPartition(prop_data$popular, p = 0.75, list = FALSE)
train_df <- prop_data[idx, ]
test_df <- prop_data[-idx, ]
logit_fit <- glm(
popular ~ bedrooms + bathrooms + floorarea,
data = train_df,
family = binomial(link = "logit")
)
summary(logit_fit)
##
## Call:
## glm(formula = popular ~ bedrooms + bathrooms + floorarea, family = binomial(link = "logit"),
## data = train_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.661e-01 4.046e-02 -13.991 <2e-16 ***
## bedrooms -1.641e-02 2.052e-02 -0.800 0.424
## bathrooms -2.659e-02 1.919e-02 -1.385 0.166
## floorarea -1.199e-06 1.277e-06 -0.939 0.348
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 22412 on 17566 degrees of freedom
## Residual deviance: 22398 on 17563 degrees of freedom
## AIC: 22406
##
## Number of Fisher Scoring iterations: 4
odds_ratios <- exp(coef(logit_fit))
odds_ratios
## (Intercept) bedrooms bathrooms floorarea
## 0.5677246 0.9837219 0.9737628 0.9999988
test_df$prob_popular <- predict(
logit_fit,
newdata = test_df,
type = "response"
)
threshold <- 0.5
test_df$pred_popular <- ifelse(test_df$prob_popular >= threshold, 1L, 0L)
test_df$pred_popular <- factor(test_df$pred_popular, levels = c(0, 1))
conf_mat <- table(
Actual = test_df$popular,
Predicted = test_df$pred_popular
)
conf_mat
## Predicted
## Actual 0 1
## 0 3892 0
## 1 1963 0
accuracy <- sum(diag(conf_mat)) / sum(conf_mat)
accuracy
## [1] 0.664731
roc_obj <- roc(
response = test_df$popular,
predictor = test_df$prob_popular
)
auc_val <- auc(roc_obj)
auc_val
## Area under the curve: 0.5141