data <- read.csv("C:/Users/UsEr/Downloads/clv_data.csv")  # Ganti dengan path file Anda
print(head(data))
##   X id age gender income days_on_platform          city purchases
## 1 0  0  NA   Male 126895               14 San Francisco         0
## 2 1  1  NA   Male 161474               14         Tokyo         0
## 3 2  2  24   Male 104723               34        London         1
## 4 3  3  29   Male  43791               28        London         2
## 5 4  4  18 Female 132181               26        London         2
## 6 5  5  23   Male  12315               14 New York City         0
data <- subset(data, select = -c(X, id))
print(head(data))
##   age gender income days_on_platform          city purchases
## 1  NA   Male 126895               14 San Francisco         0
## 2  NA   Male 161474               14         Tokyo         0
## 3  24   Male 104723               34        London         1
## 4  29   Male  43791               28        London         2
## 5  18 Female 132181               26        London         2
## 6  23   Male  12315               14 New York City         0
data$age[is.na(data$age)] <- median(data$age, na.rm = TRUE)
data$days_on_platform[is.na(data$days_on_platform)] <- median(data$days_on_platform, na.rm = TRUE)
data$gender <- as.factor(data$gender)
data$city <- as.factor(data$city)
normalize <- function(x) (x - min(x)) / (max(x) - min(x))
data$income <- normalize(data$income)
data$days_on_platform <- normalize(data$days_on_platform)
data$purchases <- normalize(data$purchases)
print(head(data))
##   age gender    income days_on_platform          city purchases
## 1  30   Male 0.3265606        0.1181818 San Francisco 0.0000000
## 2  30   Male 0.4155515        0.1181818         Tokyo 0.0000000
## 3  24   Male 0.2694998        0.3000000        London 0.1666667
## 4  29   Male 0.1126881        0.2454545        London 0.3333333
## 5  18 Female 0.3401644        0.2272727        London 0.3333333
## 6  23   Male 0.0316830        0.1181818 New York City 0.0000000
write.csv(data, "processed_clv_data.csv", row.names = FALSE)
print(file.exists("processed_clv_data.csv"))
## [1] TRUE