data <- read.csv("C:/Users/UsEr/Downloads/clv_data.csv") # Ganti dengan path file Anda
print(head(data))
## X id age gender income days_on_platform city purchases
## 1 0 0 NA Male 126895 14 San Francisco 0
## 2 1 1 NA Male 161474 14 Tokyo 0
## 3 2 2 24 Male 104723 34 London 1
## 4 3 3 29 Male 43791 28 London 2
## 5 4 4 18 Female 132181 26 London 2
## 6 5 5 23 Male 12315 14 New York City 0
data <- subset(data, select = -c(X, id))
print(head(data))
## age gender income days_on_platform city purchases
## 1 NA Male 126895 14 San Francisco 0
## 2 NA Male 161474 14 Tokyo 0
## 3 24 Male 104723 34 London 1
## 4 29 Male 43791 28 London 2
## 5 18 Female 132181 26 London 2
## 6 23 Male 12315 14 New York City 0
data$age[is.na(data$age)] <- median(data$age, na.rm = TRUE)
data$days_on_platform[is.na(data$days_on_platform)] <- median(data$days_on_platform, na.rm = TRUE)
data$gender <- as.factor(data$gender)
data$city <- as.factor(data$city)
normalize <- function(x) (x - min(x)) / (max(x) - min(x))
data$income <- normalize(data$income)
data$days_on_platform <- normalize(data$days_on_platform)
data$purchases <- normalize(data$purchases)
print(head(data))
## age gender income days_on_platform city purchases
## 1 30 Male 0.3265606 0.1181818 San Francisco 0.0000000
## 2 30 Male 0.4155515 0.1181818 Tokyo 0.0000000
## 3 24 Male 0.2694998 0.3000000 London 0.1666667
## 4 29 Male 0.1126881 0.2454545 London 0.3333333
## 5 18 Female 0.3401644 0.2272727 London 0.3333333
## 6 23 Male 0.0316830 0.1181818 New York City 0.0000000
write.csv(data, "processed_clv_data.csv", row.names = FALSE)
print(file.exists("processed_clv_data.csv"))
## [1] TRUE