1 Library Loading

library(readxl)
library(tidyverse)
library("readxl")
library(knitr)
library(data.table)
library(dplyr)
library(ggplot2)
library(tidyr)
library(writexl)
library(pastecs)
library(fpc)
library(magrittr)
data <- read_excel("/Users/admin/Documents/Linh-R Studio/Irisgo/IRISGO_data.xlsx")

2 Data Wrangling

2.1 Change format from long to wide

2.2 Calculate Usage Time

dfx <- data_wide %>% group_by(`User ID`) %>% summarise(Usage = sum(`Total Usage Time (minute)`,na.rm = T), Learn = sum(`Total Learning Time (minute)`,na.rm = T), Entertainment = sum(`Total Entertaining Time (minute)`,na.rm = T), Discovery = sum(`Total Discovering Time (minute)`,na.rm = T), Child = n()) # na.rm = replace the na = 0 

tail(dfx,10)%>%knitr::kable()
User ID Usage Learn Entertainment Discovery Child
ff8cd672-ac4f-45d8-a121-b0663a396cda 0.00 0.000000 0.00000 0.000000 1
ffa0023e-e58f-4787-87d7-9b0a7a8e3203 0.00 0.000000 0.00000 0.000000 1
ffa3d783-c5fb-414f-b6f2-07e8eac70a37 8.90 4.133333 2.20000 2.566667 1
ffab3bd4-9297-4fea-9574-d7cdee690465 0.00 0.000000 0.00000 0.000000 1
ffbaa2de-c2fd-4b75-80b1-92d6737e93eb 0.00 0.000000 0.00000 0.000000 1
ffc505c1-69d5-4763-aee5-4a226ed5660a 30.45 19.683333 10.56667 0.200000 1
ffc550f4-ac79-4020-bb22-20a6ec818765 0.00 0.000000 0.00000 0.000000 1
ffc7bb05-ee07-4b9f-859d-b1bd8d150943 0.00 0.000000 0.00000 0.000000 1
ffc8b9c3-3ae7-4373-9a59-ee360ade048f 0.00 0.000000 0.00000 0.000000 1
ffe6c04b-b2b0-4f90-8043-ba2c3fae1bf2 0.00 0.000000 0.00000 0.000000 1

2.3 Data preparation for cluster

df3 <- data_wide %>% distinct(`User ID`, .keep_all = TRUE)
df2 <- data_wide[!duplicated(data_wide$`User ID`), ]
df4 <- df2 %>% select(`User ID`,`Day of User Joined Date`,`Number of Active Days`)

df5 <- merge(dfx,df4)
df51 <-  df5[complete.cases(df5),] # remove null / na valaue 

2.4 Clustering

2.4.1 Calculate k

# Cluster 
df52 <- df51 %>% filter(Usage != "0")
df6 <- df52 %>% select(`User ID`, `Usage`, `Number of Active Days`)

df7 <-  df6[complete.cases(df6),]

df7 %>% mutate_if(is.numeric, function(x) {(x - min(x)) / (max(x) - min(x))}) %>% select(-`User ID`) -> final_df_scaled

set.seed(29)

wss <- sapply(1:10, 
              function(k){kmeans(final_df_scaled %>% sample_frac(0.2), 
                                 k, nstart = 30)$tot.withinss})

u <- data.frame(k = 1:10, WSS = wss)

u %>% 
  ggplot(aes(k, WSS)) + 
  geom_line() + 
  geom_point() + 
  geom_point(data = u %>% filter(k == 3), color = "red", size = 3) + 
  scale_x_continuous(breaks = seq(1, 10, by = 1)) + 
  labs(title = "Figure 7: The Optimal Number of Clusters, Elbow Method", x = "Number of Clusters (K)") + 
  theme(panel.grid.minor = element_blank())

2.4.2 Cluster with k = 3

set.seed(123)
km.res <- kmeans(final_df_scaled, 3, nstart = 30)

2.4.3 Mapping Clustered data with dataset

df52 %>% 
  mutate(Group = km.res$cluster) %>% 
  mutate(Group = paste("Group", Group)) -> final_df_clustered

final_df_clustered <- final_df_clustered[c(1,7,6,8,2,3,4,5,9)] # reorder columns

df5 <- df5[c(1,7,6,8,2,3,4,5)]

tail(df5,5)%>%knitr::kable()
User ID Day of User Joined Date Child Number of Active Days Usage Learn Entertainment Discovery
6865 ffc505c1-69d5-4763-aee5-4a226ed5660a 2021-07-17 1 1 30.45 19.68333 10.56667 0.2
6866 ffc550f4-ac79-4020-bb22-20a6ec818765 2021-07-03 1 1 0.00 0.00000 0.00000 0.0
6867 ffc7bb05-ee07-4b9f-859d-b1bd8d150943 2021-07-19 1 1 0.00 0.00000 0.00000 0.0
6868 ffc8b9c3-3ae7-4373-9a59-ee360ade048f 2021-07-15 1 1 0.00 0.00000 0.00000 0.0
6869 ffe6c04b-b2b0-4f90-8043-ba2c3fae1bf2 2021-07-15 1 1 0.00 0.00000 0.00000 0.0