Library Loading
library(readxl)
library(tidyverse)
library("readxl")
library(knitr)
library(data.table)
library(dplyr)
library(ggplot2)
library(tidyr)
library(writexl)
library(pastecs)
library(fpc)
library(magrittr)
data <- read_excel("/Users/admin/Documents/Linh-R Studio/Irisgo/IRISGO_data.xlsx")
Data Wrangling
Change format from long to wide
Calculate Usage Time
dfx <- data_wide %>% group_by(`User ID`) %>% summarise(Usage = sum(`Total Usage Time (minute)`,na.rm = T), Learn = sum(`Total Learning Time (minute)`,na.rm = T), Entertainment = sum(`Total Entertaining Time (minute)`,na.rm = T), Discovery = sum(`Total Discovering Time (minute)`,na.rm = T), Child = n()) # na.rm = replace the na = 0
tail(dfx,10)%>%knitr::kable()
| ff8cd672-ac4f-45d8-a121-b0663a396cda |
0.00 |
0.000000 |
0.00000 |
0.000000 |
1 |
| ffa0023e-e58f-4787-87d7-9b0a7a8e3203 |
0.00 |
0.000000 |
0.00000 |
0.000000 |
1 |
| ffa3d783-c5fb-414f-b6f2-07e8eac70a37 |
8.90 |
4.133333 |
2.20000 |
2.566667 |
1 |
| ffab3bd4-9297-4fea-9574-d7cdee690465 |
0.00 |
0.000000 |
0.00000 |
0.000000 |
1 |
| ffbaa2de-c2fd-4b75-80b1-92d6737e93eb |
0.00 |
0.000000 |
0.00000 |
0.000000 |
1 |
| ffc505c1-69d5-4763-aee5-4a226ed5660a |
30.45 |
19.683333 |
10.56667 |
0.200000 |
1 |
| ffc550f4-ac79-4020-bb22-20a6ec818765 |
0.00 |
0.000000 |
0.00000 |
0.000000 |
1 |
| ffc7bb05-ee07-4b9f-859d-b1bd8d150943 |
0.00 |
0.000000 |
0.00000 |
0.000000 |
1 |
| ffc8b9c3-3ae7-4373-9a59-ee360ade048f |
0.00 |
0.000000 |
0.00000 |
0.000000 |
1 |
| ffe6c04b-b2b0-4f90-8043-ba2c3fae1bf2 |
0.00 |
0.000000 |
0.00000 |
0.000000 |
1 |
Data preparation for cluster
df3 <- data_wide %>% distinct(`User ID`, .keep_all = TRUE)
df2 <- data_wide[!duplicated(data_wide$`User ID`), ]
df4 <- df2 %>% select(`User ID`,`Day of User Joined Date`,`Number of Active Days`)
df5 <- merge(dfx,df4)
df51 <- df5[complete.cases(df5),] # remove null / na valaue
Clustering
Calculate k
# Cluster
df52 <- df51 %>% filter(Usage != "0")
df6 <- df52 %>% select(`User ID`, `Usage`, `Number of Active Days`)
df7 <- df6[complete.cases(df6),]
df7 %>% mutate_if(is.numeric, function(x) {(x - min(x)) / (max(x) - min(x))}) %>% select(-`User ID`) -> final_df_scaled
set.seed(29)
wss <- sapply(1:10,
function(k){kmeans(final_df_scaled %>% sample_frac(0.2),
k, nstart = 30)$tot.withinss})
u <- data.frame(k = 1:10, WSS = wss)
u %>%
ggplot(aes(k, WSS)) +
geom_line() +
geom_point() +
geom_point(data = u %>% filter(k == 3), color = "red", size = 3) +
scale_x_continuous(breaks = seq(1, 10, by = 1)) +
labs(title = "Figure 7: The Optimal Number of Clusters, Elbow Method", x = "Number of Clusters (K)") +
theme(panel.grid.minor = element_blank())

Cluster with k = 3
set.seed(123)
km.res <- kmeans(final_df_scaled, 3, nstart = 30)
Mapping Clustered data with dataset
df52 %>%
mutate(Group = km.res$cluster) %>%
mutate(Group = paste("Group", Group)) -> final_df_clustered
final_df_clustered <- final_df_clustered[c(1,7,6,8,2,3,4,5,9)] # reorder columns
df5 <- df5[c(1,7,6,8,2,3,4,5)]
tail(df5,5)%>%knitr::kable()
| 6865 |
ffc505c1-69d5-4763-aee5-4a226ed5660a |
2021-07-17 |
1 |
1 |
30.45 |
19.68333 |
10.56667 |
0.2 |
| 6866 |
ffc550f4-ac79-4020-bb22-20a6ec818765 |
2021-07-03 |
1 |
1 |
0.00 |
0.00000 |
0.00000 |
0.0 |
| 6867 |
ffc7bb05-ee07-4b9f-859d-b1bd8d150943 |
2021-07-19 |
1 |
1 |
0.00 |
0.00000 |
0.00000 |
0.0 |
| 6868 |
ffc8b9c3-3ae7-4373-9a59-ee360ade048f |
2021-07-15 |
1 |
1 |
0.00 |
0.00000 |
0.00000 |
0.0 |
| 6869 |
ffe6c04b-b2b0-4f90-8043-ba2c3fae1bf2 |
2021-07-15 |
1 |
1 |
0.00 |
0.00000 |
0.00000 |
0.0 |