url <- "https://raw.githubusercontent.com/stormwhale/data-mines/refs/heads/main/user_behavior_dataset.csv"
data <- read.csv(url)
pi_gal <- data %>% 
  filter(str_detect(Device.Model, regex("Pixel|Galaxy", ignore_case = TRUE)))
head(pi_gal)
##   User.ID       Device.Model Operating.System App.Usage.Time..min.day.
## 1       1     Google Pixel 5          Android                      393
## 2       4     Google Pixel 5          Android                      239
## 3       6     Google Pixel 5          Android                       99
## 4       7 Samsung Galaxy S21          Android                      350
## 5       9 Samsung Galaxy S21          Android                      340
## 6      11     Google Pixel 5          Android                       53
##   Screen.On.Time..hours.day. Battery.Drain..mAh.day. Number.of.Apps.Installed
## 1                        6.4                    1872                       67
## 2                        4.8                    1676                       56
## 3                        2.0                     940                       35
## 4                        7.3                    1802                       66
## 5                        7.7                    2138                       75
## 6                        1.4                     435                       17
##   Data.Usage..MB.day. Age Gender User.Behavior.Class
## 1                1122  40   Male                   4
## 2                 871  20   Male                   3
## 3                 564  31   Male                   2
## 4                1054  21 Female                   4
## 5                1053  42 Female                   4
## 6                 162  34 Female                   1
phone_gen <- data %>% 
  mutate(Phone_generation = str_extract(Device.Model, "\\d+")) %>% 
  select(c(Device.Model, Phone_generation))
head(phone_gen)
##     Device.Model Phone_generation
## 1 Google Pixel 5                5
## 2      OnePlus 9                9
## 3   Xiaomi Mi 11               11
## 4 Google Pixel 5                5
## 5      iPhone 12               12
## 6 Google Pixel 5                5
phone_gen_summary <- phone_gen %>% 
  filter(!is.na(Phone_generation)) %>% # Exclude rows with no phone generation info
  group_by(Phone_generation) %>% 
  count(sort = TRUE)

print(phone_gen_summary)
## # A tibble: 5 × 2
## # Groups:   Phone_generation [5]
##   Phone_generation     n
##   <chr>            <int>
## 1 11                 146
## 2 12                 146
## 3 5                  142
## 4 21                 133
## 5 9                  133
phone_gen_summary %>% 
  ggplot(aes(x = reorder(Phone_generation, n), y = n)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(
    title = "Distribution of Phone Generations",
    x = "Phone Generation",
    y = "Frequency"
  ) +
  theme_minimal()

keyword_summary <- data %>% 
  mutate(keyword = case_when(
    str_detect(Device.Model, regex("Pixel", ignore_case = TRUE)) ~ "Pixel",
    str_detect(Device.Model, regex("Galaxy", ignore_case = TRUE)) ~ "Galaxy",
    TRUE ~ "Other"
  )) %>% 
  group_by(keyword) %>% 
  summarise(Count = n())

print(keyword_summary)
## # A tibble: 3 × 2
##   keyword Count
##   <chr>   <int>
## 1 Galaxy    133
## 2 Other     425
## 3 Pixel     142
keyword_summary %>% 
  ggplot(aes(x = "", y = Count, fill = keyword)) +
  geom_bar(width = 1, stat = "identity") +
  coord_polar("y") +
  labs(
    title = "Keyword Distribution in Device Models",
    fill = "Keyword"
  ) +
  theme_void()