url <- "https://raw.githubusercontent.com/stormwhale/data-mines/refs/heads/main/user_behavior_dataset.csv"
data <- read.csv(url)
pi_gal <- data %>%
filter(str_detect(Device.Model, regex("Pixel|Galaxy", ignore_case = TRUE)))
head(pi_gal)
## User.ID Device.Model Operating.System App.Usage.Time..min.day.
## 1 1 Google Pixel 5 Android 393
## 2 4 Google Pixel 5 Android 239
## 3 6 Google Pixel 5 Android 99
## 4 7 Samsung Galaxy S21 Android 350
## 5 9 Samsung Galaxy S21 Android 340
## 6 11 Google Pixel 5 Android 53
## Screen.On.Time..hours.day. Battery.Drain..mAh.day. Number.of.Apps.Installed
## 1 6.4 1872 67
## 2 4.8 1676 56
## 3 2.0 940 35
## 4 7.3 1802 66
## 5 7.7 2138 75
## 6 1.4 435 17
## Data.Usage..MB.day. Age Gender User.Behavior.Class
## 1 1122 40 Male 4
## 2 871 20 Male 3
## 3 564 31 Male 2
## 4 1054 21 Female 4
## 5 1053 42 Female 4
## 6 162 34 Female 1
phone_gen <- data %>%
mutate(Phone_generation = str_extract(Device.Model, "\\d+")) %>%
select(c(Device.Model, Phone_generation))
head(phone_gen)
## Device.Model Phone_generation
## 1 Google Pixel 5 5
## 2 OnePlus 9 9
## 3 Xiaomi Mi 11 11
## 4 Google Pixel 5 5
## 5 iPhone 12 12
## 6 Google Pixel 5 5
phone_gen_summary <- phone_gen %>%
filter(!is.na(Phone_generation)) %>% # Exclude rows with no phone generation info
group_by(Phone_generation) %>%
count(sort = TRUE)
print(phone_gen_summary)
## # A tibble: 5 × 2
## # Groups: Phone_generation [5]
## Phone_generation n
## <chr> <int>
## 1 11 146
## 2 12 146
## 3 5 142
## 4 21 133
## 5 9 133
phone_gen_summary %>%
ggplot(aes(x = reorder(Phone_generation, n), y = n)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(
title = "Distribution of Phone Generations",
x = "Phone Generation",
y = "Frequency"
) +
theme_minimal()

keyword_summary <- data %>%
mutate(keyword = case_when(
str_detect(Device.Model, regex("Pixel", ignore_case = TRUE)) ~ "Pixel",
str_detect(Device.Model, regex("Galaxy", ignore_case = TRUE)) ~ "Galaxy",
TRUE ~ "Other"
)) %>%
group_by(keyword) %>%
summarise(Count = n())
print(keyword_summary)
## # A tibble: 3 × 2
## keyword Count
## <chr> <int>
## 1 Galaxy 133
## 2 Other 425
## 3 Pixel 142
keyword_summary %>%
ggplot(aes(x = "", y = Count, fill = keyword)) +
geom_bar(width = 1, stat = "identity") +
coord_polar("y") +
labs(
title = "Keyword Distribution in Device Models",
fill = "Keyword"
) +
theme_void()
