Libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
dataset
dataset <- read.csv("dataset.csv", stringsAsFactors = FALSE)
head(dataset)
## transaction_id user_id age gender daily_screen_time_hours social_media_hours
## 1 TXN00001 U00001 21 Male 3.23 2.01
## 2 TXN00002 U00002 24 Other 5.09 3.81
## 3 TXN00003 U00003 31 Other 6.06 1.36
## 4 TXN00004 U00004 32 Other 7.83 5.85
## 5 TXN00005 U00005 25 Male 9.96 5.92
## 6 TXN00006 U00006 26 Male 9.32 4.26
## gaming_hours work_study_hours sleep_hours notifications_per_day
## 1 0.89 4.55 7.55 248
## 2 2.24 4.44 7.66 127
## 3 3.83 2.35 4.92 44
## 4 1.51 3.54 8.23 178
## 5 3.42 5.27 6.21 136
## 6 0.29 3.99 6.90 82
## app_opens_per_day weekend_screen_time stress_level academic_work_impact
## 1 154 3.95 Medium Yes
## 2 71 6.71 Medium Yes
## 3 106 8.68 High No
## 4 107 9.77 High Yes
## 5 177 12.55 Low No
## 6 56 10.98 Medium Yes
## addiction_level addicted_label
## 1 None 0
## 2 None 0
## 3 Mild 0
## 4 Moderate 1
## 5 Severe 1
## 6 Severe 1
Data cleaning
dataset[dataset == ""] <- NA
str(dataset)
## 'data.frame': 7500 obs. of 16 variables:
## $ transaction_id : chr "TXN00001" "TXN00002" "TXN00003" "TXN00004" ...
## $ user_id : chr "U00001" "U00002" "U00003" "U00004" ...
## $ age : int 21 24 31 32 25 26 25 26 21 35 ...
## $ gender : chr "Male" "Other" "Other" "Other" ...
## $ daily_screen_time_hours: num 3.23 5.09 6.06 7.83 9.96 9.32 10.4 4.26 4.38 9.76 ...
## $ social_media_hours : num 2.01 3.81 1.36 5.85 5.92 4.26 4.93 4.6 1.38 4.73 ...
## $ gaming_hours : num 0.89 2.24 3.83 1.51 3.42 0.29 1.6 2.16 2.72 1.36 ...
## $ work_study_hours : num 4.55 4.44 2.35 3.54 5.27 3.99 0.86 4.61 3.78 2.11 ...
## $ sleep_hours : num 7.55 7.66 4.92 8.23 6.21 6.9 8.61 6.43 6.23 5.21 ...
## $ notifications_per_day : int 248 127 44 178 136 82 165 169 172 20 ...
## $ app_opens_per_day : int 154 71 106 107 177 56 95 117 134 82 ...
## $ weekend_screen_time : num 3.95 6.71 8.68 9.77 12.55 ...
## $ stress_level : chr "Medium" "Medium" "High" "High" ...
## $ academic_work_impact : chr "Yes" "Yes" "No" "Yes" ...
## $ addiction_level : chr "None" "None" "Mild" "Moderate" ...
## $ addicted_label : int 0 0 0 1 1 1 1 1 0 1 ...
summary(dataset)
## transaction_id user_id age gender
## Length:7500 Length:7500 Min. :18.00 Length:7500
## Class :character Class :character 1st Qu.:22.00 Class :character
## Mode :character Mode :character Median :27.00 Mode :character
## Mean :26.57
## 3rd Qu.:31.00
## Max. :35.00
## daily_screen_time_hours social_media_hours gaming_hours work_study_hours
## Min. : 3.000 Min. :0.500 Min. :0.000 Min. :0.500
## 1st Qu.: 5.220 1st Qu.:1.910 1st Qu.:1.020 1st Qu.:1.850
## Median : 7.525 Median :3.270 Median :2.040 Median :3.230
## Mean : 7.500 Mean :3.273 Mean :2.014 Mean :3.242
## 3rd Qu.: 9.810 3rd Qu.:4.630 3rd Qu.:2.990 3rd Qu.:4.640
## Max. :12.000 Max. :6.000 Max. :4.000 Max. :6.000
## sleep_hours notifications_per_day app_opens_per_day weekend_screen_time
## Min. :4.500 Min. : 20.0 Min. : 15.00 Min. : 3.580
## 1st Qu.:5.630 1st Qu.: 76.0 1st Qu.: 55.00 1st Qu.: 6.960
## Median :6.720 Median :134.0 Median : 98.00 Median : 9.260
## Mean :6.738 Mean :134.3 Mean : 97.83 Mean : 9.244
## 3rd Qu.:7.840 3rd Qu.:191.0 3rd Qu.:140.00 3rd Qu.:11.540
## Max. :9.000 Max. :250.0 Max. :180.00 Max. :14.880
## stress_level academic_work_impact addiction_level addicted_label
## Length:7500 Length:7500 Length:7500 Min. :0.0000
## Class :character Class :character Class :character 1st Qu.:0.0000
## Mode :character Mode :character Mode :character Median :1.0000
## Mean :0.7077
## 3rd Qu.:1.0000
## Max. :1.0000
dim(dataset)
## [1] 7500 16
colSums(is.na(dataset))
## transaction_id user_id age
## 0 0 0
## gender daily_screen_time_hours social_media_hours
## 0 0 0
## gaming_hours work_study_hours sleep_hours
## 0 0 0
## notifications_per_day app_opens_per_day weekend_screen_time
## 0 0 0
## stress_level academic_work_impact addiction_level
## 0 0 0
## addicted_label
## 0
Anaysis Questions Q1-How many users belong to each gender category in the dataset?
dataset %>% count(gender)
## gender n
## 1 Female 2461
## 2 Male 2553
## 3 Other 2486
Interpretation: The dataset has a nearly equal distribution of users among Male, Female, and Other gender categories. This balanced representation ensures that behavioral patterns analyzed are not biased toward a specific gender.
Q2-What is the distribution of users across different age groups?
dataset %>%
mutate(Age_Group = ifelse(age < 30, "Young Adult", "Adult")) %>%
count(Age_Group)
## Age_Group n
## 1 Adult 2569
## 2 Young Adult 4931
Interpretation: The majority of users in the study are “Young Adults” (under 30). This suggests the findings on digital addiction are particularly relevant to the younger demographic, which typically has higher tech engagement.
Q3-What are the minimum, maximum, and average daily screen time hours?
dataset %>%
summarise(
Min_Screen_Time = min(daily_screen_time_hours),
Max_Screen_Time = max(daily_screen_time_hours),
Avg_Screen_Time = mean(daily_screen_time_hours)
)
## Min_Screen_Time Max_Screen_Time Avg_Screen_Time
## 1 3 12 7.499912
Interpretation: Screen time varies significantly, from a minimum of 3 hours to a maximum of 12 hours. The average user spends 7.5 hours daily on their devices, highlighting a high level of digital engagement across the board.
Q4-How many users are classified as addicted versus not addicted?
dataset %>% count(addicted_label)
## addicted_label n
## 1 0 2192
## 2 1 5308
Interpretation: A significant majority (over 70%) of the participants are labeled as “Addicted” (1). This indicates a high prevalence of digital dependency within this specific survey group.
Q5-Does the average daily screen time differ between addicted and non-addicted users?
dataset %>%
group_by(addicted_label) %>%
summarise(Avg_Screen_Time = mean(daily_screen_time_hours))
## # A tibble: 2 × 2
## addicted_label Avg_Screen_Time
## <int> <dbl>
## 1 0 5.16
## 2 1 8.47
Interpretation: Addicted users spend significantly more time on screens (8.47 hours) compared to non-addicted users (5.16 hours). This confirms that high screen duration is a primary indicator of digital addiction.
Q6-Who are the top 5 users with the highest daily screen time?
dataset %>%
arrange(desc(daily_screen_time_hours)) %>%
select(user_id, daily_screen_time_hours, addiction_level) %>%
head(5)
## user_id daily_screen_time_hours addiction_level
## 1 U00694 12.00 Severe
## 2 U02237 12.00 Moderate
## 3 U05173 12.00 Severe
## 4 U05236 12.00 Moderate
## 5 U00585 11.99 Severe
Interpretation: The heaviest users reach the 12-hour maximum limit. Interestingly, some users with 12 hours of usage are categorized as “Moderate” while others are “Severe,” suggesting other behavioral factors contribute to the addiction level.
Q7-How many users of each gender are classified as having “Severe” addiction?
dataset %>%
filter(addiction_level == "Severe") %>%
count(gender)
## gender n
## 1 Female 762
## 2 Male 848
## 3 Other 824
Interpretation: “Severe” addiction is fairly evenly distributed across all genders, though it is slightly higher among Male users in this dataset.
Q8-Is there a significant difference in age across the different addiction levels?
dataset %>%
group_by(addiction_level) %>%
summarise(Avg_Age = mean(age, na.rm = TRUE))
## # A tibble: 4 × 2
## addiction_level Avg_Age
## <chr> <dbl>
## 1 Mild 26.6
## 2 Moderate 26.4
## 3 None 26.5
## 4 Severe 26.8
Interpretation: The average age remains remarkably consistent (around 26.5 years) across all addiction levels. This indicates that within this dataset, age is not a primary differentiator for the severity of digital addiction.
Q9-What percentage of “Mild” addiction users report a negative academic impact?
dataset %>%
filter(addiction_level == "Mild") %>%
count(academic_work_impact) %>%
mutate(Percent = n / sum(n) * 100)
## academic_work_impact n Percent
## 1 No 690 50.25492
## 2 Yes 683 49.74508
Interpretation: Even among those with only “Mild” addiction, nearly 50% feel that their digital habits are negatively impacting their work or academics. This shows that digital usage can be disruptive even before it reaches “Severe” levels.
Q10-Are high notification counts correlated with high stress levels?
dataset %>%
group_by(stress_level) %>%
summarise(Avg_Notifications = mean(notifications_per_day))
## # A tibble: 3 × 2
## stress_level Avg_Notifications
## <chr> <dbl>
## 1 High 134.
## 2 Low 134.
## 3 Medium 135.
Interpretation: Notification counts are almost identical across all stress levels. This suggests that the volume of digital alerts alone is not the primary driver of reported stress for these users.
Q11-How many users are in the “Other” gender category?
dataset %>% count(gender) %>% filter(gender == "Other")
## gender n
## 1 Other 2486
Interpretation: There are 2,486 users in the “Other” gender category, making it almost exactly one-third of the total population surveyed.
Q12-How do average sleep hours vary by stress level?
ggplot(dataset, aes(x = stress_level, y = sleep_hours)) +
stat_summary(fun = mean, geom = "bar", fill = "skyblue") +
labs(title = "Average Sleep Hours by Stress Level",
x = "Stress Level",
y = "Average Sleep Hours")
Interpretation:
The graph visualizes that sleep remains relatively stable across stress categories, though “High” stress users have a visible (if slight) reduction in sleep.
Q13-How are genders distributed across different stress levels?
ggplot(dataset, aes(x = gender)) +
geom_bar(fill = "red") +
labs(title = "Gender Distribution across Stress Levels",
x = "Gender",
y = "Count")
Interpretation:
All gender groups show a fairly even distribution across High, Medium, and Low stress levels. No single gender appears significantly more stressed than others in this dataset.
Q14-How does daily screen time compare to weekend screen time across addiction status?
ggplot(dataset, aes(x = daily_screen_time_hours, y = weekend_screen_time)) +
geom_point(color = "green") +
labs(title = "Daily vs Weekend Screen Time",
x = "Daily Screen Time (Hours)",
y = "Weekend Screen Time (Hours)")
Interpretation:
The scatter plot shows the relationship between daily and weekend screen time. It indicates that individuals who spend more time on screens daily also tend to spend more time on weekends.
Q15-How is addiction level distributed among users?
ggplot(dataset, aes(x = addiction_level)) +
geom_bar(fill = "orange") +
coord_flip() +
labs(title = "Distribution of Addiction Levels",
x = "Addiction Level",
y = "Count")
Interpretation:
The horizontal bar chart shows the number of users in each addiction category. It helps identify which addiction level (None, Mild, Moderate, etc.)is most common in the dataset.