Libraries

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)

dataset

dataset <- read.csv("dataset.csv", stringsAsFactors = FALSE)
head(dataset)
##   transaction_id user_id age gender daily_screen_time_hours social_media_hours
## 1       TXN00001  U00001  21   Male                    3.23               2.01
## 2       TXN00002  U00002  24  Other                    5.09               3.81
## 3       TXN00003  U00003  31  Other                    6.06               1.36
## 4       TXN00004  U00004  32  Other                    7.83               5.85
## 5       TXN00005  U00005  25   Male                    9.96               5.92
## 6       TXN00006  U00006  26   Male                    9.32               4.26
##   gaming_hours work_study_hours sleep_hours notifications_per_day
## 1         0.89             4.55        7.55                   248
## 2         2.24             4.44        7.66                   127
## 3         3.83             2.35        4.92                    44
## 4         1.51             3.54        8.23                   178
## 5         3.42             5.27        6.21                   136
## 6         0.29             3.99        6.90                    82
##   app_opens_per_day weekend_screen_time stress_level academic_work_impact
## 1               154                3.95       Medium                  Yes
## 2                71                6.71       Medium                  Yes
## 3               106                8.68         High                   No
## 4               107                9.77         High                  Yes
## 5               177               12.55          Low                   No
## 6                56               10.98       Medium                  Yes
##   addiction_level addicted_label
## 1            None              0
## 2            None              0
## 3            Mild              0
## 4        Moderate              1
## 5          Severe              1
## 6          Severe              1

Data cleaning

dataset[dataset == ""] <- NA
str(dataset)
## 'data.frame':    7500 obs. of  16 variables:
##  $ transaction_id         : chr  "TXN00001" "TXN00002" "TXN00003" "TXN00004" ...
##  $ user_id                : chr  "U00001" "U00002" "U00003" "U00004" ...
##  $ age                    : int  21 24 31 32 25 26 25 26 21 35 ...
##  $ gender                 : chr  "Male" "Other" "Other" "Other" ...
##  $ daily_screen_time_hours: num  3.23 5.09 6.06 7.83 9.96 9.32 10.4 4.26 4.38 9.76 ...
##  $ social_media_hours     : num  2.01 3.81 1.36 5.85 5.92 4.26 4.93 4.6 1.38 4.73 ...
##  $ gaming_hours           : num  0.89 2.24 3.83 1.51 3.42 0.29 1.6 2.16 2.72 1.36 ...
##  $ work_study_hours       : num  4.55 4.44 2.35 3.54 5.27 3.99 0.86 4.61 3.78 2.11 ...
##  $ sleep_hours            : num  7.55 7.66 4.92 8.23 6.21 6.9 8.61 6.43 6.23 5.21 ...
##  $ notifications_per_day  : int  248 127 44 178 136 82 165 169 172 20 ...
##  $ app_opens_per_day      : int  154 71 106 107 177 56 95 117 134 82 ...
##  $ weekend_screen_time    : num  3.95 6.71 8.68 9.77 12.55 ...
##  $ stress_level           : chr  "Medium" "Medium" "High" "High" ...
##  $ academic_work_impact   : chr  "Yes" "Yes" "No" "Yes" ...
##  $ addiction_level        : chr  "None" "None" "Mild" "Moderate" ...
##  $ addicted_label         : int  0 0 0 1 1 1 1 1 0 1 ...
summary(dataset)
##  transaction_id       user_id               age           gender         
##  Length:7500        Length:7500        Min.   :18.00   Length:7500       
##  Class :character   Class :character   1st Qu.:22.00   Class :character  
##  Mode  :character   Mode  :character   Median :27.00   Mode  :character  
##                                        Mean   :26.57                     
##                                        3rd Qu.:31.00                     
##                                        Max.   :35.00                     
##  daily_screen_time_hours social_media_hours  gaming_hours   work_study_hours
##  Min.   : 3.000          Min.   :0.500      Min.   :0.000   Min.   :0.500   
##  1st Qu.: 5.220          1st Qu.:1.910      1st Qu.:1.020   1st Qu.:1.850   
##  Median : 7.525          Median :3.270      Median :2.040   Median :3.230   
##  Mean   : 7.500          Mean   :3.273      Mean   :2.014   Mean   :3.242   
##  3rd Qu.: 9.810          3rd Qu.:4.630      3rd Qu.:2.990   3rd Qu.:4.640   
##  Max.   :12.000          Max.   :6.000      Max.   :4.000   Max.   :6.000   
##   sleep_hours    notifications_per_day app_opens_per_day weekend_screen_time
##  Min.   :4.500   Min.   : 20.0         Min.   : 15.00    Min.   : 3.580     
##  1st Qu.:5.630   1st Qu.: 76.0         1st Qu.: 55.00    1st Qu.: 6.960     
##  Median :6.720   Median :134.0         Median : 98.00    Median : 9.260     
##  Mean   :6.738   Mean   :134.3         Mean   : 97.83    Mean   : 9.244     
##  3rd Qu.:7.840   3rd Qu.:191.0         3rd Qu.:140.00    3rd Qu.:11.540     
##  Max.   :9.000   Max.   :250.0         Max.   :180.00    Max.   :14.880     
##  stress_level       academic_work_impact addiction_level    addicted_label  
##  Length:7500        Length:7500          Length:7500        Min.   :0.0000  
##  Class :character   Class :character     Class :character   1st Qu.:0.0000  
##  Mode  :character   Mode  :character     Mode  :character   Median :1.0000  
##                                                             Mean   :0.7077  
##                                                             3rd Qu.:1.0000  
##                                                             Max.   :1.0000
dim(dataset)
## [1] 7500   16
colSums(is.na(dataset))
##          transaction_id                 user_id                     age 
##                       0                       0                       0 
##                  gender daily_screen_time_hours      social_media_hours 
##                       0                       0                       0 
##            gaming_hours        work_study_hours             sleep_hours 
##                       0                       0                       0 
##   notifications_per_day       app_opens_per_day     weekend_screen_time 
##                       0                       0                       0 
##            stress_level    academic_work_impact         addiction_level 
##                       0                       0                       0 
##          addicted_label 
##                       0

Anaysis Questions Q1-How many users belong to each gender category in the dataset?

dataset %>% count(gender)
##   gender    n
## 1 Female 2461
## 2   Male 2553
## 3  Other 2486

Interpretation: The dataset has a nearly equal distribution of users among Male, Female, and Other gender categories. This balanced representation ensures that behavioral patterns analyzed are not biased toward a specific gender.

Q2-What is the distribution of users across different age groups?

dataset %>% 
  mutate(Age_Group = ifelse(age < 30, "Young Adult", "Adult")) %>%
  count(Age_Group)
##     Age_Group    n
## 1       Adult 2569
## 2 Young Adult 4931

Interpretation: The majority of users in the study are “Young Adults” (under 30). This suggests the findings on digital addiction are particularly relevant to the younger demographic, which typically has higher tech engagement.

Q3-What are the minimum, maximum, and average daily screen time hours?

dataset %>%
  summarise(
    Min_Screen_Time = min(daily_screen_time_hours),
    Max_Screen_Time = max(daily_screen_time_hours),
    Avg_Screen_Time = mean(daily_screen_time_hours)
  )
##   Min_Screen_Time Max_Screen_Time Avg_Screen_Time
## 1               3              12        7.499912

Interpretation: Screen time varies significantly, from a minimum of 3 hours to a maximum of 12 hours. The average user spends 7.5 hours daily on their devices, highlighting a high level of digital engagement across the board.

Q4-How many users are classified as addicted versus not addicted?

dataset %>% count(addicted_label)
##   addicted_label    n
## 1              0 2192
## 2              1 5308

Interpretation: A significant majority (over 70%) of the participants are labeled as “Addicted” (1). This indicates a high prevalence of digital dependency within this specific survey group.

Q5-Does the average daily screen time differ between addicted and non-addicted users?

dataset %>%
  group_by(addicted_label) %>%
  summarise(Avg_Screen_Time = mean(daily_screen_time_hours))
## # A tibble: 2 × 2
##   addicted_label Avg_Screen_Time
##            <int>           <dbl>
## 1              0            5.16
## 2              1            8.47

Interpretation: Addicted users spend significantly more time on screens (8.47 hours) compared to non-addicted users (5.16 hours). This confirms that high screen duration is a primary indicator of digital addiction.

Q6-Who are the top 5 users with the highest daily screen time?

dataset %>%
  arrange(desc(daily_screen_time_hours)) %>%
  select(user_id, daily_screen_time_hours, addiction_level) %>%
  head(5)
##   user_id daily_screen_time_hours addiction_level
## 1  U00694                   12.00          Severe
## 2  U02237                   12.00        Moderate
## 3  U05173                   12.00          Severe
## 4  U05236                   12.00        Moderate
## 5  U00585                   11.99          Severe

Interpretation: The heaviest users reach the 12-hour maximum limit. Interestingly, some users with 12 hours of usage are categorized as “Moderate” while others are “Severe,” suggesting other behavioral factors contribute to the addiction level.

Q7-How many users of each gender are classified as having “Severe” addiction?

dataset %>%
  filter(addiction_level == "Severe") %>%
  count(gender)
##   gender   n
## 1 Female 762
## 2   Male 848
## 3  Other 824

Interpretation: “Severe” addiction is fairly evenly distributed across all genders, though it is slightly higher among Male users in this dataset.

Q8-Is there a significant difference in age across the different addiction levels?

dataset %>%
  group_by(addiction_level) %>%
  summarise(Avg_Age = mean(age, na.rm = TRUE))
## # A tibble: 4 × 2
##   addiction_level Avg_Age
##   <chr>             <dbl>
## 1 Mild               26.6
## 2 Moderate           26.4
## 3 None               26.5
## 4 Severe             26.8

Interpretation: The average age remains remarkably consistent (around 26.5 years) across all addiction levels. This indicates that within this dataset, age is not a primary differentiator for the severity of digital addiction.

Q9-What percentage of “Mild” addiction users report a negative academic impact?

dataset %>%
  filter(addiction_level == "Mild") %>%
  count(academic_work_impact) %>%
  mutate(Percent = n / sum(n) * 100)
##   academic_work_impact   n  Percent
## 1                   No 690 50.25492
## 2                  Yes 683 49.74508

Interpretation: Even among those with only “Mild” addiction, nearly 50% feel that their digital habits are negatively impacting their work or academics. This shows that digital usage can be disruptive even before it reaches “Severe” levels.

Q10-Are high notification counts correlated with high stress levels?

dataset %>%
  group_by(stress_level) %>%
  summarise(Avg_Notifications = mean(notifications_per_day))
## # A tibble: 3 × 2
##   stress_level Avg_Notifications
##   <chr>                    <dbl>
## 1 High                      134.
## 2 Low                       134.
## 3 Medium                    135.

Interpretation: Notification counts are almost identical across all stress levels. This suggests that the volume of digital alerts alone is not the primary driver of reported stress for these users.

Q11-How many users are in the “Other” gender category?

dataset %>% count(gender) %>% filter(gender == "Other")
##   gender    n
## 1  Other 2486

Interpretation: There are 2,486 users in the “Other” gender category, making it almost exactly one-third of the total population surveyed.

Q12-How do average sleep hours vary by stress level?

ggplot(dataset, aes(x = stress_level, y = sleep_hours)) +
  stat_summary(fun = mean, geom = "bar", fill = "skyblue") +
  labs(title = "Average Sleep Hours by Stress Level",
       x = "Stress Level",
       y = "Average Sleep Hours")

Interpretation:

The graph visualizes that sleep remains relatively stable across stress categories, though “High” stress users have a visible (if slight) reduction in sleep.

Q13-How are genders distributed across different stress levels?

ggplot(dataset, aes(x = gender)) +
  geom_bar(fill = "red") +
  labs(title = "Gender Distribution across Stress Levels",
       x = "Gender",
       y = "Count")

Interpretation:

All gender groups show a fairly even distribution across High, Medium, and Low stress levels. No single gender appears significantly more stressed than others in this dataset.

Q14-How does daily screen time compare to weekend screen time across addiction status?

ggplot(dataset, aes(x = daily_screen_time_hours, y = weekend_screen_time)) +
  geom_point(color = "green") +
  labs(title = "Daily vs Weekend Screen Time",
       x = "Daily Screen Time (Hours)",
       y = "Weekend Screen Time (Hours)")

Interpretation:

The scatter plot shows the relationship between daily and weekend screen time. It indicates that individuals who spend more time on screens daily also tend to spend more time on weekends.

Q15-How is addiction level distributed among users?

ggplot(dataset, aes(x = addiction_level)) +
  geom_bar(fill = "orange") +
  coord_flip() +
  labs(title = "Distribution of Addiction Levels",
       x = "Addiction Level",
       y = "Count")

Interpretation:

The horizontal bar chart shows the number of users in each addiction category. It helps identify which addiction level (None, Mild, Moderate, etc.)is most common in the dataset.