library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(ggplot2)

# Load Dataset
df = read.csv("C:/college/3rd year/DCS 402/Smartphone_Usage_Productivity_Dataset_50000.csv")

str(df)
## 'data.frame':    50000 obs. of  13 variables:
##  $ User_ID                  : chr  "U1" "U2" "U3" "U4" ...
##  $ Age                      : int  58 25 19 35 33 32 26 24 52 23 ...
##  $ Gender                   : chr  "Male" "Male" "Male" "Female" ...
##  $ Occupation               : chr  "Professional" "Professional" "Student" "Business Owner" ...
##  $ Device_Type              : chr  "Android" "Android" "iOS" "iOS" ...
##  $ Daily_Phone_Hours        : num  1.3 1.2 5.3 5.8 7.9 10.9 5.6 8.5 9.4 2.8 ...
##  $ Social_Media_Hours       : num  6.7 1.5 5.7 2.5 1.3 4.2 6.1 1.9 2.5 1.1 ...
##  $ Work_Productivity_Score  : int  6 5 5 2 4 9 8 4 3 1 ...
##  $ Sleep_Hours              : num  8.8 6.4 9 5.7 5.7 6.3 6.7 7.8 6.4 4.5 ...
##  $ Stress_Level             : int  4 1 4 3 3 7 3 3 4 2 ...
##  $ App_Usage_Count          : int  42 51 14 36 37 34 5 6 17 7 ...
##  $ Caffeine_Intake_Cups     : int  1 3 5 6 5 5 3 6 3 0 ...
##  $ Weekend_Screen_Time_Hours: num  8.7 5.1 6.3 12.8 9.9 3.6 2.9 12.8 3.1 2.7 ...
head(df)
##   User_ID Age Gender     Occupation Device_Type Daily_Phone_Hours
## 1      U1  58   Male   Professional     Android               1.3
## 2      U2  25   Male   Professional     Android               1.2
## 3      U3  19   Male        Student         iOS               5.3
## 4      U4  35 Female Business Owner         iOS               5.8
## 5      U5  33   Male     Freelancer     Android               7.9
## 6      U6  32 Female        Student     Android              10.9
##   Social_Media_Hours Work_Productivity_Score Sleep_Hours Stress_Level
## 1                6.7                       6         8.8            4
## 2                1.5                       5         6.4            1
## 3                5.7                       5         9.0            4
## 4                2.5                       2         5.7            3
## 5                1.3                       4         5.7            3
## 6                4.2                       9         6.3            7
##   App_Usage_Count Caffeine_Intake_Cups Weekend_Screen_Time_Hours
## 1              42                    1                       8.7
## 2              51                    3                       5.1
## 3              14                    5                       6.3
## 4              36                    6                      12.8
## 5              37                    5                       9.9
## 6              34                    5                       3.6
# Raw data line count
nrow(df)
## [1] 50000
ncol(df)
## [1] 13
# Missing Values Check 
missing_df <- df %>%
  summarise(across(everything(), ~sum(is.na(.)))) %>%
  pivot_longer(cols = everything(),
               names_to = "Column",
               values_to = "Missing_Count")
missing_df
## # A tibble: 13 × 2
##    Column                    Missing_Count
##    <chr>                             <int>
##  1 User_ID                               0
##  2 Age                                   0
##  3 Gender                                0
##  4 Occupation                            0
##  5 Device_Type                           0
##  6 Daily_Phone_Hours                     0
##  7 Social_Media_Hours                    0
##  8 Work_Productivity_Score               0
##  9 Sleep_Hours                           0
## 10 Stress_Level                          0
## 11 App_Usage_Count                       0
## 12 Caffeine_Intake_Cups                  0
## 13 Weekend_Screen_Time_Hours             0
# Bar Plot- Missing data
ggplot(missing_df, aes(x = reorder(Column, -Missing_Count), 
                       y = Missing_Count)) +
  geom_col() +
  labs(title = "Missing Values per Column",
       x = "Variables",
       y = "Number of Missing Values") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Data Cleaning
df <- df %>%
  mutate(across(where(is.character), str_trim))
view(df)


# Handle Missing Values
colnames(df)
##  [1] "User_ID"                   "Age"                      
##  [3] "Gender"                    "Occupation"               
##  [5] "Device_Type"               "Daily_Phone_Hours"        
##  [7] "Social_Media_Hours"        "Work_Productivity_Score"  
##  [9] "Sleep_Hours"               "Stress_Level"             
## [11] "App_Usage_Count"           "Caffeine_Intake_Cups"     
## [13] "Weekend_Screen_Time_Hours"
df <- df %>%
  mutate(
    Sleep_Hours = ifelse(is.na(Sleep_Hours),
                         median(Sleep_Hours, na.rm = TRUE),
                         Sleep_Hours),
    
    Daily_Phone_Hours = ifelse(is.na(Daily_Phone_Hours),
                               median(Daily_Phone_Hours, na.rm = TRUE),
                               Daily_Phone_Hours)
  )

df <- df %>%
  drop_na(Stress_Level)

# Detect and Remove Outliers (IQR Method)
Q1 <- quantile(df$Daily_Phone_Hours, 0.25)
Q3 <- quantile(df$Daily_Phone_Hours, 0.75)
IQR_value <- Q3 - Q1

df <- df %>%
  filter(Daily_Phone_Hours >= (Q1 - 1.5 * IQR_value) &
           Daily_Phone_Hours <= (Q3 + 1.5 * IQR_value))

Q1_s <- quantile(df$Sleep_Hours, 0.25)
Q3_s <- quantile(df$Sleep_Hours, 0.75)
IQR_s <- Q3_s - Q1_s

df <- df %>%
  filter(Sleep_Hours >= (Q1_s - 1.5 * IQR_s) &
           Sleep_Hours <= (Q3_s + 1.5 * IQR_s))

df <- df %>%
  mutate(
    Stress_Level = str_trim(Stress_Level),          # remove spaces
    Stress_Level = str_to_title(Stress_Level)      # optional: make "low" -> "Low"
  )

# Convert Qualitative into Quantitative
df <- df %>%
  mutate(
    Stress_Numeric = as.numeric(Stress_Level),
    Gender_Numeric = case_when(
      Gender == "Male" ~ 0,
      Gender == "Female" ~ 1,
      TRUE ~ NA_real_
    )
  )


df %>%
  select(User_ID, Gender, Stress_Level, Gender_Numeric, Stress_Numeric) %>%
  head(10)
##    User_ID Gender Stress_Level Gender_Numeric Stress_Numeric
## 1       U1   Male            4              0              4
## 2       U2   Male            1              0              1
## 3       U3   Male            4              0              4
## 4       U4 Female            3              1              3
## 5       U5   Male            3              0              3
## 6       U6 Female            7              1              7
## 7       U7   Male            3              0              3
## 8       U8   Male            3              0              3
## 9       U9 Female            4              1              4
## 10     U10 Female            2              1              2
unique(df$Stress_Level)
##  [1] "4"  "1"  "3"  "7"  "2"  "9"  "10" "8"  "5"  "6"
# Bar Chart
ggplot(df, aes(x = Stress_Level)) +
  geom_bar(fill = "lightblue") +
  labs(title = "Distribution of Stress Levels",
       x = "Stress Level",
       y = "Count") +
  theme_minimal()

# Grouped Bar Chart
df %>%
  group_by(Stress_Numeric) %>%
  summarise(
    Avg_Phone = mean(Daily_Phone_Hours, na.rm = TRUE),
    Avg_Social = mean(Social_Media_Hours, na.rm = TRUE)
  ) %>%
  pivot_longer(cols = c(Avg_Phone, Avg_Social), 
               names_to = "Type", 
               values_to = "Average_Hours") %>%
  ggplot(aes(x = factor(Stress_Numeric), y = Average_Hours, fill = Type)) +
  geom_col(position = "dodge") +
  scale_fill_manual(values = c("lightblue", "grey")) +
  labs(title = "Average Daily Phone and Social Media Hours by Stress Level",
       x = "Stress Level (1=Low, 2=Medium, 3=High)",
       y = "Average Hours") +
  theme_minimal()

# Histogram 
ggplot(df, aes(x = Daily_Phone_Hours)) +
  geom_histogram(binwidth = 1, fill = "steelblue", color = "white") +
  labs(title = "Distribution of Daily Phone Hours",
       x = "Daily Phone Hours",
       y = "Number of Users") +
  theme_minimal()