library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(ggplot2)
# Load Dataset
df = read.csv("C:/college/3rd year/DCS 402/Smartphone_Usage_Productivity_Dataset_50000.csv")
str(df)
## 'data.frame': 50000 obs. of 13 variables:
## $ User_ID : chr "U1" "U2" "U3" "U4" ...
## $ Age : int 58 25 19 35 33 32 26 24 52 23 ...
## $ Gender : chr "Male" "Male" "Male" "Female" ...
## $ Occupation : chr "Professional" "Professional" "Student" "Business Owner" ...
## $ Device_Type : chr "Android" "Android" "iOS" "iOS" ...
## $ Daily_Phone_Hours : num 1.3 1.2 5.3 5.8 7.9 10.9 5.6 8.5 9.4 2.8 ...
## $ Social_Media_Hours : num 6.7 1.5 5.7 2.5 1.3 4.2 6.1 1.9 2.5 1.1 ...
## $ Work_Productivity_Score : int 6 5 5 2 4 9 8 4 3 1 ...
## $ Sleep_Hours : num 8.8 6.4 9 5.7 5.7 6.3 6.7 7.8 6.4 4.5 ...
## $ Stress_Level : int 4 1 4 3 3 7 3 3 4 2 ...
## $ App_Usage_Count : int 42 51 14 36 37 34 5 6 17 7 ...
## $ Caffeine_Intake_Cups : int 1 3 5 6 5 5 3 6 3 0 ...
## $ Weekend_Screen_Time_Hours: num 8.7 5.1 6.3 12.8 9.9 3.6 2.9 12.8 3.1 2.7 ...
head(df)
## User_ID Age Gender Occupation Device_Type Daily_Phone_Hours
## 1 U1 58 Male Professional Android 1.3
## 2 U2 25 Male Professional Android 1.2
## 3 U3 19 Male Student iOS 5.3
## 4 U4 35 Female Business Owner iOS 5.8
## 5 U5 33 Male Freelancer Android 7.9
## 6 U6 32 Female Student Android 10.9
## Social_Media_Hours Work_Productivity_Score Sleep_Hours Stress_Level
## 1 6.7 6 8.8 4
## 2 1.5 5 6.4 1
## 3 5.7 5 9.0 4
## 4 2.5 2 5.7 3
## 5 1.3 4 5.7 3
## 6 4.2 9 6.3 7
## App_Usage_Count Caffeine_Intake_Cups Weekend_Screen_Time_Hours
## 1 42 1 8.7
## 2 51 3 5.1
## 3 14 5 6.3
## 4 36 6 12.8
## 5 37 5 9.9
## 6 34 5 3.6
# Raw data line count
nrow(df)
## [1] 50000
ncol(df)
## [1] 13
# Missing Values Check
missing_df <- df %>%
summarise(across(everything(), ~sum(is.na(.)))) %>%
pivot_longer(cols = everything(),
names_to = "Column",
values_to = "Missing_Count")
missing_df
## # A tibble: 13 × 2
## Column Missing_Count
## <chr> <int>
## 1 User_ID 0
## 2 Age 0
## 3 Gender 0
## 4 Occupation 0
## 5 Device_Type 0
## 6 Daily_Phone_Hours 0
## 7 Social_Media_Hours 0
## 8 Work_Productivity_Score 0
## 9 Sleep_Hours 0
## 10 Stress_Level 0
## 11 App_Usage_Count 0
## 12 Caffeine_Intake_Cups 0
## 13 Weekend_Screen_Time_Hours 0
# Bar Plot- Missing data
ggplot(missing_df, aes(x = reorder(Column, -Missing_Count),
y = Missing_Count)) +
geom_col() +
labs(title = "Missing Values per Column",
x = "Variables",
y = "Number of Missing Values") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Data Cleaning
df <- df %>%
mutate(across(where(is.character), str_trim))
view(df)
# Handle Missing Values
colnames(df)
## [1] "User_ID" "Age"
## [3] "Gender" "Occupation"
## [5] "Device_Type" "Daily_Phone_Hours"
## [7] "Social_Media_Hours" "Work_Productivity_Score"
## [9] "Sleep_Hours" "Stress_Level"
## [11] "App_Usage_Count" "Caffeine_Intake_Cups"
## [13] "Weekend_Screen_Time_Hours"
df <- df %>%
mutate(
Sleep_Hours = ifelse(is.na(Sleep_Hours),
median(Sleep_Hours, na.rm = TRUE),
Sleep_Hours),
Daily_Phone_Hours = ifelse(is.na(Daily_Phone_Hours),
median(Daily_Phone_Hours, na.rm = TRUE),
Daily_Phone_Hours)
)
df <- df %>%
drop_na(Stress_Level)
# Detect and Remove Outliers (IQR Method)
Q1 <- quantile(df$Daily_Phone_Hours, 0.25)
Q3 <- quantile(df$Daily_Phone_Hours, 0.75)
IQR_value <- Q3 - Q1
df <- df %>%
filter(Daily_Phone_Hours >= (Q1 - 1.5 * IQR_value) &
Daily_Phone_Hours <= (Q3 + 1.5 * IQR_value))
Q1_s <- quantile(df$Sleep_Hours, 0.25)
Q3_s <- quantile(df$Sleep_Hours, 0.75)
IQR_s <- Q3_s - Q1_s
df <- df %>%
filter(Sleep_Hours >= (Q1_s - 1.5 * IQR_s) &
Sleep_Hours <= (Q3_s + 1.5 * IQR_s))
df <- df %>%
mutate(
Stress_Level = str_trim(Stress_Level), # remove spaces
Stress_Level = str_to_title(Stress_Level) # optional: make "low" -> "Low"
)
# Convert Qualitative into Quantitative
df <- df %>%
mutate(
Stress_Numeric = as.numeric(Stress_Level),
Gender_Numeric = case_when(
Gender == "Male" ~ 0,
Gender == "Female" ~ 1,
TRUE ~ NA_real_
)
)
df %>%
select(User_ID, Gender, Stress_Level, Gender_Numeric, Stress_Numeric) %>%
head(10)
## User_ID Gender Stress_Level Gender_Numeric Stress_Numeric
## 1 U1 Male 4 0 4
## 2 U2 Male 1 0 1
## 3 U3 Male 4 0 4
## 4 U4 Female 3 1 3
## 5 U5 Male 3 0 3
## 6 U6 Female 7 1 7
## 7 U7 Male 3 0 3
## 8 U8 Male 3 0 3
## 9 U9 Female 4 1 4
## 10 U10 Female 2 1 2
unique(df$Stress_Level)
## [1] "4" "1" "3" "7" "2" "9" "10" "8" "5" "6"
# Bar Chart
ggplot(df, aes(x = Stress_Level)) +
geom_bar(fill = "lightblue") +
labs(title = "Distribution of Stress Levels",
x = "Stress Level",
y = "Count") +
theme_minimal()

# Grouped Bar Chart
df %>%
group_by(Stress_Numeric) %>%
summarise(
Avg_Phone = mean(Daily_Phone_Hours, na.rm = TRUE),
Avg_Social = mean(Social_Media_Hours, na.rm = TRUE)
) %>%
pivot_longer(cols = c(Avg_Phone, Avg_Social),
names_to = "Type",
values_to = "Average_Hours") %>%
ggplot(aes(x = factor(Stress_Numeric), y = Average_Hours, fill = Type)) +
geom_col(position = "dodge") +
scale_fill_manual(values = c("lightblue", "grey")) +
labs(title = "Average Daily Phone and Social Media Hours by Stress Level",
x = "Stress Level (1=Low, 2=Medium, 3=High)",
y = "Average Hours") +
theme_minimal()

# Histogram
ggplot(df, aes(x = Daily_Phone_Hours)) +
geom_histogram(binwidth = 1, fill = "steelblue", color = "white") +
labs(title = "Distribution of Daily Phone Hours",
x = "Daily Phone Hours",
y = "Number of Users") +
theme_minimal()
