This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.1     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.3     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
burnout <-read_csv("student_mental_health_burnout_1M-selected-columns-3.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 1000000 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): gender
## dbl (9): age, academic_year, study_hours_per_day, exam_pressure, academic_pe...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dim(burnout)
## [1] 1000000      10
head(burnout)
tail(burnout)
str(burnout)
## spc_tbl_ [1,000,000 × 10] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ age                 : num [1:1000000] 23 20 29 27 24 29 21 23 26 19 ...
##  $ gender              : chr [1:1000000] "Male" "Male" "Male" "Male" ...
##  $ academic_year       : num [1:1000000] 2 3 2 4 4 3 3 2 4 3 ...
##  $ study_hours_per_day : num [1:1000000] 5.6 5.6 2.58 4.61 2.19 ...
##  $ exam_pressure       : num [1:1000000] 6.49 5.63 6.02 6.68 4.01 ...
##  $ academic_performance: num [1:1000000] 68.4 67.7 58.4 68.9 69.1 ...
##  $ stress_level        : num [1:1000000] 4.117 0.349 3.476 6.779 1.855 ...
##  $ anxiety_score       : num [1:1000000] 2.28 0 2.43 4.51 1.1 ...
##  $ depression_score    : num [1:1000000] 1.987 0 0.852 4.286 0 ...
##  $ sleep_hours         : num [1:1000000] 6.88 7.46 8.95 4.57 5.99 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   age = col_double(),
##   ..   gender = col_character(),
##   ..   academic_year = col_double(),
##   ..   study_hours_per_day = col_double(),
##   ..   exam_pressure = col_double(),
##   ..   academic_performance = col_double(),
##   ..   stress_level = col_double(),
##   ..   anxiety_score = col_double(),
##   ..   depression_score = col_double(),
##   ..   sleep_hours = col_double()
##   .. )
##  - attr(*, "problems")=<pointer: 0x7fadcf87b760>
select(burnout,"study_hours_per_day", "sleep_hours")
studysleep <-select(burnout,"study_hours_per_day", "sleep_hours")
dim(studysleep)
## [1] 1000000       2
summary(studysleep)
##  study_hours_per_day  sleep_hours    
##  Min.   : 0.000      Min.   : 3.000  
##  1st Qu.: 3.651      1st Qu.: 5.491  
##  Median : 4.998      Median : 6.502  
##  Mean   : 5.002      Mean   : 6.502  
##  3rd Qu.: 6.346      3rd Qu.: 7.515  
##  Max.   :14.000      Max.   :10.000  
##                      NAs    :1
plot(studysleep)

ggplot(studysleep, aes(x = study_hours_per_day, y = sleep_hours)) +
  geom_point(alpha = 0.3) +
  labs(title = "Study Hours vs. Sleep Hours",
       x = "Study", y = "Sleep") +
  theme_minimal()
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).