end project

library(kableExtra)

## Warning: package 'kableExtra' was built under R version 4.5.3

library(readr)
library(psych)

## Warning: package 'psych' was built under R version 4.5.3

raw <- readLines("C:\\Users\\20253667\\OneDrive - TU Eindhoven\\Documents\\applied data skills\\end project\\df_Sleep_Fatigue_AlcoholUse.csv")

raw[1] <- gsub('"', '', raw[1])

tmp <- tempfile(fileext = ".csv")
writeLines(raw, tmp)
data <- read_csv(tmp)

## New names:
## Rows: 172 Columns: 20
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," dbl
## (20): ...1, X, ID, TSC, FSS, CIS_Fatigue_severity, CIS_Concentration, CI...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`

variables <- c("TSC", "CIS_Concentration", "PSQI_GlobalScore", "MCTQ_MSFsc", "AUDIT_Score")

subset_data <- data[ ,variables]

desc <- describe(subset_data)

var <- c("n", "mean", "sd", "min", "max")

labels <- c("Trait self-control", "Concentration (CIS)", "Disturbances in sleep quality (PSQI)", "Chronotype (MCTQ)", "Alcohol usage (AUDIT)")

table1 <- data.frame(Variable = labels, desc[ ,var])
table1 %>% kbl(caption = "table 1, descriptive", row.names = FALSE) %>%
  kable_styling

table 1, descriptive
Variable	n	mean	sd	min	max
Trait self-control	162	2.831358	0.5224661	1.85	4.31
Concentration (CIS)	162	3.925926	1.1227493	1.60	6.80
Disturbances in sleep quality (PSQI)	149	5.906040	2.4282746	1.00	15.00
Chronotype (MCTQ)	140	5.113357	1.0773600	2.44	8.08
Alcohol usage (AUDIT)	149	8.885906	5.2253672	0.00	25.00

library(kableExtra)
library(readr)
library(psych)
library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following objects are masked from 'package:psych':
## 
##     %+%, alpha

library(patchwork)

## Warning: package 'patchwork' was built under R version 4.5.3

library(plotly)

## Warning: package 'plotly' was built under R version 4.5.3

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(ggcorrplot)

## Warning: package 'ggcorrplot' was built under R version 4.5.3

raw <- readLines("C:\\Users\\20253667\\OneDrive - TU Eindhoven\\Documents\\applied data skills\\end project\\df_Sleep_Fatigue_AlcoholUse.csv")

raw[1] <- gsub('"', '', raw[1])

tmp <- tempfile(fileext = ".csv")
writeLines(raw, tmp)
data <- read_csv(tmp)

## New names:
## • `` -> `...1`

## Rows: 172 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (20): ...1, X, ID, TSC, FSS, CIS_Fatigue_severity, CIS_Concentration, CI...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

var <- c("TSC", "CIS_Concentration", "PSQI_GlobalScore", "MCTQ_MSFsc", "AUDIT_Score")

subset_data <- data[ ,var]

labels <- c("Trait self-control", "Concentration (CIS)", "Disturbances in sleep quality (PSQI)", "Chronotype (MCTQ)", "Alcohol usage (AUDIT)")

plot_histogram <- function(.data, variable, labels, min_value, max_value) {
  ggplot(.data, aes(x = .data[[variable]])) +
    geom_histogram() +
    xlab(labels) +
    xlim(min_value, max_value)
}

plots <- list()

for (i in 1:5) {
  
  plots[[i]] <- plot_histogram(subset_data, var[i], labels[i], min(subset_data[[i]], na.rm = TRUE), max(subset_data[[i]], na.rm = TRUE))
}

figure1 <- plots[[1]] + plots[[2]] + plots[[3]] + plots[[4]] + plots[[5]] 

figure1

## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

## Warning: Removed 10 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).

## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

## Warning: Removed 10 rows containing non-finite outside the scale range (`stat_bin()`).
## Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).

## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

## Warning: Removed 23 rows containing non-finite outside the scale range (`stat_bin()`).
## Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).

## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

## Warning: Removed 32 rows containing non-finite outside the scale range (`stat_bin()`).
## Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).

## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

## Warning: Removed 23 rows containing non-finite outside the scale range (`stat_bin()`).
## Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).

figure2 <- plots[[5]] +
  geom_vline(xintercept = mean(subset_data[[5]], na.rm = TRUE), color = "red", linewidth = 3)

figure2

## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

## Warning: Removed 23 rows containing non-finite outside the scale range (`stat_bin()`).
## Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).

figure3 <- figure2 +
  annotate("text", x = 6, y = 16, label = paste("mean = ", round(mean(subset_data[[5]], na.rm = TRUE), 2)))

figure3

## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

## Warning: Removed 23 rows containing non-finite outside the scale range (`stat_bin()`).
## Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).

lower_bound <- mean(subset_data[[5]], na.rm = TRUE) - 2 * sd(subset_data[[5]], na.rm = TRUE)
upper_bound <- mean(subset_data[[5]], na.rm = TRUE) + 2 * sd(subset_data[[5]], na.rm = TRUE)

outliers <- subset_data[[5]] > upper_bound | subset_data[[5]] < lower_bound
show <- na.omit(subset_data[[5]][outliers])

data[which(outliers), c("ID", "AUDIT_Score")]

## # A tibble: 3 × 2
##      ID AUDIT_Score
##   <dbl>       <dbl>
## 1   145          25
## 2   160          21
## 3   164          21

figure4 <- figure3 +
  geom_vline(xintercept = mean(subset_data[[5]], na.rm = TRUE) - 2 * sd(subset_data[[5]], na.rm = TRUE), color = "blue", linetype = "dashed") +
  geom_vline(xintercept = mean(subset_data[[5]], na.rm = TRUE) + 2 * sd(subset_data[[5]], na.rm = TRUE), color = "blue", linetype = "dashed")

figure4

## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

## Warning: Removed 23 rows containing non-finite outside the scale range (`stat_bin()`).
## Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_vline()`).

figure5 <- ggplotly(figure4)

## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

## Warning: Removed 23 rows containing non-finite outside the scale range
## (`stat_bin()`).

figure5

plot_boxplot <- function(.data, variable, labels, min_value, max_value) {
  ggplot(.data, aes(x = .data[[variable]], y = 0)) +
    geom_boxplot() +
    xlab(labels) +
    ylab("") +
    xlim(min_value, max_value) +
    theme(axis.text.y = element_blank(),
        axis.ticks.y = element_blank()) 
}

plots_2 <- list()

for (i in 1:5) {
  
  plots_2[[i]] <- plot_boxplot(subset_data, var[i], labels[i], min(subset_data[[i]], na.rm = TRUE), max(subset_data[[i]], na.rm = TRUE))
}

figure6 <- plots_2[[5]]
figure6

## Warning: Removed 23 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

cor_matrix <- cor(subset_data, use = "complete.obs")
colnames(cor_matrix) <- labels
rownames(cor_matrix) <- labels

figure7 <- ggcorrplot(
  cor_matrix, 
  lab = TRUE,
  type = "lower",
  )

## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the ggcorrplot package.
##   Please report the issue at <https://github.com/kassambara/ggcorrplot/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

figure7

1.4: 4a. The outliers scores and their ID’s are shown in the table. 4b. Two standard deviations don’t fit on the graph because there are no values there, this is why there is only a dashed line on the right showing the two standard deviations higher than the mean. 4d. The boxplot shows one outlier at 25, while the histogram shows two outliers at 20.68…

end project

2026-04-01