library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.5.3
library(readr)
library(psych)
## Warning: package 'psych' was built under R version 4.5.3
raw <- readLines("C:\\Users\\20253667\\OneDrive - TU Eindhoven\\Documents\\applied data skills\\end project\\df_Sleep_Fatigue_AlcoholUse.csv")
raw[1] <- gsub('"', '', raw[1])
tmp <- tempfile(fileext = ".csv")
writeLines(raw, tmp)
data <- read_csv(tmp)
## New names:
## Rows: 172 Columns: 20
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," dbl
## (20): ...1, X, ID, TSC, FSS, CIS_Fatigue_severity, CIS_Concentration, CI...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
variables <- c("TSC", "CIS_Concentration", "PSQI_GlobalScore", "MCTQ_MSFsc", "AUDIT_Score")
subset_data <- data[ ,variables]
desc <- describe(subset_data)
var <- c("n", "mean", "sd", "min", "max")
labels <- c("Trait self-control", "Concentration (CIS)", "Disturbances in sleep quality (PSQI)", "Chronotype (MCTQ)", "Alcohol usage (AUDIT)")
table1 <- data.frame(Variable = labels, desc[ ,var])
table1 %>% kbl(caption = "table 1, descriptive", row.names = FALSE) %>%
kable_styling
| Variable | n | mean | sd | min | max |
|---|---|---|---|---|---|
| Trait self-control | 162 | 2.831358 | 0.5224661 | 1.85 | 4.31 |
| Concentration (CIS) | 162 | 3.925926 | 1.1227493 | 1.60 | 6.80 |
| Disturbances in sleep quality (PSQI) | 149 | 5.906040 | 2.4282746 | 1.00 | 15.00 |
| Chronotype (MCTQ) | 140 | 5.113357 | 1.0773600 | 2.44 | 8.08 |
| Alcohol usage (AUDIT) | 149 | 8.885906 | 5.2253672 | 0.00 | 25.00 |
library(kableExtra)
library(readr)
library(psych)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
library(patchwork)
## Warning: package 'patchwork' was built under R version 4.5.3
library(plotly)
## Warning: package 'plotly' was built under R version 4.5.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(ggcorrplot)
## Warning: package 'ggcorrplot' was built under R version 4.5.3
raw <- readLines("C:\\Users\\20253667\\OneDrive - TU Eindhoven\\Documents\\applied data skills\\end project\\df_Sleep_Fatigue_AlcoholUse.csv")
raw[1] <- gsub('"', '', raw[1])
tmp <- tempfile(fileext = ".csv")
writeLines(raw, tmp)
data <- read_csv(tmp)
## New names:
## • `` -> `...1`
## Rows: 172 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (20): ...1, X, ID, TSC, FSS, CIS_Fatigue_severity, CIS_Concentration, CI...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
var <- c("TSC", "CIS_Concentration", "PSQI_GlobalScore", "MCTQ_MSFsc", "AUDIT_Score")
subset_data <- data[ ,var]
labels <- c("Trait self-control", "Concentration (CIS)", "Disturbances in sleep quality (PSQI)", "Chronotype (MCTQ)", "Alcohol usage (AUDIT)")
plot_histogram <- function(.data, variable, labels, min_value, max_value) {
ggplot(.data, aes(x = .data[[variable]])) +
geom_histogram() +
xlab(labels) +
xlim(min_value, max_value)
}
plots <- list()
for (i in 1:5) {
plots[[i]] <- plot_histogram(subset_data, var[i], labels[i], min(subset_data[[i]], na.rm = TRUE), max(subset_data[[i]], na.rm = TRUE))
}
figure1 <- plots[[1]] + plots[[2]] + plots[[3]] + plots[[4]] + plots[[5]]
figure1
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## Warning: Removed 10 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## Warning: Removed 10 rows containing non-finite outside the scale range (`stat_bin()`).
## Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## Warning: Removed 23 rows containing non-finite outside the scale range (`stat_bin()`).
## Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## Warning: Removed 32 rows containing non-finite outside the scale range (`stat_bin()`).
## Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## Warning: Removed 23 rows containing non-finite outside the scale range (`stat_bin()`).
## Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
figure2 <- plots[[5]] +
geom_vline(xintercept = mean(subset_data[[5]], na.rm = TRUE), color = "red", linewidth = 3)
figure2
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## Warning: Removed 23 rows containing non-finite outside the scale range (`stat_bin()`).
## Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
figure3 <- figure2 +
annotate("text", x = 6, y = 16, label = paste("mean = ", round(mean(subset_data[[5]], na.rm = TRUE), 2)))
figure3
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## Warning: Removed 23 rows containing non-finite outside the scale range (`stat_bin()`).
## Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
lower_bound <- mean(subset_data[[5]], na.rm = TRUE) - 2 * sd(subset_data[[5]], na.rm = TRUE)
upper_bound <- mean(subset_data[[5]], na.rm = TRUE) + 2 * sd(subset_data[[5]], na.rm = TRUE)
outliers <- subset_data[[5]] > upper_bound | subset_data[[5]] < lower_bound
show <- na.omit(subset_data[[5]][outliers])
data[which(outliers), c("ID", "AUDIT_Score")]
## # A tibble: 3 × 2
## ID AUDIT_Score
## <dbl> <dbl>
## 1 145 25
## 2 160 21
## 3 164 21
figure4 <- figure3 +
geom_vline(xintercept = mean(subset_data[[5]], na.rm = TRUE) - 2 * sd(subset_data[[5]], na.rm = TRUE), color = "blue", linetype = "dashed") +
geom_vline(xintercept = mean(subset_data[[5]], na.rm = TRUE) + 2 * sd(subset_data[[5]], na.rm = TRUE), color = "blue", linetype = "dashed")
figure4
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## Warning: Removed 23 rows containing non-finite outside the scale range (`stat_bin()`).
## Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_vline()`).
figure5 <- ggplotly(figure4)
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## Warning: Removed 23 rows containing non-finite outside the scale range
## (`stat_bin()`).
figure5
plot_boxplot <- function(.data, variable, labels, min_value, max_value) {
ggplot(.data, aes(x = .data[[variable]], y = 0)) +
geom_boxplot() +
xlab(labels) +
ylab("") +
xlim(min_value, max_value) +
theme(axis.text.y = element_blank(),
axis.ticks.y = element_blank())
}
plots_2 <- list()
for (i in 1:5) {
plots_2[[i]] <- plot_boxplot(subset_data, var[i], labels[i], min(subset_data[[i]], na.rm = TRUE), max(subset_data[[i]], na.rm = TRUE))
}
figure6 <- plots_2[[5]]
figure6
## Warning: Removed 23 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
cor_matrix <- cor(subset_data, use = "complete.obs")
colnames(cor_matrix) <- labels
rownames(cor_matrix) <- labels
figure7 <- ggcorrplot(
cor_matrix,
lab = TRUE,
type = "lower",
)
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the ggcorrplot package.
## Please report the issue at <https://github.com/kassambara/ggcorrplot/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
figure7
1.4: 4a. The outliers scores and their ID’s are shown in the table. 4b.
Two standard deviations don’t fit on the graph because there are no
values there, this is why there is only a dashed line on the right
showing the two standard deviations higher than the mean. 4d. The
boxplot shows one outlier at 25, while the histogram shows two outliers
at 20.68…