We’re looking at information from 477 people, each represented as one row in the dataset. Every column is a different piece of information we collected about them. Some columns capture demographics (like age, gender, and ethnicity), while others are responses to questionnaires.
There are two main groups of questionnaire items here:
Recovery questions (recovery1 through recovery16): these ask about how well someone bounces back from stress or fatigue.
PANAS questions (PANAS1 through PANAS20): this stands for Positive and Negative Affect Schedule, a common survey that measures how positive or negative a person’s emotions are.
Think of the dataset as a giant spreadsheet where each row is a person and each column is a question they answered.
# Read from the same folder as this .Rmd
stress <- read.csv("Stress.csv", na.strings = c("", "NA"))
# Sanity checks
dim(stress) # rows x columns
## [1] 477 40
names(stress) # column names
## [1] "age" "gender" "ethnicity" "recovery1" "recovery2"
## [6] "recovery3" "recovery4" "recovery5" "recovery6" "recovery7"
## [11] "recovery8" "recovery9" "recovery10" "recovery11" "recovery12"
## [16] "recovery13" "recovery14" "recovery15" "recovery16" "AttnChk"
## [21] "PANAS1" "PANAS2" "PANAS3" "PANAS4" "PANAS5"
## [26] "PANAS6" "PANAS7" "PANAS8" "PANAS9" "PANAS10"
## [31] "PANAS11" "PANAS12" "PANAS13" "PANAS14" "PANAS15"
## [36] "PANAS16" "PANAS17" "PANAS18" "PANAS19" "PANAS20"
head(stress, 5) # first 5 rows
## age gender ethnicity recovery1 recovery2 recovery3 recovery4 recovery5
## 1 34 2 1 NA NA NA NA NA
## 2 22 1 1 NA NA NA NA NA
## 3 29 2 1 NA NA NA NA NA
## 4 34 2 1 NA NA NA NA NA
## 5 34 1 1 NA NA NA NA NA
## recovery6 recovery7 recovery8 recovery9 recovery10 recovery11 recovery12
## 1 NA NA NA NA NA NA NA
## 2 NA NA NA NA NA NA NA
## 3 NA NA NA NA NA NA NA
## 4 NA NA NA NA NA NA NA
## 5 NA NA NA NA NA NA NA
## recovery13 recovery14 recovery15 recovery16 AttnChk PANAS1 PANAS2 PANAS3
## 1 NA NA NA NA NA NA NA NA
## 2 NA NA NA NA NA NA NA NA
## 3 NA NA NA NA NA NA NA NA
## 4 NA NA NA NA NA NA NA NA
## 5 NA NA NA NA NA NA NA NA
## PANAS4 PANAS5 PANAS6 PANAS7 PANAS8 PANAS9 PANAS10 PANAS11 PANAS12 PANAS13
## 1 NA NA NA NA NA NA NA NA NA NA
## 2 NA NA NA NA NA NA NA NA NA NA
## 3 NA NA NA NA NA NA NA NA NA NA
## 4 NA NA NA NA NA NA NA NA NA NA
## 5 NA NA NA NA NA NA NA NA NA NA
## PANAS14 PANAS15 PANAS16 PANAS17 PANAS18 PANAS19 PANAS20
## 1 NA NA NA NA NA NA NA
## 2 NA NA NA NA NA NA NA
## 3 NA NA NA NA NA NA NA
## 4 NA NA NA NA NA NA NA
## 5 NA NA NA NA NA NA NA
stress$ID <- seq_len(nrow(stress))
head(stress[, c("ID", names(stress)[1:5])], 5)
## ID age gender ethnicity recovery1 recovery2
## 1 1 34 2 1 NA NA
## 2 2 22 1 1 NA NA
## 3 3 29 2 1 NA NA
## 4 4 34 2 1 NA NA
## 5 5 34 1 1 NA NA
num_cols <- grep("^(recovery\\d+|PANAS\\d+)$", names(stress), value = TRUE)
for (cn in num_cols) {
# Coerce only if not already numeric
if (!is.numeric(stress[[cn]])) {
suppressWarnings(stress[[cn]] <- as.numeric(stress[[cn]]))
}
}
str(stress[num_cols])
## 'data.frame': 477 obs. of 36 variables:
## $ recovery1 : int NA NA NA NA NA NA NA NA NA NA ...
## $ recovery2 : int NA NA NA NA NA NA NA NA NA NA ...
## $ recovery3 : int NA NA NA NA NA NA NA NA NA NA ...
## $ recovery4 : int NA NA NA NA NA NA NA NA NA NA ...
## $ recovery5 : int NA NA NA NA NA NA NA NA NA NA ...
## $ recovery6 : int NA NA NA NA NA NA NA NA NA NA ...
## $ recovery7 : int NA NA NA NA NA NA NA NA NA NA ...
## $ recovery8 : int NA NA NA NA NA NA NA NA NA NA ...
## $ recovery9 : int NA NA NA NA NA NA NA NA NA NA ...
## $ recovery10: int NA NA NA NA NA NA NA NA NA NA ...
## $ recovery11: int NA NA NA NA NA NA NA NA NA NA ...
## $ recovery12: int NA NA NA NA NA NA NA NA NA NA ...
## $ recovery13: int NA NA NA NA NA NA NA NA NA NA ...
## $ recovery14: int NA NA NA NA NA NA NA NA NA NA ...
## $ recovery15: int NA NA NA NA NA NA NA NA NA NA ...
## $ recovery16: int NA NA NA NA NA NA NA NA NA NA ...
## $ PANAS1 : int NA NA NA NA NA NA NA NA NA NA ...
## $ PANAS2 : int NA NA NA NA NA NA NA NA NA NA ...
## $ PANAS3 : int NA NA NA NA NA NA NA NA NA NA ...
## $ PANAS4 : int NA NA NA NA NA NA NA NA NA NA ...
## $ PANAS5 : int NA NA NA NA NA NA NA NA NA NA ...
## $ PANAS6 : int NA NA NA NA NA NA NA NA NA NA ...
## $ PANAS7 : int NA NA NA NA NA NA NA NA NA NA ...
## $ PANAS8 : int NA NA NA NA NA NA NA NA NA NA ...
## $ PANAS9 : int NA NA NA NA NA NA NA NA NA NA ...
## $ PANAS10 : int NA NA NA NA NA NA NA NA NA NA ...
## $ PANAS11 : int NA NA NA NA NA NA NA NA NA NA ...
## $ PANAS12 : int NA NA NA NA NA NA NA NA NA NA ...
## $ PANAS13 : int NA NA NA NA NA NA NA NA NA NA ...
## $ PANAS14 : int NA NA NA NA NA NA NA NA NA NA ...
## $ PANAS15 : int NA NA NA NA NA NA NA NA NA NA ...
## $ PANAS16 : int NA NA NA NA NA NA NA NA NA NA ...
## $ PANAS17 : int NA NA NA NA NA NA NA NA NA NA ...
## $ PANAS18 : int NA NA NA NA NA NA NA NA NA NA ...
## $ PANAS19 : int NA NA NA NA NA NA NA NA NA NA ...
## $ PANAS20 : int NA NA NA NA NA NA NA NA NA NA ...
# Safe, codebook-agnostic factorization
if ("gender" %in% names(stress)) {
stress$gender <- factor(stress$gender)
}
if ("ethnicity" %in% names(stress)) {
stress$ethnicity <- factor(stress$ethnicity)
}
# Example labeled version (use only if you know your coding):
# stress$gender <- factor(stress$gender, levels = c(1, 2), labels = c("Female", "Male"))
# stress$ethnicity <- factor(stress$ethnicity,
# levels = c(1, 2, 3, 4, 5),
# labels = c("Group1","Group2","Group3","Group4","Group5"))
summary(stress[, intersect(c("gender","ethnicity"), names(stress))])
## gender ethnicity
## 1 :275 1 :377
## 2 :197 2 : 44
## 3 : 2 3 : 5
## NA's: 3 4 : 28
## 5 : 1
## 6 : 19
## NA's: 3
This way, we’re not throwing away data, but we’re also not pretending the blanks don’t exist.
# NA counts by column
na_by_col <- sapply(stress, function(x) sum(is.na(x)))
na_by_col[na_by_col > 0]
## age gender ethnicity recovery1 recovery2 recovery3 recovery4
## 3 3 3 25 25 25 25
## recovery5 recovery6 recovery7 recovery8 recovery9 recovery10 recovery11
## 25 25 25 25 25 25 25
## recovery12 recovery13 recovery14 recovery15 recovery16 AttnChk PANAS1
## 25 25 25 25 25 25 51
## PANAS2 PANAS3 PANAS4 PANAS5 PANAS6 PANAS7 PANAS8
## 52 51 52 51 51 53 51
## PANAS9 PANAS10 PANAS11 PANAS12 PANAS13 PANAS14 PANAS15
## 51 52 51 52 53 51 52
## PANAS16 PANAS17 PANAS18 PANAS19 PANAS20
## 51 52 52 52 52
# Total number of NA cells
sum(is.na(stress))
## [1] 1467
# How many rows contain any missing values?
rows_with_any_na <- sum(!complete.cases(stress))
rows_with_any_na
## [1] 66
# Load dplyr for the assignment's "include some dplyr" requirement
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Identify numeric and factor columns
num_cols <- names(Filter(is.numeric, stress))
factor_cols <- names(Filter(is.factor, stress))
# Create a cleaned copy with explicit, reproducible rules
stress_clean <-
stress %>%
mutate(
across(all_of(num_cols), ~ ifelse(is.na(.x), median(.x, na.rm = TRUE), .x)),
across(all_of(factor_cols), ~ {
x <- as.character(.x)
x[is.na(x)] <- "Missing"
factor(x)
})
)
# Verify that missingness is addressed
na_by_col_after <- sapply(stress_clean, function(x) sum(is.na(x)))
na_by_col_after[na_by_col_after > 0]
## named integer(0)
# Compact demographic snapshot
stress_clean %>%
select(ID, age, gender, ethnicity) %>%
arrange(ID) %>%
head(10)
## ID age gender ethnicity
## 1 1 34 2 1
## 2 2 22 1 1
## 3 3 29 2 1
## 4 4 34 2 1
## 5 5 34 1 1
## 6 6 29 1 2
## 7 7 51 2 1
## 8 8 23 1 1
## 9 9 31 2 2
## 10 10 32 1 2
# Example grouped summary (swap metrics as needed)
stress_clean %>%
group_by(gender) %>%
summarise(
n = n(),
mean_age = mean(age, na.rm = TRUE),
.groups = "drop"
)
## # A tibble: 4 × 3
## gender n mean_age
## <fct> <int> <dbl>
## 1 1 275 32.1
## 2 2 197 36.0
## 3 3 2 22.5
## 4 Missing 3 31
After all these steps, we end up with a polished version of the dataset called stress_v1. It’s the same people and questions as before, but now:
Each person has an ID.
Gender and ethnicity are proper categories.
Missing values are treated consistently.
It’s easy to summarize, plot, and analyze.
stress_v1 <- stress_clean
# Brief summary
dim(stress_v1)
## [1] 477 41
summary(stress_v1[, intersect(c("ID","age","gender","ethnicity"), names(stress_v1))])
## ID age gender ethnicity
## Min. : 1 Min. :19.00 1 :275 1 :377
## 1st Qu.:120 1st Qu.:27.00 2 :197 2 : 44
## Median :239 Median :31.00 3 : 2 3 : 5
## Mean :239 Mean :33.66 Missing: 3 4 : 28
## 3rd Qu.:358 3rd Qu.:38.00 5 : 1
## Max. :477 Max. :70.00 6 : 19
## Missing: 3
head(stress_v1, 8)
## age gender ethnicity recovery1 recovery2 recovery3 recovery4 recovery5
## 1 34 2 1 3 2 4 4 4
## 2 22 1 1 3 2 4 4 4
## 3 29 2 1 3 2 4 4 4
## 4 34 2 1 3 2 4 4 4
## 5 34 1 1 3 2 4 4 4
## 6 29 1 2 3 2 4 4 4
## 7 51 2 1 3 2 4 4 4
## 8 23 1 1 3 2 4 4 4
## recovery6 recovery7 recovery8 recovery9 recovery10 recovery11 recovery12
## 1 4 4 4 4 4 4 4
## 2 4 4 4 4 4 4 4
## 3 4 4 4 4 4 4 4
## 4 4 4 4 4 4 4 4
## 5 4 4 4 4 4 4 4
## 6 4 4 4 4 4 4 4
## 7 4 4 4 4 4 4 4
## 8 4 4 4 4 4 4 4
## recovery13 recovery14 recovery15 recovery16 AttnChk PANAS1 PANAS2 PANAS3
## 1 4 4 4 4 3 4 4 2
## 2 4 4 4 4 3 4 4 2
## 3 4 4 4 4 3 4 4 2
## 4 4 4 4 4 3 4 4 2
## 5 4 4 4 4 3 4 4 2
## 6 4 4 4 4 3 4 4 2
## 7 4 4 4 4 3 4 4 2
## 8 4 4 4 4 3 4 4 2
## PANAS4 PANAS5 PANAS6 PANAS7 PANAS8 PANAS9 PANAS10 PANAS11 PANAS12 PANAS13
## 1 1 3.5 2 1 3 2 4 4 2 1
## 2 1 3.5 2 1 3 2 4 4 2 1
## 3 1 3.5 2 1 3 2 4 4 2 1
## 4 1 3.5 2 1 3 2 4 4 2 1
## 5 1 3.5 2 1 3 2 4 4 2 1
## 6 1 3.5 2 1 3 2 4 4 2 1
## 7 1 3.5 2 1 3 2 4 4 2 1
## 8 1 3.5 2 1 3 2 4 4 2 1
## PANAS14 PANAS15 PANAS16 PANAS17 PANAS18 PANAS19 PANAS20 ID
## 1 2 3 4 2 2 3 4 1
## 2 2 3 4 2 2 3 4 2
## 3 2 3 4 2 2 3 4 3
## 4 2 3 4 2 2 3 4 4
## 5 2 3 4 2 2 3 4 5
## 6 2 3 4 2 2 3 4 6
## 7 2 3 4 2 2 3 4 7
## 8 2 3 4 2 2 3 4 8
stopifnot(exists("stress_v1"))
str(stress_v1[ , intersect(c("ID","age","gender","ethnicity"), names(stress_v1))])
## 'data.frame': 477 obs. of 4 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ age : num 34 22 29 34 34 29 51 23 31 32 ...
## $ gender : Factor w/ 4 levels "1","2","3","Missing": 2 1 2 2 1 1 2 1 2 1 ...
## $ ethnicity: Factor w/ 7 levels "1","2","3","4",..: 1 1 1 1 1 2 1 1 2 2 ...
keep_cols <- unique(c("ID", "age", "gender", "ethnicity",
"recovery_mean", "PANAS_pos_mean", "PANAS_neg_mean"))
keep_cols <- keep_cols[keep_cols %in% names(stress)]
summary(stress[keep_cols])
## ID age gender ethnicity
## Min. : 1 Min. :19.00 1 :275 1 :377
## 1st Qu.:120 1st Qu.:27.00 2 :197 2 : 44
## Median :239 Median :31.00 3 : 2 3 : 5
## Mean :239 Mean :33.68 NA's: 3 4 : 28
## 3rd Qu.:358 3rd Qu.:38.00 5 : 1
## Max. :477 Max. :70.00 6 : 19
## NA's :3 NA's: 3
# If you want a small sample display:
head(stress[keep_cols], 10)
## ID age gender ethnicity
## 1 1 34 2 1
## 2 2 22 1 1
## 3 3 29 2 1
## 4 4 34 2 1
## 5 5 34 1 1
## 6 6 29 1 2
## 7 7 51 2 1
## 8 8 23 1 1
## 9 9 31 2 2
## 10 10 32 1 2
# ---- table_polished -------------------------------------------------------
knitr::kable({
cols <- intersect(c("ID","age","gender","ethnicity","recovery_mean","PANAS_pos_mean","PANAS_neg_mean"),
names(stress_v1))
df <- stress_v1[cols]
# round numeric columns for readability
num <- sapply(df, is.numeric)
df[num] <- lapply(df[num], function(x) round(x, 2))
# order by ID if present
if ("ID" %in% names(df)) df <- df[order(df$ID), ]
head(df, 25)
}, caption = "Final dataset (clean preview: first 25 rows, rounded)")
ID | age | gender | ethnicity |
---|---|---|---|
1 | 34 | 2 | 1 |
2 | 22 | 1 | 1 |
3 | 29 | 2 | 1 |
4 | 34 | 2 | 1 |
5 | 34 | 1 | 1 |
6 | 29 | 1 | 2 |
7 | 51 | 2 | 1 |
8 | 23 | 1 | 1 |
9 | 31 | 2 | 2 |
10 | 32 | 1 | 2 |
11 | 19 | 2 | 2 |
12 | 25 | 1 | 1 |
13 | 50 | 2 | 1 |
14 | 49 | 2 | 1 |
15 | 48 | 1 | 1 |
16 | 31 | 2 | 2 |
17 | 30 | 1 | 1 |
18 | 42 | 2 | 1 |
19 | 29 | 1 | 1 |
20 | 32 | 2 | 6 |
21 | 23 | 1 | 1 |
22 | 49 | 2 | 4 |
23 | 29 | 1 | 1 |
24 | 32 | 1 | 2 |
25 | 26 | 2 | 1 |