# if you haven't run this code before, you'll need to download the below packages first
# instructions on how to do this are included in the video
# but as a reminder, you use the packages tab to the right
library(tidyverse) # for the map() command
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(psych) # for the describe() command
##
## Attaching package: 'psych'
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(naniar) # for the gg_miss-upset() command
library(expss) # for the cross_cases() command
## Loading required package: maditr
##
## To select rows from data: rows(mtcars, am==0)
##
##
## Attaching package: 'maditr'
##
## The following objects are masked from 'package:dplyr':
##
## between, coalesce, first, last
##
## The following object is masked from 'package:purrr':
##
## transpose
##
## The following object is masked from 'package:readr':
##
## cols
##
##
## Use 'expss_output_viewer()' to display tables in the RStudio Viewer.
## To return to the console output, use 'expss_output_default()'.
##
##
## Attaching package: 'expss'
##
## The following object is masked from 'package:naniar':
##
## is_na
##
## The following objects are masked from 'package:stringr':
##
## fixed, regex
##
## The following objects are masked from 'package:dplyr':
##
## compute, contains, na_if, recode, vars, where
##
## The following objects are masked from 'package:purrr':
##
## keep, modify, modify_if, when
##
## The following objects are masked from 'package:tidyr':
##
## contains, nest
##
## The following object is masked from 'package:ggplot2':
##
## vars
# for the lab, you'll import the CSV file you downloaded along with the current file we're working in (an RMD file)
# for the homework, you'll download the CSV file from your chosen README page (should be titled arc_data_final.csv or eammi2_data_final.csv)
df <- read.csv(file="Data/arc_data_final.csv", header=T)
# these are commands useful for viewing a dataframe
# you can also click the object in the environment tab to view it in a new window
names(df)
## [1] "X" "gender" "trans"
## [4] "sexual_orientation" "ethnicity" "relationship_status"
## [7] "age" "urban_rural" "income"
## [10] "education" "employment" "treatment"
## [13] "health" "mhealth" "sleep_hours"
## [16] "exercise" "pet" "covid_pos"
## [19] "covid_neg" "big5_open" "big5_con"
## [22] "big5_agr" "big5_neu" "big5_ext"
## [25] "pswq" "iou" "mfq_26"
## [28] "mfq_state" "rse" "school_covid_support"
## [31] "school_att" "pas_covid" "pss"
## [34] "phq" "gad" "edeq12"
## [37] "brs" "swemws" "isolation_a"
## [40] "isolation_c" "support"
head(df)
## X gender trans sexual_orientation ethnicity
## 1 1 female no Heterosexual/Straight White - British, Irish, other
## 2 20 male no Heterosexual/Straight White - British, Irish, other
## 3 30 female no Heterosexual/Straight White - British, Irish, other
## 4 31 female no Heterosexual/Straight White - British, Irish, other
## 5 32 <NA> <NA> <NA> <NA>
## 6 33 female no Heterosexual/Straight White - British, Irish, other
## relationship_status age urban_rural
## 1 In a relationship/married and cohabiting <NA> city
## 2 Prefer not to say 1 under 18 city
## 3 Prefer not to say 1 under 18 city
## 4 In a relationship/married and cohabiting 4 between 36 and 45 town
## 5 <NA> <NA> <NA>
## 6 In a relationship/married and cohabiting 4 between 36 and 45 city
## income education employment
## 1 3 high 6 graduate degree or higher 3 employed
## 2 <NA> prefer not to say 1 high school equivalent
## 3 <NA> 2 equivalent to high school completion 1 high school equivalent
## 4 2 middle 5 undergraduate degree 3 employed
## 5 <NA> <NA> <NA>
## 6 2 middle 6 graduate degree or higher 3 employed
## treatment health mhealth
## 1 no psychological disorders something else or not applicable none or NA
## 2 in treatment something else or not applicable anxiety disorder
## 3 not in treatment something else or not applicable none or NA
## 4 no psychological disorders two conditions none or NA
## 5 <NA> <NA> none or NA
## 6 not in treatment something else or not applicable none or NA
## sleep_hours exercise pet covid_pos covid_neg big5_open
## 1 3 7-8 hours 0.0 cat 0 0 5.333333
## 2 2 5-6 hours 2.0 cat 0 0 5.333333
## 3 3 7-8 hours 3.0 dog 0 0 5.000000
## 4 2 5-6 hours 1.5 no pets 0 0 6.000000
## 5 <NA> NA <NA> 0 0 NA
## 6 3 7-8 hours 1.0 multiple types of pet 0 0 5.000000
## big5_con big5_agr big5_neu big5_ext pswq iou mfq_26 mfq_state rse
## 1 6.000000 4.333333 6.000000 2.000000 4.937500 3.185185 4.20 3.625 2.3
## 2 3.333333 4.333333 6.666667 1.666667 3.357143 4.000000 3.35 3.000 1.6
## 3 5.333333 6.666667 4.000000 6.000000 1.857143 1.592593 4.65 5.875 3.9
## 4 5.666667 4.666667 4.000000 5.000000 3.937500 3.370370 4.65 4.000 1.7
## 5 NA NA NA NA NA NA NA NA NA
## 6 6.000000 6.333333 2.666667 NA 2.625000 1.703704 4.50 4.625 3.9
## school_covid_support school_att pas_covid pss phq gad edeq12 brs
## 1 NA NA 3.222222 3.25 1.333333 1.857143 1.583333 NA
## 2 NA NA 4.555556 3.75 3.333333 3.857143 1.833333 NA
## 3 NA NA 3.333333 1.00 1.000000 1.142857 1.000000 NA
## 4 NA NA 4.222222 3.25 2.333333 2.000000 1.666667 NA
## 5 NA NA NA NA NA NA NA NA
## 6 NA NA 3.222222 2.00 1.111111 1.428571 1.416667 NA
## swemws isolation_a isolation_c support
## 1 2.857143 2.25 NA 2.500000
## 2 2.285714 NA 3.5 2.166667
## 3 4.285714 NA 1.0 5.000000
## 4 3.285714 2.50 NA 2.500000
## 5 NA NA NA NA
## 6 4.000000 1.75 NA 3.666667
str(df)
## 'data.frame': 2073 obs. of 41 variables:
## $ X : int 1 20 30 31 32 33 48 49 57 58 ...
## $ gender : chr "female" "male" "female" "female" ...
## $ trans : chr "no" "no" "no" "no" ...
## $ sexual_orientation : chr "Heterosexual/Straight" "Heterosexual/Straight" "Heterosexual/Straight" "Heterosexual/Straight" ...
## $ ethnicity : chr "White - British, Irish, other" "White - British, Irish, other" "White - British, Irish, other" "White - British, Irish, other" ...
## $ relationship_status : chr "In a relationship/married and cohabiting" "Prefer not to say" "Prefer not to say" "In a relationship/married and cohabiting" ...
## $ age : chr NA "1 under 18" "1 under 18" "4 between 36 and 45" ...
## $ urban_rural : chr "city" "city" "city" "town" ...
## $ income : chr "3 high" NA NA "2 middle" ...
## $ education : chr "6 graduate degree or higher" "prefer not to say" "2 equivalent to high school completion" "5 undergraduate degree" ...
## $ employment : chr "3 employed" "1 high school equivalent" "1 high school equivalent" "3 employed" ...
## $ treatment : chr "no psychological disorders" "in treatment" "not in treatment" "no psychological disorders" ...
## $ health : chr "something else or not applicable" "something else or not applicable" "something else or not applicable" "two conditions" ...
## $ mhealth : chr "none or NA" "anxiety disorder" "none or NA" "none or NA" ...
## $ sleep_hours : chr "3 7-8 hours" "2 5-6 hours" "3 7-8 hours" "2 5-6 hours" ...
## $ exercise : num 0 2 3 1.5 NA 1 NA 2 2 1.7 ...
## $ pet : chr "cat" "cat" "dog" "no pets" ...
## $ covid_pos : int 0 0 0 0 0 0 0 0 0 0 ...
## $ covid_neg : int 0 0 0 0 0 0 0 0 0 0 ...
## $ big5_open : num 5.33 5.33 5 6 NA ...
## $ big5_con : num 6 3.33 5.33 5.67 NA ...
## $ big5_agr : num 4.33 4.33 6.67 4.67 NA ...
## $ big5_neu : num 6 6.67 4 4 NA ...
## $ big5_ext : num 2 1.67 6 5 NA ...
## $ pswq : num 4.94 3.36 1.86 3.94 NA ...
## $ iou : num 3.19 4 1.59 3.37 NA ...
## $ mfq_26 : num 4.2 3.35 4.65 4.65 NA 4.5 NA 4.3 5.25 4.45 ...
## $ mfq_state : num 3.62 3 5.88 4 NA ...
## $ rse : num 2.3 1.6 3.9 1.7 NA 3.9 NA 2.4 1.8 NA ...
## $ school_covid_support: num NA NA NA NA NA NA NA NA NA NA ...
## $ school_att : num NA NA NA NA NA NA NA NA NA NA ...
## $ pas_covid : num 3.22 4.56 3.33 4.22 NA ...
## $ pss : num 3.25 3.75 1 3.25 NA 2 NA 2 4 1.25 ...
## $ phq : num 1.33 3.33 1 2.33 NA ...
## $ gad : num 1.86 3.86 1.14 2 NA ...
## $ edeq12 : num 1.58 1.83 1 1.67 NA ...
## $ brs : num NA NA NA NA NA NA NA NA NA NA ...
## $ swemws : num 2.86 2.29 4.29 3.29 NA ...
## $ isolation_a : num 2.25 NA NA 2.5 NA 1.75 NA 2 1.25 NA ...
## $ isolation_c : num NA 3.5 1 NA NA NA NA NA NA 1 ...
## $ support : num 2.5 2.17 5 2.5 NA ...
# for the HW: use the codebook you created in the codebook activity to get the names of your variables (first column)
# enter this list of names in the select=c() argument to subset those columns from the dataframe
# variables for the lab: id, variable2, variable3, variable5, variable8, variable10, variable11
d <- subset(df, select=c(sleep_hours, mhealth, iou, mfq_26, pas_covid, pss))
# categorical variables need to be recoded as factors
# the content of the variable will stay the same, but R will treat the variable differently at times
d$sleep_hours <- as.factor(d$sleep_hours)
d$mhealth <- as.factor(d$mhealth)
str(d)
## 'data.frame': 2073 obs. of 6 variables:
## $ sleep_hours: Factor w/ 5 levels "1 < 5 hours",..: 3 2 3 2 NA 3 NA 3 3 4 ...
## $ mhealth : Factor w/ 8 levels "anxiety disorder",..: 5 1 5 5 5 5 3 5 1 5 ...
## $ iou : num 3.19 4 1.59 3.37 NA ...
## $ mfq_26 : num 4.2 3.35 4.65 4.65 NA 4.5 NA 4.3 5.25 4.45 ...
## $ pas_covid : num 3.22 4.56 3.33 4.22 NA ...
## $ pss : num 3.25 3.75 1 3.25 NA 2 NA 2 4 1.25 ...
# use the gg_miss_upset() command for a visualization of your missing data
gg_miss_upset(d[-1], nsets = 6)
# use the na.omit() command to create a new dataframe in which any participants with missing data are dropped from the dataframe
d2 <- na.omit(d)
# last step is to export the data after you've dropped NAs
# for the HW, the file you're exporting here is what you'll use for all future HW assignments (labs will use the files I provide you)
# make sure you give it a name that is memorable!
# and make sure you save it to your Data folder!
write.csv(d2, file="Data/arc_data_finalclean.csv", row.names = F)
# since we've created a cleaned dataframe in d2, we'll use that for the rest of the lab/HW
table(d2$sleep_hours)
##
## 1 < 5 hours 2 5-6 hours 3 7-8 hours 4 8-10 hours 5 > 10 hours
## 83 320 455 276 48
table(d2$mhealth)
##
## anxiety disorder bipolar
## 123 5
## depression eating disorders
## 28 28
## none or NA obsessive compulsive disorder
## 916 27
## other ptsd
## 35 20
hist(d2$iou)
hist(d2$mfq_26)
hist(d2$pas_covid)
hist(d2$pss)
Cutoffs are -2 to +2. iou = skew and kurtosis are okay. mfq_26 = skew and kurtosis are okay. pas_covid = skew and kurtosis are okay. pss = skew and kurtosis are okay.
describe(d2)
## vars n mean sd median trimmed mad min max range skew
## sleep_hours* 1 1182 2.90 0.97 3.00 2.92 1.48 1.0 5 4.0 0.00
## mhealth* 2 1182 4.63 1.41 5.00 4.87 0.00 1.0 8 7.0 -1.45
## iou 3 1182 2.56 0.90 2.41 2.50 0.99 1.0 5 4.0 0.50
## mfq_26 4 1182 4.31 0.67 4.35 4.33 0.67 1.8 6 4.2 -0.32
## pas_covid 5 1182 3.23 0.68 3.22 3.24 0.66 1.0 5 4.0 -0.19
## pss 6 1182 2.94 0.95 3.00 2.93 1.11 1.0 5 4.0 0.06
## kurtosis se
## sleep_hours* -0.48 0.03
## mhealth* 2.52 0.04
## iou -0.59 0.03
## mfq_26 0.14 0.02
## pas_covid -0.04 0.02
## pss -0.76 0.03
cross_cases(d2, sleep_hours, mhealth)
|  mhealth | ||||||||
|---|---|---|---|---|---|---|---|---|
|  anxiety disorder |  bipolar |  depression |  eating disorders |  none or NA |  obsessive compulsive disorder |  other |  ptsd | |
|  sleep_hours | ||||||||
|    1 < 5 hours | 19 | 2 | 1 | 3 | 45 | 2 | 5 | 6 |
|    2 5-6 hours | 43 | 1 | 9 | 8 | 232 | 8 | 10 | 9 |
|    3 7-8 hours | 36 | 10 | 10 | 369 | 12 | 15 | 3 | |
|    4 8-10 hours | 20 | 2 | 7 | 6 | 233 | 2 | 4 | 2 |
|    5 > 10 hours | 5 | 1 | 1 | 37 | 3 | 1 | ||
|    #Total cases | 123 | 5 | 28 | 28 | 916 | 27 | 35 | 20 |
plot(d2$iou, d2$mfq_26,
main="Scatterplot of iou and mfq_26",
xlab = "iou",
ylab = "mfq_26")
plot(d2$pas_covid, d2$pss,
main="Scatterplot of pas_covid and pss",
xlab = "pas_covid",
ylab = "pss")
boxplot(data=d2, iou~sleep_hours,
main="Boxplot of sleep_hours and iou",
xlab = "x sleep_hours",
ylab = "Y iou")
boxplot(data=d2, mfq_26~mhealth,
main="Boxplot of mhealth and mfq_26",
xlab = "x mfq_26",
ylab = "Y mfq")