# if you haven't run this code before, you'll need to download the below packages first
# instructions on how to do this are included in the video
# but as a reminder, you use the packages tab to the right
library(tidyverse) # for the map() command
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(psych) # for the describe() command
##
## Attaching package: 'psych'
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(naniar) # for the gg_miss-upset() command
library(expss) # for the cross_cases() command
## Loading required package: maditr
##
## To get total summary skip 'by' argument: take_all(mtcars, mean)
##
##
## Attaching package: 'maditr'
##
## The following objects are masked from 'package:dplyr':
##
## between, coalesce, first, last
##
## The following object is masked from 'package:purrr':
##
## transpose
##
## The following object is masked from 'package:readr':
##
## cols
##
##
## Attaching package: 'expss'
##
## The following object is masked from 'package:naniar':
##
## is_na
##
## The following objects are masked from 'package:stringr':
##
## fixed, regex
##
## The following objects are masked from 'package:dplyr':
##
## compute, contains, na_if, recode, vars, where
##
## The following objects are masked from 'package:purrr':
##
## keep, modify, modify_if, when
##
## The following objects are masked from 'package:tidyr':
##
## contains, nest
##
## The following object is masked from 'package:ggplot2':
##
## vars
# for the lab, you'll import the CSV file you downloaded along with the current file we're working in (an RMD file)
# for the homework, you'll download the CSV file from your chosen README page (should be titled arc_data_final.csv or eammi2_data_final.csv)
df <- read.csv(file="Data/arc_data_final.csv", header=T)
# these are commands useful for viewing a dataframe
# you can also click the object in the environment tab to view it in a new window
names(df)
## [1] "X" "gender" "trans"
## [4] "sexual_orientation" "ethnicity" "relationship_status"
## [7] "age" "urban_rural" "income"
## [10] "education" "employment" "treatment"
## [13] "health" "mhealth" "sleep_hours"
## [16] "exercise" "pet" "covid_pos"
## [19] "covid_neg" "big5_open" "big5_con"
## [22] "big5_agr" "big5_neu" "big5_ext"
## [25] "pswq" "iou" "mfq_26"
## [28] "mfq_state" "rse" "school_covid_support"
## [31] "school_att" "pas_covid" "pss"
## [34] "phq" "gad" "edeq12"
## [37] "brs" "swemws" "isolation_a"
## [40] "isolation_c" "support"
head(df)
## X gender trans sexual_orientation ethnicity
## 1 1 female no Heterosexual/Straight White - British, Irish, other
## 2 20 male no Heterosexual/Straight White - British, Irish, other
## 3 30 female no Heterosexual/Straight White - British, Irish, other
## 4 31 female no Heterosexual/Straight White - British, Irish, other
## 5 32 <NA> <NA> <NA> <NA>
## 6 33 female no Heterosexual/Straight White - British, Irish, other
## relationship_status age urban_rural
## 1 In a relationship/married and cohabiting <NA> city
## 2 Prefer not to say 1 under 18 city
## 3 Prefer not to say 1 under 18 city
## 4 In a relationship/married and cohabiting 4 between 36 and 45 town
## 5 <NA> <NA> <NA>
## 6 In a relationship/married and cohabiting 4 between 36 and 45 city
## income education employment
## 1 3 high 6 graduate degree or higher 3 employed
## 2 <NA> prefer not to say 1 high school equivalent
## 3 <NA> 2 equivalent to high school completion 1 high school equivalent
## 4 2 middle 5 undergraduate degree 3 employed
## 5 <NA> <NA> <NA>
## 6 2 middle 6 graduate degree or higher 3 employed
## treatment health mhealth
## 1 no psychological disorders something else or not applicable none or NA
## 2 in treatment something else or not applicable anxiety disorder
## 3 not in treatment something else or not applicable none or NA
## 4 no psychological disorders two conditions none or NA
## 5 <NA> <NA> none or NA
## 6 not in treatment something else or not applicable none or NA
## sleep_hours exercise pet covid_pos covid_neg big5_open
## 1 3 7-8 hours 0.0 cat 0 0 5.333333
## 2 2 5-6 hours 2.0 cat 0 0 5.333333
## 3 3 7-8 hours 3.0 dog 0 0 5.000000
## 4 2 5-6 hours 1.5 no pets 0 0 6.000000
## 5 <NA> NA <NA> 0 0 NA
## 6 3 7-8 hours 1.0 multiple types of pet 0 0 5.000000
## big5_con big5_agr big5_neu big5_ext pswq iou mfq_26 mfq_state rse
## 1 6.000000 4.333333 6.000000 2.000000 4.937500 3.185185 4.20 3.625 2.3
## 2 3.333333 4.333333 6.666667 1.666667 3.357143 4.000000 3.35 3.000 1.6
## 3 5.333333 6.666667 4.000000 6.000000 1.857143 1.592593 4.65 5.875 3.9
## 4 5.666667 4.666667 4.000000 5.000000 3.937500 3.370370 4.65 4.000 1.7
## 5 NA NA NA NA NA NA NA NA NA
## 6 6.000000 6.333333 2.666667 NA 2.625000 1.703704 4.50 4.625 3.9
## school_covid_support school_att pas_covid pss phq gad edeq12 brs
## 1 NA NA 3.222222 3.25 1.333333 1.857143 1.583333 NA
## 2 NA NA 4.555556 3.75 3.333333 3.857143 1.833333 NA
## 3 NA NA 3.333333 1.00 1.000000 1.142857 1.000000 NA
## 4 NA NA 4.222222 3.25 2.333333 2.000000 1.666667 NA
## 5 NA NA NA NA NA NA NA NA
## 6 NA NA 3.222222 2.00 1.111111 1.428571 1.416667 NA
## swemws isolation_a isolation_c support
## 1 2.857143 2.25 NA 2.500000
## 2 2.285714 NA 3.5 2.166667
## 3 4.285714 NA 1.0 5.000000
## 4 3.285714 2.50 NA 2.500000
## 5 NA NA NA NA
## 6 4.000000 1.75 NA 3.666667
str(df)
## 'data.frame': 2073 obs. of 41 variables:
## $ X : int 1 20 30 31 32 33 48 49 57 58 ...
## $ gender : chr "female" "male" "female" "female" ...
## $ trans : chr "no" "no" "no" "no" ...
## $ sexual_orientation : chr "Heterosexual/Straight" "Heterosexual/Straight" "Heterosexual/Straight" "Heterosexual/Straight" ...
## $ ethnicity : chr "White - British, Irish, other" "White - British, Irish, other" "White - British, Irish, other" "White - British, Irish, other" ...
## $ relationship_status : chr "In a relationship/married and cohabiting" "Prefer not to say" "Prefer not to say" "In a relationship/married and cohabiting" ...
## $ age : chr NA "1 under 18" "1 under 18" "4 between 36 and 45" ...
## $ urban_rural : chr "city" "city" "city" "town" ...
## $ income : chr "3 high" NA NA "2 middle" ...
## $ education : chr "6 graduate degree or higher" "prefer not to say" "2 equivalent to high school completion" "5 undergraduate degree" ...
## $ employment : chr "3 employed" "1 high school equivalent" "1 high school equivalent" "3 employed" ...
## $ treatment : chr "no psychological disorders" "in treatment" "not in treatment" "no psychological disorders" ...
## $ health : chr "something else or not applicable" "something else or not applicable" "something else or not applicable" "two conditions" ...
## $ mhealth : chr "none or NA" "anxiety disorder" "none or NA" "none or NA" ...
## $ sleep_hours : chr "3 7-8 hours" "2 5-6 hours" "3 7-8 hours" "2 5-6 hours" ...
## $ exercise : num 0 2 3 1.5 NA 1 NA 2 2 1.7 ...
## $ pet : chr "cat" "cat" "dog" "no pets" ...
## $ covid_pos : int 0 0 0 0 0 0 0 0 0 0 ...
## $ covid_neg : int 0 0 0 0 0 0 0 0 0 0 ...
## $ big5_open : num 5.33 5.33 5 6 NA ...
## $ big5_con : num 6 3.33 5.33 5.67 NA ...
## $ big5_agr : num 4.33 4.33 6.67 4.67 NA ...
## $ big5_neu : num 6 6.67 4 4 NA ...
## $ big5_ext : num 2 1.67 6 5 NA ...
## $ pswq : num 4.94 3.36 1.86 3.94 NA ...
## $ iou : num 3.19 4 1.59 3.37 NA ...
## $ mfq_26 : num 4.2 3.35 4.65 4.65 NA 4.5 NA 4.3 5.25 4.45 ...
## $ mfq_state : num 3.62 3 5.88 4 NA ...
## $ rse : num 2.3 1.6 3.9 1.7 NA 3.9 NA 2.4 1.8 NA ...
## $ school_covid_support: num NA NA NA NA NA NA NA NA NA NA ...
## $ school_att : num NA NA NA NA NA NA NA NA NA NA ...
## $ pas_covid : num 3.22 4.56 3.33 4.22 NA ...
## $ pss : num 3.25 3.75 1 3.25 NA 2 NA 2 4 1.25 ...
## $ phq : num 1.33 3.33 1 2.33 NA ...
## $ gad : num 1.86 3.86 1.14 2 NA ...
## $ edeq12 : num 1.58 1.83 1 1.67 NA ...
## $ brs : num NA NA NA NA NA NA NA NA NA NA ...
## $ swemws : num 2.86 2.29 4.29 3.29 NA ...
## $ isolation_a : num 2.25 NA NA 2.5 NA 1.75 NA 2 1.25 NA ...
## $ isolation_c : num NA 3.5 1 NA NA NA NA NA NA 1 ...
## $ support : num 2.5 2.17 5 2.5 NA ...
# for the HW: use the codebook you created in the codebook activity to get the names of your variables (first column)
# enter this list of names in the select=c() argument to subset those columns from the dataframe
# variables for the lab: id, variable2, variable3, variable5, variable8, variable10, variable11
#this is a comment(notes)
d <-subset (df, select = c(X, age, urban_rural, pss, support, swemws, phq))
# categorical variables need to be recoded as factors
# the content of the variable will stay the same, but R will treat the variable differently at times
d$X <- as.factor(d$X)
str(d)
## 'data.frame': 2073 obs. of 7 variables:
## $ X : Factor w/ 2073 levels "1","20","30",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ age : chr NA "1 under 18" "1 under 18" "4 between 36 and 45" ...
## $ urban_rural: chr "city" "city" "city" "town" ...
## $ pss : num 3.25 3.75 1 3.25 NA 2 NA 2 4 1.25 ...
## $ support : num 2.5 2.17 5 2.5 NA ...
## $ swemws : num 2.86 2.29 4.29 3.29 NA ...
## $ phq : num 1.33 3.33 1 2.33 NA ...
we looked at our missing data and had a number of skipped responses (n =1152) we did not notice any meaningful patterns in skipped items and decided to drop particapants who did not respond to any of the items in our analyses. However, the decisions to drop our particapants is not ideal and can lead to the erasure of vulnerable groups, and this is a limitation of our data set and our analyses that we will need to acknowledge in our manuscript.
# use the gg_miss_upset() command for a visualization of your missing data
gg_miss_upset(d[-1], nsets = 6)
gg_miss_upset(d[-1], nsets = 6)
# use the na.omit() command to create a new dataframe in which any participants with missing data are dropped from the dataframe
d2 <- na.omit(d)
d2 <- na.omit(d)
nrow(d) - nrow(d2)
## [1] 1152
2073-921
## [1] 1152
# last step is to export the data after you've dropped NAs
# for the HW, the file you're exporting here is what you'll use for all future HW assignments (labs will use the files I provide you)
# make sure you give it a name that is memorable!
# and make sure you save it to your Data folder!
write.csv(d2, file="data/fakedata_clean_final.csv",row.names = F)
# since we've created a cleaned dataframe in d2, we'll use that for the rest of the lab/HW
table(d2$age)
##
## 1 under 18 2 between 18 and 25 3 between 26 and 35 4 between 36 and 45
## 602 53 6 87
## 5 over 45
## 173
table(d2$urban_rural)
##
## city isolated dwelling town village
## 204 20 406 291
hist(d2$pss)
hist(d2$support)
hist(d2$swemws)
hist(d2$phq)
describe(d2)
## vars n mean sd median trimmed mad min max range skew
## X* 1 921 964.59 592.74 916.00 946.88 726.47 2 2070 2068 0.23
## age* 2 921 2.11 1.65 1.00 1.88 0.00 1 5 4 0.96
## urban_rural* 3 921 2.85 1.10 3.00 2.94 1.48 1 4 3 -0.71
## pss 4 921 2.91 0.96 2.75 2.90 1.11 1 5 4 0.12
## support 5 921 3.60 0.94 3.67 3.66 0.99 1 5 4 -0.44
## swemws 6 921 3.18 0.84 3.29 3.19 0.85 1 5 4 -0.19
## phq 7 921 2.05 0.85 1.89 1.97 0.99 1 4 3 0.67
## kurtosis se
## X* -1.11 19.53
## age* -0.92 0.05
## urban_rural* -0.81 0.04
## pss -0.79 0.03
## support -0.57 0.03
## swemws -0.38 0.03
## phq -0.61 0.03
cross_cases(d2, age, urban_rural)
|  urban_rural | ||||
|---|---|---|---|---|
|  city |  isolated dwelling |  town |  village | |
|  age | ||||
| Â Â Â 1 under 18Â | 138 | 11 | 267 | 186 |
| Â Â Â 2 between 18 and 25Â | 19 | 4 | 16 | 14 |
| Â Â Â 3 between 26 and 35Â | 1 | 2 | 3 | |
| Â Â Â 4 between 36 and 45Â | 12 | 4 | 45 | 26 |
| Â Â Â 5 over 45Â | 34 | 1 | 76 | 62 |
|    #Total cases | 204 | 20 | 406 | 291 |
plot(d2$pss, d2$support,
main="Scatterplot of pss and support",
xlab = "pss",
ylab = "support")
plot(d2$swemws, d2$phq,
main="Scatterplot of swemws and phq",
xlab = "swemws",
ylab = "phq")
boxplot(data=d2, support~urban_rural,
main="Boxplot of support and urban_rural",
xlab = "x (urban_rural)",
ylab = "Y (support")
boxplot(data=d2, phq~age,
main="Boxplot of phq and age",
xlab = "x (age)",
ylab = "Y (phq")