# if you haven't run this code before, you'll need to download the below packages first
# you should see a prompt near the top of the page (in a yellow bar within the RStudio window)
# you can also use the packages tab to the right
library(naniar) # for the gg_miss-upset() command
Data Prep
Load Libraries
Import Data
# # for the HW, you'll import the CSV file of your chosen dataset
<- read.csv(file="Data/arc_data_final.csv", header=T) df
Viewing Data
# # these are commands useful for viewing a dataframe
# # you can also click the object in the environment tab to view it in a new window
names(df)
[1] "X" "gender" "trans"
[4] "sexual_orientation" "ethnicity" "relationship_status"
[7] "age" "urban_rural" "income"
[10] "education" "employment" "treatment"
[13] "health" "mhealth" "sleep_hours"
[16] "exercise_cat" "pet" "covid_pos"
[19] "covid_neg" "big5_open" "big5_con"
[22] "big5_agr" "big5_neu" "big5_ext"
[25] "pswq" "iou" "mfq_26"
[28] "mfq_state" "rse" "school_covid_support"
[31] "school_att" "pas_covid" "pss"
[34] "phq" "gad" "edeq12"
[37] "brs" "swemws" "isolation"
[40] "support"
head(df)
X gender trans sexual_orientation ethnicity
1 1 female no Heterosexual/Straight White - British, Irish, other
2 20 male no Heterosexual/Straight White - British, Irish, other
3 30 female no Heterosexual/Straight White - British, Irish, other
4 31 female no Heterosexual/Straight White - British, Irish, other
5 32 <NA> <NA> <NA> <NA>
6 33 female no Heterosexual/Straight White - British, Irish, other
relationship_status age urban_rural
1 In a relationship/married and cohabiting <NA> city
2 Prefer not to say 1 under 18 city
3 Prefer not to say 1 under 18 city
4 In a relationship/married and cohabiting 4 between 36 and 45 town
5 <NA> <NA> <NA>
6 In a relationship/married and cohabiting 4 between 36 and 45 city
income education employment
1 3 high 6 graduate degree or higher 3 employed
2 <NA> prefer not to say 1 high school equivalent
3 <NA> 2 equivalent to high school completion 1 high school equivalent
4 2 middle 5 undergraduate degree 3 employed
5 <NA> <NA> <NA>
6 2 middle 6 graduate degree or higher 3 employed
treatment health mhealth
1 no psychological disorders something else or not applicable none or NA
2 in treatment something else or not applicable anxiety disorder
3 not in treatment something else or not applicable none or NA
4 no psychological disorders two conditions none or NA
5 <NA> <NA> none or NA
6 not in treatment something else or not applicable none or NA
sleep_hours exercise_cat pet covid_pos covid_neg
1 3 7-8 hours 1 less than 1 hour cat 0 0
2 2 5-6 hours 2 1-2 hours cat 0 0
3 3 7-8 hours 3 2-5 hours dog 0 0
4 2 5-6 hours 2 1-2 hours no pets 0 0
5 <NA> <NA> <NA> 0 0
6 3 7-8 hours 2 1-2 hours multiple types of pet 0 0
big5_open big5_con big5_agr big5_neu big5_ext pswq iou mfq_26
1 5.333333 6.000000 4.333333 6.000000 2.000000 2.3094514 3.185185 4.20
2 5.333333 3.333333 4.333333 6.666667 1.666667 0.8509744 4.000000 3.35
3 5.000000 5.333333 6.666667 4.000000 6.000000 -1.1235082 1.592593 4.65
4 6.000000 5.666667 4.666667 4.000000 5.000000 1.1626810 3.370370 4.65
5 NA NA NA NA NA NA NA NA
6 5.000000 6.000000 6.333333 2.666667 NA -0.3424552 1.703704 4.50
mfq_state rse school_covid_support school_att pas_covid pss phq
1 3.625 2.3 NA NA 3.222222 3.25 1.333333
2 3.000 1.6 NA NA 4.555556 3.75 3.333333
3 5.875 3.9 NA NA 3.333333 1.00 1.000000
4 4.000 1.7 NA NA 4.222222 3.25 2.333333
5 NA NA NA NA NA NA NA
6 4.625 3.9 NA NA 3.222222 2.00 1.111111
gad edeq12 brs swemws isolation support
1 1.857143 1.583333 NA 2.857143 2.25 2.500000
2 3.857143 1.833333 NA 2.285714 3.50 2.166667
3 1.142857 1.000000 NA 4.285714 1.00 5.000000
4 2.000000 1.666667 NA 3.285714 2.50 2.500000
5 NA NA NA NA NA NA
6 1.428571 1.416667 NA 4.000000 1.75 3.666667
str(df)
'data.frame': 2073 obs. of 40 variables:
$ X : int 1 20 30 31 32 33 48 49 57 58 ...
$ gender : chr "female" "male" "female" "female" ...
$ trans : chr "no" "no" "no" "no" ...
$ sexual_orientation : chr "Heterosexual/Straight" "Heterosexual/Straight" "Heterosexual/Straight" "Heterosexual/Straight" ...
$ ethnicity : chr "White - British, Irish, other" "White - British, Irish, other" "White - British, Irish, other" "White - British, Irish, other" ...
$ relationship_status : chr "In a relationship/married and cohabiting" "Prefer not to say" "Prefer not to say" "In a relationship/married and cohabiting" ...
$ age : chr NA "1 under 18" "1 under 18" "4 between 36 and 45" ...
$ urban_rural : chr "city" "city" "city" "town" ...
$ income : chr "3 high" NA NA "2 middle" ...
$ education : chr "6 graduate degree or higher" "prefer not to say" "2 equivalent to high school completion" "5 undergraduate degree" ...
$ employment : chr "3 employed" "1 high school equivalent" "1 high school equivalent" "3 employed" ...
$ treatment : chr "no psychological disorders" "in treatment" "not in treatment" "no psychological disorders" ...
$ health : chr "something else or not applicable" "something else or not applicable" "something else or not applicable" "two conditions" ...
$ mhealth : chr "none or NA" "anxiety disorder" "none or NA" "none or NA" ...
$ sleep_hours : chr "3 7-8 hours" "2 5-6 hours" "3 7-8 hours" "2 5-6 hours" ...
$ exercise_cat : chr "1 less than 1 hour" "2 1-2 hours" "3 2-5 hours" "2 1-2 hours" ...
$ pet : chr "cat" "cat" "dog" "no pets" ...
$ covid_pos : int 0 0 0 0 0 0 0 0 0 0 ...
$ covid_neg : int 0 0 0 0 0 0 0 0 0 0 ...
$ big5_open : num 5.33 5.33 5 6 NA ...
$ big5_con : num 6 3.33 5.33 5.67 NA ...
$ big5_agr : num 4.33 4.33 6.67 4.67 NA ...
$ big5_neu : num 6 6.67 4 4 NA ...
$ big5_ext : num 2 1.67 6 5 NA ...
$ pswq : num 2.309 0.851 -1.124 1.163 NA ...
$ iou : num 3.19 4 1.59 3.37 NA ...
$ mfq_26 : num 4.2 3.35 4.65 4.65 NA 4.5 NA 4.3 5.25 4.45 ...
$ mfq_state : num 3.62 3 5.88 4 NA ...
$ rse : num 2.3 1.6 3.9 1.7 NA 3.9 NA 2.4 1.8 NA ...
$ school_covid_support: num NA NA NA NA NA NA NA NA NA NA ...
$ school_att : num NA NA NA NA NA NA NA NA NA NA ...
$ pas_covid : num 3.22 4.56 3.33 4.22 NA ...
$ pss : num 3.25 3.75 1 3.25 NA 2 NA 2 4 1.25 ...
$ phq : num 1.33 3.33 1 2.33 NA ...
$ gad : num 1.86 3.86 1.14 2 NA ...
$ edeq12 : num 1.58 1.83 1 1.67 NA ...
$ brs : num NA NA NA NA NA NA NA NA NA NA ...
$ swemws : num 2.86 2.29 4.29 3.29 NA ...
$ isolation : num 2.25 3.5 1 2.5 NA 1.75 NA 2 1.25 1 ...
$ support : num 2.5 2.17 5 2.5 NA ...
Subsetting Data
# # use the codebook you created in the codebook activity to get the names of your variables (first column)
# # enter this list of names in the select=c() argument to subset those columns from the dataframe
<- subset(df, select=c(gender, ethnicity, age, big5_open, pswq, big5_ext)) d
Missing Data
# use the gg_miss_upset() command for a visualization of your missing data
gg_miss_upset(d, nsets = 6)
# use the na.omit() command to create a new dataframe in which any participants with missing data are dropped from the dataframe
<- na.omit(d)
d2
# use a bit of math to see what percentage of participants had missing data
2073-1259
[1] 814
814/2073
[1] 0.3926676
## Exporting Data
# # last step is to export the data after you've dropped NAs
write.csv(d2, file="Data/mydata.csv", row.names = F)
# MAKE SURE TO RENAME TO MYDATA FOR THE HOMEWORK
# DONT FORGET!!
Write-Up
We selected six variables from the ARC dataset to focus on in our analysis: gender, age, ethnicity, penn state worry scale, big 5 openness subscale, and big 5 extraverison subscale. Participants with missing data (39%) in these six variables were dropped from our analysis, leaving us a final sample of n = 1259.