Data Prep

Load Libraries

# if you haven't run this code before, you'll need to download the below packages first
# you should see a prompt near the top of the page (in a yellow bar within the RStudio window)
# you can also use the packages tab to the right

library(naniar) # for the gg_miss-upset() command

Import Data

# # for the HW, you'll import the CSV file of your chosen dataset
df <- read.csv(file="Data/arc_data_final.csv", header=T)

Viewing Data

# # these are commands useful for viewing a dataframe
# # you can also click the object in the environment tab to view it in a new window
names(df)

 [1] "X"                    "gender"               "trans"               
 [4] "sexual_orientation"   "ethnicity"            "relationship_status" 
 [7] "age"                  "urban_rural"          "income"              
[10] "education"            "employment"           "treatment"           
[13] "health"               "mhealth"              "sleep_hours"         
[16] "exercise_cat"         "pet"                  "covid_pos"           
[19] "covid_neg"            "big5_open"            "big5_con"            
[22] "big5_agr"             "big5_neu"             "big5_ext"            
[25] "pswq"                 "iou"                  "mfq_26"              
[28] "mfq_state"            "rse"                  "school_covid_support"
[31] "school_att"           "pas_covid"            "pss"                 
[34] "phq"                  "gad"                  "edeq12"              
[37] "brs"                  "swemws"               "isolation"           
[40] "support"

head(df)

   X gender trans    sexual_orientation                     ethnicity
1  1 female    no Heterosexual/Straight White - British, Irish, other
2 20   male    no Heterosexual/Straight White - British, Irish, other
3 30 female    no Heterosexual/Straight White - British, Irish, other
4 31 female    no Heterosexual/Straight White - British, Irish, other
5 32   <NA>  <NA>                  <NA>                          <NA>
6 33 female    no Heterosexual/Straight White - British, Irish, other
                       relationship_status                 age urban_rural
1 In a relationship/married and cohabiting                <NA>        city
2                        Prefer not to say          1 under 18        city
3                        Prefer not to say          1 under 18        city
4 In a relationship/married and cohabiting 4 between 36 and 45        town
5                                     <NA>                <NA>        <NA>
6 In a relationship/married and cohabiting 4 between 36 and 45        city
    income                              education               employment
1   3 high            6 graduate degree or higher               3 employed
2     <NA>                      prefer not to say 1 high school equivalent
3     <NA> 2 equivalent to high school completion 1 high school equivalent
4 2 middle                 5 undergraduate degree               3 employed
5     <NA>                                   <NA>                     <NA>
6 2 middle            6 graduate degree or higher               3 employed
                   treatment                           health          mhealth
1 no psychological disorders something else or not applicable       none or NA
2               in treatment something else or not applicable anxiety disorder
3           not in treatment something else or not applicable       none or NA
4 no psychological disorders                   two conditions       none or NA
5                       <NA>                             <NA>       none or NA
6           not in treatment something else or not applicable       none or NA
  sleep_hours       exercise_cat                   pet covid_pos covid_neg
1 3 7-8 hours 1 less than 1 hour                   cat         0         0
2 2 5-6 hours        2 1-2 hours                   cat         0         0
3 3 7-8 hours        3 2-5 hours                   dog         0         0
4 2 5-6 hours        2 1-2 hours               no pets         0         0
5        <NA>               <NA>                  <NA>         0         0
6 3 7-8 hours        2 1-2 hours multiple types of pet         0         0
  big5_open big5_con big5_agr big5_neu big5_ext       pswq      iou mfq_26
1  5.333333 6.000000 4.333333 6.000000 2.000000  2.3094514 3.185185   4.20
2  5.333333 3.333333 4.333333 6.666667 1.666667  0.8509744 4.000000   3.35
3  5.000000 5.333333 6.666667 4.000000 6.000000 -1.1235082 1.592593   4.65
4  6.000000 5.666667 4.666667 4.000000 5.000000  1.1626810 3.370370   4.65
5        NA       NA       NA       NA       NA         NA       NA     NA
6  5.000000 6.000000 6.333333 2.666667       NA -0.3424552 1.703704   4.50
  mfq_state rse school_covid_support school_att pas_covid  pss      phq
1     3.625 2.3                   NA         NA  3.222222 3.25 1.333333
2     3.000 1.6                   NA         NA  4.555556 3.75 3.333333
3     5.875 3.9                   NA         NA  3.333333 1.00 1.000000
4     4.000 1.7                   NA         NA  4.222222 3.25 2.333333
5        NA  NA                   NA         NA        NA   NA       NA
6     4.625 3.9                   NA         NA  3.222222 2.00 1.111111
       gad   edeq12 brs   swemws isolation  support
1 1.857143 1.583333  NA 2.857143      2.25 2.500000
2 3.857143 1.833333  NA 2.285714      3.50 2.166667
3 1.142857 1.000000  NA 4.285714      1.00 5.000000
4 2.000000 1.666667  NA 3.285714      2.50 2.500000
5       NA       NA  NA       NA        NA       NA
6 1.428571 1.416667  NA 4.000000      1.75 3.666667

str(df)

'data.frame':   2073 obs. of  40 variables:
 $ X                   : int  1 20 30 31 32 33 48 49 57 58 ...
 $ gender              : chr  "female" "male" "female" "female" ...
 $ trans               : chr  "no" "no" "no" "no" ...
 $ sexual_orientation  : chr  "Heterosexual/Straight" "Heterosexual/Straight" "Heterosexual/Straight" "Heterosexual/Straight" ...
 $ ethnicity           : chr  "White - British, Irish, other" "White - British, Irish, other" "White - British, Irish, other" "White - British, Irish, other" ...
 $ relationship_status : chr  "In a relationship/married and cohabiting" "Prefer not to say" "Prefer not to say" "In a relationship/married and cohabiting" ...
 $ age                 : chr  NA "1 under 18" "1 under 18" "4 between 36 and 45" ...
 $ urban_rural         : chr  "city" "city" "city" "town" ...
 $ income              : chr  "3 high" NA NA "2 middle" ...
 $ education           : chr  "6 graduate degree or higher" "prefer not to say" "2 equivalent to high school completion" "5 undergraduate degree" ...
 $ employment          : chr  "3 employed" "1 high school equivalent" "1 high school equivalent" "3 employed" ...
 $ treatment           : chr  "no psychological disorders" "in treatment" "not in treatment" "no psychological disorders" ...
 $ health              : chr  "something else or not applicable" "something else or not applicable" "something else or not applicable" "two conditions" ...
 $ mhealth             : chr  "none or NA" "anxiety disorder" "none or NA" "none or NA" ...
 $ sleep_hours         : chr  "3 7-8 hours" "2 5-6 hours" "3 7-8 hours" "2 5-6 hours" ...
 $ exercise_cat        : chr  "1 less than 1 hour" "2 1-2 hours" "3 2-5 hours" "2 1-2 hours" ...
 $ pet                 : chr  "cat" "cat" "dog" "no pets" ...
 $ covid_pos           : int  0 0 0 0 0 0 0 0 0 0 ...
 $ covid_neg           : int  0 0 0 0 0 0 0 0 0 0 ...
 $ big5_open           : num  5.33 5.33 5 6 NA ...
 $ big5_con            : num  6 3.33 5.33 5.67 NA ...
 $ big5_agr            : num  4.33 4.33 6.67 4.67 NA ...
 $ big5_neu            : num  6 6.67 4 4 NA ...
 $ big5_ext            : num  2 1.67 6 5 NA ...
 $ pswq                : num  2.309 0.851 -1.124 1.163 NA ...
 $ iou                 : num  3.19 4 1.59 3.37 NA ...
 $ mfq_26              : num  4.2 3.35 4.65 4.65 NA 4.5 NA 4.3 5.25 4.45 ...
 $ mfq_state           : num  3.62 3 5.88 4 NA ...
 $ rse                 : num  2.3 1.6 3.9 1.7 NA 3.9 NA 2.4 1.8 NA ...
 $ school_covid_support: num  NA NA NA NA NA NA NA NA NA NA ...
 $ school_att          : num  NA NA NA NA NA NA NA NA NA NA ...
 $ pas_covid           : num  3.22 4.56 3.33 4.22 NA ...
 $ pss                 : num  3.25 3.75 1 3.25 NA 2 NA 2 4 1.25 ...
 $ phq                 : num  1.33 3.33 1 2.33 NA ...
 $ gad                 : num  1.86 3.86 1.14 2 NA ...
 $ edeq12              : num  1.58 1.83 1 1.67 NA ...
 $ brs                 : num  NA NA NA NA NA NA NA NA NA NA ...
 $ swemws              : num  2.86 2.29 4.29 3.29 NA ...
 $ isolation           : num  2.25 3.5 1 2.5 NA 1.75 NA 2 1.25 1 ...
 $ support             : num  2.5 2.17 5 2.5 NA ...

Subsetting Data

# # use the codebook you created in the codebook activity to get the names of your variables (first column)
# # enter this list of names in the select=c() argument to subset those columns from the dataframe
d <- subset(df, select=c(gender, ethnicity, age, big5_open, pswq, big5_ext))

Missing Data

# use the gg_miss_upset() command for a visualization of your missing data
gg_miss_upset(d, nsets = 6)

#  use the na.omit() command to create a new dataframe in which any participants with missing data are dropped from the dataframe
d2 <- na.omit(d)

# use a bit of math to see what percentage of participants had missing data
2073-1259

[1] 814

814/2073

[1] 0.3926676

## Exporting Data

# # last step is to export the data after you've dropped NAs
write.csv(d2, file="Data/mydata.csv", row.names = F)
# MAKE SURE TO RENAME TO MYDATA FOR THE HOMEWORK
# DONT FORGET!!

Write-Up

We selected six variables from the ARC dataset to focus on in our analysis: gender, age, ethnicity, penn state worry scale, big 5 openness subscale, and big 5 extraverison subscale. Participants with missing data (39%) in these six variables were dropped from our analysis, leaving us a final sample of n = 1259.