library(naniar)
Import the full project data into a dataframe, call it “df”. Replace ‘DOWLOADED FILE NAME’ with the actual file name of your dataset (either for the ARC or EAMMi2).
Note: If you named your folder something else, you will also need to replace ‘Data’ with whatever the name of your folder is where you saved the dataset in.
df <- read.csv("arc_data.csv", header=TRUE, fill=TRUE)
These are commands useful for viewing a data frame.
names(df)
## [1] "X" "gender" "trans"
## [4] "sexual_orientation" "ethnicity" "relationship_status"
## [7] "age" "urban_rural" "income"
## [10] "education" "employment" "treatment"
## [13] "health" "mhealth" "sleep_hours"
## [16] "exercise" "pet" "covid_pos"
## [19] "covid_neg" "big5_open" "big5_con"
## [22] "big5_agr" "big5_neu" "big5_ext"
## [25] "pswq" "iou" "mfq_26"
## [28] "mfq_state" "rse" "school_covid_support"
## [31] "school_att" "pas_covid" "pss"
## [34] "phq" "gad" "edeq12"
## [37] "brs" "swemws" "isolation_c"
## [40] "support"
head(df)
## X gender trans sexual_orientation
## 1 520 female no Prefer not to say
## 2 2814 male no Heterosexual/Straight
## 3 3146 female no Heterosexual/Straight
## 4 3295 male no Heterosexual/Straight
## 5 717 female no Asexual
## 6 6056 female no Prefer not to say
## ethnicity
## 1 Prefer not to say
## 2 White - British, Irish, other
## 3 Asian/Asian British - Indian, Pakistani, Bangladeshi, other
## 4 Asian/Asian British - Indian, Pakistani, Bangladeshi, other
## 5 White - British, Irish, other
## 6 Asian/Asian British - Indian, Pakistani, Bangladeshi, other
## relationship_status age urban_rural income
## 1 Single, never married 1 under 18 town <NA>
## 2 Single, never married 1 under 18 town <NA>
## 3 Prefer not to say 1 under 18 town <NA>
## 4 Single, never married 1 under 18 town <NA>
## 5 Single, never married 1 under 18 village <NA>
## 6 Prefer not to say 1 under 18 city <NA>
## education employment
## 1 1 equivalent to not completing high school 1 high school equivalent
## 2 prefer not to say 1 high school equivalent
## 3 2 equivalent to high school completion 1 high school equivalent
## 4 prefer not to say 1 high school equivalent
## 5 1 equivalent to not completing high school 1 high school equivalent
## 6 1 equivalent to not completing high school 1 high school equivalent
## treatment health mhealth
## 1 <NA> something else or not applicable none or NA
## 2 not in treatment something else or not applicable none or NA
## 3 <NA> prefer not to say none or NA
## 4 no psychological disorders lung disease none or NA
## 5 not in treatment something else or not applicable none or NA
## 6 not in treatment something else or not applicable none or NA
## sleep_hours exercise pet covid_pos covid_neg big5_open
## 1 2 5-6 hours 1 less than 1 hour cat 0 0 3.666667
## 2 3 7-8 hours 1 less than 1 hour other 0 0 4.333333
## 3 2 5-6 hours 1 less than 1 hour no pets 0 0 5.666667
## 4 4 8-10 hours 1 less than 1 hour no pets 0 0 6.000000
## 5 3 7-8 hours 1 less than 1 hour no pets 0 0 5.666667
## 6 4 8-10 hours 1 less than 1 hour no pets 0 0 4.666667
## big5_con big5_agr big5_neu big5_ext pswq iou mfq_26 mfq_state rse
## 1 3.000000 4.333333 5.333333 2.000000 2.714286 2.222222 2.70 3.000 2.6
## 2 4.000000 2.666667 2.666667 2.666667 1.428571 1.518519 4.55 4.375 3.1
## 3 6.000000 5.666667 1.000000 4.666667 1.857143 1.777778 4.80 4.875 3.7
## 4 4.000000 5.666667 3.666667 4.333333 1.785714 1.851852 3.80 4.875 3.0
## 5 3.333333 5.000000 4.333333 1.666667 2.357143 2.222222 4.50 4.875 3.0
## 6 4.333333 4.333333 5.000000 2.333333 2.500000 2.444444 4.00 3.750 3.0
## school_covid_support school_att pas_covid pss phq gad edeq12 brs
## 1 NA NA 3.000000 2.75 1.555556 1.142857 1.333333 NA
## 2 NA NA 3.444444 2.25 1.444444 1.285714 1.083333 NA
## 3 NA NA 4.666667 3.00 1.111111 1.000000 1.000000 NA
## 4 NA NA 2.444444 2.00 1.333333 1.000000 1.000000 NA
## 5 NA NA 1.555556 1.75 1.444444 1.142857 1.166667 NA
## 6 NA NA 3.111111 2.00 1.000000 1.142857 1.416667 NA
## swemws isolation_c support
## 1 3.000000 1 2.833333
## 2 2.857143 1 3.000000
## 3 4.000000 1 4.000000
## 4 3.571429 1 4.000000
## 5 3.857143 1 3.666667
## 6 3.571429 1 3.666667
str(df)
## 'data.frame': 996 obs. of 40 variables:
## $ X : int 520 2814 3146 3295 717 6056 4753 5365 2044 1965 ...
## $ gender : chr "female" "male" "female" "male" ...
## $ trans : chr "no" "no" "no" "no" ...
## $ sexual_orientation : chr "Prefer not to say" "Heterosexual/Straight" "Heterosexual/Straight" "Heterosexual/Straight" ...
## $ ethnicity : chr "Prefer not to say" "White - British, Irish, other" "Asian/Asian British - Indian, Pakistani, Bangladeshi, other" "Asian/Asian British - Indian, Pakistani, Bangladeshi, other" ...
## $ relationship_status : chr "Single, never married" "Single, never married" "Prefer not to say" "Single, never married" ...
## $ age : chr "1 under 18" "1 under 18" "1 under 18" "1 under 18" ...
## $ urban_rural : chr "town" "town" "town" "town" ...
## $ income : chr NA NA NA NA ...
## $ education : chr "1 equivalent to not completing high school" "prefer not to say" "2 equivalent to high school completion" "prefer not to say" ...
## $ employment : chr "1 high school equivalent" "1 high school equivalent" "1 high school equivalent" "1 high school equivalent" ...
## $ treatment : chr NA "not in treatment" NA "no psychological disorders" ...
## $ health : chr "something else or not applicable" "something else or not applicable" "prefer not to say" "lung disease" ...
## $ mhealth : chr "none or NA" "none or NA" "none or NA" "none or NA" ...
## $ sleep_hours : chr "2 5-6 hours" "3 7-8 hours" "2 5-6 hours" "4 8-10 hours" ...
## $ exercise : chr "1 less than 1 hour" "1 less than 1 hour" "1 less than 1 hour" "1 less than 1 hour" ...
## $ pet : chr "cat" "other" "no pets" "no pets" ...
## $ covid_pos : int 0 0 0 0 0 0 0 0 0 0 ...
## $ covid_neg : int 0 0 0 0 0 0 0 0 0 0 ...
## $ big5_open : num 3.67 4.33 5.67 6 5.67 ...
## $ big5_con : num 3 4 6 4 3.33 ...
## $ big5_agr : num 4.33 2.67 5.67 5.67 5 ...
## $ big5_neu : num 5.33 2.67 1 3.67 4.33 ...
## $ big5_ext : num 2 2.67 4.67 4.33 1.67 ...
## $ pswq : num 2.71 1.43 1.86 1.79 2.36 ...
## $ iou : num 2.22 1.52 1.78 1.85 2.22 ...
## $ mfq_26 : num 2.7 4.55 4.8 3.8 4.5 4 5.8 4.2 4.5 5.25 ...
## $ mfq_state : num 3 4.38 4.88 4.88 4.88 ...
## $ rse : num 2.6 3.1 3.7 3 3 3 4 3.8 2.5 4 ...
## $ school_covid_support: num NA NA NA NA NA NA NA NA NA NA ...
## $ school_att : num NA NA NA NA NA NA NA NA NA NA ...
## $ pas_covid : num 3 3.44 4.67 2.44 1.56 ...
## $ pss : num 2.75 2.25 3 2 1.75 2 1 1.25 3 1.25 ...
## $ phq : num 1.56 1.44 1.11 1.33 1.44 ...
## $ gad : num 1.14 1.29 1 1 1.14 ...
## $ edeq12 : num 1.33 1.08 1 1 1.17 ...
## $ brs : num NA NA NA NA NA NA NA NA NA NA ...
## $ swemws : num 3 2.86 4 3.57 3.86 ...
## $ isolation_c : num 1 1 1 1 1 1 1 1 1 1 ...
## $ support : num 2.83 3 4 4 3.67 ...
Open your mini codebook and get the names of your variables (first column). Then enter this list of names within the “select=c()” argument to subset those columns from the dataframe “df” into a new one “d”.
Replace “variable1, variable2,…” with your variables names.
d <- df[, c("X", "gender", "pet", "pss", "phq", "gad", "swemws")]
names(d)
## [1] "X" "gender" "pet" "pss" "phq" "gad" "swemws"
head(d)
## X gender pet pss phq gad swemws
## 1 520 female cat 2.75 1.555556 1.142857 3.000000
## 2 2814 male other 2.25 1.444444 1.285714 2.857143
## 3 3146 female no pets 3.00 1.111111 1.000000 4.000000
## 4 3295 male no pets 2.00 1.333333 1.000000 3.571429
## 5 717 female no pets 1.75 1.444444 1.142857 3.857143
## 6 6056 female no pets 2.00 1.000000 1.142857 3.571429
str(d)
## 'data.frame': 996 obs. of 7 variables:
## $ X : int 520 2814 3146 3295 717 6056 4753 5365 2044 1965 ...
## $ gender: chr "female" "male" "female" "male" ...
## $ pet : chr "cat" "other" "no pets" "no pets" ...
## $ pss : num 2.75 2.25 3 2 1.75 2 1 1.25 3 1.25 ...
## $ phq : num 1.56 1.44 1.11 1.33 1.44 ...
## $ gad : num 1.14 1.29 1 1 1.14 ...
## $ swemws: num 3 2.86 4 3.57 3.86 ...
gg_miss_upset(d[-1], nsets = 6)
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the UpSetR package.
## Please report the issue to the authors.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the UpSetR package.
## Please report the issue to the authors.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the UpSetR package.
## Please report the issue to the authors.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
d2 <- na.omit(d)
n_before <- nrow(d)
n_after <- nrow(d2)
n_dropped <- n_before - n_after
percent_dropped <- (n_dropped / n_before) * 100
n_dropped
## [1] 347
percent_dropped
## [1] 34.83936
n_after
## [1] 649
We looked at the missing data in our dataset, and found that 347, or about 34.8%, of the participants in our sample skipped at least one item. We dropped these participants from our analysis, which is not advisable and runs the risk of dropping vulnerable groups or skewing results. However, we will proceed for the sake of this class using the reduced dataset, N = 649.
write.csv(d2, file="projectdata.csv", row.names = FALSE)