# if you haven't run this code before, you'll need to download the below packages first
# you should see a prompt near the top of the page (in a yellow bar within the RStudio window)
# you can also use the packages tab to the right
library(naniar) # for the gg_miss-upset() command
Data Prep
Load Libraries
Import Data
# # for the HW, you'll import the CSV file of your chosen dataset
<- read.csv(file="Data/eammi2_data_final.csv", header=T) df
Viewing Data
# # these are commands useful for viewing a dataframe
# # you can also click the object in the environment tab to view it in a new window
names(df)
[1] "ResponseId" "gender" "race_rc" "age"
[5] "income" "edu" "sibling" "party_rc"
[9] "disability" "marriage5" "phys_sym" "pipwd"
[13] "moa_independence" "moa_role" "moa_safety" "moa_maturity"
[17] "idea" "swb" "mindful" "belong"
[21] "efficacy" "support" "socmeduse" "usdream"
[25] "npi" "exploit" "stress"
head(df)
ResponseId gender race_rc age income
1 R_BJN3bQqi1zUMid3 f white 1 between 18 and 25 1 low
2 R_2TGbiBXmAtxywsD m white 1 between 18 and 25 1 low
3 R_12G7bIqN2wB2N65 m white 1 between 18 and 25 rather not say
4 R_39pldNoon8CePfP f other 1 between 18 and 25 rather not say
5 R_1QiKb2LdJo1Bhvv m white 1 between 18 and 25 2 middle
6 R_pmwDTZyCyCycXwB f white 1 between 18 and 25 rather not say
edu sibling party_rc disability
1 2 Currently in college at least one sibling democrat <NA>
2 5 Completed Bachelors Degree at least one sibling independent <NA>
3 2 Currently in college at least one sibling apolitical psychiatric
4 2 Currently in college at least one sibling apolitical <NA>
5 2 Currently in college at least one sibling apolitical <NA>
6 2 Currently in college at least one sibling apolitical <NA>
marriage5 phys_sym pipwd
1 are currently divorced from one another high number of symptoms NA
2 are currently married to one another high number of symptoms NA
3 are currently married to one another high number of symptoms 2.333333
4 are currently married to one another high number of symptoms NA
5 are currently married to one another low number of symptoms NA
6 are currently married to one another high number of symptoms NA
moa_independence moa_role moa_safety moa_maturity idea swb mindful
1 3.666667 3.000000 2.75 3.666667 3.750 4.333333 2.4
2 3.666667 2.666667 3.25 3.333333 3.875 4.166667 1.8
3 3.500000 2.500000 3.00 3.666667 3.750 1.833333 2.2
4 3.000000 2.000000 1.25 3.000000 3.750 5.166667 2.2
5 3.833333 2.666667 2.25 3.666667 3.500 3.666667 3.2
6 3.500000 3.333333 2.50 4.000000 3.250 4.000000 3.4
belong efficacy support socmeduse
1 2.8 3.4 6.000000 47
2 4.2 3.4 6.750000 23
3 3.6 2.2 5.166667 34
4 4.0 2.8 5.583333 35
5 3.4 3.0 6.000000 37
6 4.2 2.4 4.500000 13
usdream npi
1 american dream is important and achievable for me 0.69230769
2 american dream is important and achievable for me 0.15384615
3 american dream is not important and maybe not achievable for me 0.07692308
4 american dream is not important and maybe not achievable for me 0.07692308
5 not sure if american dream important 0.76923077
6 american dream is not important and maybe not achievable for me 0.23076923
exploit stress
1 2.000000 3.3
2 3.666667 3.3
3 4.333333 4.0
4 1.666667 3.2
5 4.000000 3.1
6 1.333333 3.5
str(df)
'data.frame': 3182 obs. of 27 variables:
$ ResponseId : chr "R_BJN3bQqi1zUMid3" "R_2TGbiBXmAtxywsD" "R_12G7bIqN2wB2N65" "R_39pldNoon8CePfP" ...
$ gender : chr "f" "m" "m" "f" ...
$ race_rc : chr "white" "white" "white" "other" ...
$ age : chr "1 between 18 and 25" "1 between 18 and 25" "1 between 18 and 25" "1 between 18 and 25" ...
$ income : chr "1 low" "1 low" "rather not say" "rather not say" ...
$ edu : chr "2 Currently in college" "5 Completed Bachelors Degree" "2 Currently in college" "2 Currently in college" ...
$ sibling : chr "at least one sibling" "at least one sibling" "at least one sibling" "at least one sibling" ...
$ party_rc : chr "democrat" "independent" "apolitical" "apolitical" ...
$ disability : chr NA NA "psychiatric" NA ...
$ marriage5 : chr "are currently divorced from one another" "are currently married to one another" "are currently married to one another" "are currently married to one another" ...
$ phys_sym : chr "high number of symptoms" "high number of symptoms" "high number of symptoms" "high number of symptoms" ...
$ pipwd : num NA NA 2.33 NA NA ...
$ moa_independence: num 3.67 3.67 3.5 3 3.83 ...
$ moa_role : num 3 2.67 2.5 2 2.67 ...
$ moa_safety : num 2.75 3.25 3 1.25 2.25 2.5 4 3.25 2.75 3.5 ...
$ moa_maturity : num 3.67 3.33 3.67 3 3.67 ...
$ idea : num 3.75 3.88 3.75 3.75 3.5 ...
$ swb : num 4.33 4.17 1.83 5.17 3.67 ...
$ mindful : num 2.4 1.8 2.2 2.2 3.2 ...
$ belong : num 2.8 4.2 3.6 4 3.4 4.2 3.9 3.6 2.9 2.5 ...
$ efficacy : num 3.4 3.4 2.2 2.8 3 2.4 2.3 3 3 3.7 ...
$ support : num 6 6.75 5.17 5.58 6 ...
$ socmeduse : int 47 23 34 35 37 13 37 43 37 29 ...
$ usdream : chr "american dream is important and achievable for me" "american dream is important and achievable for me" "american dream is not important and maybe not achievable for me" "american dream is not important and maybe not achievable for me" ...
$ npi : num 0.6923 0.1538 0.0769 0.0769 0.7692 ...
$ exploit : num 2 3.67 4.33 1.67 4 ...
$ stress : num 3.3 3.3 4 3.2 3.1 3.5 3.3 2.4 2.9 2.7 ...
Subsetting Data
# # use the codebook you created in the codebook activity to get the names of your variables (first column)
# # enter this list of names in the select=c() argument to subset those columns from the dataframe
<- subset(df, select=c(edu, income, party_rc, npi, mindful, swb)) d
Missing Data
# # use the gg_miss_upset() command for a visualization of your missing data
gg_miss_upset(d, nsets = 6)
#
# # use the na.omit() command to create a new dataframe in which any participants with missing data are dropped from the dataframe
<- na.omit(d)
d2 #
# # use a bit of math to see what percentage of participants had missing data
# # math will go here
3182-3126
[1] 56
56/3182
[1] 0.01759899
Exporting Data
# # last step is to export the data after you've dropped NAs
write.csv(d2, file="Data/mydata.csv", row.names = F)
Write-Up
We selected six variables from the [eammi2] dataset to focus on in our analysis: [DataPrepHW]. Participants with missing data (1.75%) in these six variables were dropped from our analysis, leaving us a final sample of n = [3126].