#load libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.4
## ✔ ggplot2 3.4.4 ✔ stringr 1.5.0
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr)
library(forcats)
library(ggplot2)
library(lubridate)
library(purrr)
library(stringr)
library(tibble)
library(tidyr)
library(psych)
##
## Attaching package: 'psych'
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(mice)
##
## Attaching package: 'mice'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(sjstats)
##
## Attaching package: 'sjstats'
##
## The following object is masked from 'package:psych':
##
## phi
library(formattable)
options(scipen = 999)
#import data
# Using a relative path with correct working directory
setwd("/Users/melissalagunas/Desktop/Lab/DISSERTATION")
Dissertation_Main_Study_August_30 <- read_csv("Dissertation_Main_Study_August 30, 2024_08.47.csv")
## New names:
## Rows: 323 Columns: 90
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (90): StartDate, EndDate, Status, IPAddress, Progress, Duration (in seco...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `Q29` -> `Q29...18`
## • `Q1` -> `Q1...22`
## • `Q2` -> `Q2...23`
## • `Q1` -> `Q1...74`
## • `Q2` -> `Q2...75`
## • `Q29` -> `Q29...90`
# rename dataset
dat <- Dissertation_Main_Study_August_30
#rename consent
dat <- dplyr::rename(dat, consent = Q29...18)
# assess how many individuals opened the survey link vs granted consent
dat <- dplyr::filter(dat, consent == "4")
nrow(dat)
## [1] 319
Number of people who opened the survey = 319 Number of people who consented = 319
# Identify duplicates based on key columns (e.g., RespondentID, Email, Consent)
duplicates <- dat[duplicated(dat[, c("Q1...22", "IPAddress")]), ]
# Print the number of duplicate rows
cat("Number of duplicate rows based on key columns: ", nrow(duplicates), "\n")
## Number of duplicate rows based on key columns: 7
# Remove duplicates based on the key columns, keeping only the first occurrence
dat <- dat[!duplicated(dat[, c("Q1...22", "IPAddress")]), ]
# Check the number of rows before and after removing duplicates
cat("Number of rows before removing duplicates: ", nrow(dat), "\n")
## Number of rows before removing duplicates: 312
cat("Number of rows after removing duplicates: ", nrow(dat), "\n")
## Number of rows after removing duplicates: 312
# Analyze the survey completion time after removing duplicates
summary(dat$`Duration (in seconds)`)
## Length Class Mode
## 312 character character
# delete variables
dat2 = select(dat,-c(StartDate, EndDate, Status, IPAddress, Finished, RecordedDate, ResponseId, RecipientLastName, RecipientFirstName, RecipientEmail, ExternalReference, LocationLatitude, LocationLongitude, DistributionChannel, UserLanguage))
# change variable names
dat2 <- dat2 %>%
rename(
US_located_age = Q35,
PD = Q1...22,
OC_1 = Q2...23,
OC_2 = Q31_1,
SS_1 = Q30_1,
SS_2 = Q30_2,
SS_3 = Q30_3,
SS_4 = Q30_4,
SS_5 = Q30_5,
SS_6 = Q30_6,
SS_7 = Q30_7,
SS_8 = Q30_8,
SS_9 = Q30_9,
SS_10 = Q30_10,
SS_11 = Q30_11,
SS_12 = Q30_12,
SS_13_TEXT = Q32,
PSNQ_1 = Q27_1,
PSNQ_2 = Q27_2,
PSNQ_3 = Q27_3,
PSNQ_4 = Q27_4,
PSNQ_5 = Q27_5,
PSNQ_6 = Q27_6,
PSNQ_7 = Q27_7,
PSNQ_8 = Q27_8,
PSNQ_9 = Q27_9,
PSNQ_10 = Q27_10,
SBS_1 = Q4_1,
SBS_2 = Q5_1,
SBS_3 = Q6_1,
SBS_4 = Q7_1,
SBS_5 = Q8_1,
SBS_6 = Q9_1,
SBS_7 = Q10_1,
PF_1 = Q11_1,
PF_2 = Q11_2,
PF_3 = Q11_3,
PF_4 = Q11_4,
PF_5 = Q11_5,
PF_6 = Q11_6,
PF_7 = Q11_7,
PF_8 = Q11_8,
CS_1 = Q26_1,
CS_2 = Q26_2,
CS_3 = Q26_3,
CS_4 = Q26_4,
CS_5 = Q26_5,
BRS_1 = Q12_1,
BRS_2 = Q12_2,
BRS_3 = Q12_3,
BRS_4 = Q12_4,
BRS_5 = Q12_5,
BRS_6 = Q12_6,
age = Q1...74,
gender = Q2...75,
gender_8_TEXT = Q2_8_TEXT,
sexual_orientation = Q3,
sexual_orientation_9_TEXT = Q3_9_TEXT,
employment_status = Q13,
employment_status_TEXT = Q13_3_TEXT,
US_born = Q4,
race = Q5,
race_10_TEXT = Q5_10_TEXT,
income = Q6,
fam_income = Q7,
religion = Q8,
religion_15_TEXT = Q8_15_TEXT,
education = Q11,
education_9_TEXT = Q11_9_TEXT,
year_education = Q12)
# check structure
str(dat2)
## tibble [312 × 75] (S3: tbl_df/tbl/data.frame)
## $ Progress : chr [1:312] "100" "100" "100" "100" ...
## $ Duration (in seconds) : chr [1:312] "803" "1034" "533" "1547" ...
## $ consent : chr [1:312] "4" "4" "4" "4" ...
## $ US_located_age : chr [1:312] NA NA NA NA ...
## $ employment_status : chr [1:312] "1" "2" "1" "2" ...
## $ employment_status_TEXT : chr [1:312] NA NA NA NA ...
## $ PD : chr [1:312] "Associate Attorney at large international law firm." "The Information Technology (IT) profession involves designing, developing, managing, and maintaining computer s"| __truncated__ "Health education" "Education - I work as a college advisor for high school students and support them throughout the college application process." ...
## $ OC_1 : chr [1:312] "3" "4" "3" "1" ...
## $ OC_2 : chr [1:312] "2" "3" "5" "2" ...
## $ SS_1 : chr [1:312] "6" "7" "1" "5" ...
## $ SS_2 : chr [1:312] "7" "7" "1" "5" ...
## $ SS_3 : chr [1:312] "6" "6" "1" "4" ...
## $ SS_4 : chr [1:312] "6" "5" "1" "4" ...
## $ SS_5 : chr [1:312] "7" "7" "1" "5" ...
## $ SS_6 : chr [1:312] "6" "5" "1" "7" ...
## $ SS_7 : chr [1:312] "6" "5" "1" "7" ...
## $ SS_8 : chr [1:312] "6" "6" "5" "4" ...
## $ SS_9 : chr [1:312] "6" "5" "5" "7" ...
## $ SS_10 : chr [1:312] "7" "7" "3" "6" ...
## $ SS_11 : chr [1:312] "7" "6" "5" "4" ...
## $ SS_12 : chr [1:312] "7" "5" "5" "7" ...
## $ SS_13_TEXT : chr [1:312] "Mother and Friend" "Uncle(Family Member)" "Friend" "A friend who also is a mother-figure to me." ...
## $ PSNQ_1 : chr [1:312] "6" "6" "6" "4" ...
## $ PSNQ_2 : chr [1:312] "5" "6" "6" "6" ...
## $ PSNQ_3 : chr [1:312] "5" "6" "7" "7" ...
## $ PSNQ_4 : chr [1:312] "5" "6" "7" "6" ...
## $ PSNQ_5 : chr [1:312] "4" "7" "7" "7" ...
## $ PSNQ_6 : chr [1:312] "3" "7" "7" "7" ...
## $ PSNQ_7 : chr [1:312] "4" "7" "6" "7" ...
## $ PSNQ_8 : chr [1:312] "5" "7" "6" "7" ...
## $ PSNQ_9 : chr [1:312] "3" "7" "6" "5" ...
## $ PSNQ_10 : chr [1:312] "3" "7" "6" "6" ...
## $ SBS_1 : chr [1:312] "2" "5" "5" "2" ...
## $ SBS_2 : chr [1:312] "2" "4" "5" "2" ...
## $ SBS_3 : chr [1:312] "3" "4" "5" "5" ...
## $ SBS_4 : chr [1:312] "4" "5" "3" "5" ...
## $ SBS_5 : chr [1:312] "3" "5" "3" "5" ...
## $ SBS_6 : chr [1:312] "3" "4" "5" "3" ...
## $ SBS_7 : chr [1:312] "3" "5" "5" "5" ...
## $ PF_1 : chr [1:312] "5" "7" "6" "7" ...
## $ PF_2 : chr [1:312] "4" "7" "7" "4" ...
## $ PF_3 : chr [1:312] "5" "7" "7" "6" ...
## $ PF_4 : chr [1:312] "6" "7" "7" "4" ...
## $ PF_5 : chr [1:312] "6" "7" "7" "7" ...
## $ PF_6 : chr [1:312] "7" "7" "7" "5" ...
## $ PF_7 : chr [1:312] "6" "7" "7" "6" ...
## $ PF_8 : chr [1:312] "5" "7" "7" "6" ...
## $ CS_1 : chr [1:312] "4" "4" "5" "5" ...
## $ CS_2 : chr [1:312] "4" "5" "5" "5" ...
## $ CS_3 : chr [1:312] "5" "5" "4" "5" ...
## $ CS_4 : chr [1:312] "4" "5" "5" "5" ...
## $ CS_5 : chr [1:312] "5" "5" "5" "5" ...
## $ BRS_1 : chr [1:312] "3" "5" "5" "5" ...
## $ BRS_2 : chr [1:312] "1" "4" "5" "1" ...
## $ BRS_3 : chr [1:312] "4" "2" "5" "4" ...
## $ BRS_4 : chr [1:312] "4" "3" "5" "1" ...
## $ BRS_5 : chr [1:312] "3" "4" "5" "4" ...
## $ BRS_6 : chr [1:312] "1" "5" "5" "1" ...
## $ age : chr [1:312] NA "21" "25" "29" ...
## $ gender : chr [1:312] "1" "2" "2" "1" ...
## $ gender_8_TEXT : chr [1:312] NA NA NA NA ...
## $ sexual_orientation : chr [1:312] "5" "5" "5" "2" ...
## $ sexual_orientation_9_TEXT: chr [1:312] NA NA NA NA ...
## $ US_born : chr [1:312] "1" "1" "1" "1" ...
## $ race : chr [1:312] "1" "2" "2" "1" ...
## $ race_10_TEXT : chr [1:312] NA NA NA NA ...
## $ income : chr [1:312] "9" "1" "1" "2" ...
## $ fam_income : chr [1:312] "7" "2" "2" "2" ...
## $ religion : chr [1:312] "4" "5" "4,5" "4" ...
## $ religion_15_TEXT : chr [1:312] NA NA NA NA ...
## $ education : chr [1:312] "6" "4" "4" "4" ...
## $ education_9_TEXT : chr [1:312] NA NA NA NA ...
## $ year_education : chr [1:312] "2017" "2022" "2022" "2022" ...
## $ Q28 : chr [1:312] NA NA NA NA ...
## $ Q29...90 : chr [1:312] NA NA NA NA ...
# Convert all characters into integers for anything that has a number
library(hablar)
##
## Attaching package: 'hablar'
## The following object is masked from 'package:mice':
##
## squeeze
## The following object is masked from 'package:forcats':
##
## fct
## The following object is masked from 'package:tibble':
##
## num
## The following object is masked from 'package:dplyr':
##
## na_if
dat2 <- dat2 %>% #convert dataframe to integer
convert(int(Progress:employment_status, OC_1:SS_12, PSNQ_1:gender, sexual_orientation, US_born, race, income:religion, education))
## Warning: There were 3 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `age = (structure(function (..., .x = ..1, .y = ..2, . = ..1)
## ...`.
## Caused by warning in `as_reliable_int()`:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.
str(dat2)
## tibble [312 × 75] (S3: tbl_df/tbl/data.frame)
## $ Progress : int [1:312] 100 100 100 100 100 100 100 100 100 100 ...
## $ Duration (in seconds) : int [1:312] 803 1034 533 1547 1352 800 281 866 495 617 ...
## $ consent : int [1:312] 4 4 4 4 4 4 4 4 4 4 ...
## $ US_located_age : int [1:312] NA NA NA NA NA NA NA NA NA NA ...
## $ employment_status : int [1:312] 1 2 1 2 1 1 1 1 1 1 ...
## $ employment_status_TEXT : chr [1:312] NA NA NA NA ...
## $ PD : chr [1:312] "Associate Attorney at large international law firm." "The Information Technology (IT) profession involves designing, developing, managing, and maintaining computer s"| __truncated__ "Health education" "Education - I work as a college advisor for high school students and support them throughout the college application process." ...
## $ OC_1 : int [1:312] 3 4 3 1 5 6 5 5 3 4 ...
## $ OC_2 : int [1:312] 2 3 5 2 1 4 4 3 3 4 ...
## $ SS_1 : int [1:312] 6 7 1 5 7 6 6 5 6 6 ...
## $ SS_2 : int [1:312] 7 7 1 5 7 6 6 6 7 6 ...
## $ SS_3 : int [1:312] 6 6 1 4 6 6 6 5 6 6 ...
## $ SS_4 : int [1:312] 6 5 1 4 7 6 6 4 6 6 ...
## $ SS_5 : int [1:312] 7 7 1 5 7 7 6 4 5 7 ...
## $ SS_6 : int [1:312] 6 5 1 7 5 6 6 5 6 6 ...
## $ SS_7 : int [1:312] 6 5 1 7 6 5 6 4 6 6 ...
## $ SS_8 : int [1:312] 6 6 5 4 5 6 6 5 6 6 ...
## $ SS_9 : int [1:312] 6 5 5 7 5 6 6 5 6 6 ...
## $ SS_10 : int [1:312] 7 7 3 6 7 6 6 5 6 6 ...
## $ SS_11 : int [1:312] 7 6 5 4 6 5 6 5 6 6 ...
## $ SS_12 : int [1:312] 7 5 5 7 5 6 6 5 6 6 ...
## $ SS_13_TEXT : chr [1:312] "Mother and Friend" "Uncle(Family Member)" "Friend" "A friend who also is a mother-figure to me." ...
## $ PSNQ_1 : int [1:312] 6 6 6 4 6 6 6 6 2 6 ...
## $ PSNQ_2 : int [1:312] 5 6 6 6 6 6 6 6 3 7 ...
## $ PSNQ_3 : int [1:312] 5 6 7 7 6 6 6 6 4 6 ...
## $ PSNQ_4 : int [1:312] 5 6 7 6 NA 6 6 6 3 6 ...
## $ PSNQ_5 : int [1:312] 4 7 7 7 6 6 6 5 2 6 ...
## $ PSNQ_6 : int [1:312] 3 7 7 7 6 6 6 5 1 7 ...
## $ PSNQ_7 : int [1:312] 4 7 6 7 6 5 6 6 2 6 ...
## $ PSNQ_8 : int [1:312] 5 7 6 7 5 5 6 6 5 6 ...
## $ PSNQ_9 : int [1:312] 3 7 6 5 3 6 6 6 2 7 ...
## $ PSNQ_10 : int [1:312] 3 7 6 6 3 5 6 6 2 6 ...
## $ SBS_1 : int [1:312] 2 5 5 2 4 4 4 4 2 4 ...
## $ SBS_2 : int [1:312] 2 4 5 2 3 3 4 4 2 4 ...
## $ SBS_3 : int [1:312] 3 4 5 5 4 4 4 3 2 4 ...
## $ SBS_4 : int [1:312] 4 5 3 5 4 4 4 4 2 4 ...
## $ SBS_5 : int [1:312] 3 5 3 5 4 4 4 3 2 5 ...
## $ SBS_6 : int [1:312] 3 4 5 3 4 4 4 4 2 4 ...
## $ SBS_7 : int [1:312] 3 5 5 5 4 5 4 4 1 4 ...
## $ PF_1 : int [1:312] 5 7 6 7 6 6 6 6 2 6 ...
## $ PF_2 : int [1:312] 4 7 7 4 6 6 6 6 2 6 ...
## $ PF_3 : int [1:312] 5 7 7 6 6 6 6 5 5 6 ...
## $ PF_4 : int [1:312] 6 7 7 4 4 6 6 6 4 7 ...
## $ PF_5 : int [1:312] 6 7 7 7 6 6 6 5 6 6 ...
## $ PF_6 : int [1:312] 7 7 7 5 6 6 6 6 6 6 ...
## $ PF_7 : int [1:312] 6 7 7 6 5 6 6 6 2 6 ...
## $ PF_8 : int [1:312] 5 7 7 6 6 6 6 6 4 6 ...
## $ CS_1 : int [1:312] 4 4 5 5 5 5 4 4 1 4 ...
## $ CS_2 : int [1:312] 4 5 5 5 5 5 5 3 3 4 ...
## $ CS_3 : int [1:312] 5 5 4 5 2 5 5 3 1 4 ...
## $ CS_4 : int [1:312] 4 5 5 5 5 5 5 4 2 4 ...
## $ CS_5 : int [1:312] 5 5 5 5 4 4 5 4 2 4 ...
## $ BRS_1 : int [1:312] 3 5 5 5 4 5 5 4 4 4 ...
## $ BRS_2 : int [1:312] 1 4 5 1 4 1 1 2 2 4 ...
## $ BRS_3 : int [1:312] 4 2 5 4 4 5 5 4 4 4 ...
## $ BRS_4 : int [1:312] 4 3 5 1 4 1 1 2 2 4 ...
## $ BRS_5 : int [1:312] 3 4 5 4 3 5 5 4 4 4 ...
## $ BRS_6 : int [1:312] 1 5 5 1 2 1 1 2 2 4 ...
## $ age : int [1:312] NA 21 25 29 NA NA 40 27 60 40 ...
## $ gender : int [1:312] 1 2 2 1 1 2 1 2 1 5 ...
## $ gender_8_TEXT : chr [1:312] NA NA NA NA ...
## $ sexual_orientation : int [1:312] 5 5 5 2 5 9 5 5 5 5 ...
## $ sexual_orientation_9_TEXT: chr [1:312] NA NA NA NA ...
## $ US_born : int [1:312] 1 1 1 1 1 1 1 1 1 1 ...
## $ race : int [1:312] 1 2 2 1 1 6 6 9 1 5 ...
## $ race_10_TEXT : chr [1:312] NA NA NA NA ...
## $ income : int [1:312] 9 1 1 2 8 5 5 9 8 2 ...
## $ fam_income : int [1:312] 7 2 2 2 8 3 3 6 3 2 ...
## $ religion : int [1:312] 4 5 NA 4 10 5 5 10 4 4 ...
## $ religion_15_TEXT : chr [1:312] NA NA NA NA ...
## $ education : int [1:312] 6 4 4 4 6 5 1 3 6 3 ...
## $ education_9_TEXT : chr [1:312] NA NA NA NA ...
## $ year_education : chr [1:312] "2017" "2022" "2022" "2022" ...
## $ Q28 : chr [1:312] NA NA NA NA ...
## $ Q29...90 : chr [1:312] NA NA NA NA ...
# Assuming your original dataset is called df
dat2$ID <- 1:nrow(dat2)
# View the dataset with the new ID variable
head(df)
##
## 1 function (x, df1, df2, ncp, log = FALSE)
## 2 {
## 3 if (missing(ncp))
## 4 .Call(C_df, x, df1, df2, log)
## 5 else .Call(C_dnf, x, df1, df2, ncp, log)
## 6 }
# duration
#*calculate average, SD, max/min Duration__in_seconds_
duration_stats <- psych::describe(dat2$`Duration (in seconds)`, na.rm = TRUE, ranges = TRUE)
# View the results
print(duration_stats)
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 312 924.68 1397.29 543 690.7 467.76 6 17152 17146 6.66 63.75
## se
## X1 79.11
Mean: 924.68
SD: 1397.29
Max: 17152
Min: 6
dat2_missingness <- (dplyr::select(dat2, employment_status, OC_1:SS_12, PSNQ_1:gender, sexual_orientation, US_born, race, income:religion, education))
# NOTE: excluded demographics textboxes
# analyze item level data missingness and patterns
## typically want >80% of data
# Calculate the number and proportion of item-level missingness
dat2_missingness$nmiss <- dat2_missingness %>%
dplyr::select(employment_status:education) %>% # Select all variables between employment_status and education
is.na %>%
rowSums
# Dynamically calculate the number of selected variables
num_vars <- dat2_missingness %>%
dplyr::select(employment_status:education) %>%
ncol()
# Calculate the proportion of missingness
dat2_missingness <- dat2_missingness %>%
dplyr::mutate(prop_miss = (nmiss/num_vars)*100)
# Describe the missingness proportion
psych::describe(dat2_missingness$prop_miss)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 312 16.76 33.97 1.67 8.67 2.47 0 100 100 1.82 1.47 1.92
Across cases that were deemed eligible on the basis of the inclusion/exclusion criteria, missingness ranged from 0% to 100%
dat2_missingness <- dplyr::filter(dat2_missingness, prop_miss <= 90)
print(dat2_missingness)
## # A tibble: 271 × 62
## employment_status OC_1 OC_2 SS_1 SS_2 SS_3 SS_4 SS_5 SS_6 SS_7 SS_8
## <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
## 1 1 3 2 6 7 6 6 7 6 6 6
## 2 2 4 3 7 7 6 5 7 5 5 6
## 3 1 3 5 1 1 1 1 1 1 1 5
## 4 2 1 2 5 5 4 4 5 7 7 4
## 5 1 5 1 7 7 6 7 7 5 6 5
## 6 1 6 4 6 6 6 6 7 6 5 6
## 7 1 5 4 6 6 6 6 6 6 6 6
## 8 1 5 3 5 6 5 4 4 5 4 5
## 9 1 3 3 6 7 6 6 5 6 6 6
## 10 1 4 4 6 6 6 6 7 6 6 6
## # ℹ 261 more rows
## # ℹ 51 more variables: SS_9 <int>, SS_10 <int>, SS_11 <int>, SS_12 <int>,
## # PSNQ_1 <int>, PSNQ_2 <int>, PSNQ_3 <int>, PSNQ_4 <int>, PSNQ_5 <int>,
## # PSNQ_6 <int>, PSNQ_7 <int>, PSNQ_8 <int>, PSNQ_9 <int>, PSNQ_10 <int>,
## # SBS_1 <int>, SBS_2 <int>, SBS_3 <int>, SBS_4 <int>, SBS_5 <int>,
## # SBS_6 <int>, SBS_7 <int>, PF_1 <int>, PF_2 <int>, PF_3 <int>, PF_4 <int>,
## # PF_5 <int>, PF_6 <int>, PF_7 <int>, PF_8 <int>, CS_1 <int>, CS_2 <int>, …
Filter the dataframe to include only those rows that have at least 90% of their data complete. This results in 271 valid cases remaining.
#further update to exclude the n_miss and prop_miss variables since these will mess with our analyses
dat2_missingness <- dat2_missingness %>%
dplyr::select (-c(nmiss, prop_miss))
#what proportion of cells missing across entire dataset
formattable::percent(mean(is.na(dat2_missingness)))
## [1] 4.50%
#what proportion of cases (rows) are complete (nonmissing)
formattable::percent(mean(complete.cases(dat2_missingness)))
## [1] 47.60%
Among the cases that met the inclusion and exclusion criteria, missing data ranged from 0% to 100%. Overall, 4.50% of cells had missing data, while 47.60% of cases had no missing data.
missing_data_patterns <- mice::md.pattern(dat2_missingness, plot = TRUE, rotate.names = TRUE)
missing_data_patterns
## SS_2 SS_4 SS_9 SS_12 OC_1 SS_1 SS_3 SS_6 SS_5 SS_7 SS_10 employment_status
## 129 1 1 1 1 1 1 1 1 1 1 1 1
## 42 1 1 1 1 1 1 1 1 1 1 1 1
## 11 1 1 1 1 1 1 1 1 1 1 1 1
## 9 1 1 1 1 1 1 1 1 1 1 1 1
## 4 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1
## 6 1 1 1 1 1 1 1 1 1 1 1 1
## 5 1 1 1 1 1 1 1 1 1 1 1 1
## 3 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1
## 4 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1
## 2 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1
## 6 1 1 1 1 1 1 1 1 1 1 1 1
## 2 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1
## 3 1 1 1 1 1 1 1 1 1 1 1 1
## 2 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 0
## 1 1 1 1 1 1 1 1 1 1 1 1 0
## 1 1 1 1 1 1 1 1 1 1 1 1 0
## 1 1 1 1 1 1 1 1 1 1 1 0 1
## 1 1 1 1 1 1 1 1 1 1 1 0 1
## 1 1 1 1 1 1 1 1 1 1 0 1 1
## 2 1 1 1 1 1 1 1 1 0 1 1 1
## 1 1 1 1 1 1 1 1 0 1 1 1 1
## 1 1 1 1 1 1 1 0 1 1 0 1 1
## 1 1 1 1 1 1 0 1 1 1 1 1 1
## 1 1 1 1 1 0 1 1 1 1 1 1 1
## 0 0 0 0 1 1 1 1 2 2 2 3
## OC_2 SS_8 SS_11 PSNQ_1 PSNQ_2 PSNQ_5 PSNQ_6 PSNQ_10 PSNQ_3 PSNQ_4 PSNQ_7
## 129 1 1 1 1 1 1 1 1 1 1 1
## 42 1 1 1 1 1 1 1 1 1 1 1
## 11 1 1 1 1 1 1 1 1 1 1 1
## 9 1 1 1 1 1 1 1 1 1 1 1
## 4 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 6 1 1 1 1 1 1 1 1 1 1 1
## 5 1 1 1 1 1 1 1 1 1 1 1
## 3 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 4 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 2 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 6 1 1 1 1 1 1 1 1 1 1 1
## 2 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 0 1
## 1 1 1 1 1 1 1 1 1 0 1 1
## 1 1 1 1 0 0 0 0 0 0 0 0
## 3 1 1 1 0 0 0 0 0 0 0 0
## 2 1 1 0 1 1 1 1 1 1 1 1
## 1 1 1 0 1 1 1 1 1 1 1 1
## 1 1 0 1 1 1 1 1 1 1 1 1
## 1 1 0 1 1 1 1 1 1 1 1 1
## 1 0 1 1 1 1 1 1 1 1 1 1
## 1 0 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 1 0 0 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 0 0 0 0 0 0 0 0
## 1 1 1 1 1 1 1 1 1 1 1 1
## 2 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 0
## 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 3 3 3 5 5 5 5 5 6 6 6
## PSNQ_9 PSNQ_8 SBS_2 SBS_6 SBS_7 SBS_3 SBS_4 SBS_1 SBS_5 PF_1 PF_4 PF_5 PF_6
## 129 1 1 1 1 1 1 1 1 1 1 1 1 1
## 42 1 1 1 1 1 1 1 1 1 1 1 1 1
## 11 1 1 1 1 1 1 1 1 1 1 1 1 1
## 9 1 1 1 1 1 1 1 1 1 1 1 1 1
## 4 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 6 1 1 1 1 1 1 1 1 1 1 1 1 1
## 5 1 1 1 1 1 1 1 1 1 1 1 1 1
## 3 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 4 1 1 1 1 1 1 1 1 1 0 0 0 0
## 1 1 1 1 1 1 1 1 1 0 1 1 1 1
## 1 1 1 1 1 1 1 1 1 0 1 1 1 1
## 1 1 1 1 1 1 1 1 1 0 1 1 1 1
## 2 1 1 1 1 1 1 1 0 1 1 1 1 1
## 1 1 1 1 1 1 1 0 1 1 1 1 1 1
## 1 1 1 1 1 1 0 1 1 1 1 1 1 1
## 6 1 1 0 0 0 0 0 0 0 0 0 0 0
## 2 1 0 1 1 1 1 1 1 1 1 1 1 1
## 1 0 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 0 1 1 1 1
## 1 0 0 1 1 1 1 1 1 1 1 1 1 1
## 3 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1 0 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 0 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1 0
## 1 1 1 1 1 1 1 1 1 1 1 1 0 1
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 2 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 7 8 10 10 10 11 11 12 14 14 14 15 15
## PF_8 CS_2 BRS_1 gender income fam_income PF_3 CS_1 BRS_2 BRS_5 BRS_6
## 129 1 1 1 1 1 1 1 1 1 1 1
## 42 1 1 1 1 1 1 1 1 1 1 1
## 11 1 1 1 1 1 1 1 1 1 1 1
## 9 1 1 1 1 1 1 1 1 1 1 1
## 4 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 6 1 1 1 1 1 1 1 1 1 1 1
## 5 1 1 1 1 1 1 1 1 1 1 1
## 3 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 0 1 1 1
## 1 1 1 1 1 1 1 0 1 1 1 1
## 1 1 1 0 0 0 0 1 1 0 0 0
## 1 1 0 1 1 1 1 1 0 1 1 1
## 1 0 1 1 1 1 1 1 1 1 1 1
## 4 0 0 0 0 0 0 0 0 0 0 0
## 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 2 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 0 1 1 1 1
## 6 0 0 0 0 0 0 0 0 0 0 0
## 2 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 3 0 0 0 0 0 0 0 0 0 0 0
## 2 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 0 0 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 0
## 1 1 1 1 1 1 1 1 1 1 1 1
## 1 0 0 0 0 0 0 0 0 0 0 0
## 1 1 1 1 1 1 1 1 1 1 1 1
## 2 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1 1
## 15 15 15 15 15 15 16 16 16 16 16
## sexual_orientation PF_7 CS_4 CS_5 BRS_3 PF_2 BRS_4 education CS_3 US_born
## 129 1 1 1 1 1 1 1 1 1 1
## 42 1 1 1 1 1 1 1 1 1 1
## 11 1 1 1 1 1 1 1 1 1 1
## 9 1 1 1 1 1 1 1 1 1 1
## 4 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1
## 6 1 1 1 1 1 1 1 1 1 1
## 5 1 1 1 1 1 1 1 1 1 1
## 3 1 1 1 1 1 1 1 1 1 0
## 1 1 1 1 1 1 1 1 1 0 1
## 1 1 1 1 1 1 1 1 0 1 1
## 1 1 1 1 1 1 1 1 0 1 1
## 1 1 1 1 1 1 1 1 0 1 0
## 1 1 1 1 1 1 0 1 1 1 1
## 1 1 1 1 1 1 0 1 1 1 1
## 1 1 1 1 1 0 1 1 1 0 0
## 1 1 1 1 0 1 1 1 1 1 1
## 1 1 1 0 1 1 1 0 1 1 1
## 1 1 0 1 1 1 1 1 1 1 1
## 1 1 0 1 1 1 1 1 1 0 1
## 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1
## 1 0 1 1 1 0 1 0 0 1 0
## 1 1 1 0 0 1 1 1 1 0 1
## 1 1 1 1 1 1 1 1 1 1 1
## 4 0 0 0 0 0 0 0 0 0 0
## 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 0 1 1 1 1 1 1
## 1 0 1 1 1 1 1 0 1 1 1
## 2 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 0 1 1 1 1
## 6 0 0 0 0 0 0 0 0 0 0
## 2 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 0 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 0 1 1 1 1 1 1 1
## 3 0 0 0 0 0 0 0 0 0 0
## 2 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 0 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1
## 1 1 0 1 1 1 1 0 1 0 1
## 1 0 0 0 0 0 0 0 0 0 0
## 1 1 1 1 1 1 1 1 1 1 1
## 2 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1
## 1 1 1 1 1 1 1 1 1 1 1
## 16 17 17 17 17 18 18 18 19 20
## race religion age
## 129 1 1 1 0
## 42 1 1 0 1
## 11 1 0 1 1
## 9 1 0 0 2
## 4 0 1 1 1
## 1 0 1 0 2
## 6 0 0 1 2
## 5 0 0 0 3
## 3 1 1 1 1
## 1 1 1 1 1
## 1 1 1 1 1
## 1 1 1 0 2
## 1 1 1 0 3
## 1 1 1 1 1
## 1 1 1 0 2
## 1 1 1 0 4
## 1 1 1 1 1
## 1 1 1 0 3
## 1 1 1 0 2
## 1 1 1 0 3
## 1 1 1 1 1
## 1 1 1 1 1
## 1 0 0 0 15
## 1 1 1 1 5
## 1 1 1 0 2
## 4 0 0 0 28
## 1 1 1 0 2
## 1 0 1 0 4
## 1 1 1 1 3
## 2 1 1 1 1
## 1 1 1 1 1
## 1 1 1 1 3
## 6 0 0 0 35
## 2 1 1 1 1
## 1 1 1 1 2
## 1 1 1 0 2
## 1 1 1 1 2
## 1 1 1 1 11
## 3 0 0 0 45
## 2 1 1 1 1
## 1 1 1 0 4
## 1 1 1 1 1
## 1 0 1 1 2
## 1 1 1 1 1
## 1 1 1 1 2
## 1 1 1 1 1
## 1 1 1 1 3
## 1 1 0 0 7
## 1 1 0 0 7
## 1 0 0 0 46
## 1 1 1 1 1
## 2 1 1 1 1
## 1 1 1 1 1
## 1 1 0 1 4
## 1 1 1 1 1
## 1 1 0 1 2
## 33 50 86 732
#BRS (scale 5 point, range 1-5), reverse score item 2,4,6
dat2 <- dat2 %>%
dplyr::mutate(BRS_2r = 6 - BRS_2) %>%
dplyr::mutate(BRS_4r = 6 - BRS_4) %>%
dplyr::mutate(BRS_6r = 6 - BRS_5)
# Aggregate items into average subscales (STANDARDIZED)
#social support
SS <- c('SS_1','SS_2','SS_3','SS_4','SS_5','SS_6','SS_7','SS_8','SS_9','SS_10','SS_11','SS_12')
#perceived support network quality
PSNQ <- c('PSNQ_1', 'PSNQ_2', 'PSNQ_3', 'PSNQ_4', 'PSNQ_5', 'PSNQ_6', 'PSNQ_7', 'PSNQ_8', 'PSNQ_9', 'PSNQ_10')
#resilience
BRS <- c('BRS_1', 'BRS_2r', 'BRS_3', 'BRS_4r', 'BRS_5', 'BRS_6r')
#sense of belonging
SBS <- c('SBS_1', 'SBS_2', 'SBS_3', 'SBS_4', 'SBS_5', 'SBS_6', 'SBS_7')
#professional flourishing
PF <- c('PF_1','PF_2','PF_3','PF_4','PF_5','PF_6','PF_7','PF_8')
#career satisfaction
CS <- c('CS_1', 'CS_2','CS_3','CS_4','CS_5')
#organizational climate
OC <- c('OC_1', 'OC_2')
#create new variables #80% of items should be present to get the mean
dat2$SS_AVG <- mean_n(dat2[,SS], .80)
dat2$PSNQ_AVG <- mean_n(dat2[,PSNQ], .80)
dat2$BRS_AVG <- mean_n(dat2[,BRS], .80)
dat2$SBS_AVG <- mean_n(dat2[,SBS], .80)
dat2$PF_AVG <- mean_n(dat2[,PF], .80)
dat2$CS_AVG <- mean_n(dat2[,CS], .80)
dat2$OC_AVG <- mean_n(dat2[,OC], .80)
# Aggregate items into total sum scores (UNSTANDARDIZED)
#create new variables #80% of items should be present to get the mean
dat2$SS_total <- rowSums(dat2[,SS], .80)
dat2$PSNQ_total <- rowSums(dat2[,PSNQ], .80)
dat2$BRS_total <- rowSums(dat2[,BRS], .80)
dat2$SBS_total <- rowSums(dat2[,SBS], .80)
dat2$PF_total <- rowSums(dat2[,PF], .80)
dat2$CS_total <- rowSums(dat2[,CS], .80)
dat2$OC_total <- rowSums(dat2[,OC], .80)
#social support
psych::alpha(dat2[c('SS_1','SS_2','SS_3','SS_4','SS_5','SS_6','SS_7','SS_8','SS_9','SS_10','SS_11','SS_12')])
##
## Reliability analysis
## Call: psych::alpha(x = dat2[c("SS_1", "SS_2", "SS_3", "SS_4", "SS_5",
## "SS_6", "SS_7", "SS_8", "SS_9", "SS_10", "SS_11", "SS_12")])
##
## raw_alpha std.alpha G6(smc) average_r S/N ase mean sd median_r
## 0.92 0.92 0.94 0.49 11 0.0071 5.3 0.84 0.47
##
## 95% confidence boundaries
## lower alpha upper
## Feldt 0.9 0.92 0.93
## Duhachek 0.9 0.92 0.93
##
## Reliability if an item is dropped:
## raw_alpha std.alpha G6(smc) average_r S/N alpha se var.r med.r
## SS_1 0.91 0.91 0.94 0.49 11 0.0075 0.018 0.48
## SS_2 0.91 0.91 0.94 0.48 10 0.0078 0.018 0.48
## SS_3 0.91 0.91 0.94 0.49 10 0.0078 0.018 0.48
## SS_4 0.91 0.91 0.94 0.49 11 0.0076 0.016 0.48
## SS_5 0.91 0.91 0.94 0.48 10 0.0080 0.019 0.46
## SS_6 0.91 0.91 0.94 0.48 10 0.0079 0.020 0.46
## SS_7 0.91 0.91 0.94 0.48 10 0.0079 0.019 0.46
## SS_8 0.91 0.92 0.94 0.50 11 0.0073 0.015 0.48
## SS_9 0.91 0.91 0.94 0.48 10 0.0078 0.018 0.46
## SS_10 0.91 0.91 0.94 0.48 10 0.0078 0.018 0.46
## SS_11 0.91 0.91 0.94 0.48 10 0.0080 0.020 0.45
## SS_12 0.91 0.91 0.94 0.49 10 0.0077 0.017 0.46
##
## Item statistics
## n raw.r std.r r.cor r.drop mean sd
## SS_1 271 0.70 0.69 0.66 0.62 5.2 1.36
## SS_2 271 0.74 0.74 0.72 0.68 5.4 1.14
## SS_3 270 0.74 0.72 0.70 0.67 5.3 1.33
## SS_4 272 0.71 0.69 0.67 0.63 5.3 1.27
## SS_5 269 0.78 0.78 0.76 0.73 5.5 1.08
## SS_6 270 0.75 0.76 0.74 0.70 5.3 1.05
## SS_7 269 0.75 0.76 0.74 0.69 5.1 1.14
## SS_8 268 0.63 0.61 0.58 0.54 5.2 1.20
## SS_9 271 0.72 0.74 0.72 0.67 5.3 1.02
## SS_10 269 0.73 0.74 0.71 0.67 5.7 1.09
## SS_11 268 0.77 0.77 0.75 0.72 5.2 1.20
## SS_12 271 0.71 0.73 0.71 0.65 5.4 0.99
##
## Non missing response frequency for each item
## 1 2 3 4 5 6 7 miss
## SS_1 0.01 0.05 0.04 0.13 0.39 0.20 0.19 0.13
## SS_2 0.01 0.00 0.05 0.08 0.38 0.29 0.19 0.13
## SS_3 0.01 0.03 0.06 0.14 0.26 0.34 0.17 0.13
## SS_4 0.01 0.02 0.06 0.12 0.32 0.32 0.15 0.13
## SS_5 0.00 0.00 0.04 0.09 0.37 0.28 0.22 0.14
## SS_6 0.00 0.00 0.05 0.11 0.38 0.33 0.11 0.13
## SS_7 0.01 0.02 0.04 0.15 0.45 0.20 0.13 0.14
## SS_8 0.01 0.03 0.05 0.10 0.40 0.27 0.14 0.14
## SS_9 0.00 0.01 0.02 0.15 0.42 0.24 0.15 0.13
## SS_10 0.00 0.01 0.04 0.04 0.34 0.32 0.25 0.14
## SS_11 0.00 0.03 0.05 0.14 0.40 0.22 0.16 0.14
## SS_12 0.00 0.01 0.01 0.12 0.44 0.26 0.16 0.13
#perceived social network quality
psych::alpha(dat2[c('PSNQ_1', 'PSNQ_2', 'PSNQ_3', 'PSNQ_4', 'PSNQ_5', 'PSNQ_6', 'PSNQ_7', 'PSNQ_8', 'PSNQ_9', 'PSNQ_10')])
##
## Reliability analysis
## Call: psych::alpha(x = dat2[c("PSNQ_1", "PSNQ_2", "PSNQ_3", "PSNQ_4",
## "PSNQ_5", "PSNQ_6", "PSNQ_7", "PSNQ_8", "PSNQ_9", "PSNQ_10")])
##
## raw_alpha std.alpha G6(smc) average_r S/N ase mean sd median_r
## 0.92 0.92 0.93 0.54 12 0.007 5.6 0.84 0.55
##
## 95% confidence boundaries
## lower alpha upper
## Feldt 0.9 0.92 0.93
## Duhachek 0.9 0.92 0.93
##
## Reliability if an item is dropped:
## raw_alpha std.alpha G6(smc) average_r S/N alpha se var.r med.r
## PSNQ_1 0.91 0.92 0.92 0.55 10.9 0.0073 0.0120 0.56
## PSNQ_2 0.92 0.92 0.92 0.56 11.3 0.0070 0.0082 0.56
## PSNQ_3 0.91 0.91 0.93 0.53 10.3 0.0077 0.0151 0.56
## PSNQ_4 0.91 0.91 0.93 0.54 10.4 0.0077 0.0147 0.54
## PSNQ_5 0.91 0.91 0.92 0.52 9.8 0.0081 0.0131 0.54
## PSNQ_6 0.91 0.91 0.92 0.53 10.3 0.0078 0.0122 0.55
## PSNQ_7 0.91 0.91 0.92 0.53 10.0 0.0079 0.0130 0.55
## PSNQ_8 0.91 0.91 0.93 0.54 10.5 0.0076 0.0127 0.56
## PSNQ_9 0.91 0.91 0.92 0.53 10.2 0.0079 0.0131 0.55
## PSNQ_10 0.91 0.91 0.92 0.53 10.0 0.0080 0.0127 0.55
##
## Item statistics
## n raw.r std.r r.cor r.drop mean sd
## PSNQ_1 266 0.71 0.71 0.68 0.63 5.5 1.15
## PSNQ_2 266 0.66 0.66 0.63 0.57 5.7 1.09
## PSNQ_3 265 0.77 0.77 0.74 0.71 5.6 1.04
## PSNQ_4 265 0.76 0.76 0.71 0.69 5.6 1.23
## PSNQ_5 266 0.83 0.83 0.81 0.78 5.5 1.04
## PSNQ_6 266 0.78 0.77 0.75 0.71 5.4 1.21
## PSNQ_7 265 0.80 0.81 0.79 0.75 5.7 0.97
## PSNQ_8 263 0.74 0.74 0.71 0.68 5.7 0.98
## PSNQ_9 264 0.79 0.78 0.76 0.72 5.4 1.26
## PSNQ_10 266 0.80 0.80 0.78 0.75 5.6 1.04
##
## Non missing response frequency for each item
## 1 2 3 4 5 6 7 miss
## PSNQ_1 0.00 0.03 0.03 0.13 0.16 0.52 0.13 0.15
## PSNQ_2 0.00 0.02 0.03 0.05 0.27 0.41 0.22 0.15
## PSNQ_3 0.00 0.02 0.03 0.09 0.22 0.49 0.16 0.15
## PSNQ_4 0.00 0.06 0.01 0.06 0.19 0.48 0.20 0.15
## PSNQ_5 0.00 0.02 0.02 0.11 0.28 0.42 0.15 0.15
## PSNQ_6 0.01 0.02 0.06 0.07 0.26 0.43 0.15 0.15
## PSNQ_7 0.00 0.02 0.00 0.09 0.23 0.49 0.17 0.15
## PSNQ_8 0.01 0.00 0.02 0.07 0.22 0.50 0.18 0.16
## PSNQ_9 0.00 0.02 0.08 0.11 0.19 0.42 0.18 0.15
## PSNQ_10 0.00 0.01 0.04 0.09 0.18 0.53 0.15 0.15
#resilience
psych::alpha(dat2[c('BRS_1', 'BRS_2r', 'BRS_3', 'BRS_4r', 'BRS_5', 'BRS_6r')])
## Warning in cor.smooth(r): Matrix was not positive definite, smoothing was done
## Warning in psych::alpha(dat2[c("BRS_1", "BRS_2r", "BRS_3", "BRS_4r", "BRS_5", : Some items were negatively correlated with the first principal component and probably
## should be reversed.
## To do this, run the function again with the 'check.keys=TRUE' option
## Some items ( BRS_2r BRS_4r BRS_6r ) were negatively correlated with the first principal component and
## probably should be reversed.
## To do this, run the function again with the 'check.keys=TRUE' option
## In smc, smcs < 0 were set to .0
## In smc, smcs < 0 were set to .0
## In smc, smcs < 0 were set to .0
## In smc, smcs < 0 were set to .0
## In smc, smcs < 0 were set to .0
## In smc, smcs < 0 were set to .0
##
## Reliability analysis
## Call: psych::alpha(x = dat2[c("BRS_1", "BRS_2r", "BRS_3", "BRS_4r",
## "BRS_5", "BRS_6r")])
##
## raw_alpha std.alpha G6(smc) average_r S/N ase mean sd median_r
## 0.19 0.031 0.28 0.0053 0.032 0.064 3.2 0.45 0.03
##
## 95% confidence boundaries
## lower alpha upper
## Feldt 0.04 0.19 0.32
## Duhachek 0.06 0.19 0.31
##
## Reliability if an item is dropped:
## raw_alpha std.alpha G6(smc) average_r S/N alpha se var.r med.r
## BRS_1 0.0026 -0.318 0.026 -0.051 -0.24 0.075 0.208 -0.014
## BRS_2r -0.2448 -0.394 -0.125 -0.060 -0.28 0.098 0.241 0.000
## BRS_3 0.1498 -0.115 0.154 -0.021 -0.10 0.062 0.216 0.035
## BRS_4r -0.1810 -0.330 -0.115 -0.052 -0.25 0.091 0.237 0.020
## BRS_5 0.2306 0.095 0.434 0.021 0.11 0.062 0.125 0.035
## BRS_6r 0.5282 0.548 0.656 0.195 1.21 0.044 0.083 0.075
##
## Item statistics
## n raw.r std.r r.cor r.drop mean sd
## BRS_1 256 0.57 0.64 0.711 0.267 3.9 0.90
## BRS_2r 255 0.74 0.68 0.840 0.412 2.7 1.15
## BRS_3 254 0.45 0.52 0.471 0.089 3.6 0.98
## BRS_4r 253 0.72 0.65 0.813 0.346 2.9 1.22
## BRS_5 255 0.30 0.35 -0.096 -0.021 3.7 0.85
## BRS_6r 255 -0.30 -0.35 -1.416 -0.541 2.3 0.85
##
## Non missing response frequency for each item
## 1 2 3 4 5 miss
## BRS_1 0.00 0.09 0.13 0.50 0.27 0.18
## BRS_2r 0.11 0.44 0.17 0.20 0.08 0.18
## BRS_3 0.02 0.13 0.26 0.41 0.18 0.19
## BRS_4r 0.14 0.29 0.21 0.26 0.10 0.19
## BRS_5 0.01 0.08 0.24 0.52 0.15 0.18
## BRS_6r 0.15 0.52 0.24 0.08 0.01 0.18
#sense of belonging
psych::alpha(dat2[c('SBS_1', 'SBS_2', 'SBS_3', 'SBS_4', 'SBS_5', 'SBS_6', 'SBS_7')])
##
## Reliability analysis
## Call: psych::alpha(x = dat2[c("SBS_1", "SBS_2", "SBS_3", "SBS_4", "SBS_5",
## "SBS_6", "SBS_7")])
##
## raw_alpha std.alpha G6(smc) average_r S/N ase mean sd median_r
## 0.88 0.88 0.88 0.51 7.2 0.011 3.7 0.68 0.49
##
## 95% confidence boundaries
## lower alpha upper
## Feldt 0.85 0.88 0.9
## Duhachek 0.86 0.88 0.9
##
## Reliability if an item is dropped:
## raw_alpha std.alpha G6(smc) average_r S/N alpha se var.r med.r
## SBS_1 0.86 0.86 0.85 0.50 6.0 0.013 0.0054 0.49
## SBS_2 0.85 0.86 0.85 0.50 5.9 0.013 0.0061 0.51
## SBS_3 0.86 0.86 0.86 0.51 6.3 0.012 0.0092 0.49
## SBS_4 0.86 0.86 0.85 0.50 6.1 0.012 0.0087 0.47
## SBS_5 0.86 0.86 0.86 0.51 6.3 0.012 0.0073 0.49
## SBS_6 0.86 0.86 0.86 0.51 6.2 0.012 0.0075 0.51
## SBS_7 0.86 0.86 0.86 0.51 6.2 0.012 0.0100 0.47
##
## Item statistics
## n raw.r std.r r.cor r.drop mean sd
## SBS_1 259 0.78 0.77 0.74 0.69 3.5 0.97
## SBS_2 261 0.79 0.78 0.75 0.70 3.4 0.94
## SBS_3 260 0.74 0.74 0.67 0.63 3.8 0.92
## SBS_4 260 0.76 0.77 0.72 0.67 3.9 0.81
## SBS_5 257 0.73 0.74 0.69 0.63 3.8 0.86
## SBS_6 261 0.76 0.75 0.70 0.65 3.6 0.92
## SBS_7 261 0.75 0.76 0.70 0.66 3.9 0.86
##
## Non missing response frequency for each item
## 1 2 3 4 5 miss
## SBS_1 0.01 0.16 0.31 0.38 0.14 0.17
## SBS_2 0.01 0.16 0.34 0.36 0.12 0.16
## SBS_3 0.00 0.07 0.30 0.37 0.26 0.17
## SBS_4 0.01 0.02 0.29 0.45 0.23 0.17
## SBS_5 0.00 0.07 0.30 0.42 0.21 0.18
## SBS_6 0.01 0.08 0.35 0.36 0.19 0.16
## SBS_7 0.00 0.08 0.15 0.54 0.22 0.16
#professional flourishing
psych::alpha(dat2[c('PF_1','PF_2','PF_3','PF_4','PF_5','PF_6','PF_7','PF_8')])
##
## Reliability analysis
## Call: psych::alpha(x = dat2[c("PF_1", "PF_2", "PF_3", "PF_4", "PF_5",
## "PF_6", "PF_7", "PF_8")])
##
## raw_alpha std.alpha G6(smc) average_r S/N ase mean sd median_r
## 0.94 0.94 0.93 0.65 15 0.0052 5.8 0.9 0.65
##
## 95% confidence boundaries
## lower alpha upper
## Feldt 0.93 0.94 0.95
## Duhachek 0.93 0.94 0.95
##
## Reliability if an item is dropped:
## raw_alpha std.alpha G6(smc) average_r S/N alpha se var.r med.r
## PF_1 0.92 0.92 0.92 0.63 12 0.0066 0.0037 0.62
## PF_2 0.93 0.93 0.92 0.64 13 0.0062 0.0044 0.64
## PF_3 0.93 0.93 0.93 0.67 14 0.0056 0.0032 0.67
## PF_4 0.93 0.93 0.92 0.65 13 0.0060 0.0053 0.64
## PF_5 0.93 0.93 0.93 0.66 14 0.0057 0.0041 0.66
## PF_6 0.93 0.93 0.93 0.66 13 0.0058 0.0041 0.64
## PF_7 0.92 0.92 0.92 0.64 12 0.0063 0.0040 0.64
## PF_8 0.93 0.93 0.93 0.66 14 0.0057 0.0049 0.67
##
## Item statistics
## n raw.r std.r r.cor r.drop mean sd
## PF_1 257 0.90 0.89 0.89 0.86 5.6 1.24
## PF_2 253 0.87 0.86 0.84 0.82 5.6 1.17
## PF_3 255 0.79 0.78 0.74 0.72 5.8 1.04
## PF_4 257 0.84 0.84 0.81 0.78 5.8 1.09
## PF_5 256 0.79 0.79 0.76 0.73 5.9 0.94
## PF_6 256 0.81 0.81 0.78 0.75 5.9 1.01
## PF_7 254 0.87 0.88 0.86 0.83 5.8 1.14
## PF_8 256 0.80 0.81 0.77 0.74 5.8 0.93
##
## Non missing response frequency for each item
## 1 2 3 4 5 6 7 miss
## PF_1 0 0.03 0.05 0.10 0.18 0.41 0.23 0.18
## PF_2 0 0.02 0.05 0.09 0.21 0.42 0.21 0.19
## PF_3 0 0.02 0.01 0.08 0.21 0.42 0.26 0.18
## PF_4 0 0.01 0.04 0.09 0.15 0.46 0.25 0.18
## PF_5 0 0.00 0.01 0.08 0.20 0.43 0.28 0.18
## PF_6 0 0.01 0.02 0.09 0.16 0.46 0.27 0.18
## PF_7 0 0.01 0.04 0.08 0.18 0.41 0.27 0.19
## PF_8 0 0.00 0.02 0.05 0.22 0.48 0.23 0.18
#career success
psych::alpha(dat2[c('CS_1', 'CS_2','CS_3','CS_4','CS_5')])
##
## Reliability analysis
## Call: psych::alpha(x = dat2[c("CS_1", "CS_2", "CS_3", "CS_4", "CS_5")])
##
## raw_alpha std.alpha G6(smc) average_r S/N ase mean sd median_r
## 0.85 0.85 0.83 0.54 5.8 0.013 4 0.72 0.54
##
## 95% confidence boundaries
## lower alpha upper
## Feldt 0.83 0.85 0.88
## Duhachek 0.83 0.85 0.88
##
## Reliability if an item is dropped:
## raw_alpha std.alpha G6(smc) average_r S/N alpha se var.r med.r
## CS_1 0.85 0.85 0.81 0.58 5.5 0.014 0.0020 0.56
## CS_2 0.81 0.81 0.77 0.52 4.3 0.017 0.0058 0.52
## CS_3 0.81 0.81 0.77 0.52 4.3 0.017 0.0022 0.54
## CS_4 0.82 0.82 0.78 0.53 4.5 0.017 0.0055 0.54
## CS_5 0.83 0.83 0.79 0.55 4.8 0.016 0.0065 0.54
##
## Item statistics
## n raw.r std.r r.cor r.drop mean sd
## CS_1 255 0.71 0.73 0.62 0.57 4.0 0.81
## CS_2 256 0.83 0.82 0.77 0.71 4.0 0.93
## CS_3 252 0.84 0.82 0.78 0.71 4.0 1.07
## CS_4 254 0.81 0.81 0.75 0.70 4.0 0.89
## CS_5 254 0.78 0.78 0.70 0.65 4.1 0.83
##
## Non missing response frequency for each item
## 1 2 3 4 5 miss
## CS_1 0.01 0.05 0.08 0.60 0.26 0.18
## CS_2 0.00 0.09 0.13 0.42 0.36 0.18
## CS_3 0.03 0.10 0.09 0.41 0.37 0.19
## CS_4 0.02 0.04 0.17 0.47 0.31 0.19
## CS_5 0.00 0.06 0.13 0.50 0.31 0.19
#organizational climate
psych::alpha(dat2[c('OC_1', 'OC_2')])
##
## Reliability analysis
## Call: psych::alpha(x = dat2[c("OC_1", "OC_2")])
##
## raw_alpha std.alpha G6(smc) average_r S/N ase mean sd median_r
## 0.54 0.54 0.37 0.37 1.2 0.051 3.6 0.96 0.37
##
## 95% confidence boundaries
## lower alpha upper
## Feldt 0.42 0.54 0.63
## Duhachek 0.44 0.54 0.64
##
## Reliability if an item is dropped:
## raw_alpha std.alpha G6(smc) average_r S/N alpha se var.r med.r
## OC_1 0.45 0.37 0.14 0.37 0.6 NA 0 0.37
## OC_2 0.31 0.37 0.14 0.37 0.6 NA 0 0.37
##
## Item statistics
## n raw.r std.r r.cor r.drop mean sd
## OC_1 280 0.86 0.83 0.51 0.37 3.9 1.3
## OC_2 278 0.79 0.83 0.51 0.37 3.3 1.0
##
## Non missing response frequency for each item
## 1 2 3 4 5 6 miss
## OC_1 0.07 0.00 0.26 0.40 0.12 0.14 0.10
## OC_2 0.06 0.15 0.28 0.41 0.10 0.00 0.11
dat2_missingness2 <-(dplyr::select (dat2, OC_AVG, SS_AVG:CS_AVG))
#In the script below we create a variable that counts the number of missing variables and then creates a proportion by dividing it by the number of total variables.
#Create a variable (n_miss) that counts the number missing
dat2_missingness2$n_miss <- dat2_missingness2%>%
dplyr::select(OC_AVG, SS_AVG:CS_AVG) %>%
is.na %>%
rowSums
#Create a proportion missing by dividing n_miss by the total number of variables (6)
#Pipe to sort in order of descending frequency to get a sense of the missingness
dat2_missingness2<- dat2_missingness2%>%
dplyr::mutate(prop_miss = (n_miss/6)*100)%>%
arrange(desc(n_miss))
psych::describe(dat2_missingness2$prop_miss)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 312 18.22 39.49 0 8.27 0 0 116.67 116.67 1.85 1.61 2.24
Across 312 cases for which the scoring protocol was applied, missingness ranged from 0% to 116.67%
# Filter out cases with 20% or more missing data
fgpdata_scored <- dplyr::filter(dat2_missingness2, prop_miss <= 20)
# Select only the relevant columns (if necessary)
fgpdata_scored <- dplyr::select(fgpdata_scored, OC_AVG, SS_AVG:CS_AVG)
# Save the cleaned data to a CSV file
#write.csv(fgpdata_scored, file = "fgpdata_average_scores.csv", row.names = FALSE)
# Optionally, check the number of cases retained
num_cases_retained <- nrow(fgpdata_scored)
print(num_cases_retained)
## [1] 257
# Assuming your original dataset is called df
fgpdata_scored$ID <- 1:nrow(fgpdata_scored)
# View the dataset with the new ID variable
head(fgpdata_scored)
## # A tibble: 6 × 8
## OC_AVG SS_AVG PSNQ_AVG BRS_AVG SBS_AVG PF_AVG CS_AVG ID
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 3.5 4 NA 3 3.14 3.88 3.25 1
## 2 4 5.5 5.9 3.17 4.43 6 NA 2
## 3 NA 5.75 5.33 3 3.57 5.12 4.2 3
## 4 3 3.08 2.9 NA 2.57 3 2.4 4
## 5 NA 4 4.8 2.33 3.14 3.62 3.2 5
## 6 NA 4.82 5.6 3.33 3.86 6 4 6
After eliminating cases with greater than 20% missing, the dataset analyzed included 257 cases.
#percent missing across df
formattable::percent(mean(is.na(fgpdata_scored)))
## [1] 0.39%
#percent of rows with nonmissing data
formattable::percent(mean(complete.cases(fgpdata_scored)))
## [1] 96.89%
In this dataset, missing data was minimal, with only 0.44% of the values missing overall. Additionally, 96.89% of the rows had complete data at the scale level.
mice_ScaleLvl_fgp <- mice::md.pattern(fgpdata_scored, plot = TRUE, rotate.names=TRUE)
mice_ScaleLvl_fgp
## SS_AVG SBS_AVG PF_AVG ID PSNQ_AVG CS_AVG BRS_AVG OC_AVG
## 249 1 1 1 1 1 1 1 1 0
## 4 1 1 1 1 1 1 1 0 1
## 2 1 1 1 1 1 1 0 1 1
## 1 1 1 1 1 1 0 1 1 1
## 1 1 1 1 1 0 1 1 1 1
## 0 0 0 0 1 1 2 4 8
dat2_missingness3 <-(dplyr::select (dat2, OC_total, SS_total:CS_total))
#In the script below we create a variable that counts the number of missing variables and then creates a proportion by dividing it by the number of total variables.
#Create a variable (n_miss) that counts the number missing
dat2_missingness3$n_miss <- dat2_missingness3%>%
dplyr::select(OC_total, SS_total:CS_total) %>%
is.na %>%
rowSums
#Create a proportion missing by dividing n_miss by the total number of variables (6)
#Pipe to sort in order of descending frequency to get a sense of the missingness
dat2_missingness3<- dat2_missingness3%>%
dplyr::mutate(prop_miss = (n_miss/6)*100)%>%
arrange(desc(n_miss))
psych::describe(dat2_missingness3$prop_miss)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 312 0 0 0 0 0 0 0 0 NaN NaN 0
Across 312 cases for which the scoring protocol was applied, missingness ranged from 0% to 0%
# Filter out cases with 20% or more missing data
fgpdata_scored2 <- dplyr::filter(dat2_missingness3, prop_miss <= 20)
# Select only the relevant columns (if necessary)
fgpdata_scored2 <- dplyr::select(fgpdata_scored2, OC_total, SS_total:CS_total)
# Save the cleaned data to a CSV file
#write.csv(fgpdata_scored2, file = "fgpdata_total_scores.csv", row.names = FALSE)
# Optionally, check the number of cases retained
num_cases_retained2 <- nrow(fgpdata_scored2)
print(num_cases_retained2)
## [1] 312
After eliminating cases with greater than 20% missing, the dataset remained the same at 312
#percent missing across df
formattable::percent(mean(is.na(fgpdata_scored2)))
## [1] 0.00%
#percent of rows with nonmissing data
formattable::percent(mean(complete.cases(fgpdata_scored2)))
## [1] 100.00%
mice_ScaleLvl_fgp2 <- mice::md.pattern(fgpdata_scored2, plot = TRUE, rotate.names=TRUE)
## /\ /\
## { `---' }
## { O O }
## ==> V <== No need for mice. This data set is completely observed.
## \ \|/ /
## `-----'
mice_ScaleLvl_fgp2
## OC_total SS_total PSNQ_total BRS_total SBS_total PF_total CS_total
## 312 1 1 1 1 1 1 1 0
## 0 0 0 0 0 0 0 0
fgpdata_Subscales <-(dplyr::select (fgpdata_scored, OC_AVG, SS_AVG:CS_AVG))
#skew and kurtosis
descriptives <- psych::describe(fgpdata_Subscales, type = 1)
descriptives
## vars n mean sd median trimmed mad min max range skew kurtosis
## OC_AVG 1 253 3.67 0.94 4.00 3.72 0.74 1.00 5.50 4.50 -0.59 0.41
## SS_AVG 2 257 5.36 0.84 5.27 5.36 0.83 2.50 7.00 4.50 -0.16 0.07
## PSNQ_AVG 3 256 5.60 0.84 5.70 5.64 0.65 2.50 7.00 4.50 -0.76 0.99
## BRS_AVG 4 255 3.19 0.44 3.17 3.17 0.25 1.67 4.33 2.66 0.45 0.82
## SBS_AVG 5 257 3.71 0.67 3.57 3.69 0.64 1.86 5.00 3.14 0.14 -0.55
## PF_AVG 6 257 5.76 0.90 6.00 5.83 0.74 2.88 7.00 4.12 -0.79 0.32
## CS_AVG 7 256 4.02 0.72 4.20 4.08 0.59 1.40 5.00 3.60 -0.81 0.32
## se
## OC_AVG 0.06
## SS_AVG 0.05
## PSNQ_AVG 0.05
## BRS_AVG 0.03
## SBS_AVG 0.04
## PF_AVG 0.06
## CS_AVG 0.05
All skew values fall below 3.0 All kurtosis fall below 10 *As per Kline, 2016
#normality
shapiro.test(fgpdata_Subscales$SS_AVG)
##
## Shapiro-Wilk normality test
##
## data: fgpdata_Subscales$SS_AVG
## W = 0.98729, p-value = 0.0226
shapiro.test(fgpdata_Subscales$PSNQ_AVG)
##
## Shapiro-Wilk normality test
##
## data: fgpdata_Subscales$PSNQ_AVG
## W = 0.95562, p-value = 0.0000004639
shapiro.test(fgpdata_Subscales$BRS_AVG)
##
## Shapiro-Wilk normality test
##
## data: fgpdata_Subscales$BRS_AVG
## W = 0.93818, p-value = 0.000000007157
shapiro.test(fgpdata_Subscales$SBS_AVG)
##
## Shapiro-Wilk normality test
##
## data: fgpdata_Subscales$SBS_AVG
## W = 0.97696, p-value = 0.0003519
shapiro.test(fgpdata_Subscales$PF_AVG)
##
## Shapiro-Wilk normality test
##
## data: fgpdata_Subscales$PF_AVG
## W = 0.93998, p-value = 0.000000009607
shapiro.test(fgpdata_Subscales$CS_AVG)
##
## Shapiro-Wilk normality test
##
## data: fgpdata_Subscales$CS_AVG
## W = 0.93674, p-value = 0.000000004962
shapiro.test(fgpdata_Subscales$OC_AVG)
##
## Shapiro-Wilk normality test
##
## data: fgpdata_Subscales$OC_AVG
## W = 0.9488, p-value = 0.00000009337
When the p-value from the Shapiro-Wilk test is less than 0.05, it indicates that the variable’s distribution significantly deviates from a normal distribution. In other words, the test suggests that the data is not normally distributed.
#histogram and QQplot
psych::pairs.panels(fgpdata_Subscales[c("OC_AVG", "SS_AVG", "PSNQ_AVG", "BRS_AVG", "SBS_AVG", "PF_AVG", "CS_AVG" )], stars = TRUE, lm = TRUE)
#Mahalanobis distance test
fgpdata_Subscales$Mahal <- psych::outlier(fgpdata_Subscales[c("OC_AVG", "SS_AVG", "PSNQ_AVG", "BRS_AVG", "SBS_AVG", "PF_AVG", "CS_AVG" )])
psych::describe(fgpdata_Subscales$Mahal)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 257 6.96 6.74 4.82 5.67 3.44 0.69 51.33 50.64 3 11.83 0.42
# creates a variable indicating TRUE or FALSE if an item is an
# outlier
fgpdata_scored$MOutlier <- dplyr::if_else(fgpdata_Subscales$Mahal > (median(fgpdata_Subscales$Mahal) +
(3 * sd(fgpdata_Subscales$Mahal))), TRUE, FALSE)
# shows us the first 6 rows of the data so we can see the new
# variables (Mahal, MOutlier)
head(fgpdata_scored)
## # A tibble: 6 × 9
## OC_AVG SS_AVG PSNQ_AVG BRS_AVG SBS_AVG PF_AVG CS_AVG ID MOutlier
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <lgl>
## 1 3.5 4 NA 3 3.14 3.88 3.25 1 FALSE
## 2 4 5.5 5.9 3.17 4.43 6 NA 2 FALSE
## 3 NA 5.75 5.33 3 3.57 5.12 4.2 3 FALSE
## 4 3 3.08 2.9 NA 2.57 3 2.4 4 FALSE
## 5 NA 4 4.8 2.33 3.14 3.62 3.2 5 FALSE
## 6 NA 4.82 5.6 3.33 3.86 6 4 6 FALSE
# Count the number of outliers and non-outliers
OutlierCount <- fgpdata_scored %>%
dplyr::count(MOutlier)
# Number of outliers
num_outliers <- OutlierCount %>% filter(MOutlier == TRUE) %>% pull(n)
# Number of non-outliers
num_non_outliers <- OutlierCount %>% filter(MOutlier == FALSE) %>% pull(n)
# Alternatively, calculate directly
num_outliers <- sum(fgpdata_scored$MOutlier == TRUE)
num_non_outliers <- sum(fgpdata_scored$MOutlier == FALSE)
# Print the results
cat("Number of outliers:", num_outliers, "\n")
## Number of outliers: 8
cat("Number of non-outliers:", num_non_outliers, "\n")
## Number of non-outliers: 249
At this stage did a visual inspection of the data and removed duplicate rows again leading to a final dataset of 230 observations.
# Create a new dataframe with only the non-outliers (assuming non-outliers are marked as FALSE in the MOutlier column)
non_outliers_df <- fgpdata_scored %>%
dplyr::filter(MOutlier == FALSE)
# View the first few rows of the non-outliers dataframe
head(non_outliers_df)
## # A tibble: 6 × 9
## OC_AVG SS_AVG PSNQ_AVG BRS_AVG SBS_AVG PF_AVG CS_AVG ID MOutlier
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <lgl>
## 1 3.5 4 NA 3 3.14 3.88 3.25 1 FALSE
## 2 4 5.5 5.9 3.17 4.43 6 NA 2 FALSE
## 3 NA 5.75 5.33 3 3.57 5.12 4.2 3 FALSE
## 4 3 3.08 2.9 NA 2.57 3 2.4 4 FALSE
## 5 NA 4 4.8 2.33 3.14 3.62 3.2 5 FALSE
## 6 NA 4.82 5.6 3.33 3.86 6 4 6 FALSE
# Optionally, check the number of rows to confirm it has 308 observations
cat("Number of non-outliers:", nrow(non_outliers_df), "\n")
## Number of non-outliers: 249
# Count the number of outliers and non-outliers
OutlierCount <- non_outliers_df %>%
dplyr::count(MOutlier)
# Number of outliers
num_outliers <- OutlierCount %>% filter(MOutlier == TRUE) %>% pull(n)
# Number of non-outliers
num_non_outliers <- OutlierCount %>% filter(MOutlier == FALSE) %>% pull(n)
# Alternatively, calculate directly
num_outliers <- sum(non_outliers_df$MOutlier == TRUE)
num_non_outliers <- sum(non_outliers_df$MOutlier == FALSE)
# Print the results
cat("Number of outliers:", num_outliers, "\n")
## Number of outliers: 0
cat("Number of non-outliers:", num_non_outliers, "\n")
## Number of non-outliers: 249
# Assuming both data frames have a common key, such as "ID"
merged_df <- merge(non_outliers_df, dat2, by = "ID")
# Select only the desired columns
fgpdata_dem_avg <- merged_df[, c(
"OC_AVG.x", "SBS_AVG.x", "SS_AVG.x", "BRS_AVG.x", "PSNQ_AVG.x", "PF_AVG.x", "CS_AVG.x",
"employment_status", "employment_status_TEXT", "PD", "SS_13_TEXT", "age", "year_education",
"education", "education_9_TEXT", "religion", "religion_15_TEXT", "income",
"fam_income", "race", "race_10_TEXT", "US_born", "sexual_orientation",
"sexual_orientation_9_TEXT", "gender", "gender_8_TEXT"
)]
names(fgpdata_dem_avg) <- c(
"OC_AVG", "SBS_AVG", "SS_AVG", "BRS_AVG", "PSNQ_AVG", "PF_AVG", "CS_AVG",
"employment_status", "employment_status_TEXT", "PD", "SS_13_TEXT", "age",
"year_education", "education", "education_9_TEXT", "religion",
"religion_15_TEXT", "income", "fam_income", "race", "race_10_TEXT",
"US_born", "sexual_orientation", "sexual_orientation_9_TEXT",
"gender", "gender_8_TEXT"
)
write.csv(fgpdata_dem_avg, "fgpdata_dem_avg.csv")