# Clearing the Global Environment & Starting with a Clean Slate
rm(list = ls())
# Forcing R to leave data as plain text
options(stringsAsFactors = FALSE)
# Installing Pacman if not installed already
tryCatch(require(pacman), finally = utils:::install.packages(pkgs = 'pacman', repos = 'http://cran.r-project.org'))
## Loading required package: pacman
##
## The downloaded binary packages are in
## /var/folders/5_/ktgmwhhn3t5bc9m5z20plsww0000gn/T//RtmpXyLePH/downloaded_packages
require(pacman)
# Loading all libraries & toolboxes
pacman::p_load(Hmisc,
checkmate,
corrr,
conflicted,
readxl,
dplyr,
tidyr,
ggplot2,
knitr,
evaluate,
iopsych,
psych,
quantreg,
lavaan,
xtable,
reshape2,
GPArotation,
Amelia,
expss,
multilevel,
janitor,
mice,
lmtest,
naniar,
tidylog,
MVN,
haven,
openxlsx)
# Prioritizing the 'tidylog' or "dplyr" library when using filter commands in future chunks over other libraries when given the choice
for (f in getNamespaceExports("tidylog")) {conflicted::conflict_prefer(f, "tidylog", quiet = TRUE)}
conflicted::conflict_prefer("filter", "dplyr", quiet = TRUE)
conflicted::conflict_prefer("select", "dplyr", quiet = TRUE)
# Reading and relabel the SAQ dataframe (df)
df <- haven::read_spss("/Users/bglinsky/Desktop/1 - Coursework/Graduate Coursework/UGA/4 - Summer 2026/Advanced Analytics (PSYC 6841)/Data/SAQ.sav")
# Displaying the structure of df
str(df)
## tibble [2,571 × 31] (S3: tbl_df/tbl/data.frame)
## $ Question_01: dbl+lbl [1:2571] 2, 1, 2, 3, 2, 2, 2, 2, 3, 2, 2, 2, 3, 2, 2, 3, 1, 2,...
## ..@ label : chr "Statistics makes me cry"
## ..@ format.spss: chr "F1.0"
## ..@ labels : Named num [1:6] 1 2 3 4 5 9
## .. ..- attr(*, "names")= chr [1:6] "Strongly agree" "Agree" "Neither" "Disagree" ...
## $ Question_02: dbl+lbl [1:2571] 1, 1, 3, 1, 1, 1, 3, 2, 3, 4, 1, 1, 1, 2, 2, 1, 2, 2,...
## ..@ label : chr "My friends will think I'm stupid for not being able to cope with SPSS"
## ..@ format.spss: chr "F1.0"
## ..@ labels : Named num [1:5] 1 2 3 4 5
## .. ..- attr(*, "names")= chr [1:5] "Strongly agree" "Agree" "Neither" "Disagree" ...
## $ Question_03: dbl+lbl [1:2571] 4, 4, 2, 1, 3, 3, 3, 3, 1, 4, 5, 3, 3, 1, 3, 2, 5, 3,...
## ..@ label : chr "Standard deviations excite me"
## ..@ format.spss: chr "F1.0"
## ..@ labels : Named num [1:5] 1 2 3 4 5
## .. ..- attr(*, "names")= chr [1:5] "Strongly agree" "Agree" "Neither" "Disagree" ...
## $ Question_04: dbl+lbl [1:2571] 2, 3, 2, 4, 2, 2, 2, 2, 4, 3, 2, 3, 4, 2, 4, 2, 2, 3,...
## ..@ label : chr "I dream that Pearson is attacking me with correlation coefficients"
## ..@ format.spss: chr "F1.0"
## ..@ labels : Named num [1:6] 1 2 3 4 5 9
## .. ..- attr(*, "names")= chr [1:6] "Strongly agree" "Agree" "Neither" "Disagree" ...
## $ Question_05: dbl+lbl [1:2571] 2, 2, 4, 3, 2, 4, 2, 2, 5, 2, 2, 4, 3, 2, 2, 2, 1, 3,...
## ..@ label : chr "I don't understand statistics"
## ..@ format.spss: chr "F1.0"
## ..@ labels : Named num [1:5] 1 2 3 4 5
## .. ..- attr(*, "names")= chr [1:5] "Strongly agree" "Agree" "Neither" "Disagree" ...
## $ Question_06: dbl+lbl [1:2571] 2, 2, 1, 3, 3, 4, 2, 2, 3, 1, 1, 3, 2, 2, 2, 2, 1, 4,...
## ..@ label : chr "I have little experience of computers"
## ..@ format.spss: chr "F1.0"
## ..@ labels : Named num [1:5] 1 2 3 4 5
## .. ..- attr(*, "names")= chr [1:5] "Strongly agree" "Agree" "Neither" "Disagree" ...
## $ Question_07: dbl+lbl [1:2571] 3, 2, 2, 4, 3, 4, 2, 2, 5, 2, 2, 3, 3, 3, 3, 2, 1, 3,...
## ..@ label : chr "All computers hate me"
## ..@ format.spss: chr "F1.0"
## ..@ labels : Named num [1:5] 1 2 3 4 5
## .. ..- attr(*, "names")= chr [1:5] "Strongly agree" "Agree" "Neither" "Disagree" ...
## $ Question_08: dbl+lbl [1:2571] 1, 2, 2, 2, 2, 2, 2, 2, 5, 2, 2, 1, 3, 2, 2, 2, 1, 2,...
## ..@ label : chr "I have never been good at mathematics"
## ..@ format.spss: chr "F1.0"
## ..@ labels : Named num [1:5] 1 2 3 4 5
## .. ..- attr(*, "names")= chr [1:5] "Strongly agree" "Agree" "Neither" "Disagree" ...
## $ Question_09: dbl+lbl [1:2571] 1, 5, 2, 2, 4, 4, 3, 4, 3, 3, 5, 3, 2, 2, 2, 2, 4, 5,...
## ..@ label : chr "My friends are better at statistics than me"
## ..@ format.spss: chr "F1.0"
## ..@ labels : Named num [1:5] 1 2 3 4 5
## .. ..- attr(*, "names")= chr [1:5] "Strongly agree" "Agree" "Neither" "Disagree" ...
## $ Question_10: dbl+lbl [1:2571] 2, 2, 2, 4, 2, 3, 2, 2, 3, 2, 2, 2, 3, 3, 3, 3, 1, 2,...
## ..@ label : chr "Computers are useful only for playing games"
## ..@ format.spss: chr "F1.0"
## ..@ labels : Named num [1:5] 1 2 3 4 5
## .. ..- attr(*, "names")= chr [1:5] "Strongly agree" "Agree" "Neither" "Disagree" ...
## $ Question_11: dbl+lbl [1:2571] 1, 2, 3, 2, 2, 2, 2, 2, 5, 2, 1, 2, 3, 2, 2, 2, 1, 3,...
## ..@ label : chr "I did badly at mathematics at school"
## ..@ format.spss: chr "F1.0"
## ..@ labels : Named num [1:5] 1 2 3 4 5
## .. ..- attr(*, "names")= chr [1:5] "Strongly agree" "Agree" "Neither" "Disagree" ...
## $ Question_12: dbl+lbl [1:2571] 2, 3, 3, 2, 3, 4, 2, 3, 5, 3, 3, 3, 4, 4, 3, 3, 2, 3,...
## ..@ label : chr "People try to tell you that SPSS makes statistics easier to understand but it doesn't"
## ..@ format.spss: chr "F1.0"
## ..@ labels : Named num [1:5] 1 2 3 4 5
## .. ..- attr(*, "names")= chr [1:5] "Strongly agree" "Agree" "Neither" "Disagree" ...
## $ Question_13: dbl+lbl [1:2571] 2, 1, 2, 2, 3, 3, 2, 2, 5, 2, 1, 2, 4, 2, 2, 2, 1, 3,...
## ..@ label : chr "I worry that I will cause irreparable damage because of my incompetence with computers"
## ..@ format.spss: chr "F1.0"
## ..@ labels : Named num [1:5] 1 2 3 4 5
## .. ..- attr(*, "names")= chr [1:5] "Strongly agree" "Agree" "Neither" "Disagree" ...
## $ Question_14: dbl+lbl [1:2571] 2, 3, 4, 3, 2, 3, 2, 2, 5, 1, 2, 2, 4, 4, 3, 3, 1, 3,...
## ..@ label : chr "Computers have minds of their own and deliberately go wrong whenever I use them"
## ..@ format.spss: chr "F1.0"
## ..@ labels : Named num [1:5] 1 2 3 4 5
## .. ..- attr(*, "names")= chr [1:5] "Strongly agree" "Agree" "Neither" "Disagree" ...
## $ Question_15: dbl+lbl [1:2571] 2, 4, 2, 3, 2, 5, 2, 3, 5, 2, 1, 3, 4, 4, 3, 2, 1, 4,...
## ..@ label : chr "Computers are out to get me"
## ..@ format.spss: chr "F1.0"
## ..@ labels : Named num [1:5] 1 2 3 4 5
## .. ..- attr(*, "names")= chr [1:5] "Strongly agree" "Agree" "Neither" "Disagree" ...
## $ Question_16: dbl+lbl [1:2571] 3, 3, 3, 3, 2, 2, 2, 2, 5, 3, 2, 3, 4, 4, 4, 3, 2, 3,...
## ..@ label : chr "I weep openly at the mention of central tendency"
## ..@ format.spss: chr "F1.0"
## ..@ labels : Named num [1:6] 1 2 3 4 5 9
## .. ..- attr(*, "names")= chr [1:6] "Strongly agree" "Agree" "Neither" "Disagree" ...
## $ Question_17: dbl+lbl [1:2571] 1, 2, 2, 2, 2, 3, 2, 2, 5, 2, 2, 2, 3, 2, 2, 2, 2, 2,...
## ..@ label : chr "I slip into a coma whenever I see an equation"
## ..@ format.spss: chr "F1.0"
## ..@ labels : Named num [1:5] 1 2 3 4 5
## .. ..- attr(*, "names")= chr [1:5] "Strongly agree" "Agree" "Neither" "Disagree" ...
## $ Question_18: dbl+lbl [1:2571] 2, 2, 3, 4, 3, 5, 2, 2, 5, 2, 2, 2, 3, 4, 3, 3, 1, 2,...
## ..@ label : chr "SPSS always crashes when I try to use it"
## ..@ format.spss: chr "F1.0"
## ..@ labels : Named num [1:5] 1 2 3 4 5
## .. ..- attr(*, "names")= chr [1:5] "Strongly agree" "Agree" "Neither" "Disagree" ...
## $ Question_19: dbl+lbl [1:2571] 3, 3, 1, 2, 3, 1, 3, 4, 2, 3, 5, 3, 2, 1, 3, 2, 4, 2,...
## ..@ label : chr "Everybody looks at me when I use SPSS"
## ..@ format.spss: chr "F1.0"
## ..@ labels : Named num [1:5] 1 2 3 4 5
## .. ..- attr(*, "names")= chr [1:5] "Strongly agree" "Agree" "Neither" "Disagree" ...
## $ Question_20: dbl+lbl [1:2571] 2, 4, 4, 4, 4, 5, 2, 3, 5, 3, 3, 4, 4, 5, 4, 3, 2, 3,...
## ..@ label : chr "I can't sleep for thoughts of eigenvectors"
## ..@ format.spss: chr "F1.0"
## ..@ labels : Named num [1:5] 1 2 3 4 5
## .. ..- attr(*, "names")= chr [1:5] "Strongly agree" "Agree" "Neither" "Disagree" ...
## $ Question_21: dbl+lbl [1:2571] 2, 4, 3, 4, 2, 3, 2, 2, 5, 2, 2, 3, 4, 5, 4, 2, 1, 3,...
## ..@ label : chr "I wake up under my duvet thinking that I am trapped under a normal distribution"
## ..@ format.spss: chr "F1.0"
## ..@ labels : Named num [1:5] 1 2 3 4 5
## .. ..- attr(*, "names")= chr [1:5] "Strongly agree" "Agree" "Neither" "Disagree" ...
## $ Question_22: dbl+lbl [1:2571] 2, 4, 2, 4, 4, 1, 4, 4, 3, 4, 5, 4, 3, 3, 4, 3, 4, 3,...
## ..@ label : chr "My friends are better at SPSS than I am"
## ..@ format.spss: chr "F1.0"
## ..@ labels : Named num [1:6] 1 2 3 4 5 9
## .. ..- attr(*, "names")= chr [1:6] "Strongly agree" "Agree" "Neither" "Disagree" ...
## $ Question_23: dbl+lbl [1:2571] 5, 2, 2, 3, 4, 4, 4, 4, 3, 4, 5, 4, 4, 1, 4, 4, 4, 4,...
## ..@ label : chr "If I'm good at statistics my friends will think I'm a nerd"
## ..@ format.spss: chr "F1.0"
## ..@ labels : Named num [1:5] 1 2 3 4 5
## .. ..- attr(*, "names")= chr [1:5] "Strongly agree" "Agree" "Neither" "Disagree" ...
## $ FAC1_1 : num [1:2571] -1.1297 -0.0448 0.1562 0.7937 -0.9825 ...
## ..- attr(*, "label")= chr "A-R factor score 1 for analysis 1"
## ..- attr(*, "format.spss")= chr "F11.5"
## ..- attr(*, "display_width")= int 13
## $ FAC2_1 : num [1:2571] 0.0509 -0.4774 -0.7224 0.6118 0.6628 ...
## ..- attr(*, "label")= chr "A-R factor score 2 for analysis 1"
## ..- attr(*, "format.spss")= chr "F11.5"
## ..- attr(*, "display_width")= int 13
## $ FAC3_1 : num [1:2571] -1.586 -0.221 0.083 -0.793 -0.358 ...
## ..- attr(*, "label")= chr "A-R factor score 3 for analysis 1"
## ..- attr(*, "format.spss")= chr "F11.5"
## ..- attr(*, "display_width")= int 13
## $ FAC4_1 : num [1:2571] -0.552 0.641 -0.909 -0.318 0.548 ...
## ..- attr(*, "label")= chr "A-R factor score 4 for analysis 1"
## ..- attr(*, "format.spss")= chr "F11.5"
## ..- attr(*, "display_width")= int 13
## $ FAC1_2 : num [1:2571] -0.9452 -0.0171 0.2749 0.8158 -1.0391 ...
## ..- attr(*, "label")= chr "A-R factor score 1 for analysis 2"
## ..- attr(*, "format.spss")= chr "F11.5"
## ..- attr(*, "display_width")= int 13
## $ FAC2_2 : num [1:2571] -0.618 0.61 -0.934 -0.227 0.516 ...
## ..- attr(*, "label")= chr "A-R factor score 2 for analysis 2"
## ..- attr(*, "format.spss")= chr "F11.5"
## ..- attr(*, "display_width")= int 13
## $ FAC3_2 : num [1:2571] 0.0378 -0.5085 -0.6518 0.7289 0.5483 ...
## ..- attr(*, "label")= chr "A-R factor score 3 for analysis 2"
## ..- attr(*, "format.spss")= chr "F11.5"
## ..- attr(*, "display_width")= int 13
## $ FAC4_2 : num [1:2571] 1.6807 0.2401 -0.0804 0.6975 0.4316 ...
## ..- attr(*, "label")= chr "A-R factor score 4 for analysis 2"
## ..- attr(*, "format.spss")= chr "F11.5"
## ..- attr(*, "display_width")= int 13
# Creating a function to calculate the percentage of students whose tests are blank
percentmissing = function (x){ sum(is.na(x))/length(x) * 100}
# Using the function to count the percentage of missing data for each person (rows = 1, not 2, which is columns)
missing <- apply(df,
1,
percentmissing)
table(round(missing,1))
##
## 0
## 2571
# Exclude anybody who is missing more than 5% of their data rows
replace_people <- subset(df,
missing <= 5)
# Setting the seed to impute the missing values!
set.seed(2026)
# Using the mice package to impute the missing data points among remaining participants
temp_nomiss <- mice(replace_people)
##
## iter imp variable
## 1 1
## 1 2
## 1 3
## 1 4
## 1 5
## 2 1
## 2 2
## 2 3
## 2 4
## 2 5
## 3 1
## 3 2
## 3 3
## 3 4
## 3 5
## 4 1
## 4 2
## 4 3
## 4 4
## 4 5
## 5 1
## 5 2
## 5 3
## 5 4
## 5 5
# Pull out the completed, fully-imputed dataset
nomiss <- complete(temp_nomiss, 1)
summary(nomiss)
## Question_01 Question_02 Question_03 Question_04
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:2.000
## Median :2.000 Median :1.000 Median :3.000 Median :3.000
## Mean :2.374 Mean :1.623 Mean :2.585 Mean :2.786
## 3rd Qu.:3.000 3rd Qu.:2.000 3rd Qu.:3.000 3rd Qu.:3.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
## Question_05 Question_06 Question_07 Question_08
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:2.000
## Median :3.000 Median :2.000 Median :3.000 Median :2.000
## Mean :2.722 Mean :2.227 Mean :2.924 Mean :2.237
## 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:4.000 3rd Qu.:3.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
## Question_09 Question_10 Question_11 Question_12
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:3.000
## Median :3.000 Median :2.000 Median :2.000 Median :3.000
## Mean :2.846 Mean :2.281 Mean :2.255 Mean :3.159
## 3rd Qu.:4.000 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
## Question_13 Question_14 Question_15 Question_16
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.000
## Median :2.000 Median :3.000 Median :3.000 Median :3.000
## Mean :2.449 Mean :2.876 Mean :2.766 Mean :2.879
## 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:3.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
## Question_17 Question_18 Question_19 Question_20
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:3.000
## Median :2.000 Median :2.000 Median :2.000 Median :4.000
## Mean :2.467 Mean :2.569 Mean :2.292 Mean :3.624
## 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
## Question_21 Question_22 Question_23 FAC1_1
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :-3.10373
## 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:3.000 1st Qu.:-0.69008
## Median :3.000 Median :3.000 Median :4.000 Median :-0.07904
## Mean :3.171 Mean :2.888 Mean :3.434 Mean : 0.00000
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.: 0.65247
## Max. :5.000 Max. :5.000 Max. :5.000 Max. : 4.60083
## FAC2_1 FAC3_1 FAC4_1 FAC1_2
## Min. :-2.5530 Min. :-3.0874 Min. :-3.2316454 Min. :-3.17112
## 1st Qu.:-0.6801 1st Qu.:-0.5885 1st Qu.:-0.7113711 1st Qu.:-0.69145
## Median :-0.1712 Median :-0.1170 Median : 0.0008567 Median :-0.06044
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000000 Mean : 0.00000
## 3rd Qu.: 0.5419 3rd Qu.: 0.4734 3rd Qu.: 0.6787526 3rd Qu.: 0.64106
## Max. : 3.7057 Max. : 4.3461 Max. : 3.4333878 Max. : 4.48099
## FAC2_2 FAC3_2 FAC4_2
## Min. :-3.310160 Min. :-2.3033 Min. :-4.1028
## 1st Qu.:-0.703546 1st Qu.:-0.6682 1st Qu.:-0.5066
## Median :-0.005424 Median :-0.1888 Median : 0.1579
## Mean : 0.000000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.668124 3rd Qu.: 0.5281 3rd Qu.: 0.5814
## Max. : 3.458260 Max. : 3.6224 Max. : 2.8292
# Removing excess items from the original data set so that future analyses only analyze actual questionnaire items, which are columns 1 to 23
saq_items_only <- nomiss[, 1:23]
# Computing the Chi-Square cutoff with a p value of .999 and a degree of freedom value equal to the number of columns
cutoff <- qchisq(p = .999,
df = ncol(saq_items_only))
#Calculating the Mahalanobis distance for each person in the dataset so that we know how far each person's response is from the average
mahal = mahalanobis(saq_items_only,
colMeans(saq_items_only),
cov(saq_items_only))
#Displalying the cutoff value to inspect the threshold being applied
cutoff
## [1] 49.72823
# Cleaning the data set to keep only those who fall below the cutoff & using tidylog package to determine the number of participants that were removed
noout <- nomiss %>%
tidylog::filter(mahal < cutoff)
## filter: removed 97 rows (4%), 2,474 rows remaining
Question 1a:
To identify any outliers, I conducted a Mahalanobis distance score for each participant across all 23 questionnaire items. This tells us how far each person’s response pattern is from the average pattern. Participant scores were compared against a chi-square cutoff threshold of 49.73 (alpha = .001, df = 23). Based on this criterion, 97 problematic rows (4% of the sample) exceeded the score and were removed from the dataset for future analyses.
# Reverse-scoring items
# 1 = keep (Q03), -1 = reverse (all other 22 items)
keys <- rep(-1, 23)
keys[3] <- 1
# Apply reversal and rename/replace original df
df <- as.data.frame(reverse.code(keys,
noout[, 1:23],
mini = 1,
maxi = 5))
# Running a corrleation analysis on all 23 items and prompting R to ignore any specific correlations between a participant and any questions that might still have missing data with reversed data
correl <- cor(df[, 1:23], use = "pairwise.complete.obs")
#Replaing actual correlation values with symbols for clarity and ease of understanding
symnum(correl)
## Q_01 Q_02 Q_03 Q_04 Q_05 Q_06 Q_07 Q_08 Q_09 Q_10 Q_11 Q_12 Q_13
## Question_01- 1
## Question_02- 1
## Question_03 . . 1
## Question_04- . . 1
## Question_05- . . . 1
## Question_06- 1
## Question_07- . . . . . 1
## Question_08- . . . 1
## Question_09- . . 1
## Question_10- . 1
## Question_11- . . . . . . , 1
## Question_12- . . . . . . . 1
## Question_13- . . . . . . . . . . 1
## Question_14- . . . . . . . . .
## Question_15- . . . . . . . . .
## Question_16- . . . . . . . . .
## Question_17- . . . . . , , . .
## Question_18- . . . . . . . . . .
## Question_19- .
## Question_20- . .
## Question_21- . . . . . . . . .
## Question_22-
## Question_23-
## Q_14 Q_15 Q_16 Q_17 Q_18 Q_19 Q_20 Q_21 Q_22 Q_23
## Question_01-
## Question_02-
## Question_03
## Question_04-
## Question_05-
## Question_06-
## Question_07-
## Question_08-
## Question_09-
## Question_10-
## Question_11-
## Question_12-
## Question_13-
## Question_14- 1
## Question_15- . 1
## Question_16- . . 1
## Question_17- . . . 1
## Question_18- . . . . 1
## Question_19- 1
## Question_20- 1
## Question_21- . . . . . . 1
## Question_22- 1
## Question_23- 1
## attr(,"legend")
## [1] 0 ' ' 0.3 '.' 0.6 ',' 0.8 '+' 0.9 '*' 0.95 'B' 1
correl
## Question_01- Question_02- Question_03 Question_04- Question_05-
## Question_01- 1.00000000 -0.09564516 0.3394120 0.44356524 0.40683612
## Question_02- -0.09564516 1.00000000 -0.3403275 -0.11535211 -0.12750749
## Question_03 0.33941198 -0.34032755 1.0000000 0.38501916 0.31188888
## Question_04- 0.44356524 -0.11535211 0.3850192 1.00000000 0.41311634
## Question_05- 0.40683612 -0.12750749 0.3118889 0.41311634 1.00000000
## Question_06- 0.21642795 -0.09409007 0.2364366 0.27450232 0.25262203
## Question_07- 0.31830059 -0.15146503 0.3888337 0.41073259 0.35683807
## Question_08- 0.35970427 -0.05150263 0.2718204 0.35532444 0.28485684
## Question_09- -0.09735681 0.34107119 -0.3121586 -0.13853293 -0.10449784
## Question_10- 0.20718614 -0.09690482 0.2000051 0.21945052 0.24494916
## Question_11- 0.37639745 -0.14986851 0.3593173 0.37921625 0.30764567
## Question_12- 0.36296740 -0.20647399 0.4213259 0.44652908 0.35086220
## Question_13- 0.36393293 -0.16223708 0.3365851 0.34860223 0.30253499
## Question_14- 0.35652805 -0.18215985 0.3762200 0.35739840 0.32248586
## Question_15- 0.25226994 -0.17019220 0.3178184 0.34153652 0.26939559
## Question_16- 0.49696876 -0.17745629 0.4208369 0.42290818 0.40795250
## Question_17- 0.38340729 -0.09727568 0.3311392 0.37644360 0.32028257
## Question_18- 0.37623424 -0.16729714 0.3976746 0.39166256 0.34466207
## Question_19- -0.19273206 0.22571071 -0.3581167 -0.20759579 -0.18525677
## Question_20- 0.22712216 -0.21523224 0.3266163 0.25872701 0.21311727
## Question_21- 0.34426480 -0.21391337 0.4336355 0.41958580 0.36194548
## Question_22- -0.10457188 0.25028516 -0.2155666 -0.12649321 -0.14277734
## Question_23- -0.01544328 0.10567285 -0.1583330 -0.04247334 -0.05693478
## Question_06- Question_07- Question_08- Question_09- Question_10-
## Question_01- 0.21642795 0.31830059 0.359704271 -0.097356807 0.20718614
## Question_02- -0.09409007 -0.15146503 -0.051502627 0.341071191 -0.09690482
## Question_03 0.23643658 0.38883370 0.271820371 -0.312158588 0.20000510
## Question_04- 0.27450232 0.41073259 0.355324438 -0.138532934 0.21945052
## Question_05- 0.25262203 0.35683807 0.284856845 -0.104497841 0.24494916
## Question_06- 1.00000000 0.53363050 0.235239319 -0.134138078 0.32562494
## Question_07- 0.53363050 1.00000000 0.311551945 -0.137951567 0.28570275
## Question_08- 0.23523932 0.31155195 1.000000000 0.004414116 0.17208292
## Question_09- -0.13413808 -0.13795157 0.004414116 1.000000000 -0.14437725
## Question_10- 0.32562494 0.28570275 0.172082923 -0.144377253 1.00000000
## Question_11- 0.34720392 0.36394420 0.645561669 -0.125660682 0.26278685
## Question_12- 0.32463303 0.42835549 0.260041296 -0.188030097 0.25545078
## Question_13- 0.48119806 0.46913918 0.335912534 -0.191141492 0.30106099
## Question_14- 0.41067316 0.45132805 0.295700965 -0.140583541 0.26148441
## Question_15- 0.37132397 0.39896365 0.312609295 -0.209655036 0.30124219
## Question_16- 0.26157255 0.39026576 0.341856316 -0.196389345 0.28888431
## Question_17- 0.29033521 0.40050472 0.607264448 -0.060008831 0.22004286
## Question_18- 0.53564484 0.51924098 0.296357802 -0.166331510 0.32773688
## Question_19- -0.19434313 -0.28179738 -0.182826882 0.257208925 -0.13302464
## Question_20- 0.09960499 0.22221657 0.200285197 -0.168290274 0.09134585
## Question_21- 0.28028089 0.48323678 0.309998901 -0.151430462 0.20332338
## Question_22- -0.18495653 -0.18544749 -0.105454105 0.285347499 -0.12178841
## Question_23- -0.07307044 -0.06563211 -0.050427999 0.165983438 -0.06049506
## Question_11- Question_12- Question_13- Question_14- Question_15-
## Question_01- 0.3763974 0.36296740 0.36393293 0.3565280 0.25226994
## Question_02- -0.1498685 -0.20647399 -0.16223708 -0.1821598 -0.17019220
## Question_03 0.3593173 0.42132586 0.33658510 0.3762200 0.31781841
## Question_04- 0.3792163 0.44652908 0.34860223 0.3573984 0.34153652
## Question_05- 0.3076457 0.35086220 0.30253499 0.3224859 0.26939559
## Question_06- 0.3472039 0.32463303 0.48119806 0.4106732 0.37132397
## Question_07- 0.3639442 0.42835549 0.46913918 0.4513281 0.39896365
## Question_08- 0.6455617 0.26004130 0.33591253 0.2957010 0.31260929
## Question_09- -0.1256607 -0.18803010 -0.19114149 -0.1405835 -0.20965504
## Question_10- 0.2627868 0.25545078 0.30106099 0.2614844 0.30124219
## Question_11- 1.0000000 0.34140893 0.44362602 0.3463327 0.37438219
## Question_12- 0.3414089 1.00000000 0.50259162 0.4405299 0.34756600
## Question_13- 0.4436260 0.50259162 1.00000000 0.4465351 0.36827082
## Question_14- 0.3463327 0.44052991 0.44653507 1.0000000 0.38799703
## Question_15- 0.3743822 0.34756600 0.36827082 0.3879970 1.00000000
## Question_16- 0.3771590 0.41419718 0.36951002 0.4202687 0.45003587
## Question_17- 0.6093433 0.34043554 0.41248389 0.3494621 0.38806347
## Question_18- 0.3978584 0.49956410 0.55019249 0.5026376 0.36303891
## Question_19- -0.2122948 -0.28553047 -0.25575048 -0.2694598 -0.22293558
## Question_20- 0.2675185 0.30152135 0.21500906 0.2281466 0.22559760
## Question_21- 0.3591919 0.43850465 0.38500498 0.3960666 0.31533342
## Question_22- -0.1802655 -0.19247298 -0.20970778 -0.1789014 -0.18108675
## Question_23- -0.0924981 -0.04901162 -0.06044059 -0.0600950 -0.07152116
## Question_16- Question_17- Question_18- Question_19- Question_20-
## Question_01- 0.49696876 0.38340729 0.37623424 -0.1927321 0.22712216
## Question_02- -0.17745629 -0.09727568 -0.16729714 0.2257107 -0.21523224
## Question_03 0.42083690 0.33113924 0.39767457 -0.3581167 0.32661632
## Question_04- 0.42290818 0.37644360 0.39166256 -0.2075958 0.25872701
## Question_05- 0.40795250 0.32028257 0.34466207 -0.1852568 0.21311727
## Question_06- 0.26157255 0.29033521 0.53564484 -0.1943431 0.09960499
## Question_07- 0.39026576 0.40050472 0.51924098 -0.2817974 0.22221657
## Question_08- 0.34185632 0.60726445 0.29635780 -0.1828269 0.20028520
## Question_09- -0.19638935 -0.06000883 -0.16633151 0.2572089 -0.16829027
## Question_10- 0.28888431 0.22004286 0.32773688 -0.1330246 0.09134585
## Question_11- 0.37715901 0.60934335 0.39785838 -0.2122948 0.26751848
## Question_12- 0.41419718 0.34043554 0.49956410 -0.2855305 0.30152135
## Question_13- 0.36951002 0.41248389 0.55019249 -0.2557505 0.21500906
## Question_14- 0.42026874 0.34946207 0.50263764 -0.2694598 0.22814660
## Question_15- 0.45003587 0.38806347 0.36303891 -0.2229356 0.22559760
## Question_16- 1.00000000 0.41795652 0.43213319 -0.2750407 0.27996205
## Question_17- 0.41795652 1.00000000 0.38775677 -0.1875743 0.22485820
## Question_18- 0.43213319 0.38775677 1.00000000 -0.2819246 0.24510669
## Question_19- -0.27504071 -0.18757433 -0.28192464 1.0000000 -0.25279285
## Question_20- 0.27996205 0.22485820 0.24510669 -0.2527928 1.00000000
## Question_21- 0.42221207 0.36837433 0.43288139 -0.2814537 0.49045434
## Question_22- -0.17204435 -0.13450689 -0.19099701 0.2402900 -0.10112202
## Question_23- -0.09718849 -0.10123323 -0.08827266 0.1367694 -0.04440591
## Question_21- Question_22- Question_23-
## Question_01- 0.34426480 -0.1045719 -0.01544328
## Question_02- -0.21391337 0.2502852 0.10567285
## Question_03 0.43363547 -0.2155666 -0.15833302
## Question_04- 0.41958580 -0.1264932 -0.04247334
## Question_05- 0.36194548 -0.1427773 -0.05693478
## Question_06- 0.28028089 -0.1849565 -0.07307044
## Question_07- 0.48323678 -0.1854475 -0.06563211
## Question_08- 0.30999890 -0.1054541 -0.05042800
## Question_09- -0.15143046 0.2853475 0.16598344
## Question_10- 0.20332338 -0.1217884 -0.06049506
## Question_11- 0.35919193 -0.1802655 -0.09249810
## Question_12- 0.43850465 -0.1924730 -0.04901162
## Question_13- 0.38500498 -0.2097078 -0.06044059
## Question_14- 0.39606664 -0.1789014 -0.06009500
## Question_15- 0.31533342 -0.1810868 -0.07152116
## Question_16- 0.42221207 -0.1720443 -0.09718849
## Question_17- 0.36837433 -0.1345069 -0.10123323
## Question_18- 0.43288139 -0.1909970 -0.08827266
## Question_19- -0.28145374 0.2402900 0.13676942
## Question_20- 0.49045434 -0.1011220 -0.04440591
## Question_21- 1.00000000 -0.1386220 -0.06999931
## Question_22- -0.13862196 1.0000000 0.23243564
## Question_23- -0.06999931 0.2324356 1.00000000
# Running Bartlett's Test of Sphericity to determine if the items actually correlate significantly with each other
cortest.bartlett(correl, n = nrow(df))
## $chisq
## [1] 19534.27
##
## $p.value
## [1] 0
##
## $df
## [1] 253
Question 1b:
To check for linearity and additivity, I created a visual correlation matrix using the symnum function. Scanning this matrix showed a clean diagonal line representing perfect relationships, which is exactly what we expect. More importantly, there weren’t any extreme correlations or identical variables hiding off the diagonal. This confirms that the items relate to each other linearly without any multicollinearity issues that might break future factor analyses.
Question 1c:
Correlation adequacy was tested using Bartlett’s Test of Sphericity to determine if the 23 items shared enough variance to justify a factor analysis. The test yielded a chi-square statistic of 19,534.27 (df = 253, p < .001). This result allows us to confidently reject the null hypothesis that the correlation matrix is an identity matrix, proving that meaningful, statistically significant correlations exist among the variables.
# Running the Kaiser-Meyer-Olkin (KMO) Measure of Sampling Adequacy which checks if the pattern of correlations is compact enough for factor analysis
KMO(correl)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = correl)
## Overall MSA = 0.93
## MSA for each item =
## Question_01- Question_02- Question_03 Question_04- Question_05- Question_06-
## 0.94 0.88 0.95 0.96 0.96 0.90
## Question_07- Question_08- Question_09- Question_10- Question_11- Question_12-
## 0.94 0.88 0.85 0.96 0.91 0.96
## Question_13- Question_14- Question_15- Question_16- Question_17- Question_18-
## 0.95 0.97 0.95 0.94 0.93 0.95
## Question_19- Question_20- Question_21- Question_22- Question_23-
## 0.95 0.89 0.93 0.89 0.79
Question 1d:
To verify that our 23 items shared enough common variance to be grouped into distinct factors, I calculated the Kaiser-Meyer-Olkin (KMO) measure of sampling adequacy. The overall MSA was 0.93. As this value is well above the 0.70 threshold (indicating the data is “marvelous”), we can confirm that the pattern of correlations is reliable enough for factor analysis.
# Load the MVN package to test for statistical skewness and kurtosis
library(MVN)
# Conduct Mardia's test on the 23 items which checks for multivariate normality.
# The directly dictates whether we can use Maximum Likelihood (which requires normality) or if we need to use Principal Axis Factoring.
mvn_output <- capture.output({
mvn_result <- mvn(noout[,1:23], mvn_test = "mardia")
print(mvn_result)
})
#A line of code that tells R to only print the multi and univariate normality and not the specific data points for each participant on each question. Including the specific data points would result in a 200+ page long PDF!
cat(mvn_output[1:(which(grepl("^\\$data", mvn_output)) - 1)], sep="\n")
## $multivariate_normality
## Test Statistic p.value Method MVN
## 1 Mardia Skewness 7366.602 <0.001 asymptotic ✗ Not normal
## 2 Mardia Kurtosis 46.464 <0.001 asymptotic ✗ Not normal
##
## $univariate_normality
## Test Variable Statistic p.value Normality
## 1 Anderson-Darling Question_01 188.884 <0.001 ✗ Not normal
## 2 Anderson-Darling Question_02 264.574 <0.001 ✗ Not normal
## 3 Anderson-Darling Question_03 97.115 <0.001 ✗ Not normal
## 4 Anderson-Darling Question_04 130.914 <0.001 ✗ Not normal
## 5 Anderson-Darling Question_05 149.116 <0.001 ✗ Not normal
## 6 Anderson-Darling Question_06 156.790 <0.001 ✗ Not normal
## 7 Anderson-Darling Question_07 108.042 <0.001 ✗ Not normal
## 8 Anderson-Darling Question_08 221.595 <0.001 ✗ Not normal
## 9 Anderson-Darling Question_09 94.463 <0.001 ✗ Not normal
## 10 Anderson-Darling Question_10 216.015 <0.001 ✗ Not normal
## 11 Anderson-Darling Question_11 185.268 <0.001 ✗ Not normal
## 12 Anderson-Darling Question_12 138.521 <0.001 ✗ Not normal
## 13 Anderson-Darling Question_13 154.424 <0.001 ✗ Not normal
## 14 Anderson-Darling Question_14 113.629 <0.001 ✗ Not normal
## 15 Anderson-Darling Question_15 131.031 <0.001 ✗ Not normal
## 16 Anderson-Darling Question_16 139.879 <0.001 ✗ Not normal
## 17 Anderson-Darling Question_17 191.283 <0.001 ✗ Not normal
## 18 Anderson-Darling Question_18 109.244 <0.001 ✗ Not normal
## 19 Anderson-Darling Question_19 114.652 <0.001 ✗ Not normal
## 20 Anderson-Darling Question_20 111.207 <0.001 ✗ Not normal
## 21 Anderson-Darling Question_21 118.881 <0.001 ✗ Not normal
## 22 Anderson-Darling Question_22 101.333 <0.001 ✗ Not normal
## 23 Anderson-Darling Question_23 128.547 <0.001 ✗ Not normal
##
## $descriptives
## Variable n Mean Std.Dev Median Min Max 25th 75th Skew Kurtosis
## 1 Question_01 2474 2.357 0.801 2 1 5 2 3 0.626 3.638
## 2 Question_02 2474 1.627 0.836 1 1 5 1 2 1.410 4.741
## 3 Question_03 2474 2.612 1.059 3 1 5 2 3 0.055 2.248
## 4 Question_04 2474 2.755 0.924 3 1 5 2 3 0.373 2.756
## 5 Question_05 2474 2.711 0.942 3 1 5 2 3 0.464 2.586
## 6 Question_06 2474 2.207 1.092 2 1 5 1 3 0.930 3.232
## 7 Question_07 2474 2.890 1.074 3 1 5 2 4 0.202 2.187
## 8 Question_08 2474 2.228 0.849 2 1 5 2 3 1.050 4.584
## 9 Question_09 2474 2.847 1.247 3 1 5 2 4 -0.073 1.870
## 10 Question_10 2474 2.270 0.848 2 1 5 2 3 0.819 3.610
## 11 Question_11 2474 2.235 0.849 2 1 5 2 3 0.810 3.948
## 12 Question_12 2474 3.134 0.883 3 1 5 3 4 0.190 2.868
## 13 Question_13 2474 2.433 0.919 2 1 5 2 3 0.598 3.063
## 14 Question_14 2474 2.843 0.968 3 1 5 2 3 0.277 2.697
## 15 Question_15 2474 2.745 0.976 3 1 5 2 3 0.434 2.607
## 16 Question_16 2474 2.853 0.886 3 1 5 2 3 0.388 2.928
## 17 Question_17 2474 2.444 0.852 2 1 5 2 3 0.755 3.559
## 18 Question_18 2474 2.544 1.023 2 1 5 2 3 0.473 2.825
## 19 Question_19 2474 2.295 1.085 2 1 5 1 3 0.459 2.269
## 20 Question_20 2474 3.604 1.019 4 1 5 3 4 -0.358 2.354
## 21 Question_21 2474 3.141 0.957 3 1 5 2 4 0.129 2.250
## 22 Question_22 2474 2.889 1.014 3 1 5 2 4 -0.085 2.347
## 23 Question_23 2474 3.432 1.024 4 1 5 3 4 -0.594 2.878
Question 1:
Thus far, we have confirmed that this dataset is more than adequate for Exploratory Factor Analysis (EFA). There are no extreme individual outliers distorting the sample matrix (1a), linear relationships are clean and free of multicollinearity (1b), and the variables share significant common variance according to Bartlett’s test (1c). Furthermore, the Kaiser-Meyer-Olkin (KMO) Measure yields an overall sampling adequacy score of 0.93, categorizing the metric as “marvelous” (1d). Finally, Mardia’s test confirms a significant deviation from multivariate normality (MVN) (p < .001); Lastly, severe multivariate skew or kurtosis violates assumptions for Maximum Likelihood extraction, which means that we must use Principal Axis Factoring (PAF) as our chosen structural model.
# Before running EFA's it is helpful examine histograms to understand range restriction, skew, kurtosis, and distributions patterns.
# Making space for the grid of plots on one output
# The order is: c(bottom, left, top, right)
par(mar = c(2, 2, 2, 2))
# Set up the 5x5 grid, which will have 2 empty cells since there are only 23 items
par(mfrow = c(5, 5))
# Generating the actual histograms with proper spacing, titles and labels for x and y axis
for(i in 1:23) {hist(df[[i]],
breaks = 6,
main = colnames(df)[i],
xlab = "")}
# Reseting the margins to default so future plots don't look weird
par(mar = c(5, 4, 4, 2) + 0.1)
Question 2:
To inspect the distribution of responses across all 23 items, I generated a 5×5 grid of histograms. Most items showed moderate, unimodal distributions consistent with typical Likert response patterns. A handful of items (notably those related to anxiety and fear of statistics) showed mild positive skew, with responses clustering toward the lower end of the scale, suggesting that most participants did not endorse extreme anxiety. There were no items showed severe range restriction or skewness that would threaten the factor analysis.
# Exploratory Factor Analysis
# Set the seed again to ensure the "random" split is reproducible.
set.seed(2026)
# Splitting the entire df into a a training and testing dataset. The groups are randomly chosen from the entire sample, but are split 50/50 into each subsample.
# We select 50% of the total row count (nrow(df))
train_index <- sample(1:nrow(df), 0.5 * nrow(df))
# 3. Create the Training dataset (the first 50%)
train_data <- df[train_index, ]
# 4. Create the Testing dataset (the remaining 50% using the minus sign)
test_data <- df[-train_index, ]
Question 3:
To ensure the stability of the factor analysis and prevent overfitting, I split the final cleaned dataset into two equal halves using a 50/50 random split which put 1,237 participants in each subsample. The training set was used to conduct the EFA, while the test set was reserved to verify that the factor structure generalizes. A 50/50 split was chosen over a 70/30 or 80/20 split because the overall sample size was large enough that both halves would retain enough power for reliable analysis.
To confirm that the 5-factor solution was not specific to the training sample, the same PAF model was applied to the held-out test set using a polychoric correlation matrix. Factor congruence coefficients between the training and test solutions were computed using Tucker’s congruence coefficient. All five factors exceeded the 0.95 threshold for excellent replication (PA3 = 0.99, PA1 = 0.97, PA4 = 0.98, PA5 = 0.95, PA2 = 0.95), confirming that the factor structure generalizes well beyond the training data.
# Conduct a parallel analysis to decide how many factors to retain
# 'fm ="pa"' means Principal Axis factoring, 'fa="pc"' means Principal Components
fa.parallel(train_data[, 1:23],
fm = "pa", fa = "fa")
## Parallel analysis suggests that the number of factors = 5 and the number of components = NA
# Compute polychoric correlation matrix for ordinal Likert data. We do a polychoric correlation because it accounts and corrects for any attenuation caused by the scale items being ordinal scales.
poly_cor_train <- polychoric(train_data[, 1:23])$rho
# Conducting an initial EFA using Principal Axis Factoring (PAF) and Promax rotation
# 'nfactors' is the number of factors suggested by the Parallel Analysis
# 'rotate ="promax"' allows factors to correlate, which is more realistic for psychological research
# n.obs is required by fa() when a correlation matrix is passed instead of raw data.
fit <- fa(poly_cor_train, nfactors = 5, rotate = "promax",
fm = "pa", n.obs = nrow(train_data))
# Print the model results, suppressing loadings below 0.3 for readability
print(fit, cut = 0.3, sort = TRUE)
## Factor Analysis using method = pa
## Call: fa(r = poly_cor_train, nfactors = 5, n.obs = nrow(train_data),
## rotate = "promax", fm = "pa")
## Standardized loadings (pattern matrix) based upon correlation matrix
## item PA3 PA1 PA4 PA5 PA2 h2 u2 com
## Question_06- 6 1.06 0.68 0.32 1.2
## Question_18- 18 0.73 0.65 0.35 1.1
## Question_07- 7 0.69 0.56 0.44 1.1
## Question_13- 13 0.66 0.56 0.44 1.1
## Question_14- 14 0.53 0.45 0.55 1.2
## Question_10- 10 0.41 0.22 0.78 1.7
## Question_12- 12 0.36 0.54 0.46 2.8
## Question_15- 15 0.41 0.59 3.6
## Question_01- 1 0.88 0.58 0.42 1.1
## Question_16- 16 0.81 0.61 0.39 1.1
## Question_05- 5 0.65 0.40 0.60 1.1
## Question_04- 4 0.60 0.48 0.52 1.1
## Question_08- 8 0.90 0.80 0.20 1.1
## Question_11- 11 0.75 0.74 0.26 1.1
## Question_17- 17 0.64 0.68 0.32 1.2
## Question_20- 20 0.80 0.49 0.51 1.1
## Question_21- 21 0.63 0.56 0.44 1.3
## Question_03 3 0.35 0.54 0.46 3.1
## Question_09- 9 0.65 0.42 0.58 1.2
## Question_22- 22 0.55 0.31 0.69 1.1
## Question_23- 23 0.48 0.18 0.82 1.3
## Question_02- 2 -0.36 0.44 0.39 0.61 2.0
## Question_19- 19 0.30 0.28 0.72 2.2
##
## PA3 PA1 PA4 PA5 PA2
## SS loadings 3.44 2.65 2.12 1.87 1.45
## Proportion Var 0.15 0.12 0.09 0.08 0.06
## Cumulative Var 0.15 0.26 0.36 0.44 0.50
## Proportion Explained 0.30 0.23 0.18 0.16 0.13
## Cumulative Proportion 0.30 0.53 0.71 0.87 1.00
##
## With factor correlations of
## PA3 PA1 PA4 PA5 PA2
## PA3 1.00 0.74 0.55 0.59 -0.46
## PA1 0.74 1.00 0.62 0.72 -0.40
## PA4 0.55 0.62 1.00 0.37 -0.25
## PA5 0.59 0.72 0.37 1.00 -0.50
## PA2 -0.46 -0.40 -0.25 -0.50 1.00
##
## Mean item complexity = 1.5
## Test of the hypothesis that 5 factors are sufficient.
##
## df null model = 253 with the objective function = 10.4 with Chi Square = 12771.09
## df of the model are 148 and the objective function was 0.56
##
## The root mean square of the residuals (RMSR) is 0.02
## The df corrected root mean square of the residuals is 0.03
##
## The harmonic n.obs is 1237 with the empirical chi square 340.46 with prob < 3.3e-17
## The total n.obs was 1237 with Likelihood Chi Square = 686.32 with prob < 3.2e-70
##
## Tucker Lewis Index of factoring reliability = 0.926
## RMSEA index = 0.054 and the 90 % confidence intervals are 0.05 0.058
## BIC = -367.5
## Fit based upon off diagonal values = 1
## Measures of factor score adequacy
## PA3 PA1 PA4 PA5 PA2
## Correlation of (regression) scores with factors 0.95 0.94 0.94 0.90 0.85
## Multiple R square of scores with factors 0.91 0.88 0.89 0.81 0.73
## Minimum correlation of possible factor scores 0.81 0.77 0.78 0.62 0.46
# Validating the factor structure on the test dataset
poly_cor_test <- polychoric(test_data[, 1:23])$rho
# Running the same 5-factor PAF solution on the test dataset
fit_test <- fa(poly_cor_test, nfactors = 5, rotate = "promax",
fm = "pa", n.obs = nrow(test_data))
# Comparing the EFA outputs between training and test solutions
factor.congruence(fit, fit_test)
## PA1 PA5 PA4 PA3 PA2
## PA3 0.99 0.01 0.04 0.02 0.06
## PA1 -0.02 0.97 0.06 0.08 -0.03
## PA4 0.03 0.07 0.98 0.18 0.07
## PA5 0.02 0.04 0.14 0.95 -0.16
## PA2 -0.02 0.08 0.06 -0.04 0.95
Question 4 & 5:
To determine the optimal factor structure, I first conducted a Parallel Analysis, which suggested a 5-factor solution. I then performed an Exploratory Factor Analysis (EFA) using Principal Axis Factoring (PAF) as the extraction method. PAF was selected over Maximum Likelihood because Mardia’s test confirmed a significant violation of multivariate normality in the data (p < .001), and PAF does not require this assumption. Since items are measured on an ordinal Likert scale, a polychoric correlation matrix was also computed and used as input to the factor analysis rather than a raw Pearson correlation matrix. This corrects for the attenuation in correlations caused by ordinal responses.
For rotation, I used Promax, an oblique method, which was chosen over orthogonal alternatives such as Varimax because it allows the extracted factors to correlate with one another. In psychological research, constructs are never really independent so allowing for factor correlations produces a more theoretically realistic and interpretable solution.
I identified that Question 02 (“My friends will think I’m stupid for not being able to cope with SPSS”) exhibited complexity by cross-loading on both PA5 (-0.36) and PA2 (0.44). This indicates that the item captures a unique intersection between the two factors. I have chosen to retain this item within PA2, as its negative loading on PA5 provides a valuable, nuanced interpretation of the construct. It highlights the social pressure inherent in the learning environment rather than indicating a failure of the factor structure. Also, Question 15 appears in the initial pattern matrix output above as it was part of the original solution, but was excluded from all subsequent scale construction due to failing the 0.30 loading threshold and exhibiting high item complexity (com = 3.6).
# Determining Alpha coefficients of each factor while also giving them preliminary names based on their underlying themes
# Explicitly telling R to use the psych package's 'alpha' since both it and the ggplot2 package include the 'alpha' function
conflicts_prefer(psych::alpha)
## [conflicted] Will prefer psych::alpha over any other package.
# Factor 1 (PA3): Computer/Technology Anxiety
alpha_pa3 <- alpha(train_data[, c("Question_06-", "Question_18-", "Question_13-", "Question_07-", "Question_14-", "Question_10-", "Question_12-")], check.keys=TRUE)
# Factor 2 (PA1): Statistical Competence
alpha_pa1 <- alpha(train_data[, c("Question_01-", "Question_16-", "Question_05-", "Question_04-")], check.keys=TRUE)
# Factor 3 (PA4): Interest in Statistics
alpha_pa4 <- alpha(train_data[, c("Question_08-", "Question_11-", "Question_17-")], check.keys=TRUE)
# Factor 4 (PA5): Study Engagement
alpha_pa5 <- alpha(train_data[, c("Question_20-", "Question_21-")], check.keys=TRUE)
# Factor 5 (PA2): Social Comparison
alpha_pa2 <- alpha(train_data[, c("Question_09-", "Question_22-", "Question_23-", "Question_02-", "Question_19-", "Question_03")], check.keys=TRUE)
## Warning in alpha(train_data[, c("Question_09-", "Question_22-", "Question_23-", : Some items were negatively correlated with the first principal component and were automatically reversed.
## This is indicated by a negative sign for the variable name.
# Print the alpha results to inspect the scale reliabilities
print(alpha_pa3)
##
## Reliability analysis
## Call: alpha(x = train_data[, c("Question_06-", "Question_18-", "Question_13-",
## "Question_07-", "Question_14-", "Question_10-", "Question_12-")],
## check.keys = TRUE)
##
## raw_alpha std.alpha G6(smc) average_r S/N ase mean sd median_r
## 0.84 0.83 0.82 0.42 5 0.0069 3.4 0.69 0.44
##
## 95% confidence boundaries
## lower alpha upper
## Feldt 0.82 0.84 0.85
## Duhachek 0.82 0.84 0.85
##
## Reliability if an item is dropped:
## raw_alpha std.alpha G6(smc) average_r S/N alpha se var.r med.r
## Question_06- 0.81 0.81 0.79 0.42 4.3 0.0081 0.0124 0.44
## Question_18- 0.79 0.79 0.77 0.39 3.8 0.0089 0.0094 0.43
## Question_13- 0.80 0.80 0.78 0.40 4.0 0.0084 0.0119 0.43
## Question_07- 0.81 0.81 0.79 0.41 4.2 0.0083 0.0128 0.44
## Question_14- 0.82 0.81 0.80 0.42 4.4 0.0079 0.0129 0.47
## Question_10- 0.84 0.84 0.83 0.48 5.4 0.0068 0.0037 0.48
## Question_12- 0.82 0.81 0.79 0.42 4.3 0.0079 0.0125 0.44
##
## Item statistics
## n raw.r std.r r.cor r.drop mean sd
## Question_06- 1237 0.74 0.72 0.66 0.61 3.8 1.11
## Question_18- 1237 0.81 0.80 0.77 0.71 3.5 1.02
## Question_13- 1237 0.75 0.76 0.71 0.65 3.6 0.92
## Question_07- 1237 0.75 0.74 0.68 0.62 3.1 1.08
## Question_14- 1237 0.70 0.70 0.62 0.57 3.1 0.96
## Question_10- 1237 0.52 0.54 0.40 0.37 3.7 0.85
## Question_12- 1237 0.69 0.71 0.64 0.58 2.9 0.86
##
## Non missing response frequency for each item
## 1 2 3 4 5 miss
## Question_06- 0.05 0.10 0.14 0.44 0.26 0
## Question_18- 0.05 0.12 0.30 0.39 0.14 0
## Question_13- 0.02 0.11 0.26 0.49 0.11 0
## Question_07- 0.07 0.24 0.26 0.35 0.07 0
## Question_14- 0.06 0.18 0.39 0.33 0.05 0
## Question_10- 0.01 0.10 0.19 0.57 0.14 0
## Question_12- 0.06 0.23 0.49 0.19 0.02 0
print(alpha_pa1)
##
## Reliability analysis
## Call: alpha(x = train_data[, c("Question_01-", "Question_16-", "Question_05-",
## "Question_04-")], check.keys = TRUE)
##
## raw_alpha std.alpha G6(smc) average_r S/N ase mean sd median_r
## 0.75 0.76 0.7 0.44 3.1 0.011 3.3 0.68 0.43
##
## 95% confidence boundaries
## lower alpha upper
## Feldt 0.73 0.75 0.78
## Duhachek 0.73 0.75 0.78
##
## Reliability if an item is dropped:
## raw_alpha std.alpha G6(smc) average_r S/N alpha se var.r med.r
## Question_01- 0.69 0.69 0.59 0.42 2.2 0.015 0.00012 0.43
## Question_16- 0.68 0.69 0.60 0.42 2.2 0.016 0.00076 0.41
## Question_05- 0.72 0.72 0.64 0.47 2.6 0.014 0.00192 0.45
## Question_04- 0.70 0.71 0.62 0.45 2.4 0.015 0.00334 0.43
##
## Item statistics
## n raw.r std.r r.cor r.drop mean sd
## Question_01- 1237 0.76 0.78 0.67 0.58 3.6 0.79
## Question_16- 1237 0.77 0.78 0.67 0.58 3.2 0.89
## Question_05- 1237 0.75 0.74 0.59 0.52 3.3 0.95
## Question_04- 1237 0.76 0.75 0.62 0.54 3.3 0.95
##
## Non missing response frequency for each item
## 1 2 3 4 5 miss
## Question_01- 0.01 0.06 0.29 0.53 0.10 0
## Question_16- 0.04 0.16 0.43 0.33 0.04 0
## Question_05- 0.03 0.19 0.30 0.42 0.06 0
## Question_04- 0.04 0.15 0.37 0.37 0.07 0
print(alpha_pa4)
##
## Reliability analysis
## Call: alpha(x = train_data[, c("Question_08-", "Question_11-", "Question_17-")],
## check.keys = TRUE)
##
## raw_alpha std.alpha G6(smc) average_r S/N ase mean sd median_r
## 0.84 0.84 0.78 0.64 5.2 0.0079 3.7 0.76 0.62
##
## 95% confidence boundaries
## lower alpha upper
## Feldt 0.82 0.84 0.85
## Duhachek 0.82 0.84 0.85
##
## Reliability if an item is dropped:
## raw_alpha std.alpha G6(smc) average_r S/N alpha se var.r med.r
## Question_08- 0.77 0.77 0.62 0.62 3.3 0.013 NA 0.62
## Question_11- 0.76 0.76 0.62 0.62 3.2 0.014 NA 0.62
## Question_17- 0.80 0.80 0.67 0.67 4.0 0.011 NA 0.67
##
## Item statistics
## n raw.r std.r r.cor r.drop mean sd
## Question_08- 1237 0.88 0.87 0.78 0.71 3.8 0.89
## Question_11- 1237 0.88 0.88 0.79 0.72 3.7 0.86
## Question_17- 1237 0.86 0.86 0.74 0.68 3.5 0.88
##
## Non missing response frequency for each item
## 1 2 3 4 5 miss
## Question_08- 0.03 0.06 0.18 0.58 0.15 0
## Question_11- 0.02 0.06 0.23 0.54 0.15 0
## Question_17- 0.03 0.10 0.26 0.53 0.08 0
print(alpha_pa5)
##
## Reliability analysis
## Call: alpha(x = train_data[, c("Question_20-", "Question_21-")], check.keys = TRUE)
##
## raw_alpha std.alpha G6(smc) average_r S/N ase mean sd median_r
## 0.65 0.65 0.49 0.49 1.9 0.02 2.6 0.85 0.49
##
## 95% confidence boundaries
## lower alpha upper
## Feldt 0.61 0.65 0.69
## Duhachek 0.62 0.65 0.69
##
## Reliability if an item is dropped:
## raw_alpha std.alpha G6(smc) average_r S/N alpha se var.r med.r
## Question_20- 0.51 0.49 0.24 0.49 0.95 NA 0 0.49
## Question_21- 0.46 0.49 0.24 0.49 0.95 NA 0 0.49
##
## Item statistics
## n raw.r std.r r.cor r.drop mean sd
## Question_20- 1237 0.87 0.86 0.6 0.49 2.4 1.02
## Question_21- 1237 0.85 0.86 0.6 0.49 2.9 0.97
##
## Non missing response frequency for each item
## 1 2 3 4 5 miss
## Question_20- 0.20 0.39 0.25 0.15 0.02 0
## Question_21- 0.08 0.28 0.34 0.28 0.02 0
print(alpha_pa2)
##
## Reliability analysis
## Call: alpha(x = train_data[, c("Question_09-", "Question_22-", "Question_23-",
## "Question_02-", "Question_19-", "Question_03")], check.keys = TRUE)
##
## raw_alpha std.alpha G6(smc) average_r S/N ase mean sd median_r
## 0.65 0.66 0.63 0.24 1.9 0.015 3.4 0.64 0.25
##
## 95% confidence boundaries
## lower alpha upper
## Feldt 0.62 0.65 0.68
## Duhachek 0.62 0.65 0.68
##
## Reliability if an item is dropped:
## raw_alpha std.alpha G6(smc) average_r S/N alpha se var.r med.r
## Question_09- 0.60 0.60 0.56 0.23 1.5 0.018 0.0070 0.23
## Question_22- 0.61 0.61 0.57 0.24 1.6 0.017 0.0085 0.23
## Question_23- 0.65 0.66 0.61 0.28 1.9 0.016 0.0030 0.27
## Question_02- 0.61 0.61 0.56 0.24 1.6 0.017 0.0037 0.24
## Question_19- 0.61 0.62 0.58 0.24 1.6 0.017 0.0063 0.26
## Question_03- 0.59 0.59 0.55 0.22 1.5 0.018 0.0047 0.24
##
## Item statistics
## n raw.r std.r r.cor r.drop mean sd
## Question_09- 1237 0.68 0.64 0.53 0.43 3.2 1.25
## Question_22- 1237 0.61 0.61 0.48 0.39 3.2 1.03
## Question_23- 1237 0.51 0.51 0.33 0.27 2.6 1.04
## Question_02- 1237 0.57 0.62 0.51 0.40 4.4 0.81
## Question_19- 1237 0.61 0.60 0.48 0.38 3.7 1.08
## Question_03- 1237 0.65 0.66 0.56 0.44 3.4 1.05
##
## Non missing response frequency for each item
## 1 2 3 4 5 miss
## Question_09- 0.08 0.27 0.24 0.21 0.20 0
## Question_22- 0.04 0.24 0.34 0.28 0.10 0
## Question_23- 0.11 0.43 0.27 0.12 0.06 0
## Question_02- 0.00 0.04 0.07 0.32 0.57 0
## Question_19- 0.02 0.14 0.23 0.34 0.27 0
## Question_03 0.18 0.28 0.35 0.16 0.03 0
# Printing out only the specific raw alpha value for each factor with revised naming conventions
cat("PA3 (Anxiety): ", alpha_pa3$total$raw_alpha, "\n")
## PA3 (Anxiety): 0.8358727
cat("PA1 (Competence): ", alpha_pa1$total$raw_alpha, "\n")
## PA1 (Competence): 0.7545038
cat("PA4 (Interest): ", alpha_pa4$total$raw_alpha, "\n")
## PA4 (Interest): 0.8392794
cat("PA5 (Engagement): ", alpha_pa5$total$raw_alpha, "\n")
## PA5 (Engagement): 0.6543826
cat("PA2 (Comparison): ", alpha_pa2$total$raw_alpha, "\n")
## PA2 (Comparison): 0.653564
# Reprinting the pattern matrix for reference in Q6
print(fit, cut = 0.3, sort = TRUE)
## Factor Analysis using method = pa
## Call: fa(r = poly_cor_train, nfactors = 5, n.obs = nrow(train_data),
## rotate = "promax", fm = "pa")
## Standardized loadings (pattern matrix) based upon correlation matrix
## item PA3 PA1 PA4 PA5 PA2 h2 u2 com
## Question_06- 6 1.06 0.68 0.32 1.2
## Question_18- 18 0.73 0.65 0.35 1.1
## Question_07- 7 0.69 0.56 0.44 1.1
## Question_13- 13 0.66 0.56 0.44 1.1
## Question_14- 14 0.53 0.45 0.55 1.2
## Question_10- 10 0.41 0.22 0.78 1.7
## Question_12- 12 0.36 0.54 0.46 2.8
## Question_15- 15 0.41 0.59 3.6
## Question_01- 1 0.88 0.58 0.42 1.1
## Question_16- 16 0.81 0.61 0.39 1.1
## Question_05- 5 0.65 0.40 0.60 1.1
## Question_04- 4 0.60 0.48 0.52 1.1
## Question_08- 8 0.90 0.80 0.20 1.1
## Question_11- 11 0.75 0.74 0.26 1.1
## Question_17- 17 0.64 0.68 0.32 1.2
## Question_20- 20 0.80 0.49 0.51 1.1
## Question_21- 21 0.63 0.56 0.44 1.3
## Question_03 3 0.35 0.54 0.46 3.1
## Question_09- 9 0.65 0.42 0.58 1.2
## Question_22- 22 0.55 0.31 0.69 1.1
## Question_23- 23 0.48 0.18 0.82 1.3
## Question_02- 2 -0.36 0.44 0.39 0.61 2.0
## Question_19- 19 0.30 0.28 0.72 2.2
##
## PA3 PA1 PA4 PA5 PA2
## SS loadings 3.44 2.65 2.12 1.87 1.45
## Proportion Var 0.15 0.12 0.09 0.08 0.06
## Cumulative Var 0.15 0.26 0.36 0.44 0.50
## Proportion Explained 0.30 0.23 0.18 0.16 0.13
## Cumulative Proportion 0.30 0.53 0.71 0.87 1.00
##
## With factor correlations of
## PA3 PA1 PA4 PA5 PA2
## PA3 1.00 0.74 0.55 0.59 -0.46
## PA1 0.74 1.00 0.62 0.72 -0.40
## PA4 0.55 0.62 1.00 0.37 -0.25
## PA5 0.59 0.72 0.37 1.00 -0.50
## PA2 -0.46 -0.40 -0.25 -0.50 1.00
##
## Mean item complexity = 1.5
## Test of the hypothesis that 5 factors are sufficient.
##
## df null model = 253 with the objective function = 10.4 with Chi Square = 12771.09
## df of the model are 148 and the objective function was 0.56
##
## The root mean square of the residuals (RMSR) is 0.02
## The df corrected root mean square of the residuals is 0.03
##
## The harmonic n.obs is 1237 with the empirical chi square 340.46 with prob < 3.3e-17
## The total n.obs was 1237 with Likelihood Chi Square = 686.32 with prob < 3.2e-70
##
## Tucker Lewis Index of factoring reliability = 0.926
## RMSEA index = 0.054 and the 90 % confidence intervals are 0.05 0.058
## BIC = -367.5
## Fit based upon off diagonal values = 1
## Measures of factor score adequacy
## PA3 PA1 PA4 PA5 PA2
## Correlation of (regression) scores with factors 0.95 0.94 0.94 0.90 0.85
## Multiple R square of scores with factors 0.91 0.88 0.89 0.81 0.73
## Minimum correlation of possible factor scores 0.81 0.77 0.78 0.62 0.46
Question 6:
The final 5-factor solution retained 5 factors accounting for 50% of the total variance. The pattern matrix is displayed above. Loadings below 0.30 have been suppressed for clarity, and Question 15 has been excluded from scale construction as noted in Q4 & Q5.
Computer/Technology Anxiety (PA3): Questions 6, 18, 13, 7, 14, 10, and 12 Statistical Competence (PA1): Questions 1, 16, 5, and 4 Interest in Statistics (PA4): Questions 8, 11, and 17 Study Engagement (PA5): Questions 20 and 21 Social Comparison (PA2): Questions 9, 22, 23, 2, 19, and 3
Question 7:
The Tucker Lewis Index (TLI) of the final model was 0.926. This exceeds the conventional 0.90 threshold, indicating adequate model fit.
Question 8:
The RMSEA of the final model was 0.054 (90% CI [0.05, 0.058]). This falls below the 0.06 threshold for good fit, indicating the model adequately represents the data.
Question 9:
The five scales derived from the factor solution are:
Computer/Technology Anxiety (PA3): Questions 6, 18, 13, 7, 14, 10, 12 (all reverse scored) Statistical Competence (PA1): Questions 1, 16, 5, 4 (all reverse scored) Interest in Statistics (PA4): Questions 8, 11, 17 (all reverse scored) Study Engagement (PA5): Questions 20, 21 (both reverse scored) Social Comparison (PA2): Questions 9, 22, 23, 2, 19, 3 (all reverse scored except item 3)
The five factors were named based on the shared content of their highest-loading items, and are described below.
Scale 1 — Computer/Technology Anxiety (PA3): α = 0.84. This exceeds the conventional 0.70 benchmark and indicates strong reliability. Specific items in this scale include questions 6, 18, 13, 7, 14, 10, and 12 (all reverse scored).
Scale 2 — Statistical Competence (PA1): α = 0.75. This meets the conventional benchmark and indicates good reliability. Specific items in this scale include questions 1, 16, 5, and 4 (all reverse scored)
Scale 3 — Interest in Statistics (PA4): α = 0.84. This exceeds the conventional 0.70 benchmark and indicates strong reliability. Specific items in this scale include question 8, 11, and 17 (all reverse scored).
Scale 4 — Study Engagement (PA5): α = 0.65. This falls below the 0.70 benchmark, though a lower alpha is typically expected and accepted for a two-item scale in exploratory work. No immediate action is recommended, but adding a third item that gets at the heart of this construct in a future revision of the instrument would strengthen the scale. Specific items in the current scale include questions 20 and 21 (both reverse scored)
Scale 5 — Social Comparison (PA2): α = 0.65. Unlike PA5, this scale has six items, so the two-item explanation does not apply. The below-threshold alpha suggests the items within this factor may not be measuring a single cohesive construct as cleanly as hoped. In future revisions of this scale, I would examine the item-total correlations from the full alpha output. By looking at the r.drop column in that output one could determine which items are contributing the least to this scale’s coherence. Items with r.drop values below 0.30 would be candidates for removal or rewording. Specific items in the current scale include questions 9, 22, 23, 2, 19 and 3 (all reverse scored except for item 3).
Question 10:
During the reliability analysis, the check.keys=TRUE parameter was used within the alpha() function to confirm all items were properly aligned. Cronbach’s alpha was calculated for each scale using the training dataset.
Computer/Technology Anxiety (PA3): α = 0.84 — exceeds the 0.70 benchmark, strong reliability ✓ Statistical Competence (PA1): α = 0.75 — meets the benchmark, good reliability ✓ Interest in Statistics (PA4): α = 0.84 — exceeds the benchmark, strong reliability ✓ Study Engagement (PA5): α = 0.65 — falls below the 0.70 benchmark. However, a lower alpha is typically expected for two-item scales in exploratory work. Adding a third item in a future revision of the instrument would strengthen this scale. Social Comparison (PA2): α = 0.65 — falls below the 0.70 benchmark. Unlike PA5, this scale has six items so the two-item explanation does not apply. The below-threshold alpha suggests the items may not be measuring a single cohesive construct. As a next step, examining the r.drop column in the full alpha output would identify which items are contributing least to scale coherence. Items with r.drop values below 0.30 would be candidates for removal or rewording.