1 PCA for Motivation

Load necessary packages.

library(readr)
library(dplyr)
library(psych)
library(factoextra)
library(GPArotation)

Set Working Directory.

setwd("/Users/sanshirohogawa/Library/Mobile Documents/com~apple~CloudDocs/Stats Consulting/Abbie")

Load data.

dat2 <- read_csv("Writing_Tasks_with_IDs.csv", show_col_types = FALSE)
dat2$Participant <- as.factor(dat2$Participant) # Make participant IDs a factor
is.factor(dat2$Participant) # Confirm it's a factor
## [1] TRUE

1.1 A few problems:

  • There should be 160 participants, but there are only 159 levels for “Participant”
  • There should be 4 rows for each participant. However, there are 8 rows for “56”, no rows for 128, 3 rows for “48” and 5 rows for “49”
  • Therefore, I will remove Participants 56, 48, and 49 (16 rows) for now.
dat2 <- dat2 %>% 
  filter(Participant != "56") %>% 
  filter(Participant != "48") %>% 
  filter(Participant != "49")

# Drop the levels that are not being used
dat2$Participant <- droplevels(dat2$Participant)

# Confirm the number of unique participant IDs
unique(dat2$Participant)
##   [1] 1   2   3   4   5   6   7   8   9   10  11  12  13  14  15  16  17  18 
##  [19] 19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36 
##  [37] 37  38  39  40  41  42  43  44  45  46  47  50  51  52  53  54  55  57 
##  [55] 58  59  60  62  61  63  64  65  66  67  68  69  70  71  72  73  74  75 
##  [73] 76  77  78  79  80  81  82  83  122 84  85  86  87  88  89  90  91  92 
##  [91] 93  94  95  96  97  98  99  100 101 103 102 104 105 106 107 108 109 110
## [109] 111 112 113 114 115 116 117 118 119 120 121 123 124 125 126 127 129 130
## [127] 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148
## [145] 149 150 151 152 153 154 155 156 157 158 159 160
## 156 Levels: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 ... 160

Extract relevant columns, and make sure there is only 1 row per 1 participant.

# Keep only the variables you need
mot <- dat2 %>%
  select(Participant, 
         `Task Value`, 
         `Mastery Goal`,
         `Self Efficacy`,
         Attribution,
         `Avoidance Goal`,
         `Performance Goal`)

# Keep only one unique row per participant
mot <- distinct(mot, Participant, .keep_all = TRUE)

# Confirm there is only one row for each participant
table(mot$Participant)
## 
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  41  42  43  44  45  46  47  50  51  52  53  54  55  57  58  59  60  61  62  63 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80  81  82  83 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 101 102 103 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 124 125 126 127 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1

Make sure there is no missing values.

colSums(is.na(mot))
##      Participant       Task Value     Mastery Goal    Self Efficacy 
##                0                0                0                0 
##      Attribution   Avoidance Goal Performance Goal 
##                0                0                0

No missing values.

Descriptives.

describe(mot)
##                  vars   n  mean    sd median trimmed   mad min max range  skew
## Participant*        1 156 78.50 45.18   78.5   78.50 57.82   1 156   155  0.00
## Task Value          2 156 27.78  7.94   27.5   27.58  8.15   7  50    43  0.19
## Mastery Goal        3 156 22.69  5.84   22.5   22.67  5.19   7  39    32  0.10
## Self Efficacy       4 156 39.70  6.47   39.5   39.48  5.93  24  58    34  0.34
## Attribution         5 156 19.33  3.38   20.0   19.44  2.97  10  30    20 -0.21
## Avoidance Goal      6 156 40.27  8.46   41.0   40.33  8.90  17  59    42 -0.12
## Performance Goal    7 156 23.76  6.70   24.0   23.76  5.93   3  40    37 -0.06
##                  kurtosis   se
## Participant*        -1.22 3.62
## Task Value           0.11 0.64
## Mastery Goal         0.31 0.47
## Self Efficacy        0.29 0.52
## Attribution          0.68 0.27
## Avoidance Goal      -0.49 0.68
## Performance Goal     0.28 0.54

1.2 PCA Version 1

1.2.1 Get the correlation matrix

cor.mat.mot <- cor(mot[, 2:7])

1.2.2 Get the Eigenvalues.

eig.vals.mot <- eigen(cor.mat.mot)$values

1.2.3 Plot Eigenvalues.

plot(eig.vals.mot,
     type = "b",
     main = "Scree Plot",
     xlab = "Index",
     ylab = "Eigenvalue")
abline(h = 1, lty = 2)  # Kaiser reference

The scree plot supports two components (i.e., two circles above the cut off value)

1.3 PCA Version 2

1.3.1 Run PCA (# of components not pre-determined)

pca.motivation <- prcomp(mot[, 2:7], scale. = TRUE) # scale. = TRUE makes sure the values are standardized
summary(pca.motivation) # Look at the Cumulative Proportion
## Importance of components:
##                           PC1    PC2    PC3    PC4     PC5    PC6
## Standard deviation     1.6584 1.0432 0.9381 0.7784 0.65586 0.4954
## Proportion of Variance 0.4584 0.1814 0.1467 0.1010 0.07169 0.0409
## Cumulative Proportion  0.4584 0.6397 0.7864 0.8874 0.95910 1.0000

PC1 + PC2 explain 63.97% of variance. Let’s go with two components.

1.3.2 Scree plot

fviz_eig(pca.motivation, addlabels = TRUE)

The scree plot supports two components.

1.4 Run PCA with 2 Components

pca.mot <- principal(mot[, 2:7], # dataset
                     # number of components
                     nfactors = 2, 
                     # rotation method is oblique (let the components correlate)
                     rotate = "oblimin",
                     # add component scores
                     scores = TRUE)

1.4.1 PCA Results for Motivation

Loadings (hide loadings below |.40|)

print(pca.mot$loadings, cutoff = .40, sort = TRUE)
## 
## Loadings:
##                  TC1    TC2   
## Task Value        0.803       
## Mastery Goal      0.739       
## Avoidance Goal   -0.860       
## Self Efficacy            0.739
## Attribution              0.682
## Performance Goal         0.549
## 
##                  TC1   TC2
## SS loadings    2.104 1.571
## Proportion Var 0.351 0.262
## Cumulative Var 0.351 0.613
  • RC1 = Task Value, Mastery Goal, Avoidance Goal (loading negative) (TMA)
  • RC2 = Self Efficacy, Attribution, Performance Goal (SAP)

Variance Explained

pca.mot$Vaccounted # Cumulative Variance Explained = 63.97%
##                             TC1       TC2
## SS loadings           2.1856698 1.6527610
## Proportion Var        0.3642783 0.2754602
## Cumulative Var        0.3642783 0.6397385
## Proportion Explained  0.5694175 0.4305825
## Cumulative Proportion 0.5694175 1.0000000

Extract the component scores and save as a dataframe

pca.mot.scores.df <- as.data.frame(pca.mot$scores)

Rename the components. RC1 = TMA, RC2 = SAP for now, but come up with names that best represent each component.

colnames(pca.mot.scores.df) <- c("TMA", "SAP")

Add participant IDs to the dataframe.

pca.mot.scores.df$Participant <- mot$Participant

Combine the original dataset with the component scores based on the Participant IDs.

dat2.mot.scores <- left_join(dat2, pca.mot.scores.df, by = "Participant")

With the new dataset (dat2_with_scores), you can use TMA and SAP as new variables.

2 PCA for Anxiety

Extract relevant columns, and make sure there is only 1 row per 1 participant.

# Keep only the variables you need
anx <- dat2 %>%
  select(Participant, 
         Somatic, 
         Avoidance, 
         Cognitive)

# Keep only one unique row per participant
anx <- distinct(anx, Participant, .keep_all = TRUE)

# Confirm there is only one row for each participant
table(anx$Participant)
## 
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  41  42  43  44  45  46  47  50  51  52  53  54  55  57  58  59  60  61  62  63 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80  81  82  83 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 101 102 103 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 124 125 126 127 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1

Make sure there is no missing values.

colSums(is.na(anx))
## Participant     Somatic   Avoidance   Cognitive 
##           0           0           0           0

No missing values.

Descriptives.

describe(anx)
##              vars   n  mean    sd median trimmed   mad min max range  skew
## Participant*    1 156 78.50 45.18   78.5   78.50 57.82   1 156   155  0.00
## Somatic         2 156 22.11  6.67   21.0   21.97  7.41  10  37    27  0.18
## Avoidance       3 156 17.18  4.02   17.0   17.10  4.45   7  29    22  0.20
## Cognitive       4 156 33.94  5.12   35.0   34.13  5.93  19  44    25 -0.36
##              kurtosis   se
## Participant*    -1.22 3.62
## Somatic         -0.83 0.53
## Avoidance        0.14 0.32
## Cognitive       -0.61 0.41

2.1 PCA Version 1

2.1.1 Get the correlation matrix

cor.mat.anx <- cor(anx[, 2:4])

2.1.2 Get the Eigenvalues.

eig.vals.anx <- eigen(cor.mat.anx)$values

2.1.3 Plot Eigenvalues.

plot(eig.vals.anx,
     type = "b",
     main = "Scree Plot",
     xlab = "Index",
     ylab = "Eigenvalue")
abline(h = 1, lty = 2)  # Kaiser reference

The scree plot supports one component … (i.e., one circle above the cut off value)

2.2 PCA Version 2

2.2.1 Run PCA (# of components not pre-determined)

pca.anxiety <- prcomp(anx[, 2:4], scale. = TRUE) # scale. = TRUE makes sure the values are standardized
summary(pca.anxiety) # Look at the Cumulative Proportion
## Importance of components:
##                           PC1    PC2    PC3
## Standard deviation     1.2959 0.9150 0.6953
## Proportion of Variance 0.5598 0.2791 0.1611
## Cumulative Proportion  0.5598 0.8388 1.0000

PC1 + PC2 explain 83.88% of variance. Let’s go with two components because PC1 only explains 55.98% of variance.

2.2.2 Scree plot

fviz_eig(pca.anxiety, addlabels = TRUE)

The scree plot supports two components.

2.3 Run PCA with 2 Components

pca.anx <- principal(anx[, 2:4], # dataset
                     # number of components
                     nfactors = 2, 
                     # rotation method is oblique (let the components correlate)
                     rotate = "oblimin",
                     # add component scores
                     scores = TRUE)

2.3.1 PCA Results for Anxiety

Loadings (hide loadings below |.40|)

print(pca.anx$loadings, cutoff = .40, sort = TRUE)
## 
## Loadings:
##           TC1    TC2   
## Somatic    0.931       
## Avoidance  0.745       
## Cognitive         0.980
## 
##                  TC1   TC2
## SS loadings    1.421 1.053
## Proportion Var 0.474 0.351
## Cumulative Var 0.474 0.825
  • RC1 = Somatic + Avoidance (SA)
  • RC2 = Cognitive (C)

Variance Explained

pca.anx$Vaccounted # Cumulative Variance Explained = 83.88%
##                             TC1       TC2
## SS loadings           1.4424500 1.0740864
## Proportion Var        0.4808167 0.3580288
## Cumulative Var        0.4808167 0.8388455
## Proportion Explained  0.5731886 0.4268114
## Cumulative Proportion 0.5731886 1.0000000

Extract the component scores and save as a dataframe

pca.anx.scores.df <- as.data.frame(pca.anx$scores)

Rename the components. RC1 = SA, RC2 = C for now, but come up with names that best represent each component.

colnames(pca.anx.scores.df) <- c("SA", "C")

Add participant IDs to the dataframe.

pca.anx.scores.df$Participant <- anx$Participant

Combine the original dataset with the component scores based on the Participant IDs.

dat2.anx.scores <- left_join(dat2, pca.anx.scores.df, by = "Participant")

With the new dataset (dat2.anx.scores), you can use SA and C as new variables.

3 PCA for Perceived Task Difficulty

dat3 <- read_csv("Cleaned_RQ2b_Dataset.csv", show_col_types = FALSE)
dat3$Participant <- as.factor(dat3$Participant) # Make participant IDs a factor
is.factor(dat3$Participant) # Confirm it's a factor
## [1] TRUE

3.1 Check for issues with data

colSums(is.na(dat3))
##          Participant                 Task                Genre 
##                    0                    0                    0 
##           Complexity         mentaleffort           difficulty 
##                    0                    1                    1 
##      contentplanning linguisticchallenges          MasteryGoal 
##                    1                    1                    0 
##        AvoidancegOAL      PerformanceGoal         SelfEfficacy 
##                    0                    0                    0 
##            TaskValue          Attribution         SomaticTotal 
##                    0                    0                    0 
##       AvoidanceTotal       CognitiveTotal        OSPANabsolute 
##                    0                    0                    0 
##           OSPANtotal            OSPANmath         RSPANPARTIAL 
##                    0                    0                    0 
##           RSPANTOTAL 
##                    0
rowSums(is.na(dat3))
##   [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [38] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [75] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [112] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [149] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [186] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [223] 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [260] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [297] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [334] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [371] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [408] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [445] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [482] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [519] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [556] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [593] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [630] 0 0 0 0 0 0 0 0 0 0 0

Missing values present.

which(rowSums(is.na(dat3)) > 0)
## [1] 223

Row 223 has missing values.

View(dat3)

Participant 56. Therefore, I will remove Participant 56.

dat3 <- dat3 %>% 
  filter(Participant != "56")

# Drop the levels that are not being used
dat3$Participant <- droplevels(dat3$Participant)

# Confirm the number of unique participant IDs
unique(dat3$Participant)
##   [1] 1   2   3   4   5   6   7   8   9   10  11  12  13  14  15  16  17  18 
##  [19] 19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36 
##  [37] 37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54 
##  [55] 55  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72  73 
##  [73] 74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90  91 
##  [91] 92  93  94  95  96  97  98  99  100 101 102 103 104 105 106 107 108 109
## [109] 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
## [127] 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145
## [145] 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
## 159 Levels: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 ... 160

Extract relevant columns, and make sure there is only 1 row per 1 participant.

# Keep only the variables you need
diff <- dat3 %>%
  select(Participant, 
         mentaleffort, 
         difficulty,
         contentplanning,
         linguisticchallenges)

# Keep only one unique row per participant
diff <- distinct(diff, Participant, .keep_all = TRUE)

# Confirm there is only one row for each participant
table(diff$Participant)
## 
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  57  58  59  60  61 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80  81 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 101 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1

Make sure there is no missing values.

colSums(is.na(dat3))
##          Participant                 Task                Genre 
##                    0                    0                    0 
##           Complexity         mentaleffort           difficulty 
##                    0                    0                    0 
##      contentplanning linguisticchallenges          MasteryGoal 
##                    0                    0                    0 
##        AvoidancegOAL      PerformanceGoal         SelfEfficacy 
##                    0                    0                    0 
##            TaskValue          Attribution         SomaticTotal 
##                    0                    0                    0 
##       AvoidanceTotal       CognitiveTotal        OSPANabsolute 
##                    0                    0                    0 
##           OSPANtotal            OSPANmath         RSPANPARTIAL 
##                    0                    0                    0 
##           RSPANTOTAL 
##                    0

No missing values.

Descriptives.

describe(dat3)
##                      vars   n  mean    sd median trimmed   mad min max range
## Participant*            1 636 80.00 45.93   80.0   80.00 59.30   1 159   158
## Task*                   2 636  2.50  1.12    2.5    2.50  1.48   1   4     3
## Genre*                  3 636  1.50  0.50    1.5    1.50  0.74   1   2     1
## Complexity*             4 636  1.50  0.50    1.5    1.50  0.74   1   2     1
## mentaleffort            5 636  6.07  1.63    6.0    6.13  1.48   1   9     8
## difficulty              6 636  5.93  1.73    6.0    6.00  1.48   1   9     8
## contentplanning         7 636  5.32  1.95    5.0    5.33  1.48   1   9     8
## linguisticchallenges    8 636  6.48  1.71    7.0    6.56  1.48   1   9     8
## MasteryGoal             9 636 22.70  5.80   22.0   22.68  5.93   7  39    32
## AvoidancegOAL          10 636 40.31  8.39   41.0   40.37  8.90  17  59    42
## PerformanceGoal        11 636 23.81  6.68   24.0   23.81  5.93   3  40    37
## SelfEfficacy           12 636 39.70  6.41   40.0   39.48  5.93  24  58    34
## TaskValue              13 636 27.87  7.96   27.0   27.64  7.41   7  50    43
## Attribution            14 636 19.35  3.34   20.0   19.48  2.97  10  30    20
## SomaticTotal           15 636 22.19  6.63   22.0   22.08  7.41  10  37    27
## AvoidanceTotal         16 636 17.19  3.98   17.0   17.12  4.45   7  29    22
## CognitiveTotal         17 636 33.93  5.10   35.0   34.13  5.93  19  44    25
## OSPANabsolute          18 636 38.31 15.54   39.0   38.79 13.34   0  75    75
## OSPANtotal             19 636 52.31 14.99   55.0   54.15 11.86   1  78    77
## OSPANmath*             20 636 15.42 10.46   16.0   15.45 14.83   1  29    28
## RSPANPARTIAL           21 636 15.94  6.27   16.0   15.78  5.93   1  36    35
## RSPANTOTAL             22 636 29.04  8.90   29.0   28.87  7.41   7  54    47
##                       skew kurtosis   se
## Participant*          0.00    -1.21 1.82
## Task*                 0.00    -1.37 0.04
## Genre*                0.00    -2.00 0.02
## Complexity*           0.00    -2.00 0.02
## mentaleffort         -0.37    -0.10 0.06
## difficulty           -0.32    -0.37 0.07
## contentplanning      -0.05    -0.75 0.08
## linguisticchallenges -0.44    -0.32 0.07
## MasteryGoal           0.11     0.35 0.23
## AvoidancegOAL        -0.13    -0.45 0.33
## PerformanceGoal      -0.07     0.29 0.26
## SelfEfficacy          0.34     0.35 0.25
## TaskValue             0.22     0.14 0.32
## Attribution          -0.23     0.77 0.13
## SomaticTotal          0.16    -0.81 0.26
## AvoidanceTotal        0.20     0.20 0.16
## CognitiveTotal       -0.37    -0.60 0.20
## OSPANabsolute        -0.28     0.06 0.62
## OSPANtotal           -1.13     1.20 0.59
## OSPANmath*           -0.15    -1.65 0.41
## RSPANPARTIAL          0.35     0.34 0.25
## RSPANTOTAL            0.14     0.32 0.35

3.2 PCA Version 1

3.2.1 Get the correlation matrix

cor.mat.diff <- cor(diff[, 2:5])

3.2.2 Get the Eigenvalues.

eig.vals.diff <- eigen(cor.mat.diff)$values

3.2.3 Plot Eigenvalues.

plot(eig.vals.diff,
     type = "b",
     main = "Scree Plot",
     xlab = "Index",
     ylab = "Eigenvalue")
abline(h = 1, lty = 2)  # Kaiser reference

The scree plot supports one component … (i.e., one circle above the cut off value)

3.3 PCA Version 2

3.3.1 Run PCA (# of components not pre-determined)

pca.difficulty <- prcomp(mot[, 2:5], scale. = TRUE) # scale. = TRUE makes sure the values are standardized
summary(pca.difficulty) # Look at the Cumulative Proportion
## Importance of components:
##                           PC1    PC2    PC3     PC4
## Standard deviation     1.4663 0.9761 0.7994 0.50809
## Proportion of Variance 0.5375 0.2382 0.1598 0.06454
## Cumulative Proportion  0.5375 0.7757 0.9355 1.00000

PC1 + PC2 explain 77.57% of variance. Let’s go with two components because PC1 only explains 53.75% of variance.

3.3.2 Scree plot

fviz_eig(pca.difficulty, addlabels = TRUE)

The scree plot supports two components.

3.4 Run PCA with 2 Components

pca.diff <- principal(diff[, 2:5], # dataset
                     # number of components
                     nfactors = 2, 
                     # rotation method is oblique (let the components correlate)
                     rotate = "oblimin",
                     # add component scores
                     scores = TRUE)

3.4.1 PCA Results for Perceived Task Difficulty

Loadings (hide loadings below |.40|)

print(pca.diff$loadings, cutoff = .40, sort = TRUE)
## 
## Loadings:
##                      TC1    TC2   
## mentaleffort          0.843       
## difficulty            0.919       
## linguisticchallenges  0.880       
## contentplanning              0.997
## 
##                  TC1   TC2
## SS loadings    2.331 1.004
## Proportion Var 0.583 0.251
## Cumulative Var 0.583 0.834
  • RC1 = mentaleffort + difficulty + linguisticchallenges (MDL)
  • RC2 = contentplanning (CONT)

Variance Explained

pca.diff$Vaccounted # Cumulative Variance Explained = 83.71%
##                             TC1       TC2
## SS loadings           2.3375134 1.0108791
## Proportion Var        0.5843783 0.2527198
## Cumulative Var        0.5843783 0.8370981
## Proportion Explained  0.6981002 0.3018998
## Cumulative Proportion 0.6981002 1.0000000

Extract the component scores and save as a dataframe

pca.diff.scores.df <- as.data.frame(pca.diff$scores)

Rename the components. RC1 = SA, RC2 = C for now, but come up with names that best represent each component.

colnames(pca.diff.scores.df) <- c("MDL", "CONT")

Add participant IDs to the dataframe.

pca.diff.scores.df$Participant <- diff$Participant

Combine the original dataset with the component scores based on the Participant IDs.

dat3.diff.scores <- left_join(dat3, pca.diff.scores.df, by = "Participant")

With the new dataset (dat3.diff.scores), you can use MDL and CONT as new variables.