# Attaching the packages to work with
library(foreign)
library(dplyr)
library(psych)
library(polycor)
library(GPArotation)
library(sjPlot)
library(ggpubr)
library(ggplot2)
library(car)
alpha <- psych::alpha
For the purpose of this particular task I wanted to try out a new set of variables for factor abalysis. My main concern with the variables that we used for the previous task was that all of them very different - they used different scales, some of them were subjective and others were more objective, some were about the respondents themselves, others - about their perception of other participants. Just as it was mentioned in the task itself, the data was simply not good for factor analysis. We observed, that one of the factors with the highest proportion of explained variance was the one variables about one’s interests loaded on. Here I wanted to focus on these variables and see if personal interests of respondents could be divided into several groups. I suppose that we will be able to observe a clear division of them into more and less active ones, more artistic ones, more sport-related ones etc.
# Loading the dataset
dating <-read.csv("Speed Dating Data.csv")
names(dating)
## [1] "iid" "id" "gender" "idg" "condtn" "wave"
## [7] "round" "position" "positin1" "order" "partner" "pid"
## [13] "match" "int_corr" "samerace" "age_o" "race_o" "pf_o_att"
## [19] "pf_o_sin" "pf_o_int" "pf_o_fun" "pf_o_amb" "pf_o_sha" "dec_o"
## [25] "attr_o" "sinc_o" "intel_o" "fun_o" "amb_o" "shar_o"
## [31] "like_o" "prob_o" "met_o" "age" "field" "field_cd"
## [37] "undergra" "mn_sat" "tuition" "race" "imprace" "imprelig"
## [43] "from" "zipcode" "income" "goal" "date" "go_out"
## [49] "career" "career_c" "sports" "tvsports" "exercise" "dining"
## [55] "museums" "art" "hiking" "gaming" "clubbing" "reading"
## [61] "tv" "theater" "movies" "concerts" "music" "shopping"
## [67] "yoga" "exphappy" "expnum" "attr1_1" "sinc1_1" "intel1_1"
## [73] "fun1_1" "amb1_1" "shar1_1" "attr4_1" "sinc4_1" "intel4_1"
## [79] "fun4_1" "amb4_1" "shar4_1" "attr2_1" "sinc2_1" "intel2_1"
## [85] "fun2_1" "amb2_1" "shar2_1" "attr3_1" "sinc3_1" "fun3_1"
## [91] "intel3_1" "amb3_1" "attr5_1" "sinc5_1" "intel5_1" "fun5_1"
## [97] "amb5_1" "dec" "attr" "sinc" "intel" "fun"
## [103] "amb" "shar" "like" "prob" "met" "match_es"
## [109] "attr1_s" "sinc1_s" "intel1_s" "fun1_s" "amb1_s" "shar1_s"
## [115] "attr3_s" "sinc3_s" "intel3_s" "fun3_s" "amb3_s" "satis_2"
## [121] "length" "numdat_2" "attr7_2" "sinc7_2" "intel7_2" "fun7_2"
## [127] "amb7_2" "shar7_2" "attr1_2" "sinc1_2" "intel1_2" "fun1_2"
## [133] "amb1_2" "shar1_2" "attr4_2" "sinc4_2" "intel4_2" "fun4_2"
## [139] "amb4_2" "shar4_2" "attr2_2" "sinc2_2" "intel2_2" "fun2_2"
## [145] "amb2_2" "shar2_2" "attr3_2" "sinc3_2" "intel3_2" "fun3_2"
## [151] "amb3_2" "attr5_2" "sinc5_2" "intel5_2" "fun5_2" "amb5_2"
## [157] "you_call" "them_cal" "date_3" "numdat_3" "num_in_3" "attr1_3"
## [163] "sinc1_3" "intel1_3" "fun1_3" "amb1_3" "shar1_3" "attr7_3"
## [169] "sinc7_3" "intel7_3" "fun7_3" "amb7_3" "shar7_3" "attr4_3"
## [175] "sinc4_3" "intel4_3" "fun4_3" "amb4_3" "shar4_3" "attr2_3"
## [181] "sinc2_3" "intel2_3" "fun2_3" "amb2_3" "shar2_3" "attr3_3"
## [187] "sinc3_3" "intel3_3" "fun3_3" "amb3_3" "attr5_3" "sinc5_3"
## [193] "intel5_3" "fun5_3" "amb5_3"
# Choosing the variables we think belong to factors
dating3<- dating[c("sports", "tvsports", "exercise",
"dining" , "museums", "art",
"hiking", "gaming", "clubbing",
"reading", "tv", "theater", "movies",
"concerts", "music", "shopping", "yoga")]
dating3 <- as.data.frame(dating3)
dim(dating3)
## [1] 8378 17
summary(dating3)
## sports tvsports exercise dining
## Min. : 1.000 Min. : 1.000 Min. : 1.000 Min. : 1.000
## 1st Qu.: 4.000 1st Qu.: 2.000 1st Qu.: 5.000 1st Qu.: 7.000
## Median : 7.000 Median : 4.000 Median : 6.000 Median : 8.000
## Mean : 6.425 Mean : 4.575 Mean : 6.246 Mean : 7.784
## 3rd Qu.: 9.000 3rd Qu.: 7.000 3rd Qu.: 8.000 3rd Qu.: 9.000
## Max. :10.000 Max. :10.000 Max. :10.000 Max. :10.000
## NA's :79 NA's :79 NA's :79 NA's :79
## museums art hiking gaming
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 6.000 1st Qu.: 5.000 1st Qu.: 4.000 1st Qu.: 2.000
## Median : 7.000 Median : 7.000 Median : 6.000 Median : 3.000
## Mean : 6.986 Mean : 6.715 Mean : 5.737 Mean : 3.881
## 3rd Qu.: 9.000 3rd Qu.: 8.000 3rd Qu.: 8.000 3rd Qu.: 6.000
## Max. :10.000 Max. :10.000 Max. :10.000 Max. :14.000
## NA's :79 NA's :79 NA's :79 NA's :79
## clubbing reading tv theater
## Min. : 0.000 Min. : 1.000 Min. : 1.000 Min. : 0.000
## 1st Qu.: 4.000 1st Qu.: 7.000 1st Qu.: 3.000 1st Qu.: 5.000
## Median : 6.000 Median : 8.000 Median : 6.000 Median : 7.000
## Mean : 5.746 Mean : 7.679 Mean : 5.304 Mean : 6.776
## 3rd Qu.: 8.000 3rd Qu.: 9.000 3rd Qu.: 7.000 3rd Qu.: 9.000
## Max. :10.000 Max. :13.000 Max. :10.000 Max. :10.000
## NA's :79 NA's :79 NA's :79 NA's :79
## movies concerts music shopping
## Min. : 0.00 Min. : 0.000 Min. : 1.000 Min. : 1.000
## 1st Qu.: 7.00 1st Qu.: 5.000 1st Qu.: 7.000 1st Qu.: 4.000
## Median : 8.00 Median : 7.000 Median : 8.000 Median : 6.000
## Mean : 7.92 Mean : 6.825 Mean : 7.851 Mean : 5.631
## 3rd Qu.: 9.00 3rd Qu.: 8.000 3rd Qu.: 9.000 3rd Qu.: 8.000
## Max. :10.00 Max. :10.000 Max. :10.000 Max. :10.000
## NA's :79 NA's :79 NA's :79 NA's :79
## yoga
## Min. : 0.000
## 1st Qu.: 2.000
## Median : 4.000
## Mean : 4.339
## 3rd Qu.: 7.000
## Max. :10.000
## NA's :79
str(dating3)
## 'data.frame': 8378 obs. of 17 variables:
## $ sports : int 9 9 9 9 9 9 9 9 9 9 ...
## $ tvsports: int 2 2 2 2 2 2 2 2 2 2 ...
## $ exercise: int 8 8 8 8 8 8 8 8 8 8 ...
## $ dining : int 9 9 9 9 9 9 9 9 9 9 ...
## $ museums : int 1 1 1 1 1 1 1 1 1 1 ...
## $ art : int 1 1 1 1 1 1 1 1 1 1 ...
## $ hiking : int 5 5 5 5 5 5 5 5 5 5 ...
## $ gaming : int 1 1 1 1 1 1 1 1 1 1 ...
## $ clubbing: int 5 5 5 5 5 5 5 5 5 5 ...
## $ reading : int 6 6 6 6 6 6 6 6 6 6 ...
## $ tv : int 9 9 9 9 9 9 9 9 9 9 ...
## $ theater : int 1 1 1 1 1 1 1 1 1 1 ...
## $ movies : int 10 10 10 10 10 10 10 10 10 10 ...
## $ concerts: int 10 10 10 10 10 10 10 10 10 10 ...
## $ music : int 9 9 9 9 9 9 9 9 9 9 ...
## $ shopping: int 8 8 8 8 8 8 8 8 8 8 ...
## $ yoga : int 1 1 1 1 1 1 1 1 1 1 ...
#Deleting NAs
dating3 <- na.omit(dating3)
First things first, we need to take a look at the screeplot to determine what is the number of factors recommended for this particular set of variables.
fa.parallel(dating3, fa="both", n.iter=100)
## Parallel analysis suggests that the number of factors = 7 and the number of components = 5
We observe that for FA line, there are a total of 7 points above the line (observed eigenvalues are higher than coresponding randomly generated ones). This number also appears in the output.
# No rotation
fa(dating3, nfactors = 7, rotate="none", fm="ml")
## Factor Analysis using method = ml
## Call: fa(r = dating3, nfactors = 7, rotate = "none", fm = "ml")
## Standardized loadings (pattern matrix) based upon correlation matrix
## ML1 ML2 ML4 ML3 ML5 ML6 ML7 h2 u2 com
## sports -0.13 -0.09 0.79 0.19 -0.19 0.08 0.07 0.74 0.263 1.4
## tvsports -0.08 0.10 0.60 0.19 0.19 0.25 -0.18 0.54 0.463 2.1
## exercise -0.01 0.08 0.43 -0.01 -0.17 0.13 0.21 0.28 0.718 2.1
## dining 0.40 0.33 0.02 0.00 -0.02 -0.10 0.15 0.30 0.701 2.4
## museums 1.00 -0.02 0.01 -0.01 0.00 0.00 0.00 1.00 0.005 1.0
## art 0.86 0.01 -0.04 0.07 -0.02 -0.02 0.08 0.76 0.241 1.0
## hiking 0.18 -0.16 0.17 0.21 0.01 -0.24 0.32 0.29 0.709 4.6
## gaming -0.07 0.17 0.30 0.21 0.60 -0.36 0.03 0.65 0.351 2.8
## clubbing 0.12 0.19 0.11 0.09 0.08 -0.15 0.08 0.11 0.894 4.8
## reading 0.33 -0.11 -0.13 -0.03 0.03 0.04 0.03 0.14 0.860 1.7
## tv 0.04 0.52 0.03 -0.02 0.40 0.36 -0.07 0.57 0.432 2.8
## theater 0.56 0.23 -0.25 0.17 0.13 0.24 0.28 0.61 0.388 3.2
## movies 0.33 0.27 -0.21 0.28 0.19 0.34 0.18 0.49 0.513 5.8
## concerts 0.40 0.23 -0.12 0.78 -0.08 -0.04 -0.05 0.85 0.154 1.8
## music 0.29 0.23 -0.08 0.59 -0.12 0.01 -0.13 0.53 0.474 2.1
## shopping 0.25 0.92 0.03 -0.13 -0.06 -0.04 -0.01 0.93 0.068 1.2
## yoga 0.27 0.15 -0.03 0.15 -0.11 -0.08 0.29 0.22 0.778 3.6
##
## ML1 ML2 ML4 ML3 ML5 ML6 ML7
## SS loadings 2.88 1.60 1.45 1.29 0.72 0.61 0.44
## Proportion Var 0.17 0.09 0.09 0.08 0.04 0.04 0.03
## Cumulative Var 0.17 0.26 0.35 0.42 0.47 0.50 0.53
## Proportion Explained 0.32 0.18 0.16 0.14 0.08 0.07 0.05
## Cumulative Proportion 0.32 0.50 0.66 0.80 0.88 0.95 1.00
##
## Mean item complexity = 2.6
## Test of the hypothesis that 7 factors are sufficient.
##
## The degrees of freedom for the null model are 136 and the objective function was 5.23 with Chi Square of 43392.29
## The degrees of freedom for the model are 38 and the objective function was 0.18
##
## The root mean square of the residuals (RMSR) is 0.02
## The df corrected root mean square of the residuals is 0.04
##
## The harmonic number of observations is 8299 with the empirical chi square 1207.59 with prob < 1.1e-228
## The total number of observations was 8299 with Likelihood Chi Square = 1483.92 with prob < 4.4e-287
##
## Tucker Lewis Index of factoring reliability = 0.88
## RMSEA index = 0.068 and the 90 % confidence intervals are 0.065 0.071
## BIC = 1141.01
## Fit based upon off diagonal values = 0.99
## Measures of factor score adequacy
## ML1 ML2 ML4 ML3 ML5 ML6
## Correlation of (regression) scores with factors 1.00 0.97 0.90 0.92 0.81 0.76
## Multiple R square of scores with factors 1.00 0.93 0.81 0.85 0.66 0.58
## Minimum correlation of possible factor scores 0.99 0.87 0.61 0.70 0.32 0.16
## ML7
## Correlation of (regression) scores with factors 0.67
## Multiple R square of scores with factors 0.45
## Minimum correlation of possible factor scores -0.10
fa.diagram(fa(dating3, nfactors=7, rotate="none", fm="ml"))
However, when we use 7 factors, a lot of them have only one or two variables loaded on them (Ml2, ML3, ML5, ML6, ML7) with only two of them actually having three and more variables loaded on them (ML1, ML4). The difference in “Proportion Explained” values is also striking since the very first factor has a value of 0.32, the last one - a mere 0.05. ML1 is also the only factor that has “Proportion Var” higher than 1. In general, the cumulative variance explained is equal to 0.53 which is still quite bad, but better than what we had before when we were working with variables from totally different parts of the questionnarie. The root mean square of the residuals (RMSR) is 0.02 which is actually quite good since we want it to be at least less than 0.05. RMSEA index is equal to 0.068 which is considered acceptable but not excellent. Tucker Lewis index of factoring reliability is equal to 0.88 which is still too small to be considered acceptable (>.90 acceptable, >.95 excellent).
I would want to first lower the number of factors up to 5 and then also add varimax rotation.
# No rotation
fa(dating3, nfactors=5, rotate="none", fm="ml")
## Factor Analysis using method = ml
## Call: fa(r = dating3, nfactors = 5, rotate = "none", fm = "ml")
## Standardized loadings (pattern matrix) based upon correlation matrix
## ML1 ML2 ML4 ML3 ML5 h2 u2 com
## sports -0.15 -0.07 0.84 0.20 0.00 0.77 0.231 1.2
## tvsports -0.04 0.28 0.56 0.14 -0.09 0.42 0.581 1.7
## exercise -0.03 0.03 0.45 0.00 0.17 0.23 0.769 1.3
## dining 0.42 0.06 0.02 -0.09 0.46 0.40 0.599 2.1
## museums 0.89 -0.18 0.07 -0.32 -0.04 0.94 0.061 1.4
## art 0.84 -0.17 0.04 -0.23 -0.01 0.79 0.211 1.2
## hiking 0.19 -0.19 0.19 0.10 -0.06 0.12 0.878 3.7
## gaming 0.00 0.23 0.17 0.15 0.05 0.10 0.896 2.7
## clubbing 0.16 0.02 0.09 0.05 0.31 0.13 0.869 1.8
## reading 0.28 -0.08 -0.09 -0.16 -0.13 0.14 0.861 2.6
## tv 0.17 0.94 0.03 -0.12 -0.05 0.93 0.073 1.1
## theater 0.64 0.13 -0.16 -0.05 0.07 0.46 0.535 1.2
## movies 0.47 0.29 -0.12 0.09 0.03 0.33 0.669 1.9
## concerts 0.68 0.04 -0.05 0.67 -0.03 0.92 0.080 2.0
## music 0.49 0.06 -0.01 0.47 0.06 0.47 0.528 2.1
## shopping 0.34 0.44 -0.01 -0.05 0.50 0.57 0.432 2.8
## yoga 0.33 -0.02 0.01 0.06 0.15 0.14 0.861 1.5
##
## ML1 ML2 ML4 ML3 ML5
## SS loadings 3.45 1.42 1.35 0.99 0.65
## Proportion Var 0.20 0.08 0.08 0.06 0.04
## Cumulative Var 0.20 0.29 0.37 0.42 0.46
## Proportion Explained 0.44 0.18 0.17 0.13 0.08
## Cumulative Proportion 0.44 0.62 0.79 0.92 1.00
##
## Mean item complexity = 1.9
## Test of the hypothesis that 5 factors are sufficient.
##
## The degrees of freedom for the null model are 136 and the objective function was 5.23 with Chi Square of 43392.29
## The degrees of freedom for the model are 61 and the objective function was 0.41
##
## The root mean square of the residuals (RMSR) is 0.04
## The df corrected root mean square of the residuals is 0.06
##
## The harmonic number of observations is 8299 with the empirical chi square 3369.42 with prob < 0
## The total number of observations was 8299 with Likelihood Chi Square = 3426.02 with prob < 0
##
## Tucker Lewis Index of factoring reliability = 0.826
## RMSEA index = 0.082 and the 90 % confidence intervals are 0.079 0.084
## BIC = 2875.56
## Fit based upon off diagonal values = 0.97
## Measures of factor score adequacy
## ML1 ML2 ML4 ML3 ML5
## Correlation of (regression) scores with factors 0.98 0.97 0.90 0.95 0.74
## Multiple R square of scores with factors 0.96 0.93 0.81 0.90 0.55
## Minimum correlation of possible factor scores 0.92 0.86 0.61 0.79 0.11
fa.diagram(fa(dating3, nfactors=5, rotate= "none", fm="ml"))
# Varimax rotation
fa(dating3, nfactors=5, rotate="varimax", fm="ml")
## Factor Analysis using method = ml
## Call: fa(r = dating3, nfactors = 5, rotate = "varimax", fm = "ml")
## Standardized loadings (pattern matrix) based upon correlation matrix
## ML1 ML3 ML4 ML2 ML5 h2 u2 com
## sports -0.12 0.00 0.85 -0.17 -0.03 0.77 0.231 1.1
## tvsports -0.08 0.06 0.60 0.22 -0.06 0.42 0.581 1.4
## exercise -0.03 -0.08 0.44 -0.02 0.17 0.23 0.769 1.4
## dining 0.24 0.10 -0.01 0.06 0.58 0.40 0.599 1.4
## museums 0.92 0.17 -0.01 -0.02 0.26 0.94 0.061 1.2
## art 0.82 0.22 -0.03 -0.03 0.26 0.79 0.211 1.4
## hiking 0.17 0.16 0.18 -0.18 -0.01 0.12 0.878 4.0
## gaming -0.11 0.11 0.21 0.18 0.05 0.10 0.896 3.3
## clubbing 0.02 0.09 0.08 -0.01 0.34 0.13 0.869 1.3
## reading 0.35 0.03 -0.12 0.00 -0.03 0.14 0.861 1.3
## tv 0.03 0.03 0.08 0.96 0.08 0.93 0.073 1.0
## theater 0.48 0.30 -0.18 0.21 0.27 0.46 0.535 3.2
## movies 0.25 0.34 -0.10 0.33 0.18 0.33 0.669 3.6
## concerts 0.21 0.93 0.02 0.03 0.14 0.92 0.080 1.1
## music 0.13 0.65 0.04 0.05 0.18 0.47 0.528 1.3
## shopping 0.06 0.10 -0.01 0.42 0.62 0.57 0.432 1.9
## yoga 0.19 0.20 0.01 -0.01 0.24 0.14 0.861 2.9
##
## ML1 ML3 ML4 ML2 ML5
## SS loadings 2.15 1.67 1.42 1.40 1.22
## Proportion Var 0.13 0.10 0.08 0.08 0.07
## Cumulative Var 0.13 0.22 0.31 0.39 0.46
## Proportion Explained 0.27 0.21 0.18 0.18 0.16
## Cumulative Proportion 0.27 0.49 0.67 0.84 1.00
##
## Mean item complexity = 1.9
## Test of the hypothesis that 5 factors are sufficient.
##
## The degrees of freedom for the null model are 136 and the objective function was 5.23 with Chi Square of 43392.29
## The degrees of freedom for the model are 61 and the objective function was 0.41
##
## The root mean square of the residuals (RMSR) is 0.04
## The df corrected root mean square of the residuals is 0.06
##
## The harmonic number of observations is 8299 with the empirical chi square 3369.42 with prob < 0
## The total number of observations was 8299 with Likelihood Chi Square = 3426.02 with prob < 0
##
## Tucker Lewis Index of factoring reliability = 0.826
## RMSEA index = 0.082 and the 90 % confidence intervals are 0.079 0.084
## BIC = 2875.56
## Fit based upon off diagonal values = 0.97
## Measures of factor score adequacy
## ML1 ML3 ML4 ML2 ML5
## Correlation of (regression) scores with factors 0.95 0.95 0.90 0.96 0.77
## Multiple R square of scores with factors 0.91 0.91 0.81 0.93 0.60
## Minimum correlation of possible factor scores 0.82 0.82 0.62 0.85 0.19
fa.diagram(fa(dating3, nfactors=5, rotate="varimax", fm="ml"))
The rotation helps us to get four defined factor groupings instead of three. After the rotation is performed we observe that the range of values in “Proportion Explained” becomes smaller: the highest value in case of no rotation was 0.44, lowest - 0.08; now the difference between the highest and lowest value is only 0.11 (0.27 and 0.16). Because of the rotation the highest loadings were maximized (ex. “tvsports” was loaded on ML4 with 0.56 and now it’s 0.6) and we can see those slight changes in the output tables.
In comparison to the very first model, values of “Proportion Var” for both ML1 and ML3 and now equal or higher than 0.1, with remaining values (0.8, 0.8, 0.7) being quite close to 0.1 treshold - which is good. Cumulative explained variance is equal to 0.46 which is lower than what we had in case of the first model due to the smaller number of factors. The root mean square of the residuals (RMSR) is 0.04 is still less than 0.05 which is good, RMSEA index is now equal to 0.082 which is a bit worse than what we previously had (it doesn’t meet the requirement now), Tucker Lewis index of factoring reliability is equal to 0.826 and still not high enough to be acceptable.
We have to keep in mind that it’s quite hard to carry out actually good factor analysis with the data we have.
By looking at the graph, we can see that “museums”, “art”, “theater” and “reading” were loaded on ML1 factor. As it was hypothesized in the first part of this practical task, those are the variables that indicate one’s interest in art and less active, more sophisticated ways of spending time. “Concerts”, “music”, “movies” were loaded on the ML3 factor and can be described as variables connected to one’s interest in more mainstream, popular culture (not fine art). These can still be considered as one’s interest in less active ways of spending time, but less sophisticated, and more common ones in comparison to those loaded on Ml1. Variables that were loaded on ML4 are all connected with one’s interest in sport activities: “sports”, “tvsports”, “exercise”, there are the ones we would label as more active than those we observed for the first two factors. Finally, such variables as “shopping”, “dining”, “clubbing” were loaded on the factor ML5. We would still consider these to be active ways of spending time, but they indicate one’s interest in more fun activities not connected with any kind of art, self-development or any other sophisticated and deep reasons for doing these.
We can now assess scale reliablity of our factors. Since all of the variables in this subset have the same scale (1-10) we don’t have to worry about the ‘check.keys’ attribute.
ML1 <- as.data.frame(dating3[c("museums", "art", "theater", "reading")], check.keys = TRUE)
alpha(ML1)
##
## Reliability analysis
## Call: alpha(x = ML1)
##
## raw_alpha std.alpha G6(smc) average_r S/N ase mean sd median_r
## 0.77 0.77 0.78 0.45 3.3 0.0041 7 1.6 0.43
##
## lower alpha upper 95% confidence boundaries
## 0.76 0.77 0.78
##
## Reliability if an item is dropped:
## raw_alpha std.alpha G6(smc) average_r S/N alpha se var.r med.r
## museums 0.60 0.59 0.52 0.33 1.4 0.0075 0.031 0.24
## art 0.63 0.63 0.57 0.36 1.7 0.0070 0.030 0.33
## theater 0.74 0.73 0.76 0.47 2.7 0.0053 0.112 0.33
## reading 0.84 0.85 0.83 0.65 5.5 0.0032 0.034 0.55
##
## Item statistics
## n raw.r std.r r.cor r.drop mean sd
## museums 8299 0.89 0.89 0.92 0.79 7.0 2.1
## art 8299 0.86 0.85 0.87 0.71 6.7 2.3
## theater 8299 0.76 0.75 0.59 0.54 6.8 2.2
## reading 8299 0.56 0.58 0.33 0.29 7.7 2.0
We observe that raw alpha and standartized alpha is equal to 0.77 and, as we know, we need it to be higher that 0.7 for the scale to be considered reliable. If we take a look at the output of the “Reliability if an item is dropped” table, we observe that in case of all variables the raw alpha drops when they are excluded, with an exception of the “reading variable”. If “readng” is excluded the raw alpha increases up to 0.84 which is better. We might actually consider dropping that variable. However, dropping this variables does not change the alpha output in terms of it exceeding or not exceeding the treshold (in which case dropping it would be a more serious question). It actually does make sense: we would probably pick reading as an “out-of-place” item even if we just take a look at other variables that were loaded on the ML1 factor - “museums”, “art”, “theater”. Though reading is still a non-active hobby that indicates one’s interest in art, it wouldn’t be considered to be that fine art represented by the remaining variables.
ML3 <- as.data.frame(dating3[c("concerts", "music", "movies")], check.keys = TRUE)
alpha(ML3)
##
## Reliability analysis
## Call: alpha(x = ML3)
##
## raw_alpha std.alpha G6(smc) average_r S/N ase mean sd median_r
## 0.72 0.72 0.66 0.46 2.5 0.0051 7.5 1.5 0.39
##
## lower alpha upper 95% confidence boundaries
## 0.71 0.72 0.73
##
## Reliability if an item is dropped:
## raw_alpha std.alpha G6(smc) average_r S/N alpha se var.r med.r
## concerts 0.49 0.49 0.33 0.33 0.98 0.0111 NA 0.33
## music 0.55 0.56 0.39 0.39 1.29 0.0095 NA 0.39
## movies 0.78 0.79 0.66 0.66 3.81 0.0046 NA 0.66
##
## Item statistics
## n raw.r std.r r.cor r.drop mean sd
## concerts 8299 0.88 0.85 0.77 0.65 6.8 2.2
## music 8299 0.83 0.83 0.73 0.61 7.9 1.8
## movies 8299 0.69 0.72 0.45 0.40 7.9 1.7
The raw and standartized alpha is case of factor ML3 is still higher than 0.7 (0.72 > 0.7) which means the scale is reliable. Similarly, we only have one variable that can be excluded and lead to a higher alpha value - “movies” (0.72 -> 0.78). While both “concerts” and “music” are connected to the music, “movies”, while still being a less active and more pop culture activity, feels a bit out of place.
ML4 <- as.data.frame(dating3[c("sports", "tvsports", "exercise")], check.keys = TRUE)
alpha(ML4)
##
## Reliability analysis
## Call: alpha(x = ML4)
##
## raw_alpha std.alpha G6(smc) average_r S/N ase mean sd median_r
## 0.64 0.64 0.56 0.37 1.7 0.0069 5.7 2 0.4
##
## lower alpha upper 95% confidence boundaries
## 0.62 0.64 0.65
##
## Reliability if an item is dropped:
## raw_alpha std.alpha G6(smc) average_r S/N alpha se var.r med.r
## sports 0.37 0.37 0.23 0.23 0.59 0.0137 NA 0.23
## tvsports 0.57 0.57 0.40 0.40 1.31 0.0095 NA 0.40
## exercise 0.65 0.65 0.48 0.48 1.85 0.0077 NA 0.48
##
## Item statistics
## n raw.r std.r r.cor r.drop mean sd
## sports 8299 0.82 0.82 0.70 0.56 6.4 2.6
## tvsports 8299 0.77 0.75 0.55 0.43 4.6 2.8
## exercise 8299 0.69 0.71 0.46 0.36 6.2 2.4
##
## Non missing response frequency for each item
## 1 2 3 4 5 6 7 8 9 10 miss
## sports 0.04 0.06 0.08 0.07 0.10 0.09 0.14 0.16 0.13 0.13 0
## tvsports 0.18 0.14 0.11 0.09 0.10 0.08 0.11 0.09 0.06 0.05 0
## exercise 0.04 0.05 0.07 0.07 0.13 0.14 0.14 0.16 0.11 0.08 0
ML5 <- as.data.frame(dating3[c("shopping", "dining", "clubbing")], check.keys = TRUE)
alpha(ML5)
##
## Reliability analysis
## Call: alpha(x = ML5)
##
## raw_alpha std.alpha G6(smc) average_r S/N ase mean sd median_r
## 0.51 0.54 0.45 0.28 1.2 0.0092 6.4 1.6 0.23
##
## lower alpha upper 95% confidence boundaries
## 0.49 0.51 0.53
##
## Reliability if an item is dropped:
## raw_alpha std.alpha G6(smc) average_r S/N alpha se var.r med.r
## shopping 0.36 0.38 0.23 0.23 0.60 0.0132 NA 0.23
## dining 0.33 0.33 0.20 0.20 0.49 0.0147 NA 0.20
## clubbing 0.55 0.58 0.41 0.41 1.37 0.0091 NA 0.41
##
## Item statistics
## n raw.r std.r r.cor r.drop mean sd
## shopping 8299 0.77 0.74 0.54 0.36 5.6 2.6
## dining 8299 0.69 0.76 0.57 0.41 7.8 1.8
## clubbing 8299 0.69 0.66 0.34 0.25 5.7 2.5
In case of the ML4 factor the alpha does not exceed the 0.7 thershold (0.7 > 0.64) and dropping any of the items doesn’t help: alpha only increases up to 0.65 if “exercise” is dropped and decreases in case of all other variables. The alpha is even lower in case of the ML5 factor: raw - 0.51, standartized - 0.54. Dropping “clubbing” would similarly lead to an insignificant increase in alpha.
Now we proceed to save scores for the future analysis since we want to to concenrate on observations and not variables (loadings).
# Specifying scores = T
fa1<-fa(dating3, nfactors=5, rotate="varimax", fm="ml", scores=T)
load <- fa1$loadings[,1:2]
plot(load) # set up plot
fascores<-as.data.frame(fa1$scores)
# Adding these as new columns
datingfa<-cbind(dating3, fascores)
names(datingfa)
## [1] "sports" "tvsports" "exercise" "dining" "museums" "art"
## [7] "hiking" "gaming" "clubbing" "reading" "tv" "theater"
## [13] "movies" "concerts" "music" "shopping" "yoga" "ML1"
## [19] "ML3" "ML4" "ML2" "ML5"
We can now use all of those factor scores as variables in the regression analysis. Let’s suppose we want to predict one’s interest in gaming (variable “gaming” corresponding to “How interested are you in gaming”, scale 1 - 10). As one of predictor variables, I would choose one’s interest in watching TV as a more of a laid-back activity that can lead to binge watching, is often assosiated with introvert people who don’t like going out and spending time doing activities (a highly stereotypical perspective). Secondly, since now we can use our factors in linear regression as predictors, I would take ML1 (interest in fine art) as the other predictor. We mighr hypothesize that people who like fine art are quite unlikely to rate gaming as a highly interesting activity for them.
We first build scatterplots to see what the general pattern is.
g1 <- ggplot(datingfa, aes(x= tv, y = gaming))+
geom_point(position = "jitter")+
labs(title = "Relationship between gaming and one's interest in TV", y="Importance of gaming", x="Importance of TV")+
theme_bw()+
geom_smooth(method = "lm")
g2 <- ggplot(datingfa, aes(x= ML1, y = gaming))+
geom_point(position = "jitter")+
labs(title = "Relationship between gaming and one's interest in fine art", y="Importance of gaming", x="Level of free choice")+
theme_bw()+
geom_smooth(method = "lm")
ggarrange(g1, g2 + rremove("x.text"),
labels = c("A", "B"),
ncol = 2, nrow = 1)
cor.test(datingfa$gaming, datingfa$tv)
##
## Pearson's product-moment correlation
##
## data: datingfa$gaming and datingfa$tv
## t = 18.506, df = 8297, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1783520 0.2196774
## sample estimates:
## cor
## 0.1991032
cor.test(datingfa$gaming, datingfa$ML1)
##
## Pearson's product-moment correlation
##
## data: datingfa$gaming and datingfa$ML1
## t = -10.932, df = 8297, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1403146 -0.0978948
## sample estimates:
## cor
## -0.1191591
We observe a negative weak correlation in case of the “Fine art interest” and a weak positive correlation in case of the “Interest in watching TV” variables. We proceed to build a model with these predictors.
modelf <- lm(datingfa$gaming ~ datingfa$tv + datingfa$ML1)
summary(modelf)
##
## Call:
## lm(formula = datingfa$gaming ~ datingfa$tv + datingfa$ML1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.8000 -2.0795 -0.4287 1.7871 9.9588
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.76679 0.06499 42.57 <2e-16 ***
## datingfa$tv 0.21010 0.01106 18.99 <2e-16 ***
## datingfa$ML1 -0.34385 0.02934 -11.72 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.547 on 8296 degrees of freedom
## Multiple R-squared: 0.05528, Adjusted R-squared: 0.05505
## F-statistic: 242.7 on 2 and 8296 DF, p-value: < 2.2e-16
The model explains about 5% of the variance in the outcome variable but is still better than the model based solely on the mean values (p-value: < 2.2e-16). Both predictors are statistically significant and if we were to write down the regression equation:
\[Predicted gaming = 2.76679 + 0.21010 * TV - 0.34385 * FineArt\]
With a one point increase in one’s interest in watching TV, one’s interest in gaming increases by 0.21010. On the contrary, with a one-point increase in value of one’s interest in Fine arts, one’s interest in gamin decreases by 0.34385.
par(mfrow=c(2,2))
plot(modelf)
No points are observed on the Cook’s distance line (it is not present on the graph) which indicates that we don’t have highly influential observations. In terms of the Q-Q plot, we can see how some observations at the upper part of it are a bit further away from the line, but in general it looks fine. We are looking for a straight horizontal line on the first graph and it seems fine except for the those values at the right end that drive the line a bit down. The line we observe on the Scale-Location graph is not ideal, but actually quite fine.
The end