Preliminary preparations

# Attaching the packages to work with
library(foreign)
library(dplyr)
library(psych)
library(polycor)
library(GPArotation)
library(sjPlot)
library(ggpubr)
library(ggplot2)
library(car)
alpha <- psych::alpha

For the purpose of this particular task I wanted to try out a new set of variables for factor abalysis. My main concern with the variables that we used for the previous task was that all of them very different - they used different scales, some of them were subjective and others were more objective, some were about the respondents themselves, others - about their perception of other participants. Just as it was mentioned in the task itself, the data was simply not good for factor analysis. We observed, that one of the factors with the highest proportion of explained variance was the one variables about one’s interests loaded on. Here I wanted to focus on these variables and see if personal interests of respondents could be divided into several groups. I suppose that we will be able to observe a clear division of them into more and less active ones, more artistic ones, more sport-related ones etc.

# Loading the dataset
dating <-read.csv("Speed Dating Data.csv")
names(dating)
##   [1] "iid"      "id"       "gender"   "idg"      "condtn"   "wave"    
##   [7] "round"    "position" "positin1" "order"    "partner"  "pid"     
##  [13] "match"    "int_corr" "samerace" "age_o"    "race_o"   "pf_o_att"
##  [19] "pf_o_sin" "pf_o_int" "pf_o_fun" "pf_o_amb" "pf_o_sha" "dec_o"   
##  [25] "attr_o"   "sinc_o"   "intel_o"  "fun_o"    "amb_o"    "shar_o"  
##  [31] "like_o"   "prob_o"   "met_o"    "age"      "field"    "field_cd"
##  [37] "undergra" "mn_sat"   "tuition"  "race"     "imprace"  "imprelig"
##  [43] "from"     "zipcode"  "income"   "goal"     "date"     "go_out"  
##  [49] "career"   "career_c" "sports"   "tvsports" "exercise" "dining"  
##  [55] "museums"  "art"      "hiking"   "gaming"   "clubbing" "reading" 
##  [61] "tv"       "theater"  "movies"   "concerts" "music"    "shopping"
##  [67] "yoga"     "exphappy" "expnum"   "attr1_1"  "sinc1_1"  "intel1_1"
##  [73] "fun1_1"   "amb1_1"   "shar1_1"  "attr4_1"  "sinc4_1"  "intel4_1"
##  [79] "fun4_1"   "amb4_1"   "shar4_1"  "attr2_1"  "sinc2_1"  "intel2_1"
##  [85] "fun2_1"   "amb2_1"   "shar2_1"  "attr3_1"  "sinc3_1"  "fun3_1"  
##  [91] "intel3_1" "amb3_1"   "attr5_1"  "sinc5_1"  "intel5_1" "fun5_1"  
##  [97] "amb5_1"   "dec"      "attr"     "sinc"     "intel"    "fun"     
## [103] "amb"      "shar"     "like"     "prob"     "met"      "match_es"
## [109] "attr1_s"  "sinc1_s"  "intel1_s" "fun1_s"   "amb1_s"   "shar1_s" 
## [115] "attr3_s"  "sinc3_s"  "intel3_s" "fun3_s"   "amb3_s"   "satis_2" 
## [121] "length"   "numdat_2" "attr7_2"  "sinc7_2"  "intel7_2" "fun7_2"  
## [127] "amb7_2"   "shar7_2"  "attr1_2"  "sinc1_2"  "intel1_2" "fun1_2"  
## [133] "amb1_2"   "shar1_2"  "attr4_2"  "sinc4_2"  "intel4_2" "fun4_2"  
## [139] "amb4_2"   "shar4_2"  "attr2_2"  "sinc2_2"  "intel2_2" "fun2_2"  
## [145] "amb2_2"   "shar2_2"  "attr3_2"  "sinc3_2"  "intel3_2" "fun3_2"  
## [151] "amb3_2"   "attr5_2"  "sinc5_2"  "intel5_2" "fun5_2"   "amb5_2"  
## [157] "you_call" "them_cal" "date_3"   "numdat_3" "num_in_3" "attr1_3" 
## [163] "sinc1_3"  "intel1_3" "fun1_3"   "amb1_3"   "shar1_3"  "attr7_3" 
## [169] "sinc7_3"  "intel7_3" "fun7_3"   "amb7_3"   "shar7_3"  "attr4_3" 
## [175] "sinc4_3"  "intel4_3" "fun4_3"   "amb4_3"   "shar4_3"  "attr2_3" 
## [181] "sinc2_3"  "intel2_3" "fun2_3"   "amb2_3"   "shar2_3"  "attr3_3" 
## [187] "sinc3_3"  "intel3_3" "fun3_3"   "amb3_3"   "attr5_3"  "sinc5_3" 
## [193] "intel5_3" "fun5_3"   "amb5_3"
# Choosing the variables we think belong to factors
dating3<- dating[c("sports", "tvsports", "exercise",  
                   "dining" , "museums",  "art",  
                   "hiking", "gaming",  "clubbing",  
                   "reading", "tv",  "theater", "movies", 
                   "concerts",  "music",   "shopping", "yoga")]
dating3 <- as.data.frame(dating3)
dim(dating3)                
## [1] 8378   17
summary(dating3)
##      sports          tvsports         exercise          dining      
##  Min.   : 1.000   Min.   : 1.000   Min.   : 1.000   Min.   : 1.000  
##  1st Qu.: 4.000   1st Qu.: 2.000   1st Qu.: 5.000   1st Qu.: 7.000  
##  Median : 7.000   Median : 4.000   Median : 6.000   Median : 8.000  
##  Mean   : 6.425   Mean   : 4.575   Mean   : 6.246   Mean   : 7.784  
##  3rd Qu.: 9.000   3rd Qu.: 7.000   3rd Qu.: 8.000   3rd Qu.: 9.000  
##  Max.   :10.000   Max.   :10.000   Max.   :10.000   Max.   :10.000  
##  NA's   :79       NA's   :79       NA's   :79       NA's   :79      
##     museums            art             hiking           gaming      
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 6.000   1st Qu.: 5.000   1st Qu.: 4.000   1st Qu.: 2.000  
##  Median : 7.000   Median : 7.000   Median : 6.000   Median : 3.000  
##  Mean   : 6.986   Mean   : 6.715   Mean   : 5.737   Mean   : 3.881  
##  3rd Qu.: 9.000   3rd Qu.: 8.000   3rd Qu.: 8.000   3rd Qu.: 6.000  
##  Max.   :10.000   Max.   :10.000   Max.   :10.000   Max.   :14.000  
##  NA's   :79       NA's   :79       NA's   :79       NA's   :79      
##     clubbing         reading             tv            theater      
##  Min.   : 0.000   Min.   : 1.000   Min.   : 1.000   Min.   : 0.000  
##  1st Qu.: 4.000   1st Qu.: 7.000   1st Qu.: 3.000   1st Qu.: 5.000  
##  Median : 6.000   Median : 8.000   Median : 6.000   Median : 7.000  
##  Mean   : 5.746   Mean   : 7.679   Mean   : 5.304   Mean   : 6.776  
##  3rd Qu.: 8.000   3rd Qu.: 9.000   3rd Qu.: 7.000   3rd Qu.: 9.000  
##  Max.   :10.000   Max.   :13.000   Max.   :10.000   Max.   :10.000  
##  NA's   :79       NA's   :79       NA's   :79       NA's   :79      
##      movies         concerts          music           shopping     
##  Min.   : 0.00   Min.   : 0.000   Min.   : 1.000   Min.   : 1.000  
##  1st Qu.: 7.00   1st Qu.: 5.000   1st Qu.: 7.000   1st Qu.: 4.000  
##  Median : 8.00   Median : 7.000   Median : 8.000   Median : 6.000  
##  Mean   : 7.92   Mean   : 6.825   Mean   : 7.851   Mean   : 5.631  
##  3rd Qu.: 9.00   3rd Qu.: 8.000   3rd Qu.: 9.000   3rd Qu.: 8.000  
##  Max.   :10.00   Max.   :10.000   Max.   :10.000   Max.   :10.000  
##  NA's   :79      NA's   :79       NA's   :79       NA's   :79      
##       yoga       
##  Min.   : 0.000  
##  1st Qu.: 2.000  
##  Median : 4.000  
##  Mean   : 4.339  
##  3rd Qu.: 7.000  
##  Max.   :10.000  
##  NA's   :79
str(dating3)
## 'data.frame':    8378 obs. of  17 variables:
##  $ sports  : int  9 9 9 9 9 9 9 9 9 9 ...
##  $ tvsports: int  2 2 2 2 2 2 2 2 2 2 ...
##  $ exercise: int  8 8 8 8 8 8 8 8 8 8 ...
##  $ dining  : int  9 9 9 9 9 9 9 9 9 9 ...
##  $ museums : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ art     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ hiking  : int  5 5 5 5 5 5 5 5 5 5 ...
##  $ gaming  : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ clubbing: int  5 5 5 5 5 5 5 5 5 5 ...
##  $ reading : int  6 6 6 6 6 6 6 6 6 6 ...
##  $ tv      : int  9 9 9 9 9 9 9 9 9 9 ...
##  $ theater : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ movies  : int  10 10 10 10 10 10 10 10 10 10 ...
##  $ concerts: int  10 10 10 10 10 10 10 10 10 10 ...
##  $ music   : int  9 9 9 9 9 9 9 9 9 9 ...
##  $ shopping: int  8 8 8 8 8 8 8 8 8 8 ...
##  $ yoga    : int  1 1 1 1 1 1 1 1 1 1 ...
#Deleting NAs
dating3 <- na.omit(dating3)

First model: 7 factors

First things first, we need to take a look at the screeplot to determine what is the number of factors recommended for this particular set of variables.

fa.parallel(dating3, fa="both", n.iter=100) 

## Parallel analysis suggests that the number of factors =  7  and the number of components =  5

We observe that for FA line, there are a total of 7 points above the line (observed eigenvalues are higher than coresponding randomly generated ones). This number also appears in the output.

# No rotation
fa(dating3, nfactors = 7, rotate="none", fm="ml") 
## Factor Analysis using method =  ml
## Call: fa(r = dating3, nfactors = 7, rotate = "none", fm = "ml")
## Standardized loadings (pattern matrix) based upon correlation matrix
##            ML1   ML2   ML4   ML3   ML5   ML6   ML7   h2    u2 com
## sports   -0.13 -0.09  0.79  0.19 -0.19  0.08  0.07 0.74 0.263 1.4
## tvsports -0.08  0.10  0.60  0.19  0.19  0.25 -0.18 0.54 0.463 2.1
## exercise -0.01  0.08  0.43 -0.01 -0.17  0.13  0.21 0.28 0.718 2.1
## dining    0.40  0.33  0.02  0.00 -0.02 -0.10  0.15 0.30 0.701 2.4
## museums   1.00 -0.02  0.01 -0.01  0.00  0.00  0.00 1.00 0.005 1.0
## art       0.86  0.01 -0.04  0.07 -0.02 -0.02  0.08 0.76 0.241 1.0
## hiking    0.18 -0.16  0.17  0.21  0.01 -0.24  0.32 0.29 0.709 4.6
## gaming   -0.07  0.17  0.30  0.21  0.60 -0.36  0.03 0.65 0.351 2.8
## clubbing  0.12  0.19  0.11  0.09  0.08 -0.15  0.08 0.11 0.894 4.8
## reading   0.33 -0.11 -0.13 -0.03  0.03  0.04  0.03 0.14 0.860 1.7
## tv        0.04  0.52  0.03 -0.02  0.40  0.36 -0.07 0.57 0.432 2.8
## theater   0.56  0.23 -0.25  0.17  0.13  0.24  0.28 0.61 0.388 3.2
## movies    0.33  0.27 -0.21  0.28  0.19  0.34  0.18 0.49 0.513 5.8
## concerts  0.40  0.23 -0.12  0.78 -0.08 -0.04 -0.05 0.85 0.154 1.8
## music     0.29  0.23 -0.08  0.59 -0.12  0.01 -0.13 0.53 0.474 2.1
## shopping  0.25  0.92  0.03 -0.13 -0.06 -0.04 -0.01 0.93 0.068 1.2
## yoga      0.27  0.15 -0.03  0.15 -0.11 -0.08  0.29 0.22 0.778 3.6
## 
##                        ML1  ML2  ML4  ML3  ML5  ML6  ML7
## SS loadings           2.88 1.60 1.45 1.29 0.72 0.61 0.44
## Proportion Var        0.17 0.09 0.09 0.08 0.04 0.04 0.03
## Cumulative Var        0.17 0.26 0.35 0.42 0.47 0.50 0.53
## Proportion Explained  0.32 0.18 0.16 0.14 0.08 0.07 0.05
## Cumulative Proportion 0.32 0.50 0.66 0.80 0.88 0.95 1.00
## 
## Mean item complexity =  2.6
## Test of the hypothesis that 7 factors are sufficient.
## 
## The degrees of freedom for the null model are  136  and the objective function was  5.23 with Chi Square of  43392.29
## The degrees of freedom for the model are 38  and the objective function was  0.18 
## 
## The root mean square of the residuals (RMSR) is  0.02 
## The df corrected root mean square of the residuals is  0.04 
## 
## The harmonic number of observations is  8299 with the empirical chi square  1207.59  with prob <  1.1e-228 
## The total number of observations was  8299  with Likelihood Chi Square =  1483.92  with prob <  4.4e-287 
## 
## Tucker Lewis Index of factoring reliability =  0.88
## RMSEA index =  0.068  and the 90 % confidence intervals are  0.065 0.071
## BIC =  1141.01
## Fit based upon off diagonal values = 0.99
## Measures of factor score adequacy             
##                                                    ML1  ML2  ML4  ML3  ML5  ML6
## Correlation of (regression) scores with factors   1.00 0.97 0.90 0.92 0.81 0.76
## Multiple R square of scores with factors          1.00 0.93 0.81 0.85 0.66 0.58
## Minimum correlation of possible factor scores     0.99 0.87 0.61 0.70 0.32 0.16
##                                                     ML7
## Correlation of (regression) scores with factors    0.67
## Multiple R square of scores with factors           0.45
## Minimum correlation of possible factor scores     -0.10
fa.diagram(fa(dating3, nfactors=7, rotate="none", fm="ml"))

However, when we use 7 factors, a lot of them have only one or two variables loaded on them (Ml2, ML3, ML5, ML6, ML7) with only two of them actually having three and more variables loaded on them (ML1, ML4). The difference in “Proportion Explained” values is also striking since the very first factor has a value of 0.32, the last one - a mere 0.05. ML1 is also the only factor that has “Proportion Var” higher than 1. In general, the cumulative variance explained is equal to 0.53 which is still quite bad, but better than what we had before when we were working with variables from totally different parts of the questionnarie. The root mean square of the residuals (RMSR) is 0.02 which is actually quite good since we want it to be at least less than 0.05. RMSEA index is equal to 0.068 which is considered acceptable but not excellent. Tucker Lewis index of factoring reliability is equal to 0.88 which is still too small to be considered acceptable (>.90 acceptable, >.95 excellent).

I would want to first lower the number of factors up to 5 and then also add varimax rotation.

Second model: 5 factors

# No rotation
fa(dating3, nfactors=5, rotate="none", fm="ml") 
## Factor Analysis using method =  ml
## Call: fa(r = dating3, nfactors = 5, rotate = "none", fm = "ml")
## Standardized loadings (pattern matrix) based upon correlation matrix
##            ML1   ML2   ML4   ML3   ML5   h2    u2 com
## sports   -0.15 -0.07  0.84  0.20  0.00 0.77 0.231 1.2
## tvsports -0.04  0.28  0.56  0.14 -0.09 0.42 0.581 1.7
## exercise -0.03  0.03  0.45  0.00  0.17 0.23 0.769 1.3
## dining    0.42  0.06  0.02 -0.09  0.46 0.40 0.599 2.1
## museums   0.89 -0.18  0.07 -0.32 -0.04 0.94 0.061 1.4
## art       0.84 -0.17  0.04 -0.23 -0.01 0.79 0.211 1.2
## hiking    0.19 -0.19  0.19  0.10 -0.06 0.12 0.878 3.7
## gaming    0.00  0.23  0.17  0.15  0.05 0.10 0.896 2.7
## clubbing  0.16  0.02  0.09  0.05  0.31 0.13 0.869 1.8
## reading   0.28 -0.08 -0.09 -0.16 -0.13 0.14 0.861 2.6
## tv        0.17  0.94  0.03 -0.12 -0.05 0.93 0.073 1.1
## theater   0.64  0.13 -0.16 -0.05  0.07 0.46 0.535 1.2
## movies    0.47  0.29 -0.12  0.09  0.03 0.33 0.669 1.9
## concerts  0.68  0.04 -0.05  0.67 -0.03 0.92 0.080 2.0
## music     0.49  0.06 -0.01  0.47  0.06 0.47 0.528 2.1
## shopping  0.34  0.44 -0.01 -0.05  0.50 0.57 0.432 2.8
## yoga      0.33 -0.02  0.01  0.06  0.15 0.14 0.861 1.5
## 
##                        ML1  ML2  ML4  ML3  ML5
## SS loadings           3.45 1.42 1.35 0.99 0.65
## Proportion Var        0.20 0.08 0.08 0.06 0.04
## Cumulative Var        0.20 0.29 0.37 0.42 0.46
## Proportion Explained  0.44 0.18 0.17 0.13 0.08
## Cumulative Proportion 0.44 0.62 0.79 0.92 1.00
## 
## Mean item complexity =  1.9
## Test of the hypothesis that 5 factors are sufficient.
## 
## The degrees of freedom for the null model are  136  and the objective function was  5.23 with Chi Square of  43392.29
## The degrees of freedom for the model are 61  and the objective function was  0.41 
## 
## The root mean square of the residuals (RMSR) is  0.04 
## The df corrected root mean square of the residuals is  0.06 
## 
## The harmonic number of observations is  8299 with the empirical chi square  3369.42  with prob <  0 
## The total number of observations was  8299  with Likelihood Chi Square =  3426.02  with prob <  0 
## 
## Tucker Lewis Index of factoring reliability =  0.826
## RMSEA index =  0.082  and the 90 % confidence intervals are  0.079 0.084
## BIC =  2875.56
## Fit based upon off diagonal values = 0.97
## Measures of factor score adequacy             
##                                                    ML1  ML2  ML4  ML3  ML5
## Correlation of (regression) scores with factors   0.98 0.97 0.90 0.95 0.74
## Multiple R square of scores with factors          0.96 0.93 0.81 0.90 0.55
## Minimum correlation of possible factor scores     0.92 0.86 0.61 0.79 0.11
fa.diagram(fa(dating3, nfactors=5, rotate= "none", fm="ml"))

# Varimax rotation
fa(dating3, nfactors=5, rotate="varimax", fm="ml") 
## Factor Analysis using method =  ml
## Call: fa(r = dating3, nfactors = 5, rotate = "varimax", fm = "ml")
## Standardized loadings (pattern matrix) based upon correlation matrix
##            ML1   ML3   ML4   ML2   ML5   h2    u2 com
## sports   -0.12  0.00  0.85 -0.17 -0.03 0.77 0.231 1.1
## tvsports -0.08  0.06  0.60  0.22 -0.06 0.42 0.581 1.4
## exercise -0.03 -0.08  0.44 -0.02  0.17 0.23 0.769 1.4
## dining    0.24  0.10 -0.01  0.06  0.58 0.40 0.599 1.4
## museums   0.92  0.17 -0.01 -0.02  0.26 0.94 0.061 1.2
## art       0.82  0.22 -0.03 -0.03  0.26 0.79 0.211 1.4
## hiking    0.17  0.16  0.18 -0.18 -0.01 0.12 0.878 4.0
## gaming   -0.11  0.11  0.21  0.18  0.05 0.10 0.896 3.3
## clubbing  0.02  0.09  0.08 -0.01  0.34 0.13 0.869 1.3
## reading   0.35  0.03 -0.12  0.00 -0.03 0.14 0.861 1.3
## tv        0.03  0.03  0.08  0.96  0.08 0.93 0.073 1.0
## theater   0.48  0.30 -0.18  0.21  0.27 0.46 0.535 3.2
## movies    0.25  0.34 -0.10  0.33  0.18 0.33 0.669 3.6
## concerts  0.21  0.93  0.02  0.03  0.14 0.92 0.080 1.1
## music     0.13  0.65  0.04  0.05  0.18 0.47 0.528 1.3
## shopping  0.06  0.10 -0.01  0.42  0.62 0.57 0.432 1.9
## yoga      0.19  0.20  0.01 -0.01  0.24 0.14 0.861 2.9
## 
##                        ML1  ML3  ML4  ML2  ML5
## SS loadings           2.15 1.67 1.42 1.40 1.22
## Proportion Var        0.13 0.10 0.08 0.08 0.07
## Cumulative Var        0.13 0.22 0.31 0.39 0.46
## Proportion Explained  0.27 0.21 0.18 0.18 0.16
## Cumulative Proportion 0.27 0.49 0.67 0.84 1.00
## 
## Mean item complexity =  1.9
## Test of the hypothesis that 5 factors are sufficient.
## 
## The degrees of freedom for the null model are  136  and the objective function was  5.23 with Chi Square of  43392.29
## The degrees of freedom for the model are 61  and the objective function was  0.41 
## 
## The root mean square of the residuals (RMSR) is  0.04 
## The df corrected root mean square of the residuals is  0.06 
## 
## The harmonic number of observations is  8299 with the empirical chi square  3369.42  with prob <  0 
## The total number of observations was  8299  with Likelihood Chi Square =  3426.02  with prob <  0 
## 
## Tucker Lewis Index of factoring reliability =  0.826
## RMSEA index =  0.082  and the 90 % confidence intervals are  0.079 0.084
## BIC =  2875.56
## Fit based upon off diagonal values = 0.97
## Measures of factor score adequacy             
##                                                    ML1  ML3  ML4  ML2  ML5
## Correlation of (regression) scores with factors   0.95 0.95 0.90 0.96 0.77
## Multiple R square of scores with factors          0.91 0.91 0.81 0.93 0.60
## Minimum correlation of possible factor scores     0.82 0.82 0.62 0.85 0.19
fa.diagram(fa(dating3, nfactors=5, rotate="varimax", fm="ml"))

The rotation helps us to get four defined factor groupings instead of three. After the rotation is performed we observe that the range of values in “Proportion Explained” becomes smaller: the highest value in case of no rotation was 0.44, lowest - 0.08; now the difference between the highest and lowest value is only 0.11 (0.27 and 0.16). Because of the rotation the highest loadings were maximized (ex. “tvsports” was loaded on ML4 with 0.56 and now it’s 0.6) and we can see those slight changes in the output tables.

In comparison to the very first model, values of “Proportion Var” for both ML1 and ML3 and now equal or higher than 0.1, with remaining values (0.8, 0.8, 0.7) being quite close to 0.1 treshold - which is good. Cumulative explained variance is equal to 0.46 which is lower than what we had in case of the first model due to the smaller number of factors. The root mean square of the residuals (RMSR) is 0.04 is still less than 0.05 which is good, RMSEA index is now equal to 0.082 which is a bit worse than what we previously had (it doesn’t meet the requirement now), Tucker Lewis index of factoring reliability is equal to 0.826 and still not high enough to be acceptable.

We have to keep in mind that it’s quite hard to carry out actually good factor analysis with the data we have.

By looking at the graph, we can see that “museums”, “art”, “theater” and “reading” were loaded on ML1 factor. As it was hypothesized in the first part of this practical task, those are the variables that indicate one’s interest in art and less active, more sophisticated ways of spending time. “Concerts”, “music”, “movies” were loaded on the ML3 factor and can be described as variables connected to one’s interest in more mainstream, popular culture (not fine art). These can still be considered as one’s interest in less active ways of spending time, but less sophisticated, and more common ones in comparison to those loaded on Ml1. Variables that were loaded on ML4 are all connected with one’s interest in sport activities: “sports”, “tvsports”, “exercise”, there are the ones we would label as more active than those we observed for the first two factors. Finally, such variables as “shopping”, “dining”, “clubbing” were loaded on the factor ML5. We would still consider these to be active ways of spending time, but they indicate one’s interest in more fun activities not connected with any kind of art, self-development or any other sophisticated and deep reasons for doing these.

We can now assess scale reliablity of our factors. Since all of the variables in this subset have the same scale (1-10) we don’t have to worry about the ‘check.keys’ attribute.

Fit of the scales

ML1 <- as.data.frame(dating3[c("museums", "art", "theater", "reading")], check.keys = TRUE)
alpha(ML1)
## 
## Reliability analysis   
## Call: alpha(x = ML1)
## 
##   raw_alpha std.alpha G6(smc) average_r S/N    ase mean  sd median_r
##       0.77      0.77    0.78      0.45 3.3 0.0041    7 1.6     0.43
## 
##  lower alpha upper     95% confidence boundaries
## 0.76 0.77 0.78 
## 
##  Reliability if an item is dropped:
##         raw_alpha std.alpha G6(smc) average_r S/N alpha se var.r med.r
## museums      0.60      0.59    0.52      0.33 1.4   0.0075 0.031  0.24
## art          0.63      0.63    0.57      0.36 1.7   0.0070 0.030  0.33
## theater      0.74      0.73    0.76      0.47 2.7   0.0053 0.112  0.33
## reading      0.84      0.85    0.83      0.65 5.5   0.0032 0.034  0.55
## 
##  Item statistics 
##            n raw.r std.r r.cor r.drop mean  sd
## museums 8299  0.89  0.89  0.92   0.79  7.0 2.1
## art     8299  0.86  0.85  0.87   0.71  6.7 2.3
## theater 8299  0.76  0.75  0.59   0.54  6.8 2.2
## reading 8299  0.56  0.58  0.33   0.29  7.7 2.0

We observe that raw alpha and standartized alpha is equal to 0.77 and, as we know, we need it to be higher that 0.7 for the scale to be considered reliable. If we take a look at the output of the “Reliability if an item is dropped” table, we observe that in case of all variables the raw alpha drops when they are excluded, with an exception of the “reading variable”. If “readng” is excluded the raw alpha increases up to 0.84 which is better. We might actually consider dropping that variable. However, dropping this variables does not change the alpha output in terms of it exceeding or not exceeding the treshold (in which case dropping it would be a more serious question). It actually does make sense: we would probably pick reading as an “out-of-place” item even if we just take a look at other variables that were loaded on the ML1 factor - “museums”, “art”, “theater”. Though reading is still a non-active hobby that indicates one’s interest in art, it wouldn’t be considered to be that fine art represented by the remaining variables.

ML3 <- as.data.frame(dating3[c("concerts", "music", "movies")], check.keys = TRUE)
alpha(ML3)
## 
## Reliability analysis   
## Call: alpha(x = ML3)
## 
##   raw_alpha std.alpha G6(smc) average_r S/N    ase mean  sd median_r
##       0.72      0.72    0.66      0.46 2.5 0.0051  7.5 1.5     0.39
## 
##  lower alpha upper     95% confidence boundaries
## 0.71 0.72 0.73 
## 
##  Reliability if an item is dropped:
##          raw_alpha std.alpha G6(smc) average_r  S/N alpha se var.r med.r
## concerts      0.49      0.49    0.33      0.33 0.98   0.0111    NA  0.33
## music         0.55      0.56    0.39      0.39 1.29   0.0095    NA  0.39
## movies        0.78      0.79    0.66      0.66 3.81   0.0046    NA  0.66
## 
##  Item statistics 
##             n raw.r std.r r.cor r.drop mean  sd
## concerts 8299  0.88  0.85  0.77   0.65  6.8 2.2
## music    8299  0.83  0.83  0.73   0.61  7.9 1.8
## movies   8299  0.69  0.72  0.45   0.40  7.9 1.7

The raw and standartized alpha is case of factor ML3 is still higher than 0.7 (0.72 > 0.7) which means the scale is reliable. Similarly, we only have one variable that can be excluded and lead to a higher alpha value - “movies” (0.72 -> 0.78). While both “concerts” and “music” are connected to the music, “movies”, while still being a less active and more pop culture activity, feels a bit out of place.

ML4 <- as.data.frame(dating3[c("sports", "tvsports", "exercise")], check.keys = TRUE)
alpha(ML4)
## 
## Reliability analysis   
## Call: alpha(x = ML4)
## 
##   raw_alpha std.alpha G6(smc) average_r S/N    ase mean sd median_r
##       0.64      0.64    0.56      0.37 1.7 0.0069  5.7  2      0.4
## 
##  lower alpha upper     95% confidence boundaries
## 0.62 0.64 0.65 
## 
##  Reliability if an item is dropped:
##          raw_alpha std.alpha G6(smc) average_r  S/N alpha se var.r med.r
## sports        0.37      0.37    0.23      0.23 0.59   0.0137    NA  0.23
## tvsports      0.57      0.57    0.40      0.40 1.31   0.0095    NA  0.40
## exercise      0.65      0.65    0.48      0.48 1.85   0.0077    NA  0.48
## 
##  Item statistics 
##             n raw.r std.r r.cor r.drop mean  sd
## sports   8299  0.82  0.82  0.70   0.56  6.4 2.6
## tvsports 8299  0.77  0.75  0.55   0.43  4.6 2.8
## exercise 8299  0.69  0.71  0.46   0.36  6.2 2.4
## 
## Non missing response frequency for each item
##             1    2    3    4    5    6    7    8    9   10 miss
## sports   0.04 0.06 0.08 0.07 0.10 0.09 0.14 0.16 0.13 0.13    0
## tvsports 0.18 0.14 0.11 0.09 0.10 0.08 0.11 0.09 0.06 0.05    0
## exercise 0.04 0.05 0.07 0.07 0.13 0.14 0.14 0.16 0.11 0.08    0
ML5 <- as.data.frame(dating3[c("shopping", "dining", "clubbing")], check.keys = TRUE)
alpha(ML5)
## 
## Reliability analysis   
## Call: alpha(x = ML5)
## 
##   raw_alpha std.alpha G6(smc) average_r S/N    ase mean  sd median_r
##       0.51      0.54    0.45      0.28 1.2 0.0092  6.4 1.6     0.23
## 
##  lower alpha upper     95% confidence boundaries
## 0.49 0.51 0.53 
## 
##  Reliability if an item is dropped:
##          raw_alpha std.alpha G6(smc) average_r  S/N alpha se var.r med.r
## shopping      0.36      0.38    0.23      0.23 0.60   0.0132    NA  0.23
## dining        0.33      0.33    0.20      0.20 0.49   0.0147    NA  0.20
## clubbing      0.55      0.58    0.41      0.41 1.37   0.0091    NA  0.41
## 
##  Item statistics 
##             n raw.r std.r r.cor r.drop mean  sd
## shopping 8299  0.77  0.74  0.54   0.36  5.6 2.6
## dining   8299  0.69  0.76  0.57   0.41  7.8 1.8
## clubbing 8299  0.69  0.66  0.34   0.25  5.7 2.5

In case of the ML4 factor the alpha does not exceed the 0.7 thershold (0.7 > 0.64) and dropping any of the items doesn’t help: alpha only increases up to 0.65 if “exercise” is dropped and decreases in case of all other variables. The alpha is even lower in case of the ML5 factor: raw - 0.51, standartized - 0.54. Dropping “clubbing” would similarly lead to an insignificant increase in alpha.

Part 3

Now we proceed to save scores for the future analysis since we want to to concenrate on observations and not variables (loadings).

# Specifying scores = T
fa1<-fa(dating3, nfactors=5, rotate="varimax", fm="ml", scores=T) 
load <- fa1$loadings[,1:2] 
plot(load) # set up plot 

fascores<-as.data.frame(fa1$scores)
# Adding these as new columns
datingfa<-cbind(dating3, fascores)
names(datingfa)
##  [1] "sports"   "tvsports" "exercise" "dining"   "museums"  "art"     
##  [7] "hiking"   "gaming"   "clubbing" "reading"  "tv"       "theater" 
## [13] "movies"   "concerts" "music"    "shopping" "yoga"     "ML1"     
## [19] "ML3"      "ML4"      "ML2"      "ML5"

We can now use all of those factor scores as variables in the regression analysis. Let’s suppose we want to predict one’s interest in gaming (variable “gaming” corresponding to “How interested are you in gaming”, scale 1 - 10). As one of predictor variables, I would choose one’s interest in watching TV as a more of a laid-back activity that can lead to binge watching, is often assosiated with introvert people who don’t like going out and spending time doing activities (a highly stereotypical perspective). Secondly, since now we can use our factors in linear regression as predictors, I would take ML1 (interest in fine art) as the other predictor. We mighr hypothesize that people who like fine art are quite unlikely to rate gaming as a highly interesting activity for them.

We first build scatterplots to see what the general pattern is.

g1 <-  ggplot(datingfa, aes(x= tv, y = gaming))+
  geom_point(position = "jitter")+
  labs(title = "Relationship between gaming and one's interest in TV", y="Importance of gaming", x="Importance of TV")+
  theme_bw()+
  geom_smooth(method = "lm")

g2 <- ggplot(datingfa, aes(x= ML1, y = gaming))+
  geom_point(position = "jitter")+
  labs(title = "Relationship between gaming and one's interest in fine art", y="Importance of gaming", x="Level of free choice")+
  theme_bw()+
  geom_smooth(method = "lm")

ggarrange(g1, g2 + rremove("x.text"), 
          labels = c("A", "B"),
          ncol = 2, nrow = 1)

cor.test(datingfa$gaming, datingfa$tv)
## 
##  Pearson's product-moment correlation
## 
## data:  datingfa$gaming and datingfa$tv
## t = 18.506, df = 8297, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1783520 0.2196774
## sample estimates:
##       cor 
## 0.1991032
cor.test(datingfa$gaming, datingfa$ML1)
## 
##  Pearson's product-moment correlation
## 
## data:  datingfa$gaming and datingfa$ML1
## t = -10.932, df = 8297, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1403146 -0.0978948
## sample estimates:
##        cor 
## -0.1191591

We observe a negative weak correlation in case of the “Fine art interest” and a weak positive correlation in case of the “Interest in watching TV” variables. We proceed to build a model with these predictors.

modelf <- lm(datingfa$gaming ~ datingfa$tv + datingfa$ML1)
summary(modelf)
## 
## Call:
## lm(formula = datingfa$gaming ~ datingfa$tv + datingfa$ML1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.8000 -2.0795 -0.4287  1.7871  9.9588 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.76679    0.06499   42.57   <2e-16 ***
## datingfa$tv   0.21010    0.01106   18.99   <2e-16 ***
## datingfa$ML1 -0.34385    0.02934  -11.72   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.547 on 8296 degrees of freedom
## Multiple R-squared:  0.05528,    Adjusted R-squared:  0.05505 
## F-statistic: 242.7 on 2 and 8296 DF,  p-value: < 2.2e-16

The model explains about 5% of the variance in the outcome variable but is still better than the model based solely on the mean values (p-value: < 2.2e-16). Both predictors are statistically significant and if we were to write down the regression equation:

\[Predicted gaming = 2.76679 + 0.21010 * TV - 0.34385 * FineArt\]

With a one point increase in one’s interest in watching TV, one’s interest in gaming increases by 0.21010. On the contrary, with a one-point increase in value of one’s interest in Fine arts, one’s interest in gamin decreases by 0.34385.

par(mfrow=c(2,2)) 
plot(modelf)

No points are observed on the Cook’s distance line (it is not present on the graph) which indicates that we don’t have highly influential observations. In terms of the Q-Q plot, we can see how some observations at the upper part of it are a bit further away from the line, but in general it looks fine. We are looking for a straight horizontal line on the first graph and it seems fine except for the those values at the right end that drive the line a bit down. The line we observe on the Scale-Location graph is not ideal, but actually quite fine.

The end