# Steps in Factor Analysis
# A)    Test assumptions of Factor Analysis such as Factorability
# B)    Exploratory Factor Analysis (EFA)
# C)    Determine the number of Factors
# D)    Identify which item belong to which factor
# E)    Drop items as necessary and repeat steps c and d
# F)    Name and define factors. For example - For Factor 1 - Pain Reliever Action and Factor 2 - Pain Reliever side effects. You may give good names.
# G)    Examine correlations among factors
# H)    Analyze internal reliability



#Load required libraries

library(corpcor)
library(GPArotation)
library(psych)
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
library(MASS)
library(MVN)
## sROC 0.1-2 loaded
## 
## Attaching package: 'MVN'
## The following object is masked from 'package:psych':
## 
##     mardia
library(psy)
## 
## Attaching package: 'psy'
## The following object is masked from 'package:psych':
## 
##     wkappa
library(readxl)

#load data

painReliefData <-read_excel("D:/PGPBABI/Advance stats/practise assignment-pain relief/pain relief Factor Analysis.xlsx", col_names = TRUE)
str(painReliefData)
## Classes 'tbl_df', 'tbl' and 'data.frame':    100 obs. of  6 variables:
##  $ do not upset stomach: num  -2.6075 -2.5977 -0.6027 0.0515 -2.5975 ...
##  $ no bad side effects : num  -2.0409 0.5938 0.0568 -1.4052 0.3321 ...
##  $ stops the pain      : num  0.668 -0.69 -2.635 -1.335 -0.825 ...
##  $ works quickly       : num  0.405 -0.814 -3.839 -1.148 -1.125 ...
##  $ keeps me awake      : num  1.3211 -0.5492 -0.5083 -0.0914 2.1677 ...
##  $ limited relief      : num  -0.351 2.948 1.526 1.522 0.278 ...
#Normality 
hzTest(painReliefData, qqplot = TRUE)

##   Henze-Zirkler's Multivariate Normality Test 
## --------------------------------------------- 
##   data : painReliefData 
## 
##   HZ      : 0.8930289 
##   p-value : 0.6047229 
## 
##   Result  : Data are multivariate normal. 
## ---------------------------------------------
#We conclude data is multivariate normal.
#Linear relation between variables : Test by visually examining all or at least some of the #bivariate scatterplots
plot(painReliefData)

#We conclude that at least some of the variables are linear / almost linear.


#Testing of assumptions for Factor Analysis
# Re: https://en.wikiversity.org/wiki/Exploratory_factor_analysis/Assumptions
# 1.    Factorability is the assumption that there are at least some correlations amongst the #variables so that coherent factors can be identified. Basically, there should be some #degree of collinearity among the variables but not an extreme degree or singularity among the variables.
# Factorability can be examined via any of the following:
#   a. Inter-item correlations (correlation matrix) - are there at least several sizable correlations - e.g. > 0.5?
# Correlation Matrix: To do the factor analysis we must have variables that correlate fairly #well with each other. The correlation matrix is generated in R to check the pattern of #relationship between variables.
# #create a correlation matrix

PainReliefMatrix<-cor(painReliefData)
round(PainReliefMatrix, 2)
##                      do not upset stomach no bad side effects
## do not upset stomach                 1.00                0.60
## no bad side effects                  0.60                1.00
## stops the pain                       0.16                0.13
## works quickly                        0.09               -0.07
## keeps me awake                      -0.59               -0.64
## limited relief                      -0.17               -0.06
##                      stops the pain works quickly keeps me awake
## do not upset stomach           0.16          0.09          -0.59
## no bad side effects            0.13         -0.07          -0.64
## stops the pain                 1.00          0.63          -0.03
## works quickly                  0.63          1.00           0.06
## keeps me awake                -0.03          0.06           1.00
## limited relief                -0.63         -0.61           0.03
##                      limited relief
## do not upset stomach          -0.17
## no bad side effects           -0.06
## stops the pain                -0.63
## works quickly                 -0.61
## keeps me awake                 0.03
## limited relief                 1.00
# From the above correlation matrix we can see that all the variables value of 
# correlation coefficient greater than 0.5 with at least one other variable. 
# Hence we can assume that variables are fairly correlated with each other and we 
# can run Factor Analysis on this data.
# Anti-image correlation matrix diagonals - they should be > 0.5
X <- cor(painReliefData) 
iX <- ginv(X) 
S2 <- diag(diag((iX^-1)))
AIS <- S2%*%iX%*%S2                      # anti-image covariance matrix
IS <- X+AIS-2*S2                         # image covariance matrix
Dai <- sqrt(diag(diag(AIS)))
IR <- ginv(Dai)%*%IS%*%ginv(Dai)         # image correlation matrix
AIR <- ginv(Dai)%*%AIS%*%ginv(Dai)       # anti-image correlation matrix

print(diag(AIR), row.names = FALSE)
## [1] 1 1 1 1 1 1
#We observe that the diagonals of the Anti - Image Correlation matrix to be 1.
#Since Anti - Image Correlation matrix diagonals > 0.5, we can run Factor Analysis on this #data.
# c. Measure of Sampling Accuracy (MSA)
# . Kaisar-Meyer-Olkin (KMO) should be > 0.5

KMO(r = PainReliefMatrix)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = PainReliefMatrix)
## Overall MSA =  0.71
## MSA for each item = 
## do not upset stomach  no bad side effects       stops the pain 
##                 0.75                 0.67                 0.70 
##        works quickly       keeps me awake       limited relief 
##                 0.70                 0.70                 0.75
#We have applied the function, KMO() on the correlation matrix and it has returned the 
#following:
#a) Overall MSA to be 0.71 which yields a degree of common variance middling
#b) The estimates of MSA for each item to be 0.75, 0.67, 0.70, 0.70. 0.70 and 0.75

#Since MSA > 0.5, we can run Factor Analysis on this data.
#.  Bartlett's test of sphericity (Should be significant)

#Bartlett's Test of Sphericity:

cortest.bartlett(PainReliefMatrix, n = NULL,diag=TRUE)
## Warning in cortest.bartlett(PainReliefMatrix, n = NULL, diag = TRUE): n not
## specified, 100 used
## $chisq
## [1] 226.1
## 
## $p.value
## [1] 1.006033e-39
## 
## $df
## [1] 15
# Bartletta's test was conducted in R and it was found to be significant ( P < 0.001) .
# The significance of this test tells us that the correlation matrix is not an identity #matrix.
# Hence, we assume that there is some relationship between the variable.

# 2.Sample Size - The sample size should be large enough to yield reliable estimates of 
# correlations among the variables:
#a. Ideally, there should be a large ratio of N / k (Cases / items)
#b. EFA can still be reasonably done with > ~ 5:1
#c. Bare min. for pilot study purposes as low as ~3:! 17 : 1
#In this case we have N = 100 and k = 6 and the ratio is 100 : 6 = ~ 17 : 1.
#Hence the sample size is large enough to yield reliable estimates of the correlations 
# among the variables.


# Exploratory Factor Analysis (EFA) is generally used to discover structure of a measure and
# to examine its internal reliability. EFA is often recommended when researchers have no 
# hypotheses about the nature of the underlying factor structure of their measure.
# Exploratory factor analysis has three basic decision points:
#   (1) Decide the number of factors
# (2)   Choosing an extraction method
# (3)   Choosing a rotation method
# (1) Decide the number of factors
# The most common approach to deciding the number of factors is to generate a scree plot.
# The scree plot is a two-dimensional graph with factors on the x-axis and eigenvalues on #the
# y-axis.
# Eigenvalues are produced by a process called principal component analysis (PCA) and 
# represent the variance accounted for by each underlying factor. They are represented by 
# scores that total to the number of items.
# A 12-item scale will theoretically have 12 possible underlying factors, each factor will 
# have an eigen value that indicates the amount of variation in the items accounted by each 
# factor. If a, the first factor has an eigen value of 3.0, it accounts for 25% of the #variance
# (3/12= .25). The total of all the eigen values will be 12 if there are 12 items, so some 
# factors will have smaller eigenvalues. They are typically arranged in a scree plot in 
# decending order.
# Choosing an extraction method and extraction
# Once the number of factors are decided, you need to decide which mathematical solution to 
# find the loadings. There are five basic extraction methods:
# 1.    PCA - which assumes there is no measurement error and is considered not to be true 
#     exploratory factor analysis.
# 2.    Maximum Likelihood (a.k.a canonical factoring)
# 3.    Alpha Factoring
# 4.    Image Factoring
# 5.    Principal axis factoring with iterated communalities (a.k.a least squares)
# Calculate initial factor loadings:
#   This can be done in a number of different ways: the two most common methods are 
#   described very briefly below:
#   .   Principal Component Analysis (PCA) Method
#     As the name suggests, this method uses the method used to carry out a principal 
#     component analysis. However, the factors obtained will not actually be the principal 
#     components (although the loadings for the kth factor will be proportional to the 
#     coefficients of the kth principal component.

# . Principal Axis Factoring
# This is a method which tries to find the lowest number of factors which can account for 
# the variability in the original variables, that is associated with these factors 
# (this is in contrast to the principal components method which looks for a set of factors
#  which can account for the total variability in the original variables).
#  These two methods will tend to give similar results if the variables are quite highly 
# correlated and / or the number of original variables is quite high. Whichever method is 
# used, the resulting factors at this stage will be uncorrelated.
# PCA
# Let us perform an initial PCA and understand the eigen values and the variance explained 
# by them.
# #On raw data

pc1 <-  principal(painReliefData, nfactors = length(painReliefData), rotate = "none")
pc1
## Principal Components Analysis
## Call: principal(r = painReliefData, nfactors = length(painReliefData), 
##     rotate = "none")
## Standardized loadings (pattern matrix) based upon correlation matrix
##                        PC1   PC2   PC3   PC4  PC5   PC6 h2       u2 com
## do not upset stomach  0.67 -0.52 -0.39  0.30 0.18 -0.09  1  0.0e+00 3.2
## no bad side effects   0.59 -0.64  0.36  0.06 0.08  0.31  1  7.8e-16 3.1
## stops the pain        0.71  0.51  0.31  0.00 0.23 -0.30  1 -2.2e-16 3.0
## works quickly         0.60  0.63 -0.25 -0.30 0.17  0.25  1 -1.1e-15 3.3
## keeps me awake       -0.55  0.68  0.05  0.39 0.23  0.18  1  1.1e-16 3.1
## limited relief       -0.68 -0.52  0.00 -0.22 0.45 -0.05  1 -6.7e-16 2.9
## 
##                        PC1  PC2  PC3  PC4  PC5  PC6
## SS loadings           2.43 2.07 0.44 0.39 0.38 0.29
## Proportion Var        0.41 0.34 0.07 0.06 0.06 0.05
## Cumulative Var        0.41 0.75 0.82 0.89 0.95 1.00
## Proportion Explained  0.41 0.34 0.07 0.06 0.06 0.05
## Cumulative Proportion 0.41 0.75 0.82 0.89 0.95 1.00
## 
## Mean item complexity =  3.1
## Test of the hypothesis that 6 components are sufficient.
## 
## The root mean square of the residuals (RMSR) is  0 
##  with the empirical chi square  0  with prob <  NA 
## 
## Fit based upon off diagonal values = 1
cat("\n\n Eigen values \n")
## 
## 
##  Eigen values
##  Eigen values
print(pc1$values)
## [1] 2.4307444 2.0698505 0.4391380 0.3869706 0.3803800 0.2929165
# Observation
# 1)Output of this analysis show us that only 2 components have eigenvalues greater than 1,
# suggesting that we extract 2 components.
# 2)    The above output also suggests that extracting 2 components explains 75% of the total 
# variance.
# We shall draw a scree plot to decide on the number of factors.
# Scree Plot:
  ### Scree plot 
  
plot(pc1$values, type = "b",xlab = "Factors", ylab = "Eigen values", main = "SCREE PLOT") 

# Observation
# From the scree plot, we notice a steep curve before the factor 3, that starts the flat 
# line. We retain those components or factors in the steep curve before the first point that 
# starts the flat line. We notice that 2 of those factors explain most of the variation - 
#75%.
# So we shall use 2 as the number of factors for performing Factor Analysis.
# Principal axis factoring
# We shall use Principal axis factoring, (fm="pa") because we are most interested in 
# identifying the underlying constructs in the data.
# The extraction method will produce factor loadings for every item in every extracted
# factor.
# Now, We will use fa() function from the psych package, which received the following 
# primary arguments:
# . r: the correlation matrix
# . nfactors: number of factors to be extracted (default 1)
# . rotate: one of several matrix rotation methods, such as "varimax" or "oblimin" or "none"
# . fm: one of several factoring methodsm such as pa (principal axis) or ml (maximum 
# likelihood)
solution <- fa(r=PainReliefMatrix, nfactors = 2, rotate="none",fm="pa")

print(solution)
## Factor Analysis using method =  pa
## Call: fa(r = PainReliefMatrix, nfactors = 2, rotate = "none", fm = "pa")
## Standardized loadings (pattern matrix) based upon correlation matrix
##                        PA1   PA2   h2   u2 com
## do not upset stomach  0.60 -0.46 0.57 0.43 1.9
## no bad side effects   0.55 -0.60 0.65 0.35 2.0
## stops the pain        0.67  0.46 0.65 0.35 1.8
## works quickly         0.56  0.57 0.63 0.37 2.0
## keeps me awake       -0.50  0.62 0.64 0.36 1.9
## limited relief       -0.63 -0.46 0.61 0.39 1.8
## 
##                        PA1  PA2
## SS loadings           2.06 1.70
## Proportion Var        0.34 0.28
## Cumulative Var        0.34 0.63
## Proportion Explained  0.55 0.45
## Cumulative Proportion 0.55 1.00
## 
## Mean item complexity =  1.9
## Test of the hypothesis that 2 factors are sufficient.
## 
## The degrees of freedom for the null model are  15  and the objective function was  2.35
## The degrees of freedom for the model are 4  and the objective function was  0.03 
## 
## The root mean square of the residuals (RMSR) is  0.02 
## The df corrected root mean square of the residuals is  0.03 
## 
## Fit based upon off diagonal values = 1
## Measures of factor score adequacy             
##                                                 PA1  PA2
## Correlation of scores with factors             0.92 0.91
## Multiple R square of scores with factors       0.85 0.82
## Minimum correlation of possible factor scores  0.69 0.64
fa.diagram(solution)

#####  Choosing a rotation method  #####
# We observe that the components loadings are not clear. Once an initial solution is 
# obtained, the loadings are rotated. Factor Rotation is used to increase interpretability.
# Rotation is a way of maximizing high loadings and minimizing low loadings so that the 
# simplest possible structure is achieved.
# There are two types of rotation method, orthogonal and oblique rotation. In orthogonal 
# rotation the rotated factors will remain uncorrelated whereas in oblique rotation the 
# resulting factors will be correlated.
# There are a number of different methods of rotation of each type.
# The most common orthogonal method is called varimax rotation; this is the method that many
# books recommend.
# Orthogonal Rotation (varimax):
# Assuming that there is no correlation between the extracted factors, we will carry out a 
# varimax rotation.
# Rotated component matrix obtained after varimax rotation is shown below:
  # Orthogonal varimax rotation
  
solution1 <- fa(r=PainReliefMatrix, nfactors = 2, rotate="varimax",fm="pa")
print(solution1)
## Factor Analysis using method =  pa
## Call: fa(r = PainReliefMatrix, nfactors = 2, rotate = "varimax", fm = "pa")
## Standardized loadings (pattern matrix) based upon correlation matrix
##                        PA1   PA2   h2   u2 com
## do not upset stomach  0.14  0.74 0.57 0.43 1.1
## no bad side effects   0.01  0.81 0.65 0.35 1.0
## stops the pain        0.80  0.10 0.65 0.35 1.0
## works quickly         0.79 -0.05 0.63 0.37 1.0
## keeps me awake        0.04 -0.80 0.64 0.36 1.0
## limited relief       -0.78 -0.08 0.61 0.39 1.0
## 
##                        PA1  PA2
## SS loadings           1.90 1.86
## Proportion Var        0.32 0.31
## Cumulative Var        0.32 0.63
## Proportion Explained  0.51 0.49
## Cumulative Proportion 0.51 1.00
## 
## Mean item complexity =  1
## Test of the hypothesis that 2 factors are sufficient.
## 
## The degrees of freedom for the null model are  15  and the objective function was  2.35
## The degrees of freedom for the model are 4  and the objective function was  0.03 
## 
## The root mean square of the residuals (RMSR) is  0.02 
## The df corrected root mean square of the residuals is  0.03 
## 
## Fit based upon off diagonal values = 1
## Measures of factor score adequacy             
##                                                 PA1  PA2
## Correlation of scores with factors             0.91 0.91
## Multiple R square of scores with factors       0.84 0.83
## Minimum correlation of possible factor scores  0.67 0.66
print(solution1$loadings, cutoff=0.3)
## 
## Loadings:
##                      PA1    PA2   
## do not upset stomach         0.742
## no bad side effects          0.808
## stops the pain        0.801       
## works quickly         0.795       
## keeps me awake              -0.797
## limited relief       -0.777       
## 
##                  PA1   PA2
## SS loadings    1.897 1.857
## Proportion Var 0.316 0.309
## Cumulative Var 0.316 0.626
# Observation:
# From the above output we can see that on applying varimax rotation, the component loadings
# are very clear.
# Visualization
# We can visualize the factors by calling the function fa.diagram.

fa.diagram(solution1)

# Observation
# The square boxes are the observed variables, and the ovals are the unobserved factors. 
# The straight arrows are the loadings, the correlation between the factor and the observed 
# variable(s). The curved arrows are the correlations between the factors. If no curved arrow 
# is present, then the correlation between the factors is not great.
#### Identify which item belong in which factor
# Criteria for Practical and Statistical Significance of Factor Loadings:
# Factor loading can be classified based on their magnitude.
# Value Interpretation
# > 30  Minimum consideration level
# > 40  More important
# > 50  Practically significant
# Items belonging to factor - PA1:
#   1.  Stops the pain
# 2.    Works quickly
# 3.    Provides Limited Relief
# Items belonging to factor - PA2:
# 1.    Does not upset stomach
# 2.    No bad side effects
# 3.    Keeps me awake
#### Drop items as necessary and repeat steps (C) and (D)
#### Dropping items are not necessary.
#
#### Name and define factors
# The variables that load highly on factor - PA1:
# 1.    Stops the pain
# 2.    Works quickly
# 3.    Provides Limited Relief
# All these items are related to the action of pain reliever, so we can label this factor as
# Pain reliever Action.
# The variables that load highly on factor - PA2:
# 1.    Does not upset stomach
# 2.    No bad side effects
# 3.    Keeps me awake
# All these items are related to the side effects of pain reliever, so we can label this 
# factor as Pain reliever side effects.
# Oblique oblimin rotation
# Oblique Rotation (oblimin): AWe will use oblique rotation, which recognizes that there is 
# likely to be some correlation between pain relief factors in the real world.
# Rotation matrix obtained after oblimin rotation is shown below:

solution2 <- fa(r=PainReliefMatrix, nfactors = 2, rotate="oblimin",fm="pa")
print(solution2)
## Factor Analysis using method =  pa
## Call: fa(r = PainReliefMatrix, nfactors = 2, rotate = "oblimin", fm = "pa")
## Standardized loadings (pattern matrix) based upon correlation matrix
##                        PA1   PA2   h2   u2 com
## do not upset stomach  0.11  0.74 0.57 0.43   1
## no bad side effects  -0.02  0.81 0.65 0.35   1
## stops the pain        0.80  0.06 0.65 0.35   1
## works quickly         0.80 -0.09 0.63 0.37   1
## keeps me awake        0.07 -0.80 0.64 0.36   1
## limited relief       -0.78 -0.04 0.61 0.39   1
## 
##                        PA1  PA2
## SS loadings           1.90 1.85
## Proportion Var        0.32 0.31
## Cumulative Var        0.32 0.63
## Proportion Explained  0.51 0.49
## Cumulative Proportion 0.51 1.00
## 
##  With factor correlations of 
##      PA1  PA2
## PA1 1.00 0.08
## PA2 0.08 1.00
## 
## Mean item complexity =  1
## Test of the hypothesis that 2 factors are sufficient.
## 
## The degrees of freedom for the null model are  15  and the objective function was  2.35
## The degrees of freedom for the model are 4  and the objective function was  0.03 
## 
## The root mean square of the residuals (RMSR) is  0.02 
## The df corrected root mean square of the residuals is  0.03 
## 
## Fit based upon off diagonal values = 1
## Measures of factor score adequacy             
##                                                 PA1  PA2
## Correlation of scores with factors             0.92 0.91
## Multiple R square of scores with factors       0.84 0.83
## Minimum correlation of possible factor scores  0.68 0.66
print(solution2$loadings, cutoff=0.3)
## 
## Loadings:
##                      PA1    PA2   
## do not upset stomach         0.736
## no bad side effects          0.809
## stops the pain        0.800       
## works quickly         0.799       
## keeps me awake              -0.801
## limited relief       -0.777       
## 
##                  PA1   PA2
## SS loadings    1.899 1.852
## Proportion Var 0.317 0.309
## Cumulative Var 0.317 0.625
fa.diagram(solution2)
# From the above output we can conclude that oblimin rotation also produce the same factor #loading as varimax rotation.
#### Examine correlations among factors
# We notice that our factors are least correlated at 8% and recall our choice of oblique #rotation allowed for the recognition of good relationship; hence our assumption of #correlation between components while running oblique rotation is violated.
#### Analyze internal reliability
# Crombach's alpha is a measure of internal consistency, that is, how closely related a set
# of items are as a group.
# Cronbach's alpha is computed by correating the score for each scale item with the total 
# score for each observation (usually individual survey responds or test takers), and then 
# comparing that to the variance for all individual item scores.
# Cronbach's alpha is a function of the number of items in a test, the average covariance 
# between pairs of items, and the variance of the total score.
# Interpretation of Cronbach's alpha
# The alpha coefficient of reliability ranges from 0 to 1 in providing this overall
# assessment of a measure's reliability. If all of the scale items are entirely independent
# from one another (i.e., are not correlated or share no covariance), the alpha = 0; and, if
# all of the items have high covariances, then alpha will approach 1 as more the items in the
# scale approaches infinity.
# A good alpha coefficient depends on your theoretical knowledge of the scale in question. 
# Many methodologies recommend a minimum alpha coefficient between 0.65 and 0.80; alpha 
# coefficients less than 0.50 are usually unacceptable.
# Conclusion
# Overall assessment of this measure's reliability is good since the absolute value of the
# alpha coefficient is above 0.65.

# Observation
# . The graphs also confirm our interpretation that the factor 1 heavily loads on
# - stops the pain
# - works quickly
# - provides limited relief
# . The graphs also confirm our interpretation that the factor 2 heavily loads on
# - does not upset stomach
# - no bad side effects
# - keeps me awake