Week 5 Coding Practice - Factor Analysis in R

Data Exploration

Import the data

data <- read_csv("data/Factor-Hair-Revised.csv", show_col_types=FALSE)

Basic Descriptive Statistics

dim(data)
## [1] 100  13
str(data)
## spc_tbl_ [100 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ID          : num [1:100] 1 2 3 4 5 6 7 8 9 10 ...
##  $ ProdQual    : num [1:100] 8.5 8.2 9.2 6.4 9 6.5 6.9 6.2 5.8 6.4 ...
##  $ Ecom        : num [1:100] 3.9 2.7 3.4 3.3 3.4 2.8 3.7 3.3 3.6 4.5 ...
##  $ TechSup     : num [1:100] 2.5 5.1 5.6 7 5.2 3.1 5 3.9 5.1 5.1 ...
##  $ CompRes     : num [1:100] 5.9 7.2 5.6 3.7 4.6 4.1 2.6 4.8 6.7 6.1 ...
##  $ Advertising : num [1:100] 4.8 3.4 5.4 4.7 2.2 4 2.1 4.6 3.7 4.7 ...
##  $ ProdLine    : num [1:100] 4.9 7.9 7.4 4.7 6 4.3 2.3 3.6 5.9 5.7 ...
##  $ SalesFImage : num [1:100] 6 3.1 5.8 4.5 4.5 3.7 5.4 5.1 5.8 5.7 ...
##  $ ComPricing  : num [1:100] 6.8 5.3 4.5 8.8 6.8 8.5 8.9 6.9 9.3 8.4 ...
##  $ WartyClaim  : num [1:100] 4.7 5.5 6.2 7 6.1 5.1 4.8 5.4 5.9 5.4 ...
##  $ OrdBilling  : num [1:100] 5 3.9 5.4 4.3 4.5 3.6 2.1 4.3 4.4 4.1 ...
##  $ DelSpeed    : num [1:100] 3.7 4.9 4.5 3 3.5 3.3 2 3.7 4.6 4.4 ...
##  $ Satisfaction: num [1:100] 8.2 5.7 8.9 4.8 7.1 4.7 5.7 6.3 7 5.5 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ID = col_double(),
##   ..   ProdQual = col_double(),
##   ..   Ecom = col_double(),
##   ..   TechSup = col_double(),
##   ..   CompRes = col_double(),
##   ..   Advertising = col_double(),
##   ..   ProdLine = col_double(),
##   ..   SalesFImage = col_double(),
##   ..   ComPricing = col_double(),
##   ..   WartyClaim = col_double(),
##   ..   OrdBilling = col_double(),
##   ..   DelSpeed = col_double(),
##   ..   Satisfaction = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
names(data)
##  [1] "ID"           "ProdQual"     "Ecom"         "TechSup"      "CompRes"     
##  [6] "Advertising"  "ProdLine"     "SalesFImage"  "ComPricing"   "WartyClaim"  
## [11] "OrdBilling"   "DelSpeed"     "Satisfaction"
describe(data)
## data 
## 
##  13  Variables      100  Observations
## --------------------------------------------------------------------------------
## ID 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      100        0      100        1     50.5    33.67     5.95    10.90 
##      .25      .50      .75      .90      .95 
##    25.75    50.50    75.25    90.10    95.05 
## 
## lowest :   1   2   3   4   5, highest:  96  97  98  99 100
## --------------------------------------------------------------------------------
## ProdQual 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      100        0       43    0.999     7.81     1.61    5.595    5.790 
##      .25      .50      .75      .90      .95 
##    6.575    8.000    9.100    9.410    9.900 
## 
## lowest : 5   5.1 5.2 5.5 5.6, highest: 9.4 9.5 9.6 9.9 10 
## --------------------------------------------------------------------------------
## Ecom 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      100        0       27    0.996    3.672   0.7674    2.595    2.800 
##      .25      .50      .75      .90      .95 
##    3.275    3.600    3.925    4.530    5.100 
## 
## lowest : 2.2 2.4 2.5 2.6 2.7, highest: 4.9 5.1 5.5 5.6 5.7
## --------------------------------------------------------------------------------
## TechSup 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      100        0       50    0.999    5.365    1.755    2.700    3.280 
##      .25      .50      .75      .90      .95 
##    4.250    5.400    6.625    7.210    7.605 
## 
## lowest : 1.3 2.5 2.6 2.7 3  , highest: 7.7 7.9 8   8.4 8.5
## --------------------------------------------------------------------------------
## CompRes 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      100        0       45    0.999    5.442    1.388    3.595    3.900 
##      .25      .50      .75      .90      .95 
##    4.600    5.450    6.325    7.010    7.305 
## 
## lowest : 2.6 3   3.2 3.5 3.6, highest: 7.4 7.5 7.6 7.7 7.8
## --------------------------------------------------------------------------------
## Advertising 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      100        0       41    0.999     4.01    1.302    2.200    2.400 
##      .25      .50      .75      .90      .95 
##    3.175    4.000    4.800    5.510    5.800 
## 
## lowest : 1.9 2.1 2.2 2.3 2.4, highest: 5.7 5.8 5.9 6.3 6.5
## --------------------------------------------------------------------------------
## ProdLine 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      100        0       42    0.999    5.805    1.509    3.900    4.190 
##      .25      .50      .75      .90      .95 
##    4.700    5.750    6.800    7.600    7.805 
## 
## lowest : 2.3 2.9 3.3 3.6 3.9, highest: 7.7 7.8 7.9 8.3 8.4
## --------------------------------------------------------------------------------
## SalesFImage 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      100        0       35    0.997    5.123     1.19    3.385    3.790 
##      .25      .50      .75      .90      .95 
##    4.500    4.900    5.800    6.610    7.100 
## 
## lowest : 2.9 3   3.1 3.4 3.5, highest: 6.8 6.9 7.1 7.8 8.2
## --------------------------------------------------------------------------------
## ComPricing 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      100        0       45    0.998    6.974    1.778    4.500    4.800 
##      .25      .50      .75      .90      .95 
##    5.875    7.100    8.400    8.810    9.105 
## 
## lowest : 3.7 3.8 4.4 4.5 4.6, highest: 9.2 9.3 9.6 9.7 9.9
## --------------------------------------------------------------------------------
## WartyClaim 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      100        0       34    0.998    6.043   0.9372    4.795    5.000 
##      .25      .50      .75      .90      .95 
##    5.400    6.100    6.600    7.200    7.305 
## 
## lowest : 4.1 4.3 4.5 4.7 4.8, highest: 7.3 7.4 7.5 7.7 8.1
## --------------------------------------------------------------------------------
## OrdBilling 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      100        0       37    0.998    4.278    1.033    2.595    3.000 
##      .25      .50      .75      .90      .95 
##    3.700    4.400    4.800    5.400    5.605 
## 
## lowest : 2   2.1 2.4 2.5 2.6, highest: 5.5 5.6 5.7 6.5 6.7
## --------------------------------------------------------------------------------
## DelSpeed 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      100        0       30    0.997    3.886   0.8267    2.595    2.990 
##      .25      .50      .75      .90      .95 
##    3.400    3.900    4.425    4.710    4.900 
## 
## lowest : 1.6 2   2.4 2.5 2.6, highest: 4.7 4.8 4.9 5.2 5.5
## --------------------------------------------------------------------------------
## Satisfaction 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      100        0       29    0.997    6.918    1.371    5.190    5.400 
##      .25      .50      .75      .90      .95 
##    6.000    7.050    7.625    8.600    8.900 
## 
## lowest : 4.7 4.8 5   5.2 5.4, highest: 8.6 8.7 8.9 9   9.9
## --------------------------------------------------------------------------------

Goal: Predict Satisfaction with a Regression Model Variable ID is a unique number identifier, can be dropped from the regression variables:

data_X <- subset(data, select=-c(1))

Correlation Analysis

Inter-Item Correlation Matrix

Note: We want to check X only (predictors = independent variables here). Exclude Satisfaction (column 12)

datamatrix <- cor(data_X[-12]) # from library corrplot
corrplot(datamatrix, method="number")

corrplot(datamatrix, order="hclust", type='upper',tl.srt = 45)

Highly Correlated Variables:

  1. CompRes and DelSpeed
  2. OrdBilling and CompRes
  3. WartyClaim and TechSupport
  4. CompRes and OrdBilling
  5. OrdBilling and DelSpeed
  6. Ecom and SalesFImage

Correlation Coefficient

Compute the partial correlation coefficients, the t-statistics, and corresponding p-values for the independent variables.

  1. To Extract the correlation coefficients, use coeff$estimate
  2. To Extract p-values, use coeff$p.value
coeff <- pcor(data_X[-12], method="pearson") # from library ppcor


corrplot(coeff$estimate, type="upper", order="hclust",
         p.mat = coeff$p.value, sig.level = 0.01, insig = "blank")

From the plot:

  • Dark blue icon: Highly significant correlation
    • Ex: Between sales force image (SalesFImage) and e-commerce (Ecom))
  • Mid-blue icon: Significant correlation
    • Ex: Between delivery speed (DelSpeed) and order billing (OrdBilling)

We can assume that there is a high degree of collinearity between the independent variables.

VIF

  • Variance Inflation Factor (VIF) can statistically assess multicollinearity.
  • Measures how much of the variance of a regression coefficient is inflated due to multicollinearity in the model.
  • VIF value is considered high when it is above 5 or 10.
  • Steps:
    1. Build a regression model with all predictor variables.
    2. Run vif() from the car library.
model <- lm(Satisfaction ~., data = data_X)
vif(model)
##    ProdQual        Ecom     TechSup     CompRes Advertising    ProdLine 
##    1.635797    2.756694    2.976796    4.730448    1.508933    3.488185 
## SalesFImage  ComPricing  WartyClaim  OrdBilling    DelSpeed 
##    3.439420    1.635000    3.198337    2.902999    6.516014
# Rearrange VIFs in descending Order
vif_df = as.data.frame(vif(model))
arrange(vif_df, desc(vif(model)))
##             vif(model)
## DelSpeed      6.516014
## CompRes       4.730448
## ProdLine      3.488185
## SalesFImage   3.439420
## WartyClaim    3.198337
## TechSup       2.976796
## OrdBilling    2.902999
## Ecom          2.756694
## ProdQual      1.635797
## ComPricing    1.635000
## Advertising   1.508933
  • DelSpeed has high VIF value. The linear regression model assumption will be violated.

  • 2 of the most commonly used methods to deal with multicollinearity:

    1. Remove highly correlated variables using VIF or stepwise algorithms
    2. Perform an analysis design like principal component analysis (PCA)/ Factor Analysis on the correlated variables

Factor Analysis Test KMO

Note - exclude your Y (dependent variable = predicted = outcome).

  • KMO (Kaiser-Meyer-Olkin) takes values between 0 and 1.
  • A value greater than 0.5 indicates a fit for factor analysis.
data_fa <- data_X[,-12]
datamatrix <- cor(data_fa)
KMO(r=datamatrix)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = datamatrix)
## Overall MSA =  0.65
## MSA for each item = 
##    ProdQual        Ecom     TechSup     CompRes Advertising    ProdLine 
##        0.51        0.63        0.52        0.79        0.78        0.62 
## SalesFImage  ComPricing  WartyClaim  OrdBilling    DelSpeed 
##        0.62        0.75        0.51        0.76        0.67
  • Since MSA > 0.5, we can run Factor Analysis on this data

Bartlett Test

  • Measures the sampling adequacy and determines the appropriateness of Factor Analysis.
  • The approximate of Chi-square is 619 with 55 degrees of freedom, which is significant at 0.05 Level of significance.
cortest.bartlett(datamatrix, nrow(data_X))
## $chisq
## [1] 619.2726
## 
## $p.value
## [1] 1.79337e-96
## 
## $df
## [1] 55

Factor Analysis

Scree Plot

  • Examine the scree plot of the eigenvalues to determine the number of factors for Factor Analysis.
  • Sharp breaks in the plot suggest the appropriate number of components or factors.
ev <- eigen(cor(data_fa))
ev$values
##  [1] 3.42697133 2.55089671 1.69097648 1.08655606 0.60942409 0.55188378
##  [7] 0.40151815 0.24695154 0.20355327 0.13284158 0.09842702
#Plot a Scree plot using base plot:
Factor = c(1,2,3,4,5,6,7,8,9,10,11)
Eigen_Values <-ev$values
Scree <- data.frame(Factor, Eigen_Values)
plot(Scree, main = "Scree Plot", col= "Blue",ylim=c(0,4))
lines(Scree,col='Red')
abline(h = 1, col="Green")

Varimax Rotation

nfactors <- 4
fit1 <-factanal(data_fa,nfactors,scores = c("regression"),rotation = "varimax")
print(fit1)
## 
## Call:
## factanal(x = data_fa, factors = nfactors, scores = c("regression"),     rotation = "varimax")
## 
## Uniquenesses:
##    ProdQual        Ecom     TechSup     CompRes Advertising    ProdLine 
##       0.682       0.360       0.228       0.178       0.679       0.005 
## SalesFImage  ComPricing  WartyClaim  OrdBilling    DelSpeed 
##       0.017       0.636       0.163       0.347       0.076 
## 
## Loadings:
##             Factor1 Factor2 Factor3 Factor4
## ProdQual                             0.557 
## Ecom                 0.793                 
## TechSup                      0.872   0.102 
## CompRes      0.884   0.142           0.135 
## Advertising  0.190   0.521          -0.110 
## ProdLine     0.502           0.104   0.856 
## SalesFImage  0.119   0.974          -0.130 
## ComPricing           0.225  -0.216  -0.514 
## WartyClaim                   0.894   0.158 
## OrdBilling   0.794   0.101   0.105         
## DelSpeed     0.928   0.189           0.164 
## 
##                Factor1 Factor2 Factor3 Factor4
## SS loadings      2.592   1.977   1.638   1.423
## Proportion Var   0.236   0.180   0.149   0.129
## Cumulative Var   0.236   0.415   0.564   0.694
## 
## Test of the hypothesis that 4 factors are sufficient.
## The chi square statistic is 24.26 on 17 degrees of freedom.
## The p-value is 0.113

Diagram

fa_var <-  fa(r=data_fa, nfactors = 4, rotate="varimax",fm="pa")
fa.diagram(fa_var)

head(fa_var$scores)
##             PA1        PA2          PA3         PA4
## [1,] -0.1338871  0.9175166 -1.719604873  0.09135411
## [2,]  1.6297604 -2.0090053 -0.596361722  0.65808192
## [3,]  0.3637658  0.8361736  0.002979966  1.37548765
## [4,] -1.2225230 -0.5491336  1.245473305 -0.64421384
## [5,] -0.4854209 -0.4276223 -0.026980304  0.47360747
## [6,] -0.5950924 -1.3035333 -1.183019401 -0.95913571

Regression

  • Factors can be re-labeled based on your knowledge of data
regdata <- cbind(data_X[12], fa_var$scores)
#Labeling the data

names(regdata) <- c("Satisfaction", "Purchase", "Marketing",
                    "Post_purchase", "Prod_positioning")
head(regdata)
##   Satisfaction   Purchase  Marketing Post_purchase Prod_positioning
## 1          8.2 -0.1338871  0.9175166  -1.719604873       0.09135411
## 2          5.7  1.6297604 -2.0090053  -0.596361722       0.65808192
## 3          8.9  0.3637658  0.8361736   0.002979966       1.37548765
## 4          4.8 -1.2225230 -0.5491336   1.245473305      -0.64421384
## 5          7.1 -0.4854209 -0.4276223  -0.026980304       0.47360747
## 6          4.7 -0.5950924 -1.3035333  -1.183019401      -0.95913571
set.seed(100)
indices= sample(1:nrow(regdata), 0.7*nrow(regdata))
train=regdata[indices,]
test = regdata[-indices,]
#Regression Model using train data
model1 = lm(Satisfaction~., train)
summary(model1)
## 
## Call:
## lm(formula = Satisfaction ~ ., data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.76280 -0.48717  0.06799  0.46459  1.24022 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       6.91852    0.08068  85.757  < 2e-16 ***
## Purchase          0.50230    0.07641   6.574 9.74e-09 ***
## Marketing         0.75488    0.08390   8.998 5.00e-13 ***
## Post_purchase     0.08755    0.08216   1.066    0.291    
## Prod_positioning  0.58074    0.08781   6.614 8.30e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6629 on 65 degrees of freedom
## Multiple R-squared:  0.7261, Adjusted R-squared:  0.7093 
## F-statistic: 43.08 on 4 and 65 DF,  p-value: < 2.2e-16

Testing VIF

print(vif(model1))
##         Purchase        Marketing    Post_purchase Prod_positioning 
##         1.009648         1.008235         1.015126         1.024050

Model Performance metrics

#Model 1:
pred_test1 <- predict(model1, newdata = test, type = "response")
pred_test1
##        6        8       11       13       17       19       26       33 
## 4.975008 5.908267 6.951629 8.677431 6.613838 6.963113 6.313513 6.141338 
##       34       35       37       40       42       44       49       50 
## 6.158993 7.415742 6.589746 6.858206 7.133989 8.533080 8.765145 8.078744 
##       53       56       57       60       65       67       71       73 
## 7.395438 7.468360 8.744402 6.276660 5.936570 6.650322 8.299545 7.685564 
##       75       80       96       97       99      100 
## 7.330191 6.719528 7.540233 6.143172 8.084583 5.799897
test$Satisfaction_Predicted <- pred_test1
head(test[c(1,6)], 10)
##    Satisfaction Satisfaction_Predicted
## 6           4.7               4.975008
## 8           6.3               5.908267
## 11          7.4               6.951629
## 13          8.4               8.677431
## 17          6.4               6.613838
## 19          6.8               6.963113
## 26          6.6               6.313513
## 33          5.4               6.141338
## 34          7.3               6.158993
## 35          6.3               7.415742