Implementasi Structural Equation Modeling (SEM) dengan Pendekatan Confirmatory Factor Analysis (CFA) pada Customer Survey Report Dataset

packages <- c(
  "lavaan",
  "semPlot",
  "psych",
  "MVN",
  "car",
  "ggplot2",
  "dplyr",
  "corrplot",
  "reshape2",
  "FactoMineR",
  "factoextra"
)

installed <- packages %in% rownames(installed.packages())

if(any(!installed)){
  install.packages(
    packages[!installed],
    dependencies = TRUE
  )
}

library(lavaan)
library(semPlot)
library(psych)
library(MVN)
library(car)
library(ggplot2)
library(dplyr)
library(corrplot)
library(reshape2)
library(FactoMineR)
library(factoextra)

1. Load Dataset

data_customer <- read.csv(
  "D:/UAS tableau/customer_feedback_survey.csv"
)

cat("JUMLAH DATA:")

## JUMLAH DATA:

cat("Jumlah baris:", nrow(data_customer))

## Jumlah baris: 10000

cat("Jumlah kolom:", ncol(data_customer))

## Jumlah kolom: 12

2. Preprocessing

# NORMALISASI NAMA KOLOM
names(data_customer) <- gsub(
  "\\.",
  "_",
  names(data_customer)
)

names(data_customer) <- gsub(
  " ",
  "_",
  names(data_customer)
)

names(data_customer) <- gsub(
  "[^A-Za-z0-9_]",
  "",
  names(data_customer)
)

cat("NAMA KOLOM DATASET")

## NAMA KOLOM DATASET

print(names(data_customer))

##  [1] "Customer_ID"               "Overall_Satisfaction"     
##  [3] "Product_Quality"           "Service_Speed"            
##  [5] "Support_Helpfulness"       "Website_Ease_of_Use"      
##  [7] "Delivery_Speed"            "Price_Competitiveness"    
##  [9] "Recommendation_Likelihood" "Experience_with_Brand"    
## [11] "Feedback_Comments"         "Contact_Channel"

3. Konversi Data Kategorik

data_customer$Experience_with_Brand <-
  ifelse(
    data_customer$Experience_with_Brand == "Positive", 5,
    
    ifelse(
      data_customer$Experience_with_Brand == "Neutral", 3,
      
      ifelse(
        data_customer$Experience_with_Brand == "Negative", 1, NA
      )
    )
  )

data_customer$Experience_with_Brand <-
  as.numeric(data_customer$Experience_with_Brand)

4. Pemilihan Variabel

data <- data.frame(
  
  X1 = data_customer$Product_Quality,
  
  X2 = data_customer$Service_Speed,
  
  X3 = data_customer$Support_Helpfulness,
  
  X4 = data_customer$Website_Ease_of_Use
)

5. Cek Missing Value

cat("MISSING VALUE")

## MISSING VALUE

print(colSums(is.na(data)))

## X1 X2 X3 X4 
##  0  0  0  0

    # HAPUS MISSING VALUE
data <- na.omit(data)

cat("JUMLAH DATA SETELAH na.omit()")

## JUMLAH DATA SETELAH na.omit()

print(nrow(data))

## [1] 10000

6. Cek Struktur Data

cat("STRUKTUR DATA")

## STRUKTUR DATA

str(data)

## 'data.frame':    10000 obs. of  4 variables:
##  $ X1: int  2 5 3 4 1 1 5 5 2 1 ...
##  $ X2: int  4 1 5 1 4 4 5 2 5 2 ...
##  $ X3: int  1 3 5 1 4 2 4 2 1 2 ...
##  $ X4: int  1 5 5 4 4 3 1 5 3 4 ...

7. Standarisasi Data

data_scaled <- as.data.frame(
  scale(data)
)

8. Statistik Deskriptif

cat("STATISTIK DESKRIPTIF")

## STATISTIK DESKRIPTIF

desc <- describe(data_scaled)

print(round(
  desc[, c(
    "mean",
    "sd",
    "min",
    "max",
    "skew",
    "kurtosis"
  )], 3))

##    mean sd   min  max  skew kurtosis
## X1    0  1 -1.42 1.41  0.00    -1.30
## X2    0  1 -1.40 1.41  0.00    -1.31
## X3    0  1 -1.42 1.40 -0.01    -1.31
## X4    0  1 -1.42 1.42 -0.01    -1.28

9. Visualisasi

data_long <- melt(data_scaled)

p1 <- ggplot(
  data_long,
  aes(
    x = value,
    fill = variable
  )
) +
  
  geom_histogram(
    bins = 10,
    color = "white"
  ) +
  
  facet_wrap(
    ~variable,
    scales = "free"
  ) +
  
  theme_bw() +
  
  labs(
    title = "Distribusi Variabel",
    x = "Nilai",
    y = "Frekuensi"
  )

print(p1)

cor_matrix <- cor(data_scaled)

cat("MATRKS KORELASI")

## MATRKS KORELASI

print(round(cor_matrix, 3))

##       X1     X2     X3     X4
## X1 1.000  0.003  0.017  0.004
## X2 0.003  1.000  0.010 -0.014
## X3 0.017  0.010  1.000 -0.021
## X4 0.004 -0.014 -0.021  1.000

corrplot(
  cor_matrix,
  method = "color",
  type = "upper",
  addCoef.col = "yellow",
  tl.col = "black",
  tl.srt = 45
)

10. Uji Asumsi

# KMO
kmo_result <- KMO(cor_matrix)

cat("KMO TEST")

## KMO TEST

cat(
  "MSA :",
  round(kmo_result$MSA, 3),
  "\n"
)

## MSA : 0.503

cat("MSA PER VARIABEL")

## MSA PER VARIABEL

print(round(kmo_result$MSAi, 3))

##    X1    X2    X3    X4 
## 0.497 0.511 0.502 0.502

    # BARTLETT TEST
bartlett_result <- cortest.bartlett(
  cor_matrix,
  n = nrow(data_scaled)
)

cat("BARTLETT TEST")

## BARTLETT TEST

cat(
  "Chi-Square :",
  round(bartlett_result$chisq, 3),
  "\n"
)

## Chi-Square : 10.512

cat(
  "df :",
  bartlett_result$df,
  "\n"
)

## df : 6

cat(
  "p-value :",
  round(bartlett_result$p.value, 4),
  "\n"
)

## p-value : 0.1047

    # NORMALITAS MULTIVARIAT
cat("UJI NORMALITAS MULTIVARIAT")

## UJI NORMALITAS MULTIVARIAT

result_mardia <- psych::mardia(
  data_scaled
)

print(result_mardia)

## Call: psych::mardia(x = data_scaled)
## 
## Mardia tests of multivariate skew and kurtosis
## Use describe(x) the to get univariate tests
## n.obs = 10000   num.vars =  4 
## b1p =  0.01   skew =  10.04  with probability  <=  0.97
##  small sample skew =  10.05  with probability <=  0.97
## b2p =  18.8   kurtosis =  -37.51  with probability <=  0

    # MULTIKOLINIERITAS
cat("UJI MULTIKOLINIERITAS")

## UJI MULTIKOLINIERITAS

model_vif <- lm(
  X1 ~ X2 + X3 + X4,
  data = data_scaled
)

vif_result <- vif(model_vif)
print(round(vif_result, 3))

##    X2    X3    X4 
## 1.000 1.001 1.001

11. PCA

pca_result <- PCA(
  data_scaled,
  scale.unit = TRUE,
  graph = FALSE
)

cat("EIGENVALUE PCA")

## EIGENVALUE PCA

print(round(pca_result$eig, 3))

##        eigenvalue percentage of variance cumulative percentage of variance
## comp 1      1.034                 25.840                            25.840
## comp 2      1.005                 25.135                            50.975
## comp 3      0.990                 24.758                            75.733
## comp 4      0.971                 24.267                           100.000

    # SCREE PLOT
fviz_eig(
  pca_result,
  addlabels = TRUE,
  ylim = c(0, 80)
)

12. EFA

eig_value <- pca_result$eig[,1]

n_factor <- sum(eig_value > 1)

cat("JUMLAH FAKTOR")

## JUMLAH FAKTOR

print(n_factor)

## [1] 2

efa_result <- fa(
  data_scaled,
  nfactors = 1,
  rotate = "varimax",
  fm = "pa"
)

cat("HASIL EFA")

## HASIL EFA

print(efa_result)

## Factor Analysis using method =  pa
## Call: fa(r = data_scaled, nfactors = 1, rotate = "varimax", fm = "pa")
## Standardized loadings (pattern matrix) based upon correlation matrix
##      PA1     h2   u2 com
## X1  0.05 0.0030 1.00   1
## X2  0.08 0.0059 0.99   1
## X3  0.17 0.0286 0.97   1
## X4 -0.12 0.0147 0.99   1
## 
##                 PA1
## SS loadings    0.05
## Proportion Var 0.01
## 
## Mean item complexity =  1
## Test of the hypothesis that 1 factor is sufficient.
## 
## df null model =  6  with the objective function =  0 with Chi Square =  10.51
## df of  the model are 2  and the objective function was  0 
## 
## The root mean square of the residuals (RMSR) is  0.01 
## The df corrected root mean square of the residuals is  0.01 
## 
## The harmonic n.obs is  10000 with the empirical chi square  2  with prob <  0.37 
## The total n.obs was  10000  with Likelihood Chi Square =  2.04  with prob <  0.36 
## 
## Tucker Lewis Index of factoring reliability =  0.972
## RMSEA index =  0.001  and the 90 % confidence intervals are  0 0.02
## BIC =  -16.38
## Fit based upon off diagonal values = 0.81
## Measures of factor score adequacy             
##                                                     PA1
## Correlation of (regression) scores with factors    0.22
## Multiple R square of scores with factors           0.05
## Minimum correlation of possible factor scores     -0.90

13. FIRST ORDER CFA

model_first_order <- '

KP =~
X1 + X2 + X3 + X4
'

cat("MODEL FIRST ORDER CFA")

## MODEL FIRST ORDER CFA

cat(model_first_order)

## 
## 
## KP =~
## X1 + X2 + X3 + X4

fit_first <- cfa(
  model_first_order,
  data = data_scaled,
  estimator = "ML",
  std.lv = TRUE
)

inspect(fit_first, "converged")

## [1] TRUE

summary(
  fit_first,
  standardized = TRUE,
  fit.measures = TRUE
)

## lavaan 0.6-21 ended normally after 39 iterations
## 
##   Estimator                                         ML
##   Optimization method                           NLMINB
##   Number of model parameters                         8
## 
##   Number of observations                         10000
## 
## Model Test User Model:
##                                                       
##   Test statistic                                 1.882
##   Degrees of freedom                                 2
##   P-value (Chi-square)                           0.390
## 
## Model Test Baseline Model:
## 
##   Test statistic                                10.515
##   Degrees of freedom                                 6
##   P-value                                        0.105
## 
## User Model versus Baseline Model:
## 
##   Comparative Fit Index (CFI)                    1.000
##   Tucker-Lewis Index (TLI)                       1.078
## 
## Loglikelihood and Information Criteria:
## 
##   Loglikelihood user model (H0)             -56751.225
##   Loglikelihood unrestricted model (H1)     -56750.284
##                                                       
##   Akaike (AIC)                              113518.449
##   Bayesian (BIC)                            113576.132
##   Sample-size adjusted Bayesian (SABIC)     113550.709
## 
## Root Mean Square Error of Approximation:
## 
##   RMSEA                                          0.000
##   90 Percent confidence interval - lower         0.000
##   90 Percent confidence interval - upper         0.019
##   P-value H_0: RMSEA <= 0.050                    1.000
##   P-value H_0: RMSEA >= 0.080                    0.000
## 
## Standardized Root Mean Square Residual:
## 
##   SRMR                                           0.004
## 
## Parameter Estimates:
## 
##   Standard errors                             Standard
##   Information                                 Expected
##   Information saturated (h1) model          Structured
## 
## Latent Variables:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##   KP =~                                                                 
##     X1                0.058    0.049    1.183    0.237    0.058    0.058
##     X2                0.055    0.048    1.146    0.252    0.055    0.055
##     X3                0.236    0.170    1.384    0.166    0.236    0.236
##     X4               -0.090    0.067   -1.335    0.182   -0.090   -0.090
## 
## Variances:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##    .X1                0.996    0.015   65.644    0.000    0.996    0.997
##    .X2                0.997    0.015   66.379    0.000    0.997    0.997
##    .X3                0.944    0.081   11.635    0.000    0.944    0.945
##    .X4                0.992    0.018   53.697    0.000    0.992    0.992
##     KP                1.000                               1.000    1.000

    # GOF 1
fit_first_indices <- fitMeasures(
  fit_first,
  c(
    "chisq",
    "df",
    "pvalue",
    "cfi",
    "tli",
    "rmsea",
    "srmr",
    "gfi",
    "agfi",
    "nfi"
  )
)

gof_first <- data.frame(
  Indeks = names(fit_first_indices),
  Nilai = round(
    as.numeric(fit_first_indices),
    3
  )
)

cat("GOODNESS OF FIT")

## GOODNESS OF FIT

print(gof_first)

##    Indeks Nilai
## 1   chisq 1.882
## 2      df 2.000
## 3  pvalue 0.390
## 4     cfi 1.000
## 5     tli 1.078
## 6   rmsea 0.000
## 7    srmr 0.004
## 8     gfi 1.000
## 9    agfi 1.000
## 10    nfi 0.821

    # CMIN/DF 1
cmin_df <- fitMeasures(
  fit_first,
  "chisq"
) /
  fitMeasures(
    fit_first,
    "df"
  )

cat(
  "\nCMIN/DF:",
  round(cmin_df, 3),
  "\n"
)

## 
## CMIN/DF: 0.941

    # LOADING FACTOR 1
std_solution <- standardizedSolution(
  fit_first
)

loading_factor <- std_solution[
  std_solution$op == "=~",
]

cat("LOADING FACTOR")

## LOADING FACTOR

print(
  loading_factor[, c(
    "lhs",
    "rhs",
    "est.std",
    "se",
    "z",
    "pvalue"
  )]
)

##   lhs rhs est.std    se      z pvalue
## 1  KP  X1   0.058 0.049  1.183  0.237
## 2  KP  X2   0.055 0.048  1.146  0.252
## 3  KP  X3   0.236 0.170  1.384  0.166
## 4  KP  X4  -0.090 0.067 -1.335  0.182

    # CR & AVE 1
cat("CR DAN AVE")

## CR DAN AVE

loading_data <- loading_factor

lambda <- loading_data$est.std

error_var <- 1 - lambda^2

    # CONSTRUCT RELIABILITY
CR <- (sum(lambda)^2) /
  (
    (sum(lambda)^2) +
      sum(error_var)
  )

    # AVERAGE VARIANCE EXTRACTED
AVE <- sum(lambda^2) /
  (
    sum(lambda^2) +
      sum(error_var)
  )

hasil_cr_ave <- data.frame(
  Konstruk = "KepuasanPelanggan",
  CR = round(CR, 3),
  AVE = round(AVE, 3)
)

print(hasil_cr_ave)

##            Konstruk    CR   AVE
## 1 KepuasanPelanggan 0.017 0.018

# --- MODIFICATION INDICES 1 ---
mi_first <- modificationIndices(
  fit_first,
  sort. = TRUE
)

cat("TOP MODIFICATION INDICES")

## TOP MODIFICATION INDICES

print(head(mi_first, 10))

##    lhs op rhs    mi    epc sepc.lv sepc.all sepc.nox
## 14  X2 ~~  X4 1.607 -0.017  -0.017   -0.017   -0.017
## 11  X1 ~~  X3 1.607  0.048   0.048    0.049    0.049
## 13  X2 ~~  X3 1.580 -0.044  -0.044   -0.045   -0.045
## 12  X1 ~~  X4 1.580  0.018   0.018    0.018    0.018
## 15  X3 ~~  X4 0.000 -0.001  -0.001   -0.001   -0.001
## 10  X1 ~~  X2 0.000  0.000   0.000    0.000    0.000

semPaths(
  fit_first,
  
  what = "std",
  whatLabels = "std",
  
  layout = "tree",
  
  style = "lisrel",
  
  residuals = FALSE,
  
  intercepts = FALSE,
  
  edge.label.cex = 1.8,
  
  sizeLat = 14,
  sizeMan = 10,
  
  label.cex = 2,
  
  nCharNodes = 0,
  
  fade = FALSE,
  
  mar = c(6,6,6,6)
)

title("First Order - CFA")

14. SECOND ORDER CFA

model_second_order <- '

    # FIRST ORDER LATENT
KlP =~
X1 + X2

KL =~
X3 + X4

    # SECOND ORDER LATENT
KP =~
KlP +
KL
'

cat("MODEL SECOND ORDER CFA")

## MODEL SECOND ORDER CFA

cat(model_second_order)

## 
## 
##     # FIRST ORDER LATENT
## KlP =~
## X1 + X2
## 
## KL =~
## X3 + X4
## 
##     # SECOND ORDER LATENT
## KP =~
## KlP +
## KL

fit_second <- cfa(
  model_second_order,
  data = data_scaled,
  estimator = "ML",
  std.lv = TRUE
)

inspect(fit_second, "converged")

## [1] TRUE

summary(
  fit_second,
  standardized = TRUE,
  fit.measures = TRUE
)

## lavaan 0.6-21 ended normally after 387 iterations
## 
##   Estimator                                         ML
##   Optimization method                           NLMINB
##   Number of model parameters                        10
## 
##   Number of observations                         10000
## 
## Model Test User Model:
##                                                       
##   Test statistic                                 1.882
##   Degrees of freedom                                 0
## 
## Model Test Baseline Model:
## 
##   Test statistic                                10.515
##   Degrees of freedom                                 6
##   P-value                                        0.105
## 
## User Model versus Baseline Model:
## 
##   Comparative Fit Index (CFI)                    0.583
##   Tucker-Lewis Index (TLI)                       1.000
## 
## Loglikelihood and Information Criteria:
## 
##   Loglikelihood user model (H0)             -56751.225
##   Loglikelihood unrestricted model (H1)     -56750.284
##                                                       
##   Akaike (AIC)                              113522.449
##   Bayesian (BIC)                            113594.552
##   Sample-size adjusted Bayesian (SABIC)     113562.774
## 
## Root Mean Square Error of Approximation:
## 
##   RMSEA                                          0.000
##   90 Percent confidence interval - lower         0.000
##   90 Percent confidence interval - upper         0.000
##   P-value H_0: RMSEA <= 0.050                       NA
##   P-value H_0: RMSEA >= 0.080                       NA
## 
## Standardized Root Mean Square Residual:
## 
##   SRMR                                           0.004
## 
## Parameter Estimates:
## 
##   Standard errors                             Standard
##   Information                                 Expected
##   Information saturated (h1) model          Structured
## 
## Latent Variables:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##   KlP =~                                                                
##     X1                0.007       NA                      0.059    0.059
##     X2                0.007       NA                      0.056    0.056
##   KL =~                                                                 
##     X3                0.031       NA                      0.237    0.237
##     X4               -0.012       NA                     -0.090   -0.090
##   KP =~                                                                 
##     KlP               8.321       NA                      0.993    0.993
##     KL                7.531       NA                      0.991    0.991
## 
## Variances:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##    .X1                0.996       NA                      0.996    0.996
##    .X2                0.997       NA                      0.997    0.997
##    .X3                0.944       NA                      0.944    0.944
##    .X4                0.992       NA                      0.992    0.992
##    .KlP               1.000                               0.014    0.014
##    .KL                1.000                               0.017    0.017
##     KP                1.000                               1.000    1.000

    # GOF 2 
fit_second_indices <- fitMeasures(
  fit_second,
  c(
    "chisq",
    "df",
    "pvalue",
    "cfi",
    "tli",
    "rmsea",
    "srmr",
    "gfi",
    "agfi",
    "nfi"
  )
)

gof_second <- data.frame(
  Indeks = names(fit_second_indices),
  Nilai = round(
    as.numeric(fit_second_indices),
    3
  )
)

cat("GOODNESS OF FIT SECOND ORDER")

## GOODNESS OF FIT SECOND ORDER

print(gof_second)

##    Indeks Nilai
## 1   chisq 1.882
## 2      df 0.000
## 3  pvalue    NA
## 4     cfi 0.583
## 5     tli 1.000
## 6   rmsea 0.000
## 7    srmr 0.004
## 8     gfi 1.000
## 9    agfi 1.000
## 10    nfi 1.000

    # CMIN/DF 2
cmin_df_second <- fitMeasures(
  fit_second,
  "chisq"
) /
  fitMeasures(
    fit_second,
    "df"
  )

cat(
  "\nCMIN/DF :",
  round(cmin_df_second, 3),
  "\n"
)

## 
## CMIN/DF : Inf

    # LOADING FACTOR 2
std_second <- standardizedSolution(
  fit_second
)

loading_second <- std_second[
  std_second$op == "=~",
]

cat("LOADING FACTOR SECOND ORDER")

## LOADING FACTOR SECOND ORDER

print(
  loading_second[, c(
    "lhs",
    "rhs",
    "est.std",
    "se",
    "z",
    "pvalue"
  )]
)

##   lhs rhs est.std se  z pvalue
## 1 KlP  X1   0.059 NA NA     NA
## 2 KlP  X2   0.056 NA NA     NA
## 3  KL  X3   0.237 NA NA     NA
## 4  KL  X4  -0.090 NA NA     NA
## 5  KP KlP   0.993 NA NA     NA
## 6  KP  KL   0.991 NA NA     NA

    # CR & AVE 2
cat("CR DAN AVE SECOND ORDER")

## CR DAN AVE SECOND ORDER

loading_data <- loading_second

constructs <- unique(
  loading_data$lhs
)

hasil_cr_ave_second <- data.frame()

for(k in constructs){
  
  lambda <- loading_data$est.std[
    loading_data$lhs == k
  ]
  
  error_var <- 1 - lambda^2
  
  # CONSTRUCT RELIABILITY
  CR <- (sum(lambda)^2) /
    (
      (sum(lambda)^2) +
        sum(error_var)
    )
  
  # AVERAGE VARIANCE EXTRACTED
  AVE <- sum(lambda^2) /
    (
      sum(lambda^2) +
        sum(error_var)
    )
  
  hasil_cr_ave_second <- rbind(
    hasil_cr_ave_second,
    
    data.frame(
      Konstruk = k,
      CR = round(CR, 3),
      AVE = round(AVE, 3)
    )
  )
}

print(hasil_cr_ave_second)

##   Konstruk    CR   AVE
## 1      KlP 0.007 0.003
## 2       KL 0.011 0.032
## 3       KP 0.992 0.984

cat("TOP MODIFICATION INDICES SECOND ORDER\n")

## TOP MODIFICATION INDICES SECOND ORDER

mi_second <- try(
  modificationIndices(
    fit_second,
    sort. = TRUE
  ),
  silent = TRUE
)

if(inherits(mi_second, "try-error")){

  cat(
    "Modification Indices tidak dapat dihitung karena information matrix singular.\n"
  )

} else {

  print(
    head(mi_second, 10)
  )

}

## Modification Indices tidak dapat dihitung karena information matrix singular.

semPaths(
  fit_second,
  
  what = "std",
  whatLabels = "std",
  
  layout = "tree",
  
  style = "lisrel",
  
  residuals = FALSE,
  
  intercepts = FALSE,
  
  edge.label.cex = 1.8,
  
  sizeLat = 14,
  sizeMan = 10,
  
  label.cex = 2,
  
  nCharNodes = 0,
  
  fade = FALSE,
  
  mar = c(6,6,6,6)
)
  
title("Second Order - CFA")

Implementasi Structural Equation Modeling (SEM) dengan Pendekatan Confirmatory Factor Analysis (CFA) pada Customer Survey Report Dataset

Analisis Multivariat_Modul 5

Kelompok 8_Zahra Brillianty P.

1. Load Dataset

2. Preprocessing

3. Konversi Data Kategorik

4. Pemilihan Variabel

5. Cek Missing Value

6. Cek Struktur Data

7. Standarisasi Data

8. Statistik Deskriptif

9. Visualisasi

10. Uji Asumsi

11. PCA

12. EFA

13. FIRST ORDER CFA

14. SECOND ORDER CFA