library(MASS)
## Warning: package 'MASS' was built under R version 4.0.5
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.5
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.2     v dplyr   1.0.7
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.0.5
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'readr' was built under R version 4.0.5
## Warning: package 'dplyr' was built under R version 4.0.5
## Warning: package 'forcats' was built under R version 4.0.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## x dplyr::select() masks MASS::select()
library(GGally)
## Warning: package 'GGally' was built under R version 4.0.5
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
# 1. Generate Variable & EDA -------------------------------------------------------

# In this section, we will generate dummy dataset for our simulation
# We will assume that IFI consists of only two dimensions, namely dimension A and B
# Dimension A (first sub-index of IFI) also consists of three input indicators
# Dimension B (secind sub-index of IFI) consists of four input indicators

set.seed(21102021)

# create the variance covariance matrix
sigma_A<-rbind(c(1,0.8,0.7), c(0.8,1, 0.9), c(0.7,0.9,1))
sigma_B<-rbind(c(1,0.7,0.75,0.8), c(0.7,1,0.75,0.8), c(0.75,0.75,1,0.9),c(0.8,0.8,0.9,1))

# create the mean vector
mu_A<-c(10, 5, 2) 
mu_B <- c(8.1,7.5,11,5)

# generate the multivariate normal distribution
data_A<-as.data.frame(mvrnorm(n=34, #34 observasi
                              mu=mu_A, Sigma=sigma_A)) %>% 
  arrange(V1) #data diurutkan dari kecil ke besar

data_A
##           V1       V2          V3
## 1   8.060314 3.766230  0.62110750
## 2   8.349427 3.899478  0.40125260
## 3   8.801945 3.759563  1.49625976
## 4   8.841254 2.959679 -0.06285384
## 5   8.903298 4.762103  2.08006776
## 6   8.973170 4.346222  1.09117249
## 7   8.987169 4.184988  1.16776393
## 8   9.111682 4.706975  2.34477793
## 9   9.726779 5.192716  2.38876858
## 10  9.880644 5.683585  2.85084347
## 11  9.903996 3.845549  0.28860395
## 12  9.904142 4.521493  1.48171513
## 13  9.979606 4.861729  1.41177744
## 14  9.993027 5.023735  1.17678679
## 15 10.025249 5.278150  2.88734685
## 16 10.252577 4.248625  0.67272458
## 17 10.308330 4.390691  1.15688805
## 18 10.322231 4.784315  2.06228722
## 19 10.403886 5.682867  2.06884251
## 20 10.442857 6.076200  2.70715781
## 21 10.453826 4.527375  1.75529200
## 22 10.607456 5.975030  3.42520135
## 23 10.659481 4.253073  1.23570053
## 24 10.769230 4.985568  2.09222797
## 25 10.822491 5.655021  3.10350340
## 26 10.951511 5.131998  1.18064362
## 27 10.991703 4.742193  1.03097336
## 28 10.997492 5.772918  2.68863765
## 29 11.321746 5.066543  2.26418902
## 30 11.352267 5.644275  2.14751708
## 31 11.653416 5.647513  2.24260451
## 32 11.670272 6.196614  2.63570637
## 33 11.941981 5.615213  2.01506714
## 34 12.165017 7.145965  3.57822112
data_B<-as.data.frame(mvrnorm(n=34, mu=mu_B, Sigma=sigma_B)) %>% 
  arrange(V4)

data_B
##          V1       V2        V3       V4
## 1  6.746662 5.475464  8.667404 1.985876
## 2  6.398852 6.307941  9.030900 2.817570
## 3  6.149399 7.687241  9.364629 2.900476
## 4  6.012812 7.949263  9.014755 3.220847
## 5  8.169693 7.418500 10.404226 3.776148
## 6  8.108144 6.387218  9.180112 3.893488
## 7  6.936631 6.020874 10.452149 4.077512
## 8  8.830227 7.718388 10.309080 4.163707
## 9  8.567042 6.960023  9.858251 4.221691
## 10 7.119075 7.068528  9.433241 4.234232
## 11 7.773713 6.705047 10.295575 4.235153
## 12 7.320785 5.986648 10.377317 4.746006
## 13 7.982613 8.288166 10.061391 4.784432
## 14 7.751169 7.896337 10.708707 4.865225
## 15 7.946412 8.051586 11.234394 4.876658
## 16 7.908821 8.748182 10.515884 4.886957
## 17 8.702276 6.675432 10.924299 4.887201
## 18 8.736747 7.916392 11.379632 5.030690
## 19 7.844875 7.984728 11.788480 5.073620
## 20 9.836384 7.214103 11.609466 5.124974
## 21 7.946139 8.103361 11.335065 5.176824
## 22 8.615952 7.847164 11.653299 5.410091
## 23 9.058074 7.810235 11.727991 5.412414
## 24 8.741835 8.212784 11.034568 5.436669
## 25 8.061654 8.094959 11.329944 5.527612
## 26 7.737040 7.543972 11.314776 5.667719
## 27 8.938332 7.984169 11.789868 5.872118
## 28 8.117245 6.678297 11.689216 6.070793
## 29 8.830796 8.678087 11.643901 6.236033
## 30 8.967668 8.641723 11.519852 6.337825
## 31 8.734360 9.332130 12.129725 6.364449
## 32 9.641468 8.668167 12.269787 6.495610
## 33 9.497850 7.894292 12.513680 6.547198
## 34 8.369085 8.241846 11.692097 6.629042
ggpairs(data_A)

ggpairs(data_B)

#Standard Normal Transform
data_A_std <- data_A %>% 
  mutate(V1 = (V1-mean(V1))/sd(V1),
         V2 = (V2-mean(V2))/sd(V2),
         V3 = (V3-mean(V3))/sd(V3))

data_B_std <- data_B %>% 
  mutate(V1 = (V1-mean(V1))/sd(V1),
         V2 = (V2-mean(V2))/sd(V2),
         V3 = (V3-mean(V3))/sd(V3),
         V4 = (V4-mean(V4))/sd(V4))

#Transformation doesn't really change the pattern of the data: compare 2 graphs below
data_A %>% ggplot(aes(x=1:34,y=V1))+geom_line()

data_A_std %>% ggplot(aes(x=1:34,y=V1))+geom_line()

cov(data_A)
##           V1        V2        V3
## V1 1.0642278 0.6577953 0.5009833
## V2 0.6577953 0.7410407 0.6827315
## V3 0.5009833 0.6827315 0.8283401
cov(data_A_std)
##           V1        V2        V3
## V1 1.0000000 0.7407172 0.5335825
## V2 0.7407172 1.0000000 0.8714141
## V3 0.5335825 0.8714141 1.0000000
cor(data_A)
##           V1        V2        V3
## V1 1.0000000 0.7407172 0.5335825
## V2 0.7407172 1.0000000 0.8714141
## V3 0.5335825 0.8714141 1.0000000
cor(data_A_std)
##           V1        V2        V3
## V1 1.0000000 0.7407172 0.5335825
## V2 0.7407172 1.0000000 0.8714141
## V3 0.5335825 0.8714141 1.0000000
# First Stage -------------------------------------------------------------


#PCA Dimension A
Dimensi_A <- princomp(data_A_std,cor=F) #Since our data is already standardized, therefore we don't need to base our PCA calculation on correlation matrix
summary(Dimensi_A) #Eigen Value
## Importance of components:
##                           Comp.1    Comp.2     Comp.3
## Standard deviation     1.5386602 0.6814502 0.28269241
## Proportion of Variance 0.8130723 0.1594821 0.02744556
## Cumulative Proportion  0.8130723 0.9725544 1.00000000
Dimensi_A$sdev
##    Comp.1    Comp.2    Comp.3 
## 1.5386602 0.6814502 0.2826924
Dimensi_A$loadings
## 
## Loadings:
##    Comp.1 Comp.2 Comp.3
## V1  0.533  0.792  0.298
## V2  0.622 -0.128 -0.773
## V3  0.574 -0.597  0.561
## 
##                Comp.1 Comp.2 Comp.3
## SS loadings     1.000  1.000  1.000
## Proportion Var  0.333  0.333  0.333
## Cumulative Var  0.333  0.667  1.000
Dimensi_A
## Call:
## princomp(x = data_A_std, cor = F)
## 
## Standard deviations:
##    Comp.1    Comp.2    Comp.3 
## 1.5386602 0.6814502 0.2826924 
## 
##  3  variables and  34 observations.
score_A <- (Dimensi_A$scores %*% (matrix(Dimensi_A$sdev^2)))/sum(Dimensi_A$sdev^2)
weight_A <- (Dimensi_A$loadings %*% matrix(Dimensi_A$sdev^2))/sum(Dimensi_A$sdev^2)

#score_A = the score of dimension A, which is the first sub-index of IFI
#weight_A = the weight attached to each input indicators that build dimension A

#PCA Dimension B
Dimensi_B <- princomp(data_B_std,cor=F)
summary(Dimensi_B) #Eigen Value
## Importance of components:
##                           Comp.1    Comp.2     Comp.3     Comp.4
## Standard deviation     1.7239547 0.7571253 0.50909442 0.27913647
## Proportion of Variance 0.7655203 0.1476524 0.06675775 0.02006957
## Cumulative Proportion  0.7655203 0.9131727 0.97993043 1.00000000
Dimensi_B$sdev
##    Comp.1    Comp.2    Comp.3    Comp.4 
## 1.7239547 0.7571253 0.5090944 0.2791365
Dimensi_B$loadings
## 
## Loadings:
##    Comp.1 Comp.2 Comp.3 Comp.4
## V1  0.485  0.482  0.728       
## V2  0.421 -0.857  0.290       
## V3  0.540  0.171 -0.433 -0.701
## V4  0.543        -0.445  0.709
## 
##                Comp.1 Comp.2 Comp.3 Comp.4
## SS loadings      1.00   1.00   1.00   1.00
## Proportion Var   0.25   0.25   0.25   0.25
## Cumulative Var   0.25   0.50   0.75   1.00
Dimensi_B
## Call:
## princomp(x = data_B_std, cor = F)
## 
## Standard deviations:
##    Comp.1    Comp.2    Comp.3    Comp.4 
## 1.7239547 0.7571253 0.5090944 0.2791365 
## 
##  4  variables and  34 observations.
score_B <- (Dimensi_B$scores %*% (matrix(Dimensi_B$sdev^2)))/sum(Dimensi_B$sdev^2)
weight_B <- (Dimensi_B$loadings %*% matrix(Dimensi_B$sdev^2))/sum(Dimensi_B$sdev^2)


# Second Stage ------------------------------------------------------------

data_IFI <- data.frame(score_A,score_B)

data_IFI
##        score_A     score_B
## 1  -2.33516824 -3.10258009
## 2  -2.19771305 -2.64806683
## 3  -1.55857266 -2.29196662
## 4  -2.63086557 -2.32003084
## 5  -0.71427424 -0.58720166
## 6  -1.32037099 -1.29235102
## 7  -1.36700413 -1.43834072
## 8  -0.51677206 -0.06782368
## 9   0.10222908 -0.53817903
## 10  0.64790318 -1.42758104
## 11 -1.41914849 -0.84114981
## 12 -0.54753873 -1.03444706
## 13 -0.35236728 -0.24719595
## 14 -0.35756735 -0.18442923
## 15  0.52448850  0.16025524
## 16 -0.84677368  0.03491972
## 17 -0.53369322  0.11220465
## 18  0.07101375  0.65209427
## 19  0.60302095  0.37434702
## 20  1.10782356  1.18140456
## 21 -0.12555827  0.31878564
## 22  1.44913894  0.81354107
## 23 -0.38111065  1.06529989
## 24  0.43820481  0.73884774
## 25  1.25824358  0.50110744
## 26  0.22990015  0.24456392
## 27 -0.02169910  1.23289633
## 28  1.24172280  0.52515178
## 29  0.85901600  1.41676857
## 30  1.13758629  1.46862495
## 31  1.34548660  1.75460305
## 32  1.81782433  2.17123860
## 33  1.39015035  2.02370605
## 34  3.00244485  1.23098308
# Now that we already have the sub-index score which is score A and score B
# we will proceed to perform the second stage PCA using both score
# However, we need to standardize the obtained dimension scores first, just like in the first stage

data_IFI
##        score_A     score_B
## 1  -2.33516824 -3.10258009
## 2  -2.19771305 -2.64806683
## 3  -1.55857266 -2.29196662
## 4  -2.63086557 -2.32003084
## 5  -0.71427424 -0.58720166
## 6  -1.32037099 -1.29235102
## 7  -1.36700413 -1.43834072
## 8  -0.51677206 -0.06782368
## 9   0.10222908 -0.53817903
## 10  0.64790318 -1.42758104
## 11 -1.41914849 -0.84114981
## 12 -0.54753873 -1.03444706
## 13 -0.35236728 -0.24719595
## 14 -0.35756735 -0.18442923
## 15  0.52448850  0.16025524
## 16 -0.84677368  0.03491972
## 17 -0.53369322  0.11220465
## 18  0.07101375  0.65209427
## 19  0.60302095  0.37434702
## 20  1.10782356  1.18140456
## 21 -0.12555827  0.31878564
## 22  1.44913894  0.81354107
## 23 -0.38111065  1.06529989
## 24  0.43820481  0.73884774
## 25  1.25824358  0.50110744
## 26  0.22990015  0.24456392
## 27 -0.02169910  1.23289633
## 28  1.24172280  0.52515178
## 29  0.85901600  1.41676857
## 30  1.13758629  1.46862495
## 31  1.34548660  1.75460305
## 32  1.81782433  2.17123860
## 33  1.39015035  2.02370605
## 34  3.00244485  1.23098308
data_IFI_std <- data_IFI %>% 
  mutate(score_A = (score_A-mean(score_A))/sd(score_A),
         score_B = (score_B-mean(score_B))/sd(score_B))

IFI <- princomp(data_IFI_std,cor=F)

summary(IFI) #Eigen Value
## Importance of components:
##                           Comp.1    Comp.2
## Standard deviation     1.3359346 0.3955441
## Proportion of Variance 0.9194019 0.0805981
## Cumulative Proportion  0.9194019 1.0000000
IFI$sdev
##    Comp.1    Comp.2 
## 1.3359346 0.3955441
IFI$loadings
## 
## Loadings:
##         Comp.1 Comp.2
## score_A  0.707  0.707
## score_B  0.707 -0.707
## 
##                Comp.1 Comp.2
## SS loadings       1.0    1.0
## Proportion Var    0.5    0.5
## Cumulative Var    0.5    1.0
score_IFI <- (IFI$scores %*% (matrix(IFI$sdev^2)))/sum(IFI$sdev^2)
weight_IFI <- (IFI$loadings %*% matrix(IFI$sdev^2))/sum(IFI$sdev^2)

# Viz ---------------------------------------------------------------------

# Finally, now that we have obtained IFI from previous calculations,
# we will use line chart to visualize the result
# however, to make it easier to analyze the dataa, we will transform the obtained score of each dimension and IFI to its normal CDF distribution form
# so that the value will range from 0-1

data_Viz <- data.frame(dim_A = pnorm(score_A, mean=mean(score_A), sd=sd(score_A)),
                       dim_B = pnorm(score_B, mean=mean(score_B), sd=sd(score_B)),
                       IFI = pnorm(score_IFI, mean=mean(score_IFI), 
                       
sd=sd(score_IFI)))
ggplot(data_Viz, aes(x=1:34)) + 
  geom_line(aes(y = dim_A, col = "red")) + 
  geom_line(aes(y = dim_B, col = "green")) +
  geom_line(aes(y = IFI, col = "blue"))