#install.packages("readxl")
#install.packages("pastecs")
#install.packages("TSstudio")
library(readxl)
## Warning: package 'readxl' was built under R version 4.0.5
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.5
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.2 v dplyr 1.0.7
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.0.5
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'readr' was built under R version 4.0.5
## Warning: package 'dplyr' was built under R version 4.0.5
## Warning: package 'forcats' was built under R version 4.0.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(pastecs)
## Warning: package 'pastecs' was built under R version 4.0.5
##
## Attaching package: 'pastecs'
## The following objects are masked from 'package:dplyr':
##
## first, last
## The following object is masked from 'package:tidyr':
##
## extract
library(TSstudio)
## Warning: package 'TSstudio' was built under R version 4.0.5
library(xts)
## Warning: package 'xts' was built under R version 4.0.5
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.0.5
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following objects are masked from 'package:pastecs':
##
## first, last
## The following objects are masked from 'package:dplyr':
##
## first, last
library(MASS)
## Warning: package 'MASS' was built under R version 4.0.5
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(GGally)
## Warning: package 'GGally' was built under R version 4.0.5
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
#read excel file second worksheet
composite_index <- read_excel("Composite Index -1.xlsx", sheet=2)
## New names:
## * `` -> ...1
#create time series index from composit_index dataframe
composite_index_xts <- xts(composite_index[-1], composite_index[[1]])
# assuming you have a 1 column dataframe called "df"
cov_composite_index <- cov(composite_index_xts)
#plot the summary result for internet, mobile, credit, debit, emoney
summary_table <- stat.desc(cov_composite_index)
summary_table
## Internet Mobile Credit Debit
## nbr.val 5.000000e+00 5.000000e+00 5.000000e+00 5.000000e+00
## nbr.null 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## nbr.na 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## min -7.545116e+02 -6.550128e+05 -4.995375e+06 -2.078418e+05
## max 8.213714e+04 7.241868e+07 5.151259e+04 2.154378e+07
## range 8.289166e+04 7.307369e+07 5.046888e+06 2.175162e+07
## sum 9.557770e+04 8.438149e+07 -5.807472e+06 2.538406e+07
## median 3.234270e+03 2.760510e+06 -2.078418e+05 1.284374e+06
## mean 1.911554e+04 1.687630e+07 -1.161494e+06 5.076812e+06
## SE.mean 1.589086e+04 1.400967e+07 9.665227e+05 4.150769e+06
## CI.mean.0.95 4.412009e+04 3.889709e+07 2.683497e+06 1.152438e+07
## var 1.262597e+09 9.813546e+14 4.670830e+12 8.614443e+13
## std.dev 3.553304e+04 3.132658e+07 2.161210e+06 9.281402e+06
## coef.var 1.858856e+00 1.856247e+00 -1.860715e+00 1.828195e+00
## Emoney
## nbr.val 5.000000e+00
## nbr.null 0.000000e+00
## nbr.na 0.000000e+00
## min -4.995375e+06
## max 5.467837e+08
## range 5.517791e+08
## sum 6.358329e+08
## median 2.154378e+07
## mean 1.271666e+08
## SE.mean 1.057944e+08
## CI.mean.0.95 2.937325e+08
## var 5.596232e+16
## std.dev 2.365636e+08
## coef.var 1.860265e+00
#display stad dev with specific column
sd(composite_index$Internet)
## [1] 3.532211
#create the variance covariance matrix for variables (internet, mobile, credit, debit, emoney)
sigma_first <- cov_composite_index
#create the mean vector
mu_first <- c(1.9115,1.6876,-1.1614,5.0768,1.2716)
#generate the multivariate normal distribution
data_first <- as.data.frame(mvrnorm(n=40, #40 observation
mu=mu_first, Sigma=sigma_first))
data_first
## Internet Mobile Credit Debit Emoney
## 1 6.09649383 4305.12990 -352.962363 2010.60931 26960.9366
## 2 3.33776743 1218.16387 -208.421986 1024.46159 10018.9041
## 3 0.09768967 -1708.63003 15.301426 88.67557 -16910.0951
## 4 -3.34680196 -4664.09350 375.116611 -926.59700 -33222.6100
## 5 0.80538559 -753.20649 132.783341 -1124.03718 -8799.8642
## 6 2.07812685 -752.91947 -86.641849 190.94372 -441.6693
## 7 7.36782377 5250.39562 -381.278578 1198.40400 35481.4010
## 8 -3.10132575 -4262.23473 245.253433 -1859.53494 -31901.1687
## 9 -0.33051459 -2422.76810 179.210637 -1051.45517 -14195.5412
## 10 4.64256233 1969.34563 -209.117432 770.16539 18614.4849
## 11 2.19149526 -108.40658 9.914327 32.08888 -673.1266
## 12 -2.13942529 -4077.28695 307.005694 -1429.23438 -29380.8633
## 13 -2.33660596 -3394.28314 312.341489 -612.54752 -31819.3659
## 14 5.84415946 3570.62669 -173.378728 1849.56435 29683.7886
## 15 -1.91516814 -3307.05563 108.455792 -213.37917 -19512.8654
## 16 2.08229659 -97.00085 -164.968873 496.19460 4084.3904
## 17 2.68994989 1471.29239 -75.365198 -63.10560 7180.8927
## 18 6.66608020 4119.21482 -238.435937 1057.67501 31901.6251
## 19 0.39072062 -1369.72765 148.040414 1208.37161 -10399.7393
## 20 3.99096934 1972.66289 -97.637474 -1145.14755 14359.2678
## 21 3.95310094 1802.33676 -167.786843 -87.57776 12761.6088
## 22 0.44028300 -1518.86263 64.551172 136.56845 -6464.3309
## 23 1.46945639 -1228.46082 -7.602836 1398.49557 -6250.0513
## 24 4.21542063 1777.21921 -102.733118 859.47284 19481.6884
## 25 -3.92108248 -5523.93651 289.771453 -914.53700 -39896.5386
## 26 -1.66161361 -4256.79955 199.929729 -939.26050 -25898.5150
## 27 -0.09302934 -1223.29858 18.822623 -470.50327 -12584.3868
## 28 4.20117254 2412.15590 -102.295727 -157.50566 10457.9964
## 29 -3.61090145 -4921.40408 362.795177 -608.68465 -37940.0293
## 30 6.85897439 4738.89181 -430.124772 2790.90949 31579.6019
## 31 8.15735054 6034.85941 -532.024957 3015.34230 44690.8397
## 32 -4.59684435 -5910.89742 367.727154 -1033.44582 -43586.5592
## 33 -4.56362575 -6297.81182 287.397220 -2290.39865 -44124.6226
## 34 -5.05686093 -5639.01106 493.668224 -1852.92294 -43642.4062
## 35 2.54728793 -519.62374 27.913840 152.21576 -1818.1710
## 36 -4.19117659 -5233.59218 421.983946 -1503.55076 -43939.8262
## 37 3.29441876 866.24075 -127.977530 -145.89308 10874.0921
## 38 2.63964829 520.38411 11.861802 -35.51122 5154.2225
## 39 1.36005661 -593.35894 41.874328 -398.16648 -2250.1586
## 40 5.30882826 2420.94137 -255.135283 2519.60890 18399.1714
ggpairs(data_first)

#Standard Normal Transform
data_first_std <- data_first %>%
mutate(Internet = (Internet-mean(Internet))/sd(Internet),
Mobile = (Mobile-mean(Mobile))/sd(Mobile),
Credit = (Credit-mean(Credit))/sd(Credit),
Debit = (Debit-mean(Debit))/sd(Debit),
Emoney = (Emoney-mean(Emoney))/sd(Emoney))
#Transformation doesn't really change the pattern of the data: compare 2 graphs below
data_first %>% ggplot(aes(x=1:40,y=Internet))+geom_line()

data_first_std %>% ggplot(aes(x=1:40,y=Internet))+geom_line()

cov(data_first)
## Internet Mobile Credit Debit Emoney
## Internet 13.97867 12625.29 -902.78 3929.779 93485.84
## Mobile 12625.29189 11604528.83 -816903.86 3535168.292 84812996.27
## Credit -902.77999 -816903.86 63051.00 -269618.240 -6071923.33
## Debit 3929.77871 3535168.29 -269618.24 1648239.020 26288392.50
## Emoney 93485.83667 84812996.27 -6071923.33 26288392.497 632364994.81
cov(data_first_std)
## Internet Mobile Credit Debit Emoney
## Internet 1.0000000 0.9912763 -0.9616189 0.8187005 0.9943264
## Mobile 0.9912763 1.0000000 -0.9550172 0.8083254 0.9900674
## Credit -0.9616189 -0.9550172 1.0000000 -0.8363599 -0.9616043
## Debit 0.8187005 0.8083254 -0.8363599 1.0000000 0.8142730
## Emoney 0.9943264 0.9900674 -0.9616043 0.8142730 1.0000000
# First Stage -------------------------------------------------------------
#PCA Dimension A
Dimension_First <- princomp(data_first_std,cor=F) #Since our data is already standardized, therefore we don't need to base our PCA calculation on correlation matrix
summary(Dimension_First) #Eigen Value
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## Standard deviation 2.1317132 0.5135192 0.22726282 0.100270821 0.073447104
## Proportion of Variance 0.9321438 0.0540927 0.01059454 0.002062408 0.001106559
## Cumulative Proportion 0.9321438 0.9862365 0.99683103 0.998893441 1.000000000
Dimension_First$sdev
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## 2.1317132 0.5135192 0.2272628 0.1002708 0.0734471
Dimension_First$loadings
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## Internet 0.458 0.227 0.234 0.320 0.762
## Mobile 0.456 0.257 0.317 -0.782 -0.120
## Credit -0.453 0.885
## Debit 0.409 -0.904 0.124
## Emoney 0.458 0.241 0.215 0.531 -0.636
##
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## SS loadings 1.0 1.0 1.0 1.0 1.0
## Proportion Var 0.2 0.2 0.2 0.2 0.2
## Cumulative Var 0.2 0.4 0.6 0.8 1.0
score_A <- (Dimension_First$scores %*% (matrix(Dimension_First$sdev^2)))/sum(Dimension_First$sdev^2)
weight_A <- (Dimension_First$loadings %*% matrix(Dimension_First$sdev^2))/sum(Dimension_First$sdev^2)
# Second Stage ------------------------------------------------------------
data_IFI <- data.frame(score_A)
# Now that we already have the sub-index score which is score A and score B
# we will proceed to perform the second stage PCA using both score
# However, we need to standardize the obtained dimension scores first, just like in the first stage
data_IFI_std <- data_IFI %>%
mutate(score_A = (score_A-mean(score_A))/sd(score_A))
IFI <- princomp(data_IFI_std,cor=F)
summary(IFI) #Eigen Value
## Importance of components:
## Comp.1
## Standard deviation 0.9874209
## Proportion of Variance 1.0000000
## Cumulative Proportion 1.0000000
IFI$sdev
## Comp.1
## 0.9874209
IFI$loadings
##
## Loadings:
## Comp.1
## score_A 1
##
## Comp.1
## SS loadings 1
## Proportion Var 1
score_IFI <- (IFI$scores %*% (matrix(IFI$sdev^2)))/sum(IFI$sdev^2)
weight_IFI <- (IFI$loadings %*% matrix(IFI$sdev^2))/sum(IFI$sdev^2)
# Viz ---------------------------------------------------------------------
# Finally, now that we have obtained IFI from previous calculations,
# we will use line chart to visualize the result
# however, to make it easier to analyze the dataa, we will transform the obtained score of each dimension and IFI to its normal CDF distribution form
# so that the value will range from 0-1
data_Viz <- data.frame(dim_A = pnorm(score_A, mean=mean(score_A), sd=sd(score_A)),
IFI = pnorm(score_IFI, mean=mean(score_IFI), sd=sd(score_IFI)))
ggplot(data_Viz, aes(x=1:40)) +
geom_line(aes(y = dim_A, col = "green")) +
geom_line(aes(y = IFI, col = "black"))
