Step 1: Data

How many dimensions? 14401 x 10 What are the variables types? all numeric except StockCode, will remove stockcode

library(readr)
library(psych)
library(tidyverse)
library(Hmisc)
library(car)
data <- read_csv("data copy.csv")
## Parsed with column specification:
## cols(
##   StockCode = col_double(),
##   DSRI = col_double(),
##   GMI = col_double(),
##   AQI = col_double(),
##   SGI = col_double(),
##   DEPI = col_double(),
##   SGAI = col_double(),
##   LVGI = col_double(),
##   TATA = col_double(),
##   M_score = col_double()
## )
dim(data)
## [1] 14401    10
str(data)   
## tibble [14,401 × 10] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ StockCode: num [1:14401] 1 1 1 1 2 2 2 2 4 4 ...
##  $ DSRI     : num [1:14401] 1.186 1.008 0 0 0.672 ...
##  $ GMI      : num [1:14401] 1.082 0.907 1.361 0.307 1.069 ...
##  $ AQI      : num [1:14401] 1.474 0.879 1.289 0.883 0.971 ...
##  $ SGI      : num [1:14401] 1.13 1 1.01 1.19 1.23 ...
##  $ DEPI     : num [1:14401] 0.576 1.035 0.737 0.983 0.722 ...
##  $ SGAI     : num [1:14401] 0 0 0 0 1.09 ...
##  $ LVGI     : num [1:14401] 1.05 1.18 1.06 1.17 1.04 ...
##  $ TATA     : num [1:14401] 0.0316 0.0688 0.0478 0.0424 0.2419 ...
##  $ M_score  : num [1:14401] -1.71 -2.13 -2.73 -3.33 -1.48 -2.3 -1.68 -1.35 1.32 6.29 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   StockCode = col_double(),
##   ..   DSRI = col_double(),
##   ..   GMI = col_double(),
##   ..   AQI = col_double(),
##   ..   SGI = col_double(),
##   ..   DEPI = col_double(),
##   ..   SGAI = col_double(),
##   ..   LVGI = col_double(),
##   ..   TATA = col_double(),
##   ..   M_score = col_double()
##   .. )
names(data)
##  [1] "StockCode" "DSRI"      "GMI"       "AQI"       "SGI"       "DEPI"     
##  [7] "SGAI"      "LVGI"      "TATA"      "M_score"
data_X <- select(data, -c(1))

Step 2: Correlation Matrix

it seems only DSRI and M_score are highly correlated

datamatrix <- cor(data_X)
corrplot(datamatrix, order="hclust", type="upper", tl.srt = 45)

res2 <- rcorr(as.matrix(data_X), type="pearson")

Extract the correlation coefficients

res2$r
##                  DSRI           GMI           AQI           SGI          DEPI
## DSRI     1.0000000000 -2.160700e-04 -2.493442e-04  0.0226318502 -1.966322e-04
## GMI     -0.0002160700  1.000000e+00  3.728691e-05  0.0004723872  8.619139e-04
## AQI     -0.0002493442  3.728691e-05  1.000000e+00 -0.0021475199  3.120418e-05
## SGI      0.0226318502  4.723872e-04 -2.147520e-03  1.0000000000 -1.886086e-03
## DEPI    -0.0001966322  8.619139e-04  3.120418e-05 -0.0018860864  1.000000e+00
## SGAI     0.0011953471  1.085463e-04 -4.510580e-03 -0.0261002265  8.510471e-03
## LVGI     0.0169863131 -2.486182e-03 -1.427836e-02  0.1368746151 -1.141411e-02
## TATA    -0.0004119543  1.403282e-02 -9.061456e-03  0.0216623010  1.118676e-02
## M_score  0.7154722495  4.556671e-01  1.754519e-02  0.0328338683  5.290030e-01
##                  SGAI         LVGI          TATA     M_score
## DSRI     0.0011953471  0.016986313 -0.0004119543 0.715472249
## GMI      0.0001085463 -0.002486182  0.0140328172 0.455667081
## AQI     -0.0045105803 -0.014278361 -0.0090614557 0.017545193
## SGI     -0.0261002265  0.136874615  0.0216623010 0.032833868
## DEPI     0.0085104715 -0.011414114  0.0111867625 0.529003027
## SGAI     1.0000000000  0.050230731 -0.0261440826 0.002859844
## LVGI     0.0502307306  1.000000000 -0.0100029978 0.005943361
## TATA    -0.0261440826 -0.010002998  1.0000000000 0.023478034
## M_score  0.0028598438  0.005943361  0.0234780335 1.000000000

Extract p-values

res2$P
##                DSRI        GMI        AQI          SGI      DEPI         SGAI
## DSRI             NA 0.97931553 0.97613106 0.0066070246 0.9811760 8.859472e-01
## GMI     0.979315529         NA 0.99643012 0.9547973093 0.9176262 9.896079e-01
## AQI     0.976131056 0.99643012         NA 0.7966467371 0.9970125 5.883393e-01
## SGI     0.006607025 0.95479731 0.79664674           NA 0.8209535 1.733849e-03
## DEPI    0.981175954 0.91762623 0.99701248 0.8209534945        NA 3.071493e-01
## SGAI    0.885947245 0.98960794 0.58833928 0.0017338488 0.3071493           NA
## LVGI    0.041510100 0.76545374 0.08663871 0.0000000000 0.1707909 1.627658e-09
## TATA    0.960575077 0.09219457 0.27688642 0.0093320043 0.1794717 1.703007e-03
## M_score 0.000000000 0.00000000 0.03524993 0.0000811692 0.0000000 7.314750e-01
##                 LVGI        TATA      M_score
## DSRI    4.151010e-02 0.960575077 0.0000000000
## GMI     7.654537e-01 0.092194568 0.0000000000
## AQI     8.663871e-02 0.276886416 0.0352499289
## SGI     0.000000e+00 0.009332004 0.0000811692
## DEPI    1.707909e-01 0.179471654 0.0000000000
## SGAI    1.627658e-09 0.001703007 0.7314750226
## LVGI              NA 0.230012273 0.4757390564
## TATA    2.300123e-01          NA 0.0048383198
## M_score 4.757391e-01 0.004838320           NA

Insignificant correlations are leaved blank

corrplot(res2$r, type="upper", order="hclust",p.mat = res2$P, sig.level = 0.01, insig = "blank")

model <- lm(M_score ~., data = data_X)
vif(model)
##     DSRI      GMI      AQI      SGI     DEPI     SGAI     LVGI     TATA 
## 1.000711 1.000204 1.000305 1.021170 1.000342 1.004374 1.022793 1.001674

Step 3: Testing for FA - Kaiser-Meyer-Olkin (KMO)

Remove Dependent Variable-M_score, overall MSA is close to 0.5.

data_fa <- data_X[,-9]
datamatrix <- cor(data_fa)
KMO(r=datamatrix)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = datamatrix)
## Overall MSA =  0.49
## MSA for each item = 
## DSRI  GMI  AQI  SGI DEPI SGAI LVGI TATA 
## 0.57 0.50 0.51 0.49 0.49 0.46 0.49 0.50

Step 4: Number of Factors

ev <- eigen(cor(data_fa))
ev$values
## [1] 1.1461176 1.0446482 1.0109643 1.0005705 0.9983967 0.9933372 0.9635860
## [8] 0.8423796

#Step 5: Run Factor Analysis

nfactors <- 4
fit1 <-factanal(data_fa,nfactors,scores = c("regression"),rotation = "varimax", lower = 0.01)
print(fit1)
## 
## Call:
## factanal(x = data_fa, factors = nfactors, scores = c("regression"),     rotation = "varimax", lower = 0.01)
## 
## Uniquenesses:
##  DSRI   GMI   AQI   SGI  DEPI  SGAI  LVGI  TATA 
## 0.998 1.000 0.763 0.771 0.795 0.929 0.854 0.988 
## 
## Loadings:
##      Factor1 Factor2 Factor3 Factor4
## DSRI                                
## GMI                                 
## AQI           0.486                 
## SGI   0.459                  -0.137 
## DEPI                  0.452         
## SGAI                          0.262 
## LVGI  0.346                   0.160 
## TATA                         -0.105 
## 
##                Factor1 Factor2 Factor3 Factor4
## SS loadings      0.334   0.238   0.206   0.126
## Proportion Var   0.042   0.030   0.026   0.016
## Cumulative Var   0.042   0.071   0.097   0.113
## 
## Test of the hypothesis that 4 factors are sufficient.
## The chi square statistic is 2.56 on 2 degrees of freedom.
## The p-value is 0.278
fa_var <-  fa(r=data_fa, nfactors = 4, rotate="varimax",fm="pa")
fa.diagram(fa_var)

Step 6: Regression

head(fa_var$scores)
##              PA1         PA2        PA3         PA4
## [1,] -0.04318481 -0.08435443 -0.2076681 -0.08329607
## [2,]  0.00709331 -0.04552777 -0.1799061 -0.11300150
## [3,] -0.04873556 -0.08077195 -0.1999917 -0.08423241
## [4,]  0.01621731 -0.05052543 -0.1905736 -0.11271881
## [5,] -0.04056960  0.02523135 -0.1287905 -0.02714779
## [6,] -0.05818161  0.06708525 -0.1735847 -0.03891819
regdata <- cbind(data_X[9], fa_var$scores)

Labeling the data

names(regdata) <- c("M_score", "DSRI", "DEPI","GMI", "LVGI")
set.seed(100)
indices= sample(1:nrow(regdata), 0.7*nrow(regdata))
train=regdata[indices,]
test = regdata[-indices,]

Regression Model using train data

model1 = lm(M_score~., train)
summary(model1)
## 
## Call:
## lm(formula = M_score ~ ., data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4424.0   -16.2     1.0    16.3 15900.1 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    5.043      2.227   2.264   0.0236 *  
## DSRI         151.809      4.816  31.521   <2e-16 ***
## DEPI         145.284      7.042  20.630   <2e-16 ***
## GMI          -21.034      9.994  -2.105   0.0354 *  
## LVGI         712.543     12.879  55.325   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 223.6 on 10075 degrees of freedom
## Multiple R-squared:  0.2501, Adjusted R-squared:  0.2498 
## F-statistic: 839.8 on 4 and 10075 DF,  p-value: < 2.2e-16
vif(model1)
##     DSRI     DEPI      GMI     LVGI 
## 1.387226 1.140706 1.200301 1.451095

#Step 7: Prediction

library(Metrics)
## Warning: package 'Metrics' was built under R version 4.0.2
## 
## Attaching package: 'Metrics'
## The following objects are masked from 'package:caret':
## 
##     precision, recall
pred_test1 <- predict(model1, newdata = test, type = "response")
test$M_score_Predicted <- pred_test1
head(test[c(1,6)], 100)
##      M_score M_score_Predicted
## 3      -2.73       -69.9030252
## 11      7.67        23.6679019
## 13   -328.76       -30.5190609
## 17     -2.22       -22.7136786
## 32     -0.73        -6.2360482
## 34     -2.88       -19.6734147
## 36    -10.42         8.5820273
## 39     -0.81       -13.3013313
## 41     -0.62       -10.0057123
## 42     -0.30        -8.0696696
## 43      0.04         7.2960256
## 46    -35.39       -14.4251186
## 47     -2.83       -12.9537122
## 48     -2.03       -26.6683334
## 52      3.47        47.9698308
## 54      8.39        87.5619285
## 56      4.34        72.6987569
## 59     -2.31        16.7829487
## 69      0.71        11.1132930
## 70      2.35        41.4459595
## 71      0.88        20.8991277
## 77      0.64        10.4636830
## 81     -1.10        10.7946453
## 84      1.55        29.6912293
## 88     -1.32       -18.7311237
## 98      0.13        -3.3144546
## 100     1.39         7.0343364
## 107     1.93        30.3169880
## 108    -0.10       -25.0648958
## 110     2.47        35.9921969
## 119     0.45       -14.9653130
## 121    -2.30       -32.0761760
## 124   -31.13        26.9105394
## 127   205.75        69.0532580
## 132    -0.01         5.5319307
## 134    -0.91         5.2700092
## 142    -1.26        -4.6593710
## 145     4.77         0.1444342
## 146    -0.86        13.9792876
## 150    -0.92       -15.0894730
## 156    -0.26        -4.9896147
## 161     6.38       100.1250868
## 162    -0.95         2.8991603
## 168     6.84        -3.7013952
## 170    -2.19       -77.1759896
## 173    -2.53       -37.9158610
## 175    -0.83       -72.8940074
## 178     1.21        27.2867608
## 179     3.19        34.7052151
## 180     2.82        36.5229698
## 183     1.39        35.9263264
## 188    -2.83       -28.9320063
## 189    -2.57       -31.7285603
## 190     2.97        26.0425435
## 198     2.01        59.9861427
## 199     0.90        20.5509565
## 201     2.13        17.4842831
## 204     0.72        -3.8456153
## 210    -1.70       -26.9800164
## 216     1.29        11.1365188
## 218     1.84        37.6242311
## 221     2.01        30.6469060
## 224    -2.20       -34.8164461
## 229    -1.21       -20.4250261
## 232    -0.45        -7.4880175
## 238 -6095.49      -704.7916981
## 240    -1.21       -13.1284185
## 242     0.16        11.5376589
## 258  1020.07       294.8276533
## 266    -1.89       -20.0920112
## 270     1.14        18.0360382
## 273     0.15        12.8732533
## 276    -0.12        -5.3008073
## 278    -1.67      -185.5806184
## 279    -1.37       -61.6657932
## 280    -1.76       -74.1768503
## 283    -0.94        -6.0008962
## 284    15.11        71.4432044
## 289     1.22        20.9493185
## 295     1.08         9.5325512
## 301    -0.72         8.5938233
## 310     1.28        14.7739748
## 311     2.04        20.5420263
## 312     2.33        30.1887326
## 313     2.22        23.7623157
## 316     0.57         0.4406371
## 318    15.38       299.1671699
## 320    -1.21       -41.0546969
## 324     0.91        13.0201740
## 328    -0.73       -31.4026975
## 331     7.30       105.6234163
## 336     0.06         0.5640162
## 340    -1.06       -26.0779492
## 342     0.73        86.3523232
## 347     2.44        29.1804402
## 353     2.07        32.4064959
## 354     1.70        27.3490491
## 356     0.94        17.7979451
## 362    -1.94       -57.3650459
## 363    -1.45       -18.4560378