How many dimensions? 14401 x 10 What are the variables types? all numeric except StockCode, will remove stockcode
library(readr)
library(psych)
library(tidyverse)
library(Hmisc)
library(car)
data <- read_csv("data copy.csv")
## Parsed with column specification:
## cols(
## StockCode = col_double(),
## DSRI = col_double(),
## GMI = col_double(),
## AQI = col_double(),
## SGI = col_double(),
## DEPI = col_double(),
## SGAI = col_double(),
## LVGI = col_double(),
## TATA = col_double(),
## M_score = col_double()
## )
dim(data)
## [1] 14401 10
str(data)
## tibble [14,401 × 10] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ StockCode: num [1:14401] 1 1 1 1 2 2 2 2 4 4 ...
## $ DSRI : num [1:14401] 1.186 1.008 0 0 0.672 ...
## $ GMI : num [1:14401] 1.082 0.907 1.361 0.307 1.069 ...
## $ AQI : num [1:14401] 1.474 0.879 1.289 0.883 0.971 ...
## $ SGI : num [1:14401] 1.13 1 1.01 1.19 1.23 ...
## $ DEPI : num [1:14401] 0.576 1.035 0.737 0.983 0.722 ...
## $ SGAI : num [1:14401] 0 0 0 0 1.09 ...
## $ LVGI : num [1:14401] 1.05 1.18 1.06 1.17 1.04 ...
## $ TATA : num [1:14401] 0.0316 0.0688 0.0478 0.0424 0.2419 ...
## $ M_score : num [1:14401] -1.71 -2.13 -2.73 -3.33 -1.48 -2.3 -1.68 -1.35 1.32 6.29 ...
## - attr(*, "spec")=
## .. cols(
## .. StockCode = col_double(),
## .. DSRI = col_double(),
## .. GMI = col_double(),
## .. AQI = col_double(),
## .. SGI = col_double(),
## .. DEPI = col_double(),
## .. SGAI = col_double(),
## .. LVGI = col_double(),
## .. TATA = col_double(),
## .. M_score = col_double()
## .. )
names(data)
## [1] "StockCode" "DSRI" "GMI" "AQI" "SGI" "DEPI"
## [7] "SGAI" "LVGI" "TATA" "M_score"
data_X <- select(data, -c(1))
it seems only DSRI and M_score are highly correlated
datamatrix <- cor(data_X)
corrplot(datamatrix, order="hclust", type="upper", tl.srt = 45)
res2 <- rcorr(as.matrix(data_X), type="pearson")
Extract the correlation coefficients
res2$r
## DSRI GMI AQI SGI DEPI
## DSRI 1.0000000000 -2.160700e-04 -2.493442e-04 0.0226318502 -1.966322e-04
## GMI -0.0002160700 1.000000e+00 3.728691e-05 0.0004723872 8.619139e-04
## AQI -0.0002493442 3.728691e-05 1.000000e+00 -0.0021475199 3.120418e-05
## SGI 0.0226318502 4.723872e-04 -2.147520e-03 1.0000000000 -1.886086e-03
## DEPI -0.0001966322 8.619139e-04 3.120418e-05 -0.0018860864 1.000000e+00
## SGAI 0.0011953471 1.085463e-04 -4.510580e-03 -0.0261002265 8.510471e-03
## LVGI 0.0169863131 -2.486182e-03 -1.427836e-02 0.1368746151 -1.141411e-02
## TATA -0.0004119543 1.403282e-02 -9.061456e-03 0.0216623010 1.118676e-02
## M_score 0.7154722495 4.556671e-01 1.754519e-02 0.0328338683 5.290030e-01
## SGAI LVGI TATA M_score
## DSRI 0.0011953471 0.016986313 -0.0004119543 0.715472249
## GMI 0.0001085463 -0.002486182 0.0140328172 0.455667081
## AQI -0.0045105803 -0.014278361 -0.0090614557 0.017545193
## SGI -0.0261002265 0.136874615 0.0216623010 0.032833868
## DEPI 0.0085104715 -0.011414114 0.0111867625 0.529003027
## SGAI 1.0000000000 0.050230731 -0.0261440826 0.002859844
## LVGI 0.0502307306 1.000000000 -0.0100029978 0.005943361
## TATA -0.0261440826 -0.010002998 1.0000000000 0.023478034
## M_score 0.0028598438 0.005943361 0.0234780335 1.000000000
Extract p-values
res2$P
## DSRI GMI AQI SGI DEPI SGAI
## DSRI NA 0.97931553 0.97613106 0.0066070246 0.9811760 8.859472e-01
## GMI 0.979315529 NA 0.99643012 0.9547973093 0.9176262 9.896079e-01
## AQI 0.976131056 0.99643012 NA 0.7966467371 0.9970125 5.883393e-01
## SGI 0.006607025 0.95479731 0.79664674 NA 0.8209535 1.733849e-03
## DEPI 0.981175954 0.91762623 0.99701248 0.8209534945 NA 3.071493e-01
## SGAI 0.885947245 0.98960794 0.58833928 0.0017338488 0.3071493 NA
## LVGI 0.041510100 0.76545374 0.08663871 0.0000000000 0.1707909 1.627658e-09
## TATA 0.960575077 0.09219457 0.27688642 0.0093320043 0.1794717 1.703007e-03
## M_score 0.000000000 0.00000000 0.03524993 0.0000811692 0.0000000 7.314750e-01
## LVGI TATA M_score
## DSRI 4.151010e-02 0.960575077 0.0000000000
## GMI 7.654537e-01 0.092194568 0.0000000000
## AQI 8.663871e-02 0.276886416 0.0352499289
## SGI 0.000000e+00 0.009332004 0.0000811692
## DEPI 1.707909e-01 0.179471654 0.0000000000
## SGAI 1.627658e-09 0.001703007 0.7314750226
## LVGI NA 0.230012273 0.4757390564
## TATA 2.300123e-01 NA 0.0048383198
## M_score 4.757391e-01 0.004838320 NA
Insignificant correlations are leaved blank
corrplot(res2$r, type="upper", order="hclust",p.mat = res2$P, sig.level = 0.01, insig = "blank")
model <- lm(M_score ~., data = data_X)
vif(model)
## DSRI GMI AQI SGI DEPI SGAI LVGI TATA
## 1.000711 1.000204 1.000305 1.021170 1.000342 1.004374 1.022793 1.001674
Remove Dependent Variable-M_score, overall MSA is close to 0.5.
data_fa <- data_X[,-9]
datamatrix <- cor(data_fa)
KMO(r=datamatrix)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = datamatrix)
## Overall MSA = 0.49
## MSA for each item =
## DSRI GMI AQI SGI DEPI SGAI LVGI TATA
## 0.57 0.50 0.51 0.49 0.49 0.46 0.49 0.50
ev <- eigen(cor(data_fa))
ev$values
## [1] 1.1461176 1.0446482 1.0109643 1.0005705 0.9983967 0.9933372 0.9635860
## [8] 0.8423796
#Step 5: Run Factor Analysis
nfactors <- 4
fit1 <-factanal(data_fa,nfactors,scores = c("regression"),rotation = "varimax", lower = 0.01)
print(fit1)
##
## Call:
## factanal(x = data_fa, factors = nfactors, scores = c("regression"), rotation = "varimax", lower = 0.01)
##
## Uniquenesses:
## DSRI GMI AQI SGI DEPI SGAI LVGI TATA
## 0.998 1.000 0.763 0.771 0.795 0.929 0.854 0.988
##
## Loadings:
## Factor1 Factor2 Factor3 Factor4
## DSRI
## GMI
## AQI 0.486
## SGI 0.459 -0.137
## DEPI 0.452
## SGAI 0.262
## LVGI 0.346 0.160
## TATA -0.105
##
## Factor1 Factor2 Factor3 Factor4
## SS loadings 0.334 0.238 0.206 0.126
## Proportion Var 0.042 0.030 0.026 0.016
## Cumulative Var 0.042 0.071 0.097 0.113
##
## Test of the hypothesis that 4 factors are sufficient.
## The chi square statistic is 2.56 on 2 degrees of freedom.
## The p-value is 0.278
fa_var <- fa(r=data_fa, nfactors = 4, rotate="varimax",fm="pa")
fa.diagram(fa_var)
head(fa_var$scores)
## PA1 PA2 PA3 PA4
## [1,] -0.04318481 -0.08435443 -0.2076681 -0.08329607
## [2,] 0.00709331 -0.04552777 -0.1799061 -0.11300150
## [3,] -0.04873556 -0.08077195 -0.1999917 -0.08423241
## [4,] 0.01621731 -0.05052543 -0.1905736 -0.11271881
## [5,] -0.04056960 0.02523135 -0.1287905 -0.02714779
## [6,] -0.05818161 0.06708525 -0.1735847 -0.03891819
regdata <- cbind(data_X[9], fa_var$scores)
Labeling the data
names(regdata) <- c("M_score", "DSRI", "DEPI","GMI", "LVGI")
set.seed(100)
indices= sample(1:nrow(regdata), 0.7*nrow(regdata))
train=regdata[indices,]
test = regdata[-indices,]
Regression Model using train data
model1 = lm(M_score~., train)
summary(model1)
##
## Call:
## lm(formula = M_score ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4424.0 -16.2 1.0 16.3 15900.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.043 2.227 2.264 0.0236 *
## DSRI 151.809 4.816 31.521 <2e-16 ***
## DEPI 145.284 7.042 20.630 <2e-16 ***
## GMI -21.034 9.994 -2.105 0.0354 *
## LVGI 712.543 12.879 55.325 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 223.6 on 10075 degrees of freedom
## Multiple R-squared: 0.2501, Adjusted R-squared: 0.2498
## F-statistic: 839.8 on 4 and 10075 DF, p-value: < 2.2e-16
vif(model1)
## DSRI DEPI GMI LVGI
## 1.387226 1.140706 1.200301 1.451095
#Step 7: Prediction
library(Metrics)
## Warning: package 'Metrics' was built under R version 4.0.2
##
## Attaching package: 'Metrics'
## The following objects are masked from 'package:caret':
##
## precision, recall
pred_test1 <- predict(model1, newdata = test, type = "response")
test$M_score_Predicted <- pred_test1
head(test[c(1,6)], 100)
## M_score M_score_Predicted
## 3 -2.73 -69.9030252
## 11 7.67 23.6679019
## 13 -328.76 -30.5190609
## 17 -2.22 -22.7136786
## 32 -0.73 -6.2360482
## 34 -2.88 -19.6734147
## 36 -10.42 8.5820273
## 39 -0.81 -13.3013313
## 41 -0.62 -10.0057123
## 42 -0.30 -8.0696696
## 43 0.04 7.2960256
## 46 -35.39 -14.4251186
## 47 -2.83 -12.9537122
## 48 -2.03 -26.6683334
## 52 3.47 47.9698308
## 54 8.39 87.5619285
## 56 4.34 72.6987569
## 59 -2.31 16.7829487
## 69 0.71 11.1132930
## 70 2.35 41.4459595
## 71 0.88 20.8991277
## 77 0.64 10.4636830
## 81 -1.10 10.7946453
## 84 1.55 29.6912293
## 88 -1.32 -18.7311237
## 98 0.13 -3.3144546
## 100 1.39 7.0343364
## 107 1.93 30.3169880
## 108 -0.10 -25.0648958
## 110 2.47 35.9921969
## 119 0.45 -14.9653130
## 121 -2.30 -32.0761760
## 124 -31.13 26.9105394
## 127 205.75 69.0532580
## 132 -0.01 5.5319307
## 134 -0.91 5.2700092
## 142 -1.26 -4.6593710
## 145 4.77 0.1444342
## 146 -0.86 13.9792876
## 150 -0.92 -15.0894730
## 156 -0.26 -4.9896147
## 161 6.38 100.1250868
## 162 -0.95 2.8991603
## 168 6.84 -3.7013952
## 170 -2.19 -77.1759896
## 173 -2.53 -37.9158610
## 175 -0.83 -72.8940074
## 178 1.21 27.2867608
## 179 3.19 34.7052151
## 180 2.82 36.5229698
## 183 1.39 35.9263264
## 188 -2.83 -28.9320063
## 189 -2.57 -31.7285603
## 190 2.97 26.0425435
## 198 2.01 59.9861427
## 199 0.90 20.5509565
## 201 2.13 17.4842831
## 204 0.72 -3.8456153
## 210 -1.70 -26.9800164
## 216 1.29 11.1365188
## 218 1.84 37.6242311
## 221 2.01 30.6469060
## 224 -2.20 -34.8164461
## 229 -1.21 -20.4250261
## 232 -0.45 -7.4880175
## 238 -6095.49 -704.7916981
## 240 -1.21 -13.1284185
## 242 0.16 11.5376589
## 258 1020.07 294.8276533
## 266 -1.89 -20.0920112
## 270 1.14 18.0360382
## 273 0.15 12.8732533
## 276 -0.12 -5.3008073
## 278 -1.67 -185.5806184
## 279 -1.37 -61.6657932
## 280 -1.76 -74.1768503
## 283 -0.94 -6.0008962
## 284 15.11 71.4432044
## 289 1.22 20.9493185
## 295 1.08 9.5325512
## 301 -0.72 8.5938233
## 310 1.28 14.7739748
## 311 2.04 20.5420263
## 312 2.33 30.1887326
## 313 2.22 23.7623157
## 316 0.57 0.4406371
## 318 15.38 299.1671699
## 320 -1.21 -41.0546969
## 324 0.91 13.0201740
## 328 -0.73 -31.4026975
## 331 7.30 105.6234163
## 336 0.06 0.5640162
## 340 -1.06 -26.0779492
## 342 0.73 86.3523232
## 347 2.44 29.1804402
## 353 2.07 32.4064959
## 354 1.70 27.3490491
## 356 0.94 17.7979451
## 362 -1.94 -57.3650459
## 363 -1.45 -18.4560378