directory and load

setwd("C:/Users/rooho/Documents/R/mproj")
library("readxl")
## Warning: package 'readxl' was built under R version 3.6.3
overall <- read_xlsx("overall.xlsx",
                      col_names = TRUE,
                      col_types = NULL,
                      na = "")

finding out more about dataset!

row column sizes:

dim(overall)
## [1] 10348    56

column classes:

sapply(overall,class)
##                 CompanyId                      Year                    Number 
##               "character"                 "numeric"                 "numeric" 
## <U+0438><U+043D><U+0442><U+0435><U+0440><U+043D><U+0435><U+0442> <U+0432><U+0435><U+0449><U+0435><U+0439> <U+0430><U+0434><U+0434><U+0438><U+0442><U+0438><U+0432><U+043D><U+0430><U+044F> <U+0442><U+0435><U+0445><U+043D><U+043E><U+043B><U+043E><U+0433><U+0438><U+044F> <U+0431><U+043E><U+043B><U+044C><U+0448><U+0438><U+0435> <U+0434><U+0430><U+043D><U+043D><U+044B><U+0435> 
##                 "numeric"                 "numeric"                 "numeric" 
## <U+0438><U+0441><U+043A><U+0443><U+0441><U+0441><U+0442><U+0432><U+0435><U+043D><U+043D><U+044B><U+0439> <U+0438><U+043D><U+0442><U+0435><U+0440><U+0435><U+043B><U+043B><U+0435><U+043A><U+0442> <U+043E><U+0431><U+043B><U+0430><U+0447><U+043D><U+044B><U+0435> <U+0442><U+0435><U+0445><U+043D><U+043E><U+043B><U+043E><U+0433><U+0438><U+0438> <U+043E><U+0431><U+043B><U+0430><U+0447><U+043D><U+044B><U+0435> <U+0440><U+0435><U+0448><U+0435><U+043D><U+0438><U+044F> 
##                 "numeric"                 "numeric"                 "numeric" 
## <U+0434><U+043E><U+043F><U+043E><U+043B><U+043D><U+0435><U+043D><U+043D><U+0430><U+044F> <U+0440><U+0435><U+0430><U+043B><U+044C><U+043D><U+043E><U+0441><U+0442><U+044C> <U+0432><U+0438><U+0440><U+0442><U+0443><U+0430><U+043B><U+044C><U+043D><U+0430><U+044F> <U+0440><U+0435><U+0430><U+043B><U+044C><U+043D><U+043E><U+0441><U+0442><U+044C> <U+0446><U+0438><U+0444><U+0440><U+043E><U+0432><U+043E><U+0435> <U+043F><U+0440><U+043E><U+0438><U+0437><U+0432><U+043E><U+0434><U+0441><U+0442><U+0432><U+043E> 
##                 "numeric"                 "numeric"                 "numeric" 
## <U+0446><U+0438><U+0444><U+0440><U+043E><U+0432><U+0438><U+0437><U+0430><U+0446><U+0438><U+044F> <U+0434><U+0438><U+0434><U+0436><U+0438><U+0442><U+0430><U+043B><U+0438><U+0437><U+0430><U+0446><U+0438><U+044F>        Internet of Things 
##                 "numeric"                 "numeric"                 "numeric" 
##    Additive Manufacturing                  Big Data   Artificial Intelligence 
##                 "numeric"                 "numeric"                 "numeric" 
##           Cloud Computing           Cloud Solutions         Augmented Reality 
##                 "numeric"                 "numeric"                 "numeric" 
##           Virtual Reality            Digitalization     Digital Manufacturing 
##                 "numeric"                 "numeric"                 "numeric" 
##                   iot_sum              additive_sum               bigdata_sum 
##                 "numeric"                 "numeric"                 "numeric" 
##                    ai_sum             cloudcomp_sum              cloudsol_sum 
##                 "numeric"                 "numeric"                 "numeric" 
##                    ar_sum                    vr_sum               digital_sum 
##                 "numeric"                 "numeric"                 "numeric" 
##            digitalman_sum                       vol                     index 
##                 "numeric"                 "numeric"                 "numeric" 
##             Industry Name      Industry Sensitivity                 marketcap 
##               "character"                 "numeric"               "character" 
##          enterprise_value                   ni_cont                        gp 
##               "character"               "character"               "character" 
##                 employees    Workforce Productivity              total_assets 
##               "character"                 "numeric"               "character" 
##                      Size                    ebitda                    Ticker 
##                 "numeric"                 "numeric"               "character" 
##                    Period                   vol_rus                 index_rus 
##               "character"                 "numeric"                 "numeric" 
##                   vol_eng                 index_eng                    volind 
##                 "numeric"                 "numeric"                 "numeric" 
##                volind_rus                volind_eng 
##                 "numeric"                 "numeric"

cleaning the classes of columns :

overall$Number <- as.integer(overall$Number)
overall$`Industry Sensitivity` <- as.factor(overall$`Industry Sensitivity`)
overall$`Industry Name` <- as.factor(overall$`Industry Name`)
overall$marketcap <- as.numeric(overall$marketcap)
overall$enterprise_value <- as.numeric(overall$enterprise_value)
overall$ni_cont <- as.numeric(overall$ni_cont)
overall$gp <- as.numeric(overall$gp)
overall$employees <- as.integer(overall$employees)
overall$total_assets <- as.numeric(overall$total_assets)

a little fact checking :

since we care about workforce I’m going to check how many data we have on it. then I will check whether they can be mapped on marketCap or not.

***``{r} workforce_indices = !is.na(overall$Workforce Productivity`)

print(“workforce data :”) ; length(workforce_indices[workforce_indices==T]) marketcap_indices = !is.na(overall$marketcap) both_True_indices = workforce_indices & marketcap_indices final_rows = which(True_Marketcap_workforce, both_True_indices == T) print(“matched marketcap and workforce number of rows:”) length(final_rows) ***```

starting analysis

hypothesis 1 :

  1. We want to prove that disclosure index (iv) is positively related to market capitalization (dv), so it predicts it;

extract a DB from overall in which index and marketcap are both filled with data :

db.h1 <- data.frame(name = overall$CompanyId , marketcap = overall$marketcap , index = overall$index, industryname = overall$`Industry Name`)
db.h1 <- na.omit(db.h1)
dim(db.h1)
## [1] 846   4
library(fitdistrplus)
## Warning: package 'fitdistrplus' was built under R version 3.6.3
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## Loading required package: survival
## Loading required package: npsurv
## Loading required package: lsei
plot(fitdist(db.h1$index,distr = "norm"))

plot(db.h1$index , db.h1$marketcap)
abline(lm(db.h1$marketcap ~ db.h1$index))

it can be seen that removing Zero indexes does not help. now let’s make our index feature normal. using the formula

indNo0 <- db.h1$index > 0
db.h1.no0ind <- db.h1[indNo0 , ]
plot(db.h1.no0ind$index , db.h1.no0ind$marketcap)
abline(lm(db.h1.no0ind$marketcap ~ db.h1.no0ind$index))

db.h1.norm <- db.h1 ;
db.h1.norm$index <- (db.h1.norm$index - mean(db.h1.norm$index))/sd(db.h1.norm$index)
plot(db.h1.norm$index , db.h1.norm$marketcap)
abline(lm(db.h1.norm$marketcap ~ db.h1.norm$index))

summary(lm(db.h1.norm$marketcap ~ db.h1.norm$index))
## 
## Call:
## lm(formula = db.h1.norm$marketcap ~ db.h1.norm$index)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -538389 -251094  -82125   -7019 4312667 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        254492      19088  13.332   <2e-16 ***
## db.h1.norm$index   180034      19100   9.426   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 555200 on 844 degrees of freedom
## Multiple R-squared:  0.09525,    Adjusted R-squared:  0.09417 
## F-statistic: 88.85 on 1 and 844 DF,  p-value: < 2.2e-16

I think we should leave index here to see which features are more important and which are have better relation with each other using pearson correlation and here is it:

crdb <- overall[,34:47]
crdb <- crdb[,-c(1,4,5)]
crdb <- na.omit(crdb)
crdb$employees <- as.numeric(crdb$employees)

## 160 records are full

Mcrdb <- cor(crdb)
corrplot.mixed(Mcrdb)