setwd("C:/Users/rooho/Documents/R/mproj")
library("readxl")
## Warning: package 'readxl' was built under R version 3.6.3
overall <- read_xlsx("overall.xlsx",
col_names = TRUE,
col_types = NULL,
na = "")
row column sizes:
dim(overall)
## [1] 10348 56
column classes:
sapply(overall,class)
## CompanyId Year Number
## "character" "numeric" "numeric"
## <U+0438><U+043D><U+0442><U+0435><U+0440><U+043D><U+0435><U+0442> <U+0432><U+0435><U+0449><U+0435><U+0439> <U+0430><U+0434><U+0434><U+0438><U+0442><U+0438><U+0432><U+043D><U+0430><U+044F> <U+0442><U+0435><U+0445><U+043D><U+043E><U+043B><U+043E><U+0433><U+0438><U+044F> <U+0431><U+043E><U+043B><U+044C><U+0448><U+0438><U+0435> <U+0434><U+0430><U+043D><U+043D><U+044B><U+0435>
## "numeric" "numeric" "numeric"
## <U+0438><U+0441><U+043A><U+0443><U+0441><U+0441><U+0442><U+0432><U+0435><U+043D><U+043D><U+044B><U+0439> <U+0438><U+043D><U+0442><U+0435><U+0440><U+0435><U+043B><U+043B><U+0435><U+043A><U+0442> <U+043E><U+0431><U+043B><U+0430><U+0447><U+043D><U+044B><U+0435> <U+0442><U+0435><U+0445><U+043D><U+043E><U+043B><U+043E><U+0433><U+0438><U+0438> <U+043E><U+0431><U+043B><U+0430><U+0447><U+043D><U+044B><U+0435> <U+0440><U+0435><U+0448><U+0435><U+043D><U+0438><U+044F>
## "numeric" "numeric" "numeric"
## <U+0434><U+043E><U+043F><U+043E><U+043B><U+043D><U+0435><U+043D><U+043D><U+0430><U+044F> <U+0440><U+0435><U+0430><U+043B><U+044C><U+043D><U+043E><U+0441><U+0442><U+044C> <U+0432><U+0438><U+0440><U+0442><U+0443><U+0430><U+043B><U+044C><U+043D><U+0430><U+044F> <U+0440><U+0435><U+0430><U+043B><U+044C><U+043D><U+043E><U+0441><U+0442><U+044C> <U+0446><U+0438><U+0444><U+0440><U+043E><U+0432><U+043E><U+0435> <U+043F><U+0440><U+043E><U+0438><U+0437><U+0432><U+043E><U+0434><U+0441><U+0442><U+0432><U+043E>
## "numeric" "numeric" "numeric"
## <U+0446><U+0438><U+0444><U+0440><U+043E><U+0432><U+0438><U+0437><U+0430><U+0446><U+0438><U+044F> <U+0434><U+0438><U+0434><U+0436><U+0438><U+0442><U+0430><U+043B><U+0438><U+0437><U+0430><U+0446><U+0438><U+044F> Internet of Things
## "numeric" "numeric" "numeric"
## Additive Manufacturing Big Data Artificial Intelligence
## "numeric" "numeric" "numeric"
## Cloud Computing Cloud Solutions Augmented Reality
## "numeric" "numeric" "numeric"
## Virtual Reality Digitalization Digital Manufacturing
## "numeric" "numeric" "numeric"
## iot_sum additive_sum bigdata_sum
## "numeric" "numeric" "numeric"
## ai_sum cloudcomp_sum cloudsol_sum
## "numeric" "numeric" "numeric"
## ar_sum vr_sum digital_sum
## "numeric" "numeric" "numeric"
## digitalman_sum vol index
## "numeric" "numeric" "numeric"
## Industry Name Industry Sensitivity marketcap
## "character" "numeric" "character"
## enterprise_value ni_cont gp
## "character" "character" "character"
## employees Workforce Productivity total_assets
## "character" "numeric" "character"
## Size ebitda Ticker
## "numeric" "numeric" "character"
## Period vol_rus index_rus
## "character" "numeric" "numeric"
## vol_eng index_eng volind
## "numeric" "numeric" "numeric"
## volind_rus volind_eng
## "numeric" "numeric"
overall$Number <- as.integer(overall$Number)
overall$`Industry Sensitivity` <- as.factor(overall$`Industry Sensitivity`)
overall$`Industry Name` <- as.factor(overall$`Industry Name`)
overall$marketcap <- as.numeric(overall$marketcap)
overall$enterprise_value <- as.numeric(overall$enterprise_value)
overall$ni_cont <- as.numeric(overall$ni_cont)
overall$gp <- as.numeric(overall$gp)
overall$employees <- as.integer(overall$employees)
overall$total_assets <- as.numeric(overall$total_assets)
since we care about workforce I’m going to check how many data we have on it. then I will check whether they can be mapped on marketCap or not.
***``{r} workforce_indices = !is.na(overall$Workforce Productivity`)
print(“workforce data :”) ; length(workforce_indices[workforce_indices==T]) marketcap_indices = !is.na(overall$marketcap) both_True_indices = workforce_indices & marketcap_indices final_rows = which(True_Marketcap_workforce, both_True_indices == T) print(“matched marketcap and workforce number of rows:”) length(final_rows) ***```
extract a DB from overall in which index and marketcap are both filled with data :
db.h1 <- data.frame(name = overall$CompanyId , marketcap = overall$marketcap , index = overall$index, industryname = overall$`Industry Name`)
db.h1 <- na.omit(db.h1)
dim(db.h1)
## [1] 846 4
library(fitdistrplus)
## Warning: package 'fitdistrplus' was built under R version 3.6.3
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## Loading required package: survival
## Loading required package: npsurv
## Loading required package: lsei
plot(fitdist(db.h1$index,distr = "norm"))
plot(db.h1$index , db.h1$marketcap)
abline(lm(db.h1$marketcap ~ db.h1$index))
it can be seen that removing Zero indexes does not help. now let’s make our index feature normal. using the formula
indNo0 <- db.h1$index > 0
db.h1.no0ind <- db.h1[indNo0 , ]
plot(db.h1.no0ind$index , db.h1.no0ind$marketcap)
abline(lm(db.h1.no0ind$marketcap ~ db.h1.no0ind$index))
db.h1.norm <- db.h1 ;
db.h1.norm$index <- (db.h1.norm$index - mean(db.h1.norm$index))/sd(db.h1.norm$index)
plot(db.h1.norm$index , db.h1.norm$marketcap)
abline(lm(db.h1.norm$marketcap ~ db.h1.norm$index))
summary(lm(db.h1.norm$marketcap ~ db.h1.norm$index))
##
## Call:
## lm(formula = db.h1.norm$marketcap ~ db.h1.norm$index)
##
## Residuals:
## Min 1Q Median 3Q Max
## -538389 -251094 -82125 -7019 4312667
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 254492 19088 13.332 <2e-16 ***
## db.h1.norm$index 180034 19100 9.426 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 555200 on 844 degrees of freedom
## Multiple R-squared: 0.09525, Adjusted R-squared: 0.09417
## F-statistic: 88.85 on 1 and 844 DF, p-value: < 2.2e-16
I think we should leave index here to see which features are more important and which are have better relation with each other using pearson correlation and here is it:
crdb <- overall[,34:47]
crdb <- crdb[,-c(1,4,5)]
crdb <- na.omit(crdb)
crdb$employees <- as.numeric(crdb$employees)
## 160 records are full
Mcrdb <- cor(crdb)
corrplot.mixed(Mcrdb)