library("readxl")
library(tidyverse)
library(coin)
library("ggpubr")
library("Hmisc")
library(corrplot)
Combining datasets:
spark.1709 <- read_excel("/Users/nikolajdolgih/Desktop/Эконометрика/Проект/SPARK_1709.xlsx")
spark.1714 <- read_excel("/Users/nikolajdolgih/Desktop/Эконометрика/Проект/SPARK_1714.xlsx")
spark.1554 <- read_excel("/Users/nikolajdolgih/Desktop/Эконометрика/Проект/SPARK_1554.xlsx")
spark.1548 <- read_excel("/Users/nikolajdolgih/Desktop/Эконометрика/Проект/SPARK_1548.xlsx")
spark.1541 <- read_excel("/Users/nikolajdolgih/Desktop/Эконометрика/Проект/SPARK_1541.xlsx")
#
spark.1709 = spark.1709[2:3698,]
colnames(spark.1709) = as.character(as.vector(spark.1709[1,]))
spark.1709 = spark.1709[2:3697,]
#
spark.1714 = spark.1714[2:3698,]
colnames(spark.1714) = as.character(as.vector(spark.1714[1,]))
spark.1714 = spark.1714[2:3697,]
#
spark.1554 = spark.1554[2:3698,]
colnames(spark.1554) = as.character(as.vector(spark.1554[1,]))
spark.1554 = spark.1554[2:3697,]
#
spark.1548 = spark.1548[2:3698,]
colnames(spark.1548) = as.character(as.vector(spark.1548[1,]))
spark.1548 = spark.1548[2:3697,]
#
spark.1541 = spark.1541[2:3718,]
colnames(spark.1541) = as.character(as.vector(spark.1541[1,]))
spark.1541 = spark.1541[2:3717,]
#
colnames(spark.1714)[1:2] = c("a", "b")
colnames(spark.1714)[4:5] = c("c", "d")
spark.1714 = spark.1714 %>% select(-a, -b, -c, -d)
#
colnames(spark.1554)[1:2] = c("a", "b")
colnames(spark.1554)[4:5] = c("c", "d")
spark.1554 = spark.1554 %>% select(-a, -b, -c, -d)
#
colnames(spark.1548)[1:2] = c("a", "b")
colnames(spark.1548)[4:5] = c("c", "d")
spark.1548 = spark.1548 %>% select(-a, -b, -c, -d)
#
colnames(spark.1541)[1:2] = c("a", "b")
colnames(spark.1541)[5] = c("c")
colnames(spark.1541)[7] = c("d")
spark.1541 = spark.1541 %>% select(-a, -b, -c, -d)
#
Spark = inner_join(spark.1709, spark.1714, by="Регистрационный номер")
Spark = inner_join(Spark, spark.1554, by="Регистрационный номер")
Spark = inner_join(Spark, spark.1548, by="Регистрационный номер")
Spark = inner_join(Spark, spark.1541, by="Регистрационный номер")
Variables selection:
Spark = Spark %>% select("2020, Рентабельность активов (ROA), %", "2020, Уставный капитал , RUB", "2020, Основные средства , RUB" , "Возраст компании, лет", "Размер компании", "2020, Среднесписочная численность работников", "2020, Коэффициент текущей ликвидности, %", "2020, Коэффициент концентрации собственного капитала (автономии), %", "2020, Доля рабочего капитала в активах компании, %" , "2020, Коэффициент соотношения заемных и собственных средств, %", "2020, Доля краткосрочной в общем объеме задолженности, %")
Renaming variables:
Spark = na.omit(Spark)
colnames(Spark) = c("ROA", "Capital", "Pr.means", "Age", "Size", "Avg.empl", "C.R", "Auto", "W.capital", "KSIZ", "Short.debt")
Сonversion of variables:
Spark$ROA = as.numeric(Spark$ROA)
Spark$Capital = as.numeric(Spark$Capital)
Spark$Pr.means = as.numeric(Spark$Pr.means)
Spark$Age = as.numeric(Spark$Age)
Spark$Avg.empl[Spark$Avg.empl == "1 034"] = "1034"
Spark$Avg.empl[Spark$Avg.empl == "1 892"] = "1892"
Spark$Avg.empl = as.numeric(Spark$Avg.empl)
Spark$C.R = as.numeric(Spark$C.R)
Spark$Auto = as.numeric(Spark$Auto)
Spark$W.capital = as.numeric(Spark$W.capital)
Spark$KSIZ = as.numeric(Spark$KSIZ)
Spark$Short.debt = as.numeric(Spark$Short.debt)
Spark$Capital.log = log(Spark$Capital)
Spark$Pr.means.log = log(Spark$Pr.means)
Spark$Size = as.factor(Spark$Size)
Spark = Spark %>% mutate(KSIZShort.debt = KSIZ*Short.debt)
Are the data follow a normal distribution?
# Shapiro-Wilk normality test for ROA
shapiro.test(Spark$ROA) # => p < 2.2e-16
# Shapiro-Wilk normality test for Capital.log
shapiro.test(Spark$Capital.log) # => p < 2.2e-16
# Shapiro-Wilk normality test for Pr.means
shapiro.test(Spark$Pr.means.log) # => p = 0.005059
# Shapiro-Wilk normality test for Age
shapiro.test(Spark$Age) # => p = 3.494e-09
# Shapiro-Wilk normality test for Avg.empl
shapiro.test(Spark$Avg.empl) # => p < 2.2e-16
# Shapiro-Wilk normality test for C.R
shapiro.test(Spark$C.R) # => p < 2.2e-16
# Shapiro-Wilk normality test for Auto
shapiro.test(Spark$Auto) # => p = 1.447e-12
# Shapiro-Wilk normality test for W.capital
shapiro.test(Spark$W.capital) # => p = 0.001162
# Shapiro-Wilk normality test for KSIZ
shapiro.test(Spark$KSIZ) # => p < 2.2e-16
# Shapiro-Wilk normality test for Short.debt
shapiro.test(Spark$Short.debt) # => p < 2.2e-16
# Shapiro-Wilk normality test for KSIZShort.debt
shapiro.test(Spark$KSIZShort.debt) # => p < 2.2e-16
From the output, the two p-values are greater than the significance level 0.05 implying that the distribution of the data are not significantly different from normal distribution. In other words, we can assume the normality.
Visualize association:
ggplot(data = Spark, aes(x = Size, y = ROA, color = Size))+
#scale_color_manual(values=c("#6FBD71", "#6BABD3"), breaks=c("successful", "failed"), labels=c("Успешные", "Провальные"))+
geom_boxplot()+
labs(title = "Диаграмма распределения ROA для каждого Size")+
xlab("Размер фирмы") +
ylab("ROA") +
ylim(0,0.3)+
xlim("Микропредприятия", "Малые предприятия", "Средние предприятия", "Крупные предприятия")+
theme_classic()+
theme(axis.text.x = element_text(angle = 30, hjust= 1))
ROA.Size = independence_test(ROA ~ Size, data = Spark)
cat("Вероятность случайности таких результатов:", round(pvalue(ROA.Size),4)*100, "%")
## Вероятность случайности таких результатов: 89.82 %
ggscatter(Spark, x = "Capital.log", y = "ROA",
add = "reg.line", conf.int = TRUE,
cor.coef = TRUE, cor.method = "pearson",
xlab = "Уставный капитал (логарифмическая шкала)", ylab = "ROA")
ggscatter(Spark, x = "Pr.means.log", y = "ROA",
add = "reg.line", conf.int = TRUE,
cor.coef = TRUE, cor.method = "pearson",
xlab = "Основные средства (логарифмическая шкала)", ylab = "ROA")
ggscatter(Spark, x = "Age", y = "ROA",
add = "reg.line", conf.int = TRUE,
cor.coef = TRUE, cor.method = "pearson",
xlab = "Возраст компании", ylab = "ROA")
ggscatter(Spark, x = "Avg.empl", y = "ROA",
add = "reg.line", conf.int = TRUE,
cor.coef = TRUE, cor.method = "pearson",
xlab = "Среднесписочная численность работников", ylab = "ROA")
ggscatter(Spark, x = "C.R", y = "ROA",
add = "reg.line", conf.int = TRUE,
cor.coef = TRUE, cor.method = "pearson",
xlab = "Коэффициент текущей ликвидности", ylab = "ROA")
ggscatter(Spark, x = "Auto", y = "ROA",
add = "reg.line", conf.int = TRUE,
cor.coef = TRUE, cor.method = "pearson",
xlab = "Коэффициент автономии", ylab = "ROA")
ggscatter(Spark, x = "W.capital", y = "ROA",
add = "reg.line", conf.int = TRUE,
cor.coef = TRUE, cor.method = "pearson",
xlab = "Доля рабочего капитала в активах компании", ylab = "ROA")
ggscatter(Spark, x = "KSIZ", y = "ROA",
add = "reg.line", conf.int = TRUE,
cor.coef = TRUE, cor.method = "pearson",
xlab = "Коэффициент соотношения заемных и собственных средств", ylab = "ROA")
ggscatter(Spark, x = "Short.debt", y = "ROA",
add = "reg.line", conf.int = TRUE,
cor.coef = TRUE, cor.method = "pearson",
xlab = "Доля краткосрочной в объеме задолженности", ylab = "ROA")
ggscatter(Spark, x = "KSIZShort.debt", y = "ROA",
add = "reg.line", conf.int = TRUE,
cor.coef = TRUE, cor.method = "pearson",
xlab = "KSIZ*Short.debt", ylab = "ROA")
Correlation matrix with significance levels
res<-rcorr(as.matrix(Spark %>% select(-ROA, -Capital, -Pr.means, -Size)), type = "pearson")
diag(res$P)=0
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
corrplot(res$r, method="color", col=col(200),
type="upper", order="hclust",
addCoef.col = "black", # Add coefficient of correlation
tl.col="black", tl.srt=45, #Text label color and rotation
# Combine with significance
p.mat = res$P, sig.level = 0.05,
# hide correlation coefficient on the principal diagonal
diag=FALSE
)