library(tidyverse)
library(caret)
library(ggplot2)
library(ggpubr)
library(dplyr)
library(plyr)
library(car)
library(ggpubr)
library(labelled)
library(plm)
library(Rxtsum)
library(brotools)
library(nortest)
library(lmtest)
library(lawstat)
library(tseries)
library(pastecs)
library(tsoutliers)
library(readxl)
data <- read_excel("data orig.xlsx",
sheet = "all", na = "9999999999")
I filtered the dataset in order to obtain only English industries belonging to the pavitt category of “Traditional industries”.
The variables we are going to analyze are turnov, age, empl, group and c13aperc where:
data=data%>%dplyr::rename(export_turnov_p =`exp/turnov` )%>%dplyr::rename(bonus = bonus...35)
Data<-data %>% dplyr::select( age,country,turnov,group, pavitt ,inn_sales_p, empl,c13aperc,family_tmt, d27)
Data=Data %>% dplyr::filter(country == "UK")%>%
dplyr::filter(pavitt=="Traditional industries")
Data$turnov=as.factor(Data$turnov)
Data$group=as.factor(Data$group)
Data$age=as.factor(Data$age)
Data$empl=as.numeric(Data$empl)
Data$c13aperc=as.numeric(Data$c13aperc)
Data$family_tmt=as.numeric(Data$family_tmt)
df4=Data %>% dplyr::select(turnov,age,empl, group,inn_sales_p,c13aperc,family_tmt)
df4<-na.omit(df4)
df=Data %>% dplyr::select(turnov,age,empl, group,c13aperc)
df<-na.omit(df)
df
empl (2008): Total number of employees of your company in your home country.
turnov (2008): 1 “less than €1 million” 2 “1-2 million euros” 3 “2-10 million euros” 4 “10-15 million euros” 5 “15-50 million euros” 6 “50-250 million euros” 7 “more than 250 million euros”
age: Less than 6 years Between 20 and 6 years More than 20 years old
group: 1 “Yes, Domestic” 2 “Yes, Foreign” 3 “No”
c13aperc: Reduction in planned investments in machinery, equipment or ICT Percentage: 1 to 100
df5=df%>%dplyr::select(empl, c13aperc)
brotools::describe(df, empl, c13aperc )
df6=df%>%dplyr::select(age,turnov,group)
df6 %>% group_by(age)%>%
dplyr::summarise(num=n())
df6 %>% group_by(turnov)%>%
dplyr::summarise(num=n())
df6 %>% group_by(group)%>%
dplyr::summarise(num=n())
df4=Data %>% dplyr::select(turnov,age,empl, group, c13aperc,family_tmt)
df4<-na.omit(df4)
df4 <- df4[df4$empl > quantile(df4$empl, .25) - 1.5*IQR(df4$empl) &
df4$empl < quantile(df4$empl, .75) + 1.5*IQR(df4$empl), ]
df4 <- df4[df4$family_tmt > quantile(df4$family_tmt, .25) - 1.5*IQR(df4$family_tmt) &
df4$family_tmt < quantile(df4$family_tmt, .75) + 1.5*IQR(df4$family_tmt), ]
df4=df4[-c(30,250,23,38,136,165,650),]
mod1 <- lm(empl ~ group+ turnov + age , data = df4 )
summary(mod1)
##
## Call:
## lm(formula = empl ~ group + turnov + age, data = df4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -38.302 -7.062 -2.144 5.856 59.620
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 25.016 3.821 6.548 3.30e-10 ***
## group2 -10.713 5.010 -2.138 0.033450 *
## group3 -9.258 3.233 -2.863 0.004550 **
## turnov2 7.722 2.306 3.349 0.000937 ***
## turnov3 32.622 2.342 13.930 < 2e-16 ***
## turnov4 69.591 7.516 9.259 < 2e-16 ***
## turnov5 82.008 10.853 7.556 7.84e-13 ***
## turnov6 83.033 15.355 5.407 1.49e-07 ***
## ageLess than 6 yrs 3.277 3.510 0.933 0.351472
## ageMore than 20 yrs 2.664 1.987 1.341 0.181094
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.68 on 250 degrees of freedom
## Multiple R-squared: 0.6053, Adjusted R-squared: 0.5911
## F-statistic: 42.6 on 9 and 250 DF, p-value: < 2.2e-16
mod1_1 <- lm(log(empl) ~ group+ turnov + age , data = df4 )
summary(mod1_1)
##
## Call:
## lm(formula = log(empl) ~ group + turnov + age, data = df4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.03667 -0.28045 0.00223 0.25891 1.00776
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.9759036 0.1047308 28.415 < 2e-16 ***
## group2 -0.3385011 0.1373250 -2.465 0.0144 *
## group3 -0.2168446 0.0886347 -2.446 0.0151 *
## turnov2 0.3275275 0.0632091 5.182 4.53e-07 ***
## turnov3 1.0093103 0.0641928 15.723 < 2e-16 ***
## turnov4 1.6543335 0.2060355 8.029 3.85e-14 ***
## turnov5 1.7790231 0.2975096 5.980 7.68e-09 ***
## turnov6 1.8697563 0.4209173 4.442 1.34e-05 ***
## ageLess than 6 yrs 0.0006009 0.0962182 0.006 0.9950
## ageMore than 20 yrs 0.0980114 0.0544560 1.800 0.0731 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4025 on 250 degrees of freedom
## Multiple R-squared: 0.6019, Adjusted R-squared: 0.5875
## F-statistic: 41.99 on 9 and 250 DF, p-value: < 2.2e-16
extractAIC(mod1)
## [1] 10.000 1406.863
extractAIC(mod1_1)
## [1] 10.0000 -463.4605
As we have seen using empl in log form both increases R^2 and decreases the AIC value, so we will proceed by using log(empl) instead of empl.
mod3 <- lm(log(empl) ~ group+ turnov + age +c13aperc , data = df4 )
summary(mod3)
##
## Call:
## lm(formula = log(empl) ~ group + turnov + age + c13aperc, data = df4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.0420 -0.2947 -0.0101 0.2636 0.9984
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.9175275 0.1069111 27.289 < 2e-16 ***
## group2 -0.3490048 0.1362422 -2.562 0.0110 *
## group3 -0.2250724 0.0879592 -2.559 0.0111 *
## turnov2 0.2984358 0.0639422 4.667 4.99e-06 ***
## turnov3 0.9886963 0.0642804 15.381 < 2e-16 ***
## turnov4 1.6633443 0.2043335 8.140 1.89e-14 ***
## turnov5 1.7883430 0.2950254 6.062 4.96e-09 ***
## turnov6 1.9294630 0.4181719 4.614 6.32e-06 ***
## ageLess than 6 yrs -0.0092071 0.0955013 -0.096 0.9233
## ageMore than 20 yrs 0.0918856 0.0540620 1.700 0.0904 .
## c13aperc 0.0015299 0.0006661 2.297 0.0225 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3991 on 249 degrees of freedom
## Multiple R-squared: 0.6101, Adjusted R-squared: 0.5945
## F-statistic: 38.97 on 10 and 249 DF, p-value: < 2.2e-16
According with the table shown above we get the following results:
c13aperc is significant at 0.05 significance level and this means that holding all other terms constant if c13aperc increase of one unit, on average it tends to increase the number of empl by 0.15%.
group2 is significant at 0.05 significance level and this means that holding all other terms constant if a company belongs to group2(Foreign group), on average it tends to decrease the number of empl by 35% compared to the number of empl in group1(National group).
group3 is significant at 0.05 significance level and this means that holding all other terms constant if a company belongs to group2(Foreign group), on average it tends to decrease the number of empl by 22% compared to the number of empl in group1(National group).
turnov2 is significant at 0.001 significance level and this means that holding all other terms constant if a company belongs to turnov2(1-2 million euros) on average tends to increase the number of empl by 29% compared to the companies that belong to group turnov1(less than 1 million euro).
turnov3 is significant at 0.001 significance level and this means that holding all other terms constant if a company belongs to turnov3(2-10 million euro) on average tends to increase the number of empl by 99% compared to the companies that belong to group turnov1(less than 1 million euro).
turnov4 is significant at 0.001 significance level and this means that holding all other terms constant if a company belongs to turnov4(10-15 million euro), on average tends to increase the number of empl by 166% compared to companies that belong to the group turnov1(less than 1 million euro).
turnov5 is significant at 0.001 significance level and this means that when holding all other terms constant if a company belongs to turnov5(15-50 million euro), on average it tends to increase the number of empl by 179% compared to companies that belong to the turnov1 group(less than 1 million euro).
turnov6 is significant at 0.001 significance level and this means that when holding all other terms constant if a company belongs to turnov6(50-250 million euro), on average it tends to increase the number of empl by 192% compared to companies that belong to the turnov1 group(less than 1 million euro).
par(mfrow=c(2,2))
plot(mod3)
Assumption of linearity applies only to numerical predictors by visually inspecting the scatter plot between each predictor and the logit values.
crPlots(mod3)
c13aperc appear to be linear.
mean(mod3$residuals)
## [1] 3.612374e-18
shapiro.test(mod3$residuals)
##
## Shapiro-Wilk normality test
##
## data: mod3$residuals
## W = 0.99195, p-value = 0.1677
jarque.bera.test(mod3$residuals)
##
## Jarque Bera Test
##
## data: mod3$residuals
## X-squared = 2.7896, df = 2, p-value = 0.2479
The mean of the residual is equal to 0. Residuals are normally distributed since p-value is greater di 0.05.
lmtest::bptest(mod3)
##
## studentized Breusch-Pagan test
##
## data: mod3
## BP = 14.504, df = 10, p-value = 0.1512
In the Breush-Pagan test the null hypothesis is that the variance of the residuals is constant, so since p-value is greater than 0.05 we conclude that there is Homoscedasticity among the residuals.
acf(mod3$residuals)
lmtest::dwtest(mod3)
##
## Durbin-Watson test
##
## data: mod3
## DW = 2.0465, p-value = 0.6341
## alternative hypothesis: true autocorrelation is greater than 0
From the graph we can see that all values are below the blue line. In this case H0: random residuals, so since the p-value is greater than 0.05 we conclude that there is no autocorrelation of residuals.
dim(df)
## [1] 303 5
The number of observations is greater than the number of predictors.
car::vif(mod3)
## GVIF Df GVIF^(1/(2*Df))
## group 1.279091 2 1.063470
## turnov 1.381903 5 1.032875
## age 1.100678 2 1.024271
## c13aperc 1.057212 1 1.028208
Since GVIF is less than 5/10 there is no Multicollinearity between our predictors.
library(interactions)
Slope <- lm(log(empl) ~ age * turnov, data = df)
summary(Slope)
##
## Call:
## lm(formula = log(empl) ~ age * turnov, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.8350 -0.2943 -0.0235 0.2649 1.6669
##
## Coefficients: (3 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.77895 0.08386 33.137 < 2e-16 ***
## ageLess than 6 yrs -0.10095 0.14363 -0.703 0.4827
## ageMore than 20 yrs 0.10797 0.10754 1.004 0.3162
## turnov2 0.28907 0.11269 2.565 0.0108 *
## turnov3 0.99582 0.11761 8.467 1.35e-15 ***
## turnov4 2.05997 0.33017 6.239 1.59e-09 ***
## turnov5 2.18846 0.24088 9.085 < 2e-16 ***
## turnov6 3.43565 0.33017 10.406 < 2e-16 ***
## turnov7 3.32769 0.32636 10.196 < 2e-16 ***
## ageLess than 6 yrs:turnov2 0.27142 0.35815 0.758 0.4492
## ageMore than 20 yrs:turnov2 0.05108 0.14656 0.349 0.7277
## ageLess than 6 yrs:turnov3 0.33628 0.26119 1.287 0.1990
## ageMore than 20 yrs:turnov3 0.10288 0.14904 0.690 0.4906
## ageLess than 6 yrs:turnov4 -0.29532 0.57146 -0.517 0.6057
## ageMore than 20 yrs:turnov4 -0.40388 0.36906 -1.094 0.2747
## ageLess than 6 yrs:turnov5 0.35270 0.37364 0.944 0.3460
## ageMore than 20 yrs:turnov5 0.28038 0.27771 1.010 0.3135
## ageLess than 6 yrs:turnov6 NA NA NA NA
## ageMore than 20 yrs:turnov6 -1.03700 0.42606 -2.434 0.0156 *
## ageLess than 6 yrs:turnov7 NA NA NA NA
## ageMore than 20 yrs:turnov7 NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4516 on 285 degrees of freedom
## Multiple R-squared: 0.763, Adjusted R-squared: 0.7488
## F-statistic: 53.96 on 17 and 285 DF, p-value: < 2.2e-16
cat_plot(Slope, pred = turnov, modx = age, geom = "line")
From the line plot we can see how the number of employees changes with respect to turnover and the age of the companies:
library(pscl)
library(broom)
library(tidyverse)
library(caret)
library(ggplot2)
library(ggpubr)
library(dplyr)
library(plyr)
library(car)
library(ggpubr)
library(labelled)
library(plm)
library(Rxtsum)
library(brotools)
library(tsoutliers)
library(lawstat)
library(readxl)
data <- read_excel("data orig.xlsx",
sheet = "all", na = "9999999999")
I filtered the dataset to get only German and British industries belonging to the pavitt category of “High-tech industries”.
The variables we will analyze are age, turnov, family_p, family_tmt, and benefit_rd where:
data=data%>%dplyr::rename(export_turnov_p =`exp/turnov` )%>%dplyr::rename(bonus = bonus...35)
Data<-data %>% dplyr::select( age,country, turnov, benefit_rd, family_tmt, pavitt,benefit_rd, family_p )
Data=Data%>% dplyr::filter(pavitt =="High-tech industries")%>% dplyr::filter(country %in% c("UK", "GER"))%>% dplyr::filter(benefit_rd !=3)
Data$turnov=as.factor(Data$turnov)
Data$age=as.factor(Data$age)
Data$benefit_rd=as.factor(Data$benefit_rd)
Data$family_p=as.numeric(Data$family_p)
Data$family_tmt=as.numeric(Data$family_tmt)
Data=na.omit(Data)
Data1=Data[-c(4,138,25,124,80,30,170,32,25,167,120,127),]
Data1=Data1%>%dplyr::select(-c("pavitt","country"))
Data1
Age: Less than 6 years Between 20 and 6 years More than 20 years
turnov (2008): 1 “less than 1 million euros” 2 “1-2 million euros” 3 “2-10 million euros” 4 “10-15 million euros” 5 “15-50 million euros” 6 “50-250 million euros” 7 “more than 250 million euros”
family_p: Percentage of entrepreneurs/executives (including middle management) who are related to the family that owns the business.
family_tmt: Percentage of family members in the company’s top management. 100% family 0% no family
benefit_rd : Did the company benefit from tax breaks and financial incentives for these R&D activities? 1 “Yes 2”No"
For numerical variables:
df5=Data1%>%dplyr::select(family_tmt, family_p)
brotools::describe(Data1, family_tmt, family_p )
df6=Data1%>%dplyr::select(age,turnov,benefit_rd)
df6 %>% group_by(age)%>%
dplyr::summarise(num=n())
df6 %>% group_by(turnov)%>%
dplyr::summarise(num=n())
df6 %>% group_by(benefit_rd)%>%
dplyr::summarise(num=n())
mod1 <- glm(benefit_rd ~ age +turnov, data=Data1, family="binomial")
summary(mod1)
##
## Call:
## glm(formula = benefit_rd ~ age + turnov, family = "binomial",
## data = Data1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1417 -1.1857 0.7003 0.7477 1.2069
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.0218 0.7659 2.640 0.00829 **
## ageLess than 6 yrs -0.1455 0.5926 -0.246 0.80603
## ageMore than 20 yrs 1.0753 0.3904 2.755 0.00588 **
## turnov2 -1.9453 0.8560 -2.273 0.02305 *
## turnov3 -1.8609 0.7971 -2.335 0.01957 *
## turnov4 -1.9981 0.9185 -2.175 0.02960 *
## turnov5 -1.8167 0.9130 -1.990 0.04661 *
## turnov6 -0.9101 0.9890 -0.920 0.35746
## turnov7 11.4690 882.7437 0.013 0.98963
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 212.05 on 178 degrees of freedom
## Residual deviance: 192.46 on 170 degrees of freedom
## AIC: 210.46
##
## Number of Fisher Scoring iterations: 13
pR2(mod1)[4]
## fitting null model for pseudo-r2
## McFadden
## 0.09236815
mod2 <- glm(benefit_rd ~ family_p+age +turnov+family_tmt, data=Data1, family="binomial")
summary(mod2)
##
## Call:
## glm(formula = benefit_rd ~ family_p + age + turnov + family_tmt,
## family = "binomial", data = Data1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0560 -1.0001 0.4377 0.8329 1.4290
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.51185 0.84956 1.780 0.07515 .
## family_p -0.12726 0.08709 -1.461 0.14394
## ageLess than 6 yrs 0.14134 0.62398 0.227 0.82081
## ageMore than 20 yrs 1.04187 0.41588 2.505 0.01224 *
## turnov2 -2.08631 0.89999 -2.318 0.02044 *
## turnov3 -1.87046 0.84300 -2.219 0.02650 *
## turnov4 -2.08531 0.98363 -2.120 0.03401 *
## turnov5 -1.67332 0.96147 -1.740 0.08179 .
## turnov6 -0.68575 1.04198 -0.658 0.51046
## turnov7 12.01234 882.74379 0.014 0.98914
## family_tmt 0.04451 0.01642 2.711 0.00671 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 212.05 on 178 degrees of freedom
## Residual deviance: 175.16 on 168 degrees of freedom
## AIC: 197.16
##
## Number of Fisher Scoring iterations: 13
pR2(mod2)[4]
## fitting null model for pseudo-r2
## McFadden
## 0.1739605
coefs <- coef(mod2)
exp(coefs)
## (Intercept) family_p ageLess than 6 yrs ageMore than 20 yrs
## 4.535128e+00 8.805088e-01 1.151811e+00 2.834519e+00
## turnov2 turnov3 turnov4 turnov5
## 1.241442e-01 1.540524e-01 1.242681e-01 1.876230e-01
## turnov6 turnov7 family_tmt
## 5.037115e-01 1.647760e+05 1.045513e+00
ageMore than 20 years is significant at the 0.05 level of significance with a coefficient equal to 2.83 which means that a company that is more than 20 years old, on average is associated with a 2.83 increase in the logit of having a benefit_rd compared to a company with Between 20 and 6 years.
turnov2 is significant at the 0.05 level of significance with a coefficient equal to 0.124 which means that a company belonging to the turnov2 group, on average, is associated with a 0.124 decrease in the logit of having a benefit_rd compared to a company that belongs to the group turnov1.
turnov3 is significant at the 0.05 level of significance with a coefficient equal to 0.154 which means that a company that belongs to the group turnov3, on average is associated with a decrease of 0.154 in the logit of having a benefit_rd compared to a company that belongs to the group turnov1.
turnov4 is significant at the 0.05 level of significance with a coefficient equal to 0.124 which means that a company that belongs to the group turnov4, on average is associated with a decrease of 0.124 in the logit of having a benefit_rd compared to a company that belongs to the group turnov1.
(exp(coefs)-1)*100
## (Intercept) family_p ageLess than 6 yrs ageMore than 20 yrs
## 3.535128e+02 -1.194912e+01 1.518106e+01 1.834519e+02
## turnov2 turnov3 turnov4 turnov5
## -8.758558e+01 -8.459476e+01 -8.757319e+01 -8.123770e+01
## turnov6 turnov7 family_tmt
## -4.962885e+01 1.647750e+07 4.551284e+00
family_tmt is significant at 0.01 and has a coefficient equal to 4.55 which means that for each unit added to family_tmt, on average the odds of having benefit_rd=1 increases by 4.55%.
par(mfrow=c(2,2))
plot(mod2)
Assumption of linearity applies only to numerical predictors by visually inspecting the scatter plot between each predictor and the logit values.
Data1=na.omit(Data1)
probabilities <- predict(mod2, type = "response")
predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg")
mydata <- Data1 %>%dplyr::select_if(is.numeric)
predictors <- colnames(mydata)
mydata <- mydata %>%dplyr::mutate(logit = log(probabilities/(1-probabilities))) %>%
tidyr::gather(key = "predictors", value = "predictor.value", -logit)
ggplot(mydata, aes(logit, predictor.value))+
geom_point(size = 0.5, alpha = 0.5) +
theme_bw() +
facet_wrap(~predictors, scales = "free_y")
theme_set(theme_classic())
Family_p and Family_tmt seems linear.
acf(mod2$residuals)
lawstat::runs.test(mod2$residuals)
##
## Runs Test - Two sided
##
## data: mod2$residuals
## Standardized Runs Statistic = -1.4238, p-value = 0.1545
In this case H0: random residuals, so since the p-value is greater than 0.05 we declare that there is no autocorrelation of residuals.
car::vif(mod2)
## GVIF Df GVIF^(1/(2*Df))
## family_p 4.637579 1 2.153504
## age 1.135171 2 1.032204
## turnov 1.232654 6 1.017584
## family_tmt 4.444157 1 2.108117
Since GVIF is less than 5/10 there is no Multicollinearity between our predictors.
library(tidyverse)
library(caret)
library(ggplot2)
library(ggpubr)
library(dplyr)
library(plyr)
library(car)
library(ggpubr)
library(labelled)
library(plm)
library(Rxtsum)
library(brotools)
library(clValid)
library(cluster)
library(tsoutliers)
library(factoextra )
library(FactoMineR)
library(caret)
library(e1071)
library(MASS)
library(lda)
library(readxl)
data <- read_excel("data orig.xlsx",
sheet = "all", na = "9999999999")
I filtered the dataset to obtain only German industries belonging to the pavitt category of “traditional industries” or “high-tech industries.”
The variables we will analyze are grad_n, rd_p, and family_p where:
data=data%>%dplyr::rename(export_turnov_p =`exp/turnov` )%>%dplyr::rename(bonus = bonus...35)
Data1<-data %>% dplyr::select( involv_n,no_family_p, export_turnov_p,family_n,empl,skilled_n,family_tmt,grad_n,rd_p,inn_sales_p,family_p, white_n, rd_p,invest, ceo_type, pavitt, turnov, group, country,exp_story_dummy, strategy)
Data1<-na.omit(Data1)
Data=Data1 %>%
dplyr::filter(country %in% c("GER"))%>% dplyr::filter(pavitt %in% c("Traditional industries","High-tech industries"))
Data<-Data %>% dplyr::select(grad_n, rd_p, family_p)
Data$grad=as.numeric(Data$grad_n)
Data$rd_p=as.numeric(Data$rd_p)
Data$family_p=as.numeric(Data$family_p)
Data= na.omit(Data)
Data
grad_n: The number of college graduates in the labor force in the country of origin
rd_p (2007-2009): Percentage of total revenues that the firm has invested in R&D on average over the past three years .
family_p: Percentage of entrepreneurs/executives (including middle management) who are related to the family that owns the company.
For numerical variables:
brotools::describe(Data, grad_n, rd_p, family_p )
Since the variables we will be using have different units of measurement, we will normalize the data.
normalize=function(x){
return ((x-min(x))/(max(x)-min(x)))
}
norm_Data1=as.data.frame(lapply(Data[,c(1:3)], normalize))
df4=norm_Data1 %>% dplyr::select(grad_n,rd_p, family_p)
df4<-na.omit(df4)
To choose the best clustering method for the 3 chosen variables (grad_n, rd_p, family_p) and the optimal number of groups to divide them into, we need to satisfy the internal validation and stability measures.
Internal measurements are:
Since when compactness increases with the number of clusters but separation decreases, we use Silhouette Width and Dumm Index to obtain the maximum achievable efficiency.
express <- df4[, c("grad_n","rd_p", "family_p")]
intern <- clValid(express, 2:6,
clMethods = c("hierarchical", "kmeans", "diana", "pam"),
validation = "internal") # we try from 2 to 6 groups
op <- par(no.readonly=TRUE)
par(mfrow=c(2,2),mar=c(4,4,3,1))
plot(intern, legend=FALSE)
plot(nClusters(intern),measures(intern,"Dunn")[,,1],type="n",axes=F,
xlab="",ylab="")
legend("center", clusterMethods(intern), col=1:9, lty=1:9, pch=paste(1:9))
par(op)
Where:
Connectivity: should be minimized.
Dunn: should be maximized.
Silhouette Width: should be maximized.
Stability measures are:
The average proportion of non-overlap (APN): Average distance (AD); Average distance between means (ADM); Figure of Merit (FOM).
stab <- clValid(express, 2:6,
clMethods=c("hierarchical","kmeans","diana","pam"),
validation="stability")
par(mfrow=c(2,2),mar=c(4,4,3,1))
plot(stab, measure=c("APN","AD","ADM"),legend=FALSE)
plot(nClusters(stab),measures(stab,"APN")[,,1],type="n",axes=F,
xlab="",ylab="")
legend("center", clusterMethods(stab), col=1:9, lty=1:9, pch=paste(1:9))
par(op)
Stability measures should be minimized in each case.
library(scater)
result <- clValid(express, 2:6, clMethods=c("hierarchical","kmeans","pam"),
validation=c("internal","stability"))
res <- getRanksWeights(result)
res$ranks[,1]
## APN AD ADM FOM
## "hierarchical-2" "pam-6" "hierarchical-2" "kmeans-6"
## Connectivity Dunn Silhouette
## "hierarchical-2" "hierarchical-2" "hierarchical-2"
From the results we understand that the best way to cluster our data is hierarchical with 2 groups even if AD and FOM suggest other methods.
To confirm the choice of dividing the data into two groups we check the plot of the Elbow method, the Average Silhouette method and the Gap Statistic method.
data=df4[, 1:3]
p1 <- fviz_nbclust(data, FUN = hcut, method = "wss",
k.max = 10) +
ggtitle("(A) Elbow method")
p2 <- fviz_nbclust(data, FUN = hcut, method = "silhouette",
k.max = 10) +
ggtitle("(B) Silhouette method")
p3 <- fviz_nbclust(data, FUN = hcut, method = "gap_stat",
k.max = 10) +
ggtitle("(C) Gap statistic")
gridExtra::grid.arrange(p1, p2, p3, nrow = 1)
In agreement with the graphs shown above the Elbow plot and the Gap Statistic confirm the choice of using two groups while the Average Silhouette suggests using four. Despite this, even with two we obtain excellent performance for the Sillhouette.
Using the Euclidean distance and complete linkage method we obtain a division into two clusters of our data that we can visualize using the Dendrogram.
dd=dist(df4[, 1:3], method = "euclidean")
clusters = hclust(dd,method = "complete")
plot(clusters)
cut = cutree(clusters,2)
rect.hclust(clusters , k = 2, border = 2:3)
clusters <- diana(df4[, 1:3])
plot(clusters, which = 1, nmax.lab = 100)
The Divisive Coefficient is equal to 0.95 and is very good as it measures the clustering structure of the dataset and the distinction between groups.
fviz_cluster(list(data = df4, cluster = cut))
The same result is achieved by the kmeans method that allows us to project the two clusters in two dimensions, obtaining 80% of the total variability, 47% in the first dimension and 35% in the second dimension.
kk=kmeans(df4, centers = 2, iter.max = 50, nstart = 1)
fviz_cluster(kk, df4,ellipse.type = "norm")
As we can see from the Kmeans chart, some statistical units are in the middle of the two clusters, in fact this method works badly in these situations.
Using the Hierarchical method with the complete-linkage method we solve this problem because the distance between the two clusters is defined as the longest distance between two points in each cluster in order to obtain more compact classes.
fit <- princomp(df4, cor=TRUE)
fviz_pca_biplot(fit)
As we can understand thanks to the graph of variables in two dimensions:
the first dimension is associated with the variable grad_n;
the second dimension is located between the variables family_p and rd_p;
The first cluster represents the associated companies with high values of family_p and negatively associated with the variable grad_n;
The second cluster represents associated firms with high values of rd_p and grad_n.
library(tidyverse)
library(caret)
library(ggplot2)
library(ggpubr)
library(dplyr)
library(plyr)
library(car)
library(ggpubr)
library(labelled)
library(plm)
library(Rxtsum)
library(brotools)
library(tsoutliers)
library(factoextra )
library(class)
library(mlbench)
library(cutpointr)
library(pROC)
library(Epi)
library(caret)
library(psych)
library(cluster)
library(readxl)
data <- read_excel("data orig.xlsx",
sheet = "all", na = "9999999999")
I filtered the dataset to get only Italian industries that are between 20 and 6 years old.
The variables we will analyze are family_p, family_tmt, export_turnov_p, involv_n, and strategy where:
data=data%>%dplyr::rename(export_turnov_p =`exp/turnov` )%>%dplyr::rename(bonus = bonus...35)
Data_1<-data %>% dplyr::select(involv_n,no_family_p, export_turnov_p,family_n,empl,skilled_n,family_tmt,grad_n,rd_p,inn_sales_p,family_p, white_n, rd_p,invest, ceo_type, pavitt, turnov, group, country,exp_story_dummy,benefit_rd, strategy,rd_dummy,age)
Data1<-na.omit(Data_1)
Data=Data1 %>%
dplyr::filter(country %in% c("ITA"))%>%
dplyr::filter(age!= "Between 20 and 6 yrs")
Data$strategy=as.factor(Data$strategy)
Data$family_tmt=as.numeric(Data$family_tmt)
Data$involv=as.numeric(Data$involv_n)
Data$family_p=as.numeric(Data$family_p)
Data$export_turnov_p=as.numeric(Data$export_turnov_p)
Data=Data%>%dplyr::select(export_turnov_p, family_p, family_tmt, involv_n, strategy)
Data
family_p: Percentage of entrepreneurs/executives (including middle management) who are related to the family that owns the business.
family_tmt: Percentage of family members in the top management of the company. 100% family 0% no family
export_turnov_p: Percentage of 2008 annual sales represented by. export activity.
involv_n : Number of employees who were involved in R&D activities in 2008.
strategy : Decisions in your company are :
1 “centralized: the CEO/owner makes most of the decisions in each area” 2 “decentralized: managers can make their own decisions in certain business areas”.
For numerical variables:
df5=Data%>%dplyr::select(export_turnov_p, family_p, family_tmt, involv_n)
brotools::describe(df5, export_turnov_p, family_p, family_tmt, involv_n )
For factors variables:
df6=Data%>%dplyr::select(strategy)
df6 %>% group_by(strategy)%>%
dplyr::summarise(num=n())
Since the variables we will be using have different units of measurement, we will normalize the data.
normalize=function(x){
return ((x-min(x))/(max(x)-min(x)))
}
norm_Data1=as.data.frame(lapply(Data[,c(1:4)], normalize))
data2=norm_Data1 %>% dplyr::select(export_turnov_p, family_p,family_tmt, involv_n)
data3=Data %>% dplyr::select(strategy)
df4 <- cbind(data2, data3)
df4<-na.omit(df4)
TrainData <- df4[,1:4]
TrainClasses <- df4[,5]
We use the glm method and look for Accuracy using the Cross-Validation procedure with 2 folds.
set.seed(1)
control <- trainControl(method="cv", number=2)
glm_cv <- train(TrainData, TrainClasses,
method = "glm",
preProcess = c("center", "scale"),
tuneLength = 3,
trControl = control)
glm_cv
## Generalized Linear Model
##
## 449 samples
## 4 predictor
## 2 classes: '1', '2'
##
## Pre-processing: centered (4), scaled (4)
## Resampling: Cross-Validated (2 fold)
## Summary of sample sizes: 224, 225
## Resampling results:
##
## Accuracy Kappa
## 0.817371 0.01908039
We obtain an Accuracy equal to 0.817371.
We use the knn method and look for Accuracy using the Cross-Validation procedure with 2 folds.
set.seed(1)
control <- trainControl(method="cv", , number=2)
cv <- train(TrainData, TrainClasses,
method = "knn",
preProcess = c("center", "scale"),
tuneLength = 3,
trControl = control)
cv
## k-Nearest Neighbors
##
## 449 samples
## 4 predictor
## 2 classes: '1', '2'
##
## Pre-processing: centered (4), scaled (4)
## Resampling: Cross-Validated (2 fold)
## Summary of sample sizes: 224, 225
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.8039782 0.07331957
## 7 0.8129365 0.07446937
## 9 0.8129266 0.05065965
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 7.
cv$results$Accuracy
## [1] 0.8039782 0.8129365 0.8129266
We obtain an Accuracy equal to 0.8129365 for k = 7.
As we saw earlier with both procedures we achieve the same result. Now we check graphically the Accuracy for both models.
set.seed(1)
control <- trainControl(method="cv", number=10)
mod2 <- train(TrainData, TrainClasses,
method = "glm",
preProcess = c("center", "scale"),
tuneLength = 10,
trControl = control)
appo=rep(NA, length(5:20))
for (i in 5:20) {control <- trainControl(method="cv", number=i)
mod3 <- train(TrainData, TrainClasses,
method = "glm",
preProcess = c("center", "scale"),
tuneLength = 10,
trControl = control)
appo[i-4]=as.numeric(mod3$results[2])
}
mod4 <- train(TrainData, TrainClasses,
method = "knn",
preProcess = c("center", "scale"),
tuneLength = 10,
trControl = control)
rr=length(5:20)
cc=length(mod4$results$Accuracy)
appo2=matrix(NA, rr,cc)
for (i in 5:20) {control <- trainControl(method="cv", number=i)
mod5 <- train(TrainData, TrainClasses,
method = "knn",
preProcess = c("center", "scale"),
tuneLength = 10,
trControl = control)
appo2[i-4,]=mod5$results$Accuracy
}
plot(5:20, appo, type="l", xlab="Folds", ylab="Accuracy", col="Red",ylim=c(0.79,0.83))
par(new=TRUE)
plot(5:20, appo2[,1], type="l", xlab="Folds", ylab="Accuracy ", col="Blue",ylim=c(0.79,0.83))
par(new=TRUE)
plot(5:20, appo2[,2], type="l", xlab="Folds", ylab="Accuracy ", col="Yellow",ylim=c(0.79,0.83))
par(new=TRUE)
plot(5:20, appo2[,3], type="l", xlab="Folds", ylab="Accuracy", col="Grey",ylim=c(0.79,0.83))
par(new=TRUE)
plot(5:20, appo2[,4], type="l", xlab="Folds", ylab="Accuracy ", col="Pink",ylim=c(0.79,0.83))
par(new=TRUE)
plot(5:20, appo2[,5], type="l", xlab="Folds", ylab="Accuracy", col="Green",ylim=c(0.79,0.83))
par(new=TRUE)
plot(5:20, appo2[,6], type="l", xlab="Folds", ylab="Accuracy ", col="Violet",ylim=c(0.79,0.83))
par(new=TRUE)
plot(5:20, appo2[,7], type="l", xlab="Folds", ylab="Accuracy ", col="Brown",ylim=c(0.79,0.83))
par(new=TRUE)
plot(5:20, appo2[,8], type="l", xlab="Folds", ylab="Accuracy ", col="Purple",ylim=c(0.79,0.83))
par(new=TRUE)
plot(5:20, appo2[,9], type="l", xlab="Folds", ylab="Accuracy", col="Orange",ylim=c(0.79,0.83))
par(new=TRUE)
plot(5:20, appo2[,10], type="l", xlab="Folds", ylab="Accuracy", col="Black",ylim=c(0.79,0.83)
, main="Accuracy of LR and k-nn for k=5,7,9,11,13,15,17,19,21,23 using K-fold cross validation")
legend("topright",c("Logistic Regression","k-nn with k=5","k-nn with k=7","k-nn with k=9","k-nn with k=11","k-nn with k=13","k-nn with k=15","k-nn with k=17","k-nn with k=19","k-nn with k=21","k-nn with k=23"),lty = 1, ncol = 1, lwd = 1,
col = c("Red","Blue","Yellow","Grey","Pink","Green","Violet","Brown","Purple","Orange","Black"), xjust = 1, merge = TRUE)
As we can see, in the knn k-fold model Accuracy tends to increase when the number of Folds is equal to 13, while the Accuracy for LR k-fold model tend remain stable. Now let’s try to see if our model can get a better result than the previous one.
set.seed(1)
control <- trainControl(method="cv", , number=13)
cv <- train(TrainData, TrainClasses,
method = "knn",
preProcess = c("center", "scale"),
tuneLength = 10,
trControl = control)
cv
## k-Nearest Neighbors
##
## 449 samples
## 4 predictor
## 2 classes: '1', '2'
##
## Pre-processing: centered (4), scaled (4)
## Resampling: Cross-Validated (13 fold)
## Summary of sample sizes: 415, 415, 415, 414, 414, 415, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.7974790 0.042303700
## 7 0.8020039 0.041251013
## 9 0.8241758 0.070509835
## 11 0.8219780 0.054315503
## 13 0.8197156 0.035254917
## 15 0.8107304 -0.008168822
## 17 0.8107304 -0.008168822
## 19 0.8107304 -0.008168822
## 21 0.8152553 0.000000000
## 23 0.8152553 0.000000000
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.
Now we have obtained an Accuracy equal to 0.8241758 for k=9 with an increase from the initial one of 0.01 points.
To understand which model is really the best we calculate the ROC for both.
set.seed(1)
levels(df4$strategy) <- c("one","two")
control <- trainControl(method="cv", number=13, classProbs=TRUE, summaryFunction=twoClassSummary)
mod5 <- train(TrainData, df4[,5],
method = "knn",
metric="ROC",
preProcess = c("center", "scale"),
tuneLength = 10,
trControl = control)
mod5
## k-Nearest Neighbors
##
## 449 samples
## 4 predictor
## 2 classes: 'one', 'two'
##
## Pre-processing: centered (4), scaled (4)
## Resampling: Cross-Validated (13 fold)
## Summary of sample sizes: 415, 415, 415, 414, 414, 415, ...
## Resampling results across tuning parameters:
##
## k ROC Sens Spec
## 5 0.5867933 0.9617279 0.06959707
## 7 0.6024897 0.9698750 0.06043956
## 9 0.5646146 1.0000000 0.04761905
## 11 0.5394968 1.0000000 0.03663004
## 13 0.5351290 1.0000000 0.02380952
## 15 0.5569502 0.9945055 0.00000000
## 17 0.5882391 0.9945055 0.00000000
## 19 0.5911578 0.9945055 0.00000000
## 21 0.6022878 1.0000000 0.00000000
## 23 0.6041283 1.0000000 0.00000000
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was k = 23.
control <- trainControl(method="cv", number=5, classProbs=TRUE, summaryFunction=twoClassSummary)
mod6 <- train(TrainData, df4[,5],
method = "glm",
metric="ROC",
preProcess = c("center", "scale"),
tuneLength = 10,
trControl = control)
mod6
## Generalized Linear Model
##
## 449 samples
## 4 predictor
## 2 classes: 'one', 'two'
##
## Pre-processing: centered (4), scaled (4)
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 360, 359, 360, 358, 359
## Resampling results:
##
## ROC Sens Spec
## 0.6748397 0.9972603 0
The highest ROC belongs to the LR k-fold Cross-validation classifier and is equal to 0.67.
The multi_cutpointr function allows us to know the AUC value, i.e. the probability that a randomly extracted subject is correctly classified, for each variable inserted in the classifier.
mcp <- multi_cutpointr(df4, x = c("family_p", "export_turnov_p", "family_tmt", "involv_n"), class = strategy, pos_class = "two")
summary(mcp)
We therefore obtain that family_p, export_turnov_p, and family_tmt have a sufficient AUC between 0.5 and 0.7. While the variable involv_n has an AUC equal to 0.45 and thus does not contribute positively to correct classification.
For understand how well the test performed can distinguish the two different levels, is proportional to the area underlying the ROC curve (AUC Area Under Curve) and is equivalent to the probability that a randomly extracted subject is correctly classified.
par(mfrow=c(1,2))
ROC(form=strategy~family_p+export_turnov_p+family_tmt+involv_n , data=df4)
Thanks to the graphs shown above we can see how the trade-off between sensitivity with respect to 1-specificity (probability of having false positives) is interpreted and visualize the point at which the maximum efficiency is obtained.