Multiple Linear Regression

library(tidyverse)
library(caret)
library(ggplot2)
library(ggpubr)
library(dplyr)
library(plyr)
library(car)
library(ggpubr)
library(labelled)
library(plm)
library(Rxtsum)
library(brotools)
library(nortest)
library(lmtest)
library(lawstat)
library(tseries)
library(pastecs)

library(tsoutliers)

library(readxl)
data <- read_excel("data orig.xlsx", 
    sheet = "all", na = "9999999999")

I filtered the dataset in order to obtain only English industries belonging to the pavitt category of “Traditional industries”.

The variables we are going to analyze are turnov, age, empl, group and c13aperc where:

data=data%>%dplyr::rename(export_turnov_p =`exp/turnov` )%>%dplyr::rename(bonus = bonus...35)

Data<-data %>% dplyr::select( age,country,turnov,group, pavitt ,inn_sales_p, empl,c13aperc,family_tmt, d27)


Data=Data %>% dplyr::filter(country == "UK")%>%
  dplyr::filter(pavitt=="Traditional industries")


Data$turnov=as.factor(Data$turnov)
Data$group=as.factor(Data$group)
Data$age=as.factor(Data$age)

Data$empl=as.numeric(Data$empl)
Data$c13aperc=as.numeric(Data$c13aperc)
Data$family_tmt=as.numeric(Data$family_tmt)


df4=Data %>% dplyr::select(turnov,age,empl, group,inn_sales_p,c13aperc,family_tmt)
df4<-na.omit(df4)

df=Data %>% dplyr::select(turnov,age,empl, group,c13aperc)
df<-na.omit(df)

df

empl (2008): Total number of employees of your company in your home country.
turnov (2008): 1 “less than €1 million” 2 “1-2 million euros” 3 “2-10 million euros” 4 “10-15 million euros” 5 “15-50 million euros” 6 “50-250 million euros” 7 “more than 250 million euros”
age: Less than 6 years Between 20 and 6 years More than 20 years old
group: 1 “Yes, Domestic” 2 “Yes, Foreign” 3 “No”
c13aperc: Reduction in planned investments in machinery, equipment or ICT Percentage: 1 to 100

Descriptive statistics

For numerical variables:

df5=df%>%dplyr::select(empl, c13aperc)
brotools::describe(df, empl, c13aperc )

For factors variables:

age

df6=df%>%dplyr::select(age,turnov,group)

df6 %>% group_by(age)%>%
  dplyr::summarise(num=n())

turnov

df6 %>% group_by(turnov)%>%
  dplyr::summarise(num=n())

group

df6 %>% group_by(group)%>%
  dplyr::summarise(num=n())

df4=Data %>% dplyr::select(turnov,age,empl, group, c13aperc,family_tmt)
df4<-na.omit(df4)

df4 <- df4[df4$empl > quantile(df4$empl, .25) - 1.5*IQR(df4$empl) & 
        df4$empl < quantile(df4$empl, .75) + 1.5*IQR(df4$empl), ] 

df4 <- df4[df4$family_tmt > quantile(df4$family_tmt, .25) - 1.5*IQR(df4$family_tmt) & 
        df4$family_tmt < quantile(df4$family_tmt, .75) + 1.5*IQR(df4$family_tmt), ] 
df4=df4[-c(30,250,23,38,136,165,650),]

Only control variables

mod1 <- lm(empl ~  group+ turnov  + age , data = df4  ) 
summary(mod1)

## 
## Call:
## lm(formula = empl ~ group + turnov + age, data = df4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -38.302  -7.062  -2.144   5.856  59.620 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           25.016      3.821   6.548 3.30e-10 ***
## group2               -10.713      5.010  -2.138 0.033450 *  
## group3                -9.258      3.233  -2.863 0.004550 ** 
## turnov2                7.722      2.306   3.349 0.000937 ***
## turnov3               32.622      2.342  13.930  < 2e-16 ***
## turnov4               69.591      7.516   9.259  < 2e-16 ***
## turnov5               82.008     10.853   7.556 7.84e-13 ***
## turnov6               83.033     15.355   5.407 1.49e-07 ***
## ageLess than 6 yrs     3.277      3.510   0.933 0.351472    
## ageMore than 20 yrs    2.664      1.987   1.341 0.181094    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.68 on 250 degrees of freedom
## Multiple R-squared:  0.6053, Adjusted R-squared:  0.5911 
## F-statistic:  42.6 on 9 and 250 DF,  p-value: < 2.2e-16

Only control variables and empl in log form

mod1_1 <- lm(log(empl) ~  group+ turnov  + age , data = df4  ) 
summary(mod1_1)

## 
## Call:
## lm(formula = log(empl) ~ group + turnov + age, data = df4)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.03667 -0.28045  0.00223  0.25891  1.00776 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          2.9759036  0.1047308  28.415  < 2e-16 ***
## group2              -0.3385011  0.1373250  -2.465   0.0144 *  
## group3              -0.2168446  0.0886347  -2.446   0.0151 *  
## turnov2              0.3275275  0.0632091   5.182 4.53e-07 ***
## turnov3              1.0093103  0.0641928  15.723  < 2e-16 ***
## turnov4              1.6543335  0.2060355   8.029 3.85e-14 ***
## turnov5              1.7790231  0.2975096   5.980 7.68e-09 ***
## turnov6              1.8697563  0.4209173   4.442 1.34e-05 ***
## ageLess than 6 yrs   0.0006009  0.0962182   0.006   0.9950    
## ageMore than 20 yrs  0.0980114  0.0544560   1.800   0.0731 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4025 on 250 degrees of freedom
## Multiple R-squared:  0.6019, Adjusted R-squared:  0.5875 
## F-statistic: 41.99 on 9 and 250 DF,  p-value: < 2.2e-16

extractAIC(mod1)

## [1]   10.000 1406.863

extractAIC(mod1_1)

## [1]   10.0000 -463.4605

As we have seen using empl in log form both increases R^2 and decreases the AIC value, so we will proceed by using log(empl) instead of empl.

with c13aperc

mod3 <- lm(log(empl) ~  group+ turnov  + age +c13aperc  , data = df4  ) 
summary(mod3)

## 
## Call:
## lm(formula = log(empl) ~ group + turnov + age + c13aperc, data = df4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.0420 -0.2947 -0.0101  0.2636  0.9984 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          2.9175275  0.1069111  27.289  < 2e-16 ***
## group2              -0.3490048  0.1362422  -2.562   0.0110 *  
## group3              -0.2250724  0.0879592  -2.559   0.0111 *  
## turnov2              0.2984358  0.0639422   4.667 4.99e-06 ***
## turnov3              0.9886963  0.0642804  15.381  < 2e-16 ***
## turnov4              1.6633443  0.2043335   8.140 1.89e-14 ***
## turnov5              1.7883430  0.2950254   6.062 4.96e-09 ***
## turnov6              1.9294630  0.4181719   4.614 6.32e-06 ***
## ageLess than 6 yrs  -0.0092071  0.0955013  -0.096   0.9233    
## ageMore than 20 yrs  0.0918856  0.0540620   1.700   0.0904 .  
## c13aperc             0.0015299  0.0006661   2.297   0.0225 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3991 on 249 degrees of freedom
## Multiple R-squared:  0.6101, Adjusted R-squared:  0.5945 
## F-statistic: 38.97 on 10 and 249 DF,  p-value: < 2.2e-16

According with the table shown above we get the following results:

c13aperc is significant at 0.05 significance level and this means that holding all other terms constant if c13aperc increase of one unit, on average it tends to increase the number of empl by 0.15%.
group2 is significant at 0.05 significance level and this means that holding all other terms constant if a company belongs to group2(Foreign group), on average it tends to decrease the number of empl by 35% compared to the number of empl in group1(National group).
group3 is significant at 0.05 significance level and this means that holding all other terms constant if a company belongs to group2(Foreign group), on average it tends to decrease the number of empl by 22% compared to the number of empl in group1(National group).
turnov2 is significant at 0.001 significance level and this means that holding all other terms constant if a company belongs to turnov2(1-2 million euros) on average tends to increase the number of empl by 29% compared to the companies that belong to group turnov1(less than 1 million euro).
turnov3 is significant at 0.001 significance level and this means that holding all other terms constant if a company belongs to turnov3(2-10 million euro) on average tends to increase the number of empl by 99% compared to the companies that belong to group turnov1(less than 1 million euro).
turnov4 is significant at 0.001 significance level and this means that holding all other terms constant if a company belongs to turnov4(10-15 million euro), on average tends to increase the number of empl by 166% compared to companies that belong to the group turnov1(less than 1 million euro).
turnov5 is significant at 0.001 significance level and this means that when holding all other terms constant if a company belongs to turnov5(15-50 million euro), on average it tends to increase the number of empl by 179% compared to companies that belong to the turnov1 group(less than 1 million euro).
turnov6 is significant at 0.001 significance level and this means that when holding all other terms constant if a company belongs to turnov6(50-250 million euro), on average it tends to increase the number of empl by 192% compared to companies that belong to the turnov1 group(less than 1 million euro).

Testing The Assumptions

par(mfrow=c(2,2))
plot(mod3)

1 The regression model is linear in parameters

Assumption of linearity applies only to numerical predictors by visually inspecting the scatter plot between each predictor and the logit values.

crPlots(mod3)

c13aperc appear to be linear.

2 The mean of the residual is equal to 0 and residuals are normally distributed

mean(mod3$residuals)

## [1] 3.612374e-18

shapiro.test(mod3$residuals)

## 
##  Shapiro-Wilk normality test
## 
## data:  mod3$residuals
## W = 0.99195, p-value = 0.1677

jarque.bera.test(mod3$residuals)

## 
##  Jarque Bera Test
## 
## data:  mod3$residuals
## X-squared = 2.7896, df = 2, p-value = 0.2479

The mean of the residual is equal to 0. Residuals are normally distributed since p-value is greater di 0.05.

3 Homoscedasticity of residuals

lmtest::bptest(mod3)

## 
##  studentized Breusch-Pagan test
## 
## data:  mod3
## BP = 14.504, df = 10, p-value = 0.1512

In the Breush-Pagan test the null hypothesis is that the variance of the residuals is constant, so since p-value is greater than 0.05 we conclude that there is Homoscedasticity among the residuals.

4 Independence of the errors -> No autocorrelation of residuals

acf(mod3$residuals)

lmtest::dwtest(mod3)

## 
##  Durbin-Watson test
## 
## data:  mod3
## DW = 2.0465, p-value = 0.6341
## alternative hypothesis: true autocorrelation is greater than 0

From the graph we can see that all values are below the blue line. In this case H0: random residuals, so since the p-value is greater than 0.05 we conclude that there is no autocorrelation of residuals.

5 The predictors and residuals are uncorrelated

cor.test(df4$c13aperc, mod3$residuals)

## 
##  Pearson's product-moment correlation
## 
## data:  df4$c13aperc and mod3$residuals
## t = -4.3666e-15, df = 258, p-value = 1
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1216537  0.1216537
## sample estimates:
##           cor 
## -2.718553e-16

In this case H0: correlation is 0 (there is no correlation) so since the p-value is greater than 0.05 we declare that the predictors and residuals are uncorrelated.

6 The number of observations must be greater than number of predictors

dim(df)

## [1] 303   5

The number of observations is greater than the number of predictors.

7 Absence of perfect multicollinearity

car::vif(mod3)

##              GVIF Df GVIF^(1/(2*Df))
## group    1.279091  2        1.063470
## turnov   1.381903  5        1.032875
## age      1.100678  2        1.024271
## c13aperc 1.057212  1        1.028208

Since GVIF is less than 5/10 there is no Multicollinearity between our predictors.

Simple Slope analysis between empl-turnov-age

library(interactions)
Slope <- lm(log(empl) ~ age * turnov, data = df)
summary(Slope)

## 
## Call:
## lm(formula = log(empl) ~ age * turnov, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.8350 -0.2943 -0.0235  0.2649  1.6669 
## 
## Coefficients: (3 not defined because of singularities)
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  2.77895    0.08386  33.137  < 2e-16 ***
## ageLess than 6 yrs          -0.10095    0.14363  -0.703   0.4827    
## ageMore than 20 yrs          0.10797    0.10754   1.004   0.3162    
## turnov2                      0.28907    0.11269   2.565   0.0108 *  
## turnov3                      0.99582    0.11761   8.467 1.35e-15 ***
## turnov4                      2.05997    0.33017   6.239 1.59e-09 ***
## turnov5                      2.18846    0.24088   9.085  < 2e-16 ***
## turnov6                      3.43565    0.33017  10.406  < 2e-16 ***
## turnov7                      3.32769    0.32636  10.196  < 2e-16 ***
## ageLess than 6 yrs:turnov2   0.27142    0.35815   0.758   0.4492    
## ageMore than 20 yrs:turnov2  0.05108    0.14656   0.349   0.7277    
## ageLess than 6 yrs:turnov3   0.33628    0.26119   1.287   0.1990    
## ageMore than 20 yrs:turnov3  0.10288    0.14904   0.690   0.4906    
## ageLess than 6 yrs:turnov4  -0.29532    0.57146  -0.517   0.6057    
## ageMore than 20 yrs:turnov4 -0.40388    0.36906  -1.094   0.2747    
## ageLess than 6 yrs:turnov5   0.35270    0.37364   0.944   0.3460    
## ageMore than 20 yrs:turnov5  0.28038    0.27771   1.010   0.3135    
## ageLess than 6 yrs:turnov6        NA         NA      NA       NA    
## ageMore than 20 yrs:turnov6 -1.03700    0.42606  -2.434   0.0156 *  
## ageLess than 6 yrs:turnov7        NA         NA      NA       NA    
## ageMore than 20 yrs:turnov7       NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4516 on 285 degrees of freedom
## Multiple R-squared:  0.763,  Adjusted R-squared:  0.7488 
## F-statistic: 53.96 on 17 and 285 DF,  p-value: < 2.2e-16

cat_plot(Slope, pred = turnov, modx = age, geom = "line")

From the line plot we can see how the number of employees changes with respect to turnover and the age of the companies:

Firms more than 20 years old, on average, have the same number of empl as firms between 6 and 20 years old and less than 6 years old, except when they have turnov=6(10-15 million euros), where these firms tend to have fewer empl than when they have turnov=6(50-250 million euros) and then outnumber the other firms when they reach turnov=7(more than 250 million euros).

Binary Logistic Regression

library(pscl)
library(broom)
library(tidyverse)
 library(caret)
library(ggplot2)
library(ggpubr)
library(dplyr)
library(plyr)
library(car)
library(ggpubr)
library(labelled)
library(plm)
library(Rxtsum)
library(brotools)
library(tsoutliers)
library(lawstat)

library(readxl)
data <- read_excel("data orig.xlsx", 
    sheet = "all", na = "9999999999")

I filtered the dataset to get only German and British industries belonging to the pavitt category of “High-tech industries”.

The variables we will analyze are age, turnov, family_p, family_tmt, and benefit_rd where:

data=data%>%dplyr::rename(export_turnov_p =`exp/turnov` )%>%dplyr::rename(bonus = bonus...35)

Data<-data %>% dplyr::select( age,country, turnov, benefit_rd, family_tmt, pavitt,benefit_rd, family_p )


Data=Data%>%  dplyr::filter(pavitt =="High-tech industries")%>%   dplyr::filter(country %in% c("UK", "GER"))%>%  dplyr::filter(benefit_rd !=3)

Data$turnov=as.factor(Data$turnov)
Data$age=as.factor(Data$age)
Data$benefit_rd=as.factor(Data$benefit_rd)

Data$family_p=as.numeric(Data$family_p)
Data$family_tmt=as.numeric(Data$family_tmt)
Data=na.omit(Data)

Data1=Data[-c(4,138,25,124,80,30,170,32,25,167,120,127),]
Data1=Data1%>%dplyr::select(-c("pavitt","country"))
Data1

Age: Less than 6 years Between 20 and 6 years More than 20 years
turnov (2008): 1 “less than 1 million euros” 2 “1-2 million euros” 3 “2-10 million euros” 4 “10-15 million euros” 5 “15-50 million euros” 6 “50-250 million euros” 7 “more than 250 million euros”
family_p: Percentage of entrepreneurs/executives (including middle management) who are related to the family that owns the business.
family_tmt: Percentage of family members in the company’s top management. 100% family 0% no family
benefit_rd : Did the company benefit from tax breaks and financial incentives for these R&D activities? 1 “Yes 2”No"

Descriptive statistics

For numerical variables:

df5=Data1%>%dplyr::select(family_tmt, family_p)
brotools::describe(Data1, family_tmt, family_p )

For factors variables:

age

df6=Data1%>%dplyr::select(age,turnov,benefit_rd)

df6 %>% group_by(age)%>%
  dplyr::summarise(num=n())

turnov

df6 %>% group_by(turnov)%>%
  dplyr::summarise(num=n())

benefit_rd

df6 %>% group_by(benefit_rd)%>%
  dplyr::summarise(num=n())

Only control variables

mod1 <- glm(benefit_rd ~  age +turnov, data=Data1, family="binomial")
summary(mod1)

## 
## Call:
## glm(formula = benefit_rd ~ age + turnov, family = "binomial", 
##     data = Data1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1417  -1.1857   0.7003   0.7477   1.2069  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)   
## (Intercept)           2.0218     0.7659   2.640  0.00829 **
## ageLess than 6 yrs   -0.1455     0.5926  -0.246  0.80603   
## ageMore than 20 yrs   1.0753     0.3904   2.755  0.00588 **
## turnov2              -1.9453     0.8560  -2.273  0.02305 * 
## turnov3              -1.8609     0.7971  -2.335  0.01957 * 
## turnov4              -1.9981     0.9185  -2.175  0.02960 * 
## turnov5              -1.8167     0.9130  -1.990  0.04661 * 
## turnov6              -0.9101     0.9890  -0.920  0.35746   
## turnov7              11.4690   882.7437   0.013  0.98963   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 212.05  on 178  degrees of freedom
## Residual deviance: 192.46  on 170  degrees of freedom
## AIC: 210.46
## 
## Number of Fisher Scoring iterations: 13

pR2(mod1)[4]

## fitting null model for pseudo-r2

##   McFadden 
## 0.09236815

With family_p and family_tmt

mod2 <- glm(benefit_rd ~  family_p+age +turnov+family_tmt, data=Data1, family="binomial")
summary(mod2)

## 
## Call:
## glm(formula = benefit_rd ~ family_p + age + turnov + family_tmt, 
##     family = "binomial", data = Data1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0560  -1.0001   0.4377   0.8329   1.4290  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)   
## (Intercept)           1.51185    0.84956   1.780  0.07515 . 
## family_p             -0.12726    0.08709  -1.461  0.14394   
## ageLess than 6 yrs    0.14134    0.62398   0.227  0.82081   
## ageMore than 20 yrs   1.04187    0.41588   2.505  0.01224 * 
## turnov2              -2.08631    0.89999  -2.318  0.02044 * 
## turnov3              -1.87046    0.84300  -2.219  0.02650 * 
## turnov4              -2.08531    0.98363  -2.120  0.03401 * 
## turnov5              -1.67332    0.96147  -1.740  0.08179 . 
## turnov6              -0.68575    1.04198  -0.658  0.51046   
## turnov7              12.01234  882.74379   0.014  0.98914   
## family_tmt            0.04451    0.01642   2.711  0.00671 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 212.05  on 178  degrees of freedom
## Residual deviance: 175.16  on 168  degrees of freedom
## AIC: 197.16
## 
## Number of Fisher Scoring iterations: 13

pR2(mod2)[4]

## fitting null model for pseudo-r2

##  McFadden 
## 0.1739605

Interpretation of Factors coefficients

coefs <- coef(mod2) 
exp(coefs)

##         (Intercept)            family_p  ageLess than 6 yrs ageMore than 20 yrs 
##        4.535128e+00        8.805088e-01        1.151811e+00        2.834519e+00 
##             turnov2             turnov3             turnov4             turnov5 
##        1.241442e-01        1.540524e-01        1.242681e-01        1.876230e-01 
##             turnov6             turnov7          family_tmt 
##        5.037115e-01        1.647760e+05        1.045513e+00

ageMore than 20 years is significant at the 0.05 level of significance with a coefficient equal to 2.83 which means that a company that is more than 20 years old, on average is associated with a 2.83 increase in the logit of having a benefit_rd compared to a company with Between 20 and 6 years.

turnov2 is significant at the 0.05 level of significance with a coefficient equal to 0.124 which means that a company belonging to the turnov2 group, on average, is associated with a 0.124 decrease in the logit of having a benefit_rd compared to a company that belongs to the group turnov1.

turnov3 is significant at the 0.05 level of significance with a coefficient equal to 0.154 which means that a company that belongs to the group turnov3, on average is associated with a decrease of 0.154 in the logit of having a benefit_rd compared to a company that belongs to the group turnov1.

turnov4 is significant at the 0.05 level of significance with a coefficient equal to 0.124 which means that a company that belongs to the group turnov4, on average is associated with a decrease of 0.124 in the logit of having a benefit_rd compared to a company that belongs to the group turnov1.

Interpretation of Numerical coefficients

(exp(coefs)-1)*100

##         (Intercept)            family_p  ageLess than 6 yrs ageMore than 20 yrs 
##        3.535128e+02       -1.194912e+01        1.518106e+01        1.834519e+02 
##             turnov2             turnov3             turnov4             turnov5 
##       -8.758558e+01       -8.459476e+01       -8.757319e+01       -8.123770e+01 
##             turnov6             turnov7          family_tmt 
##       -4.962885e+01        1.647750e+07        4.551284e+00

family_tmt is significant at 0.01 and has a coefficient equal to 4.55 which means that for each unit added to family_tmt, on average the odds of having benefit_rd=1 increases by 4.55%.

Assumptions

par(mfrow=c(2,2))
plot(mod2)

1 The regression model is linear in parameters

Assumption of linearity applies only to numerical predictors by visually inspecting the scatter plot between each predictor and the logit values.

Data1=na.omit(Data1)

 probabilities <- predict(mod2, type = "response")
predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg")

 mydata <- Data1 %>%dplyr::select_if(is.numeric) 
predictors <- colnames(mydata)


mydata <- mydata %>%dplyr::mutate(logit = log(probabilities/(1-probabilities))) %>%
  tidyr::gather(key = "predictors", value = "predictor.value", -logit)
ggplot(mydata, aes(logit, predictor.value))+
  geom_point(size = 0.5, alpha = 0.5) +
  theme_bw() + 
  facet_wrap(~predictors, scales = "free_y")

theme_set(theme_classic())

Family_p and Family_tmt seems linear.

2 Independence of the errors -> No autocorrelation of residuals

acf(mod2$residuals)

lawstat::runs.test(mod2$residuals)

## 
##  Runs Test - Two sided
## 
## data:  mod2$residuals
## Standardized Runs Statistic = -1.4238, p-value = 0.1545

In this case H0: random residuals, so since the p-value is greater than 0.05 we declare that there is no autocorrelation of residuals.

3 Multicollinearity

car::vif(mod2)

##                GVIF Df GVIF^(1/(2*Df))
## family_p   4.637579  1        2.153504
## age        1.135171  2        1.032204
## turnov     1.232654  6        1.017584
## family_tmt 4.444157  1        2.108117

Since GVIF is less than 5/10 there is no Multicollinearity between our predictors.

Hierarchical Clustering

library(tidyverse)
library(caret)
library(ggplot2)
library(ggpubr)
library(dplyr)
library(plyr)
library(car)
library(ggpubr)
library(labelled)
library(plm)
library(Rxtsum)
library(brotools)
library(clValid)  
library(cluster)
library(tsoutliers)
library(factoextra ) 
library(FactoMineR)
library(caret)
library(e1071)
library(MASS)
library(lda)

library(readxl)
data <- read_excel("data orig.xlsx", 
    sheet = "all", na = "9999999999")

I filtered the dataset to obtain only German industries belonging to the pavitt category of “traditional industries” or “high-tech industries.”

The variables we will analyze are grad_n, rd_p, and family_p where:

data=data%>%dplyr::rename(export_turnov_p =`exp/turnov` )%>%dplyr::rename(bonus = bonus...35)


Data1<-data %>% dplyr::select( involv_n,no_family_p, export_turnov_p,family_n,empl,skilled_n,family_tmt,grad_n,rd_p,inn_sales_p,family_p, white_n, rd_p,invest, ceo_type, pavitt, turnov, group, country,exp_story_dummy, strategy)
Data1<-na.omit(Data1)

Data=Data1 %>%
   dplyr::filter(country %in% c("GER"))%>% dplyr::filter(pavitt %in% c("Traditional industries","High-tech industries"))

Data<-Data %>% dplyr::select(grad_n, rd_p, family_p)
                               
Data$grad=as.numeric(Data$grad_n)
Data$rd_p=as.numeric(Data$rd_p)
Data$family_p=as.numeric(Data$family_p)
Data= na.omit(Data)
Data

grad_n: The number of college graduates in the labor force in the country of origin
rd_p (2007-2009): Percentage of total revenues that the firm has invested in R&D on average over the past three years .
family_p: Percentage of entrepreneurs/executives (including middle management) who are related to the family that owns the company.

Descriptive statistics

For numerical variables:

brotools::describe(Data, grad_n, rd_p, family_p )

Since the variables we will be using have different units of measurement, we will normalize the data.

normalize=function(x){
  return ((x-min(x))/(max(x)-min(x)))
}

norm_Data1=as.data.frame(lapply(Data[,c(1:3)], normalize))

df4=norm_Data1 %>% dplyr::select(grad_n,rd_p, family_p)
df4<-na.omit(df4)

Cluster Validity Analysis:

To choose the best clustering method for the 3 chosen variables (grad_n, rd_p, family_p) and the optimal number of groups to divide them into, we need to satisfy the internal validation and stability measures.

Internal Measures

Internal measurements are:

Compactness;
Separation;
Connectivity.

Since when compactness increases with the number of clusters but separation decreases, we use Silhouette Width and Dumm Index to obtain the maximum achievable efficiency.

express <- df4[, c("grad_n","rd_p", "family_p")]

intern <- clValid(express, 2:6, 
                  clMethods = c("hierarchical", "kmeans", "diana", "pam"),
                  validation = "internal")    # we try from 2 to 6 groups

op <- par(no.readonly=TRUE)
par(mfrow=c(2,2),mar=c(4,4,3,1))
plot(intern, legend=FALSE)
plot(nClusters(intern),measures(intern,"Dunn")[,,1],type="n",axes=F,
     xlab="",ylab="")
legend("center", clusterMethods(intern), col=1:9, lty=1:9, pch=paste(1:9))

par(op)

Where:

Connectivity: should be minimized.
Dunn: should be maximized.
Silhouette Width: should be maximized.

Stability Measures

Stability measures are:

The average proportion of non-overlap (APN): Average distance (AD); Average distance between means (ADM); Figure of Merit (FOM).

stab <- clValid(express, 2:6, 
                clMethods=c("hierarchical","kmeans","diana","pam"),
                validation="stability")

par(mfrow=c(2,2),mar=c(4,4,3,1))
plot(stab, measure=c("APN","AD","ADM"),legend=FALSE)
plot(nClusters(stab),measures(stab,"APN")[,,1],type="n",axes=F,
     xlab="",ylab="")
legend("center", clusterMethods(stab), col=1:9, lty=1:9, pch=paste(1:9))

par(op)

Stability measures should be minimized in each case.

library(scater)



result <- clValid(express, 2:6, clMethods=c("hierarchical","kmeans","pam"),
                  validation=c("internal","stability"))

res <- getRanksWeights(result)
res$ranks[,1]

##              APN               AD              ADM              FOM 
## "hierarchical-2"          "pam-6" "hierarchical-2"       "kmeans-6" 
##     Connectivity             Dunn       Silhouette 
## "hierarchical-2" "hierarchical-2" "hierarchical-2"

From the results we understand that the best way to cluster our data is hierarchical with 2 groups even if AD and FOM suggest other methods.

Selecting the number of groups

To confirm the choice of dividing the data into two groups we check the plot of the Elbow method, the Average Silhouette method and the Gap Statistic method.

data=df4[, 1:3] 

p1 <- fviz_nbclust(data, FUN = hcut, method = "wss", 
                   k.max = 10) +
  ggtitle("(A) Elbow method")
p2 <- fviz_nbclust(data, FUN = hcut, method = "silhouette", 
                   k.max = 10) +
  ggtitle("(B) Silhouette method")
p3 <- fviz_nbclust(data, FUN = hcut, method = "gap_stat", 
                   k.max = 10) +
  ggtitle("(C) Gap statistic")

gridExtra::grid.arrange(p1, p2, p3, nrow = 1)

In agreement with the graphs shown above the Elbow plot and the Gap Statistic confirm the choice of using two groups while the Average Silhouette suggests using four. Despite this, even with two we obtain excellent performance for the Sillhouette.

Plot Clusters:

Dendrogram

Using the Euclidean distance and complete linkage method we obtain a division into two clusters of our data that we can visualize using the Dendrogram.

dd=dist(df4[, 1:3], method = "euclidean")


clusters = hclust(dd,method = "complete")
plot(clusters)

cut = cutree(clusters,2)

rect.hclust(clusters , k = 2, border = 2:3)

Diana

clusters <- diana(df4[, 1:3])

plot(clusters, which = 1, nmax.lab = 100)

The Divisive Coefficient is equal to 0.95 and is very good as it measures the clustering structure of the dataset and the distinction between groups.

Difference between Hierarchical and Non-Hierarchical Clustering:

HierarchicalClustering

fviz_cluster(list(data = df4, cluster = cut))

Kmeans

The same result is achieved by the kmeans method that allows us to project the two clusters in two dimensions, obtaining 80% of the total variability, 47% in the first dimension and 35% in the second dimension.

kk=kmeans(df4, centers = 2, iter.max = 50, nstart = 1)
fviz_cluster(kk, df4,ellipse.type = "norm")

As we can see from the Kmeans chart, some statistical units are in the middle of the two clusters, in fact this method works badly in these situations.

Using the Hierarchical method with the complete-linkage method we solve this problem because the distance between the two clusters is defined as the longest distance between two points in each cluster in order to obtain more compact classes.

Interpretation of dimensions and clusters

fit <- princomp(df4, cor=TRUE)
fviz_pca_biplot(fit)

As we can understand thanks to the graph of variables in two dimensions:

the first dimension is associated with the variable grad_n;
the second dimension is located between the variables family_p and rd_p;
The first cluster represents the associated companies with high values of family_p and negatively associated with the variable grad_n;
The second cluster represents associated firms with high values of rd_p and grad_n.

K-NN Classification

library(tidyverse)
library(caret)
library(ggplot2)
library(ggpubr)
library(dplyr)
library(plyr)
library(car)
library(ggpubr)
library(labelled)
library(plm)
library(Rxtsum)
library(brotools)
library(tsoutliers)
library(factoextra ) 
library(class)
library(mlbench)
library(cutpointr)
library(pROC)
library(Epi)
library(caret)
library(psych)
library(cluster)

library(readxl)
data <- read_excel("data orig.xlsx", 
    sheet = "all", na = "9999999999")

I filtered the dataset to get only Italian industries that are between 20 and 6 years old.

The variables we will analyze are family_p, family_tmt, export_turnov_p, involv_n, and strategy where:

data=data%>%dplyr::rename(export_turnov_p =`exp/turnov` )%>%dplyr::rename(bonus = bonus...35)


Data_1<-data %>% dplyr::select(involv_n,no_family_p, export_turnov_p,family_n,empl,skilled_n,family_tmt,grad_n,rd_p,inn_sales_p,family_p, white_n, rd_p,invest, ceo_type, pavitt, turnov, group, country,exp_story_dummy,benefit_rd, strategy,rd_dummy,age)
Data1<-na.omit(Data_1)

Data=Data1 %>%
   dplyr::filter(country %in% c("ITA"))%>%
 dplyr::filter(age!= "Between 20 and 6 yrs")

Data$strategy=as.factor(Data$strategy)

Data$family_tmt=as.numeric(Data$family_tmt)
Data$involv=as.numeric(Data$involv_n)
Data$family_p=as.numeric(Data$family_p)
Data$export_turnov_p=as.numeric(Data$export_turnov_p)
Data=Data%>%dplyr::select(export_turnov_p, family_p, family_tmt, involv_n, strategy)
Data

family_p: Percentage of entrepreneurs/executives (including middle management) who are related to the family that owns the business.
family_tmt: Percentage of family members in the top management of the company. 100% family 0% no family
export_turnov_p: Percentage of 2008 annual sales represented by. export activity.
involv_n : Number of employees who were involved in R&D activities in 2008.
strategy : Decisions in your company are :

1 “centralized: the CEO/owner makes most of the decisions in each area” 2 “decentralized: managers can make their own decisions in certain business areas”.

Descriptive statistics

For numerical variables:

df5=Data%>%dplyr::select(export_turnov_p, family_p, family_tmt, involv_n)
brotools::describe(df5, export_turnov_p, family_p, family_tmt, involv_n )

For factors variables:

df6=Data%>%dplyr::select(strategy)

df6 %>% group_by(strategy)%>%
  dplyr::summarise(num=n())

Since the variables we will be using have different units of measurement, we will normalize the data.

normalize=function(x){
  return ((x-min(x))/(max(x)-min(x)))
}

norm_Data1=as.data.frame(lapply(Data[,c(1:4)], normalize))

data2=norm_Data1 %>% dplyr::select(export_turnov_p, family_p,family_tmt, involv_n)
data3=Data %>% dplyr::select(strategy)
df4 <- cbind(data2, data3)
df4<-na.omit(df4)

TrainData <- df4[,1:4]
TrainClasses <- df4[,5]

Logistic regression with k-fold cross-validation

We use the glm method and look for Accuracy using the Cross-Validation procedure with 2 folds.

set.seed(1)
control <- trainControl(method="cv", number=2)

glm_cv <- train(TrainData, TrainClasses,
              method = "glm",
              preProcess = c("center", "scale"),
              tuneLength = 3,
              trControl = control)
glm_cv

## Generalized Linear Model 
## 
## 449 samples
##   4 predictor
##   2 classes: '1', '2' 
## 
## Pre-processing: centered (4), scaled (4) 
## Resampling: Cross-Validated (2 fold) 
## Summary of sample sizes: 224, 225 
## Resampling results:
## 
##   Accuracy  Kappa     
##   0.817371  0.01908039

We obtain an Accuracy equal to 0.817371.

knn with k-fold cross-validation

We use the knn method and look for Accuracy using the Cross-Validation procedure with 2 folds.

set.seed(1)
control <- trainControl(method="cv", , number=2)

cv <- train(TrainData, TrainClasses,
              method = "knn",
              preProcess = c("center", "scale"),
              tuneLength = 3,
              trControl = control)

cv

## k-Nearest Neighbors 
## 
## 449 samples
##   4 predictor
##   2 classes: '1', '2' 
## 
## Pre-processing: centered (4), scaled (4) 
## Resampling: Cross-Validated (2 fold) 
## Summary of sample sizes: 224, 225 
## Resampling results across tuning parameters:
## 
##   k  Accuracy   Kappa     
##   5  0.8039782  0.07331957
##   7  0.8129365  0.07446937
##   9  0.8129266  0.05065965
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 7.

cv$results$Accuracy

## [1] 0.8039782 0.8129365 0.8129266

We obtain an Accuracy equal to 0.8129365 for k = 7.

As we saw earlier with both procedures we achieve the same result. Now we check graphically the Accuracy for both models.

set.seed(1)

control <- trainControl(method="cv", number=10)

mod2 <- train(TrainData, TrainClasses,
               method = "glm",
               preProcess = c("center", "scale"),
               tuneLength = 10,
              trControl = control)


appo=rep(NA, length(5:20))


for (i in 5:20) {control <- trainControl(method="cv", number=i)
mod3 <- train(TrainData, TrainClasses,
              method = "glm",
              preProcess = c("center", "scale"),
              tuneLength = 10,
              trControl = control)
              appo[i-4]=as.numeric(mod3$results[2])
  }

              


mod4 <- train(TrainData, TrainClasses,
              method = "knn",
              preProcess = c("center", "scale"),
              tuneLength = 10,
              trControl = control)
  

rr=length(5:20)   
cc=length(mod4$results$Accuracy) 

appo2=matrix(NA, rr,cc)

for (i in 5:20) {control <- trainControl(method="cv", number=i)
mod5 <- train(TrainData, TrainClasses,
             method = "knn",
             preProcess = c("center", "scale"),
             tuneLength = 10,
             trControl = control)
appo2[i-4,]=mod5$results$Accuracy
  }



plot(5:20, appo, type="l", xlab="Folds", ylab="Accuracy", col="Red",ylim=c(0.79,0.83))
par(new=TRUE) 
plot(5:20, appo2[,1], type="l", xlab="Folds", ylab="Accuracy ", col="Blue",ylim=c(0.79,0.83))
par(new=TRUE) 
plot(5:20, appo2[,2], type="l", xlab="Folds", ylab="Accuracy ", col="Yellow",ylim=c(0.79,0.83))
par(new=TRUE) 
plot(5:20, appo2[,3], type="l", xlab="Folds", ylab="Accuracy", col="Grey",ylim=c(0.79,0.83))
par(new=TRUE) 
plot(5:20, appo2[,4], type="l", xlab="Folds", ylab="Accuracy ", col="Pink",ylim=c(0.79,0.83))
par(new=TRUE) 
plot(5:20, appo2[,5], type="l", xlab="Folds", ylab="Accuracy", col="Green",ylim=c(0.79,0.83))
par(new=TRUE) 
plot(5:20, appo2[,6], type="l", xlab="Folds", ylab="Accuracy ", col="Violet",ylim=c(0.79,0.83))
par(new=TRUE) 
plot(5:20, appo2[,7], type="l", xlab="Folds", ylab="Accuracy ", col="Brown",ylim=c(0.79,0.83))
par(new=TRUE) 
plot(5:20, appo2[,8], type="l", xlab="Folds", ylab="Accuracy ", col="Purple",ylim=c(0.79,0.83))
par(new=TRUE) 
plot(5:20, appo2[,9], type="l", xlab="Folds", ylab="Accuracy", col="Orange",ylim=c(0.79,0.83))
par(new=TRUE) 
plot(5:20, appo2[,10], type="l", xlab="Folds", ylab="Accuracy", col="Black",ylim=c(0.79,0.83)
, main="Accuracy of LR and k-nn for k=5,7,9,11,13,15,17,19,21,23 using K-fold cross validation")
legend("topright",c("Logistic Regression","k-nn with k=5","k-nn with k=7","k-nn with k=9","k-nn with k=11","k-nn with k=13","k-nn with k=15","k-nn with k=17","k-nn with k=19","k-nn with k=21","k-nn with k=23"),lty = 1, ncol = 1, lwd = 1, 
       col = c("Red","Blue","Yellow","Grey","Pink","Green","Violet","Brown","Purple","Orange","Black"), xjust = 1, merge = TRUE)

As we can see, in the knn k-fold model Accuracy tends to increase when the number of Folds is equal to 13, while the Accuracy for LR k-fold model tend remain stable. Now let’s try to see if our model can get a better result than the previous one.

set.seed(1)
control <- trainControl(method="cv", , number=13)

cv <- train(TrainData, TrainClasses,
              method = "knn",
              preProcess = c("center", "scale"),
              tuneLength = 10,
              trControl = control)

cv

## k-Nearest Neighbors 
## 
## 449 samples
##   4 predictor
##   2 classes: '1', '2' 
## 
## Pre-processing: centered (4), scaled (4) 
## Resampling: Cross-Validated (13 fold) 
## Summary of sample sizes: 415, 415, 415, 414, 414, 415, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa       
##    5  0.7974790   0.042303700
##    7  0.8020039   0.041251013
##    9  0.8241758   0.070509835
##   11  0.8219780   0.054315503
##   13  0.8197156   0.035254917
##   15  0.8107304  -0.008168822
##   17  0.8107304  -0.008168822
##   19  0.8107304  -0.008168822
##   21  0.8152553   0.000000000
##   23  0.8152553   0.000000000
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.

Now we have obtained an Accuracy equal to 0.8241758 for k=9 with an increase from the initial one of 0.01 points.

To understand which model is really the best we calculate the ROC for both.

set.seed(1)

levels(df4$strategy) <- c("one","two")

control <- trainControl(method="cv", number=13, classProbs=TRUE, summaryFunction=twoClassSummary)

mod5 <- train(TrainData, df4[,5],
              method = "knn",
              metric="ROC",
              preProcess = c("center", "scale"),
              tuneLength = 10,
              trControl = control)
mod5

## k-Nearest Neighbors 
## 
## 449 samples
##   4 predictor
##   2 classes: 'one', 'two' 
## 
## Pre-processing: centered (4), scaled (4) 
## Resampling: Cross-Validated (13 fold) 
## Summary of sample sizes: 415, 415, 415, 414, 414, 415, ... 
## Resampling results across tuning parameters:
## 
##   k   ROC        Sens       Spec      
##    5  0.5867933  0.9617279  0.06959707
##    7  0.6024897  0.9698750  0.06043956
##    9  0.5646146  1.0000000  0.04761905
##   11  0.5394968  1.0000000  0.03663004
##   13  0.5351290  1.0000000  0.02380952
##   15  0.5569502  0.9945055  0.00000000
##   17  0.5882391  0.9945055  0.00000000
##   19  0.5911578  0.9945055  0.00000000
##   21  0.6022878  1.0000000  0.00000000
##   23  0.6041283  1.0000000  0.00000000
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was k = 23.

control <- trainControl(method="cv", number=5, classProbs=TRUE, summaryFunction=twoClassSummary)

mod6 <- train(TrainData,  df4[,5],
              method = "glm",
              metric="ROC",
              preProcess = c("center", "scale"),
              tuneLength = 10,
              trControl = control)
mod6

## Generalized Linear Model 
## 
## 449 samples
##   4 predictor
##   2 classes: 'one', 'two' 
## 
## Pre-processing: centered (4), scaled (4) 
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 360, 359, 360, 358, 359 
## Resampling results:
## 
##   ROC        Sens       Spec
##   0.6748397  0.9972603  0

The highest ROC belongs to the LR k-fold Cross-validation classifier and is equal to 0.67.

The multi_cutpointr function allows us to know the AUC value, i.e. the probability that a randomly extracted subject is correctly classified, for each variable inserted in the classifier.

mcp <- multi_cutpointr(df4, x = c("family_p", "export_turnov_p", "family_tmt", "involv_n"), class = strategy, pos_class = "two")
summary(mcp)

We therefore obtain that family_p, export_turnov_p, and family_tmt have a sufficient AUC between 0.5 and 0.7. While the variable involv_n has an AUC equal to 0.45 and thus does not contribute positively to correct classification.

ROC

For understand how well the test performed can distinguish the two different levels, is proportional to the area underlying the ROC curve (AUC Area Under Curve) and is equivalent to the probability that a randomly extracted subject is correctly classified.

par(mfrow=c(1,2))

ROC(form=strategy~family_p+export_turnov_p+family_tmt+involv_n , data=df4)

Thanks to the graphs shown above we can see how the trade-off between sensitivity with respect to 1-specificity (probability of having false positives) is interpreted and visualize the point at which the maximum efficiency is obtained.

Statistical Learning Project

Claudio Olivari - B34000025

Multiple Linear Regression

Descriptive statistics

For numerical variables:

For factors variables:

age

turnov

group

Only control variables

Only control variables and empl in log form

with c13aperc

Testing The Assumptions

1 The regression model is linear in parameters

2 The mean of the residual is equal to 0 and residuals are normally distributed

3 Homoscedasticity of residuals

4 Independence of the errors -> No autocorrelation of residuals

5 The predictors and residuals are uncorrelated

6 The number of observations must be greater than number of predictors

7 Absence of perfect multicollinearity

Simple Slope analysis between empl-turnov-age

Binary Logistic Regression

Descriptive statistics

For factors variables:

age

turnov

benefit_rd

Only control variables

With family_p and family_tmt

Interpretation of Factors coefficients

Interpretation of Numerical coefficients

Assumptions

1 The regression model is linear in parameters

2 Independence of the errors -> No autocorrelation of residuals

3 Multicollinearity

Hierarchical Clustering

Descriptive statistics

Cluster Validity Analysis:

Internal Measures

Stability Measures

Selecting the number of groups

Plot Clusters:

Dendrogram

Diana

Difference between Hierarchical and Non-Hierarchical Clustering:

HierarchicalClustering

Kmeans

Interpretation of dimensions and clusters

K-NN Classification

Descriptive statistics

Logistic regression with k-fold cross-validation

knn with k-fold cross-validation

ROC