Reading in the data
compensation<-read_csv ("http://asayanalytics.com/ceos_csv")
# It is not appropraite to combine salary and bonus as one variable. This is because, different companies treat those two items different for tax purposes.
# Combining the two variables together assumes that all the companies in the dada set have the same policies concerning salaries and bonuses.
# Because of the different policies, different variables will influence these two variables differently. It is because of this reason that it makes sense for them to be analyzed as two
# separate independt variables.
colSums(is.na(compensation))
## Company CEO CityofBirth StateofBirth Age
## 0 0 7 47 0
## Undergrad UGDegree UGDate AgeOfUnder Graduate
## 71 73 71 71 318
## GradDegree MBA? MasterPhd? G_Date AgeOfGradu
## 0 0 0 318 318
## YearsFirm YearsCEO Salary Bonus Other
## 0 0 10 98 47
## StGains Compfor5Yrs StockOwned Sales Profits
## 404 132 0 0 2
## ReturnOver5Yrs Industry Industry Code WideIndustry
## 15 0 0 0
Data Wrangling
#Since the salary column has NAs, I am imputing the mean salary to replace the missing data.
class(compensation$Salary)
## [1] "numeric"
compensation$Sal <- ifelse(is.na(compensation$Salary),1,0)
compensation$salary <- as.numeric(compensation$Salary)
Ave_salary <- mean(compensation$Salary, na.rm = TRUE)
compensation$Salary <- replace(compensation$Salary, is.na(compensation$Salary), Ave_salary)
Imputing the mean of Short Term Gains
compensation$Gains <- ifelse(is.na(compensation$StGains),1,0)
compensation$Stgains<- as.numeric(compensation$StGains)
Ave_Gains <- mean(compensation$StGains, na.rm = TRUE)
compensation$StGains <- replace(compensation$StGains, is.na(compensation$StGains), Ave_Gains)
Imouting the mean of Compensation for the last five years.
compensation$yrs <- ifelse(is.na(compensation$Compfor5Yrs),1,0)
compensation$Compfor5Yrs<- as.numeric(compensation$Compfor5Yrs)
Ave_Comp <- mean(compensation$Compfor5Yrs, na.rm = TRUE)
compensation$Compfor5Yrs <- replace(compensation$Compfor5Yrs, is.na(compensation$Compfor5Yrs), Ave_Comp)
Imputing the mean of “Others”
compensation$misc <- ifelse(is.na(compensation$Other),1,0)
compensation$Other<- as.numeric(compensation$Other)
Ave_misc <- mean(compensation$Other, na.rm = TRUE)
compensation$Other <- replace(compensation$Other, is.na(compensation$Other), Ave_misc)
The Model (Salary)
compensation_lm1 <- lm(Salary~poly(Sales ,3, raw=TRUE) +
Age+StGains+Gains+Other+misc+Compfor5Yrs+ yrs+
YearsCEO+as.factor(GradDegree), data=compensation)
summary(compensation_lm1)
##
## Call:
## lm(formula = Salary ~ poly(Sales, 3, raw = TRUE) + Age + StGains +
## Gains + Other + misc + Compfor5Yrs + yrs + YearsCEO + as.factor(GradDegree),
## data = compensation)
##
## Residuals:
## Min 1Q Median 3Q Max
## -917289 -114107 -26818 92349 2144406
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.753e+05 2.571e+05 0.682 0.49569
## poly(Sales, 3, raw = TRUE)1 3.172e+01 3.268e+00 9.708 < 2e-16 ***
## poly(Sales, 3, raw = TRUE)2 -5.620e-04 8.967e-05 -6.267 6.99e-10 ***
## poly(Sales, 3, raw = TRUE)3 2.804e-09 5.448e-10 5.147 3.58e-07 ***
## Age 4.438e+03 1.494e+03 2.971 0.00308 **
## StGains -1.041e-02 3.604e-03 -2.887 0.00402 **
## Gains 2.316e+04 2.093e+04 1.107 0.26887
## Other 4.031e-02 9.989e-03 4.036 6.14e-05 ***
## misc 1.260e+04 3.829e+04 0.329 0.74234
## Compfor5Yrs 5.684e-03 1.134e-03 5.014 7.02e-07 ***
## yrs -6.502e+04 2.788e+04 -2.332 0.02001 *
## YearsCEO 3.472e+03 1.500e+03 2.315 0.02093 *
## as.factor(GradDegree)EdD 1.284e+05 3.419e+05 0.376 0.70735
## as.factor(GradDegree)JD -9.094e+03 2.446e+05 -0.037 0.97036
## as.factor(GradDegree)LLB 3.626e+04 2.483e+05 0.146 0.88394
## as.factor(GradDegree)LLD -1.244e+05 3.409e+05 -0.365 0.71524
## as.factor(GradDegree)MA 1.333e+04 2.485e+05 0.054 0.95726
## as.factor(GradDegree)MBA -1.599e+04 2.425e+05 -0.066 0.94746
## as.factor(GradDegree)MD -2.099e+04 2.793e+05 -0.075 0.94012
## as.factor(GradDegree)MEA -5.136e+03 3.414e+05 -0.015 0.98800
## as.factor(GradDegree)MPA 6.127e+05 2.960e+05 2.070 0.03886 *
## as.factor(GradDegree)MS 3.521e+04 2.456e+05 0.143 0.88605
## as.factor(GradDegree)MSA -1.233e+05 2.953e+05 -0.418 0.67642
## as.factor(GradDegree)MSEE -4.072e+04 2.954e+05 -0.138 0.89041
## as.factor(GradDegree)MSIE -1.504e+05 2.954e+05 -0.509 0.61096
## as.factor(GradDegree)none 5.309e+02 2.421e+05 0.002 0.99825
## as.factor(GradDegree)PhD -1.299e+04 2.471e+05 -0.053 0.95809
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 240600 on 605 degrees of freedom
## Multiple R-squared: 0.3539, Adjusted R-squared: 0.3261
## F-statistic: 12.75 on 26 and 605 DF, p-value: < 2.2e-16
The model (Bonus)
compensation_lm2 <- lm(Bonus~poly(Sales ,3, raw=TRUE) +
Age+StGains+Gains+Other+misc+Compfor5Yrs+ yrs+
YearsCEO+as.factor(GradDegree), data=compensation)
summary(compensation_lm2)
##
## Call:
## lm(formula = Bonus ~ poly(Sales, 3, raw = TRUE) + Age + StGains +
## Gains + Other + misc + Compfor5Yrs + yrs + YearsCEO + as.factor(GradDegree),
## data = compensation)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2208081 -258651 -122727 46929 10384606
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.390e+05 9.664e+05 -0.144 0.88572
## poly(Sales, 3, raw = TRUE)1 6.245e+01 1.551e+01 4.027 6.50e-05 ***
## poly(Sales, 3, raw = TRUE)2 -1.479e-03 5.391e-04 -2.743 0.00631 **
## poly(Sales, 3, raw = TRUE)3 8.942e-09 4.104e-09 2.179 0.02980 *
## Age 7.435e+03 6.403e+03 1.161 0.24610
## StGains 2.415e-02 1.371e-02 1.761 0.07890 .
## Gains -3.680e+04 8.256e+04 -0.446 0.65595
## Other 1.824e-01 3.939e-02 4.632 4.61e-06 ***
## misc -1.619e+05 1.826e+05 -0.886 0.37578
## Compfor5Yrs 1.399e-02 4.436e-03 3.154 0.00170 **
## yrs -1.313e+05 1.123e+05 -1.169 0.24296
## YearsCEO -4.639e+02 6.174e+03 -0.075 0.94014
## as.factor(GradDegree)JD -1.868e+05 9.081e+05 -0.206 0.83707
## as.factor(GradDegree)LLB -1.077e+04 9.256e+05 -0.012 0.99072
## as.factor(GradDegree)LLD 7.417e+04 1.274e+06 0.058 0.95359
## as.factor(GradDegree)MA -1.745e+05 9.267e+05 -0.188 0.85068
## as.factor(GradDegree)MBA -4.159e+04 8.997e+05 -0.046 0.96315
## as.factor(GradDegree)MD 1.000e+05 1.102e+06 0.091 0.92770
## as.factor(GradDegree)MEA -3.153e+05 1.269e+06 -0.249 0.80381
## as.factor(GradDegree)MPA -1.879e+04 1.097e+06 -0.017 0.98634
## as.factor(GradDegree)MS -2.236e+05 9.142e+05 -0.245 0.80689
## as.factor(GradDegree)MSA -5.068e+05 1.098e+06 -0.461 0.64470
## as.factor(GradDegree)MSEE -9.325e+04 1.102e+06 -0.085 0.93258
## as.factor(GradDegree)MSIE -1.303e+05 1.101e+06 -0.118 0.90579
## as.factor(GradDegree)none -2.021e+04 8.985e+05 -0.022 0.98206
## as.factor(GradDegree)PhD -3.620e+05 9.247e+05 -0.391 0.69565
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 894200 on 508 degrees of freedom
## (98 observations deleted due to missingness)
## Multiple R-squared: 0.2023, Adjusted R-squared: 0.1631
## F-statistic: 5.154 on 25 and 508 DF, p-value: 6.423e-14
Checking for multicollinearity
cor(compensation[, c("Age", "StGains", "Other", "YearsCEO", "Compfor5Yrs")], method="spearman") %>%
datatable()
#Using doagnostic plots to check for linearity.
crPlots(compensation_lm1)


Testing for normality using the Shapiro Wilk test
shapiro.test(compensation$Salary)
##
## Shapiro-Wilk normality test
##
## data: compensation$Salary
## W = 0.8482, p-value < 2.2e-16
compensation_Lm_sal <- sample(compensation$Salary,replace=FALSE)
shapiro.test(compensation_Lm_sal)
##
## Shapiro-Wilk normality test
##
## data: compensation_Lm_sal
## W = 0.8482, p-value < 2.2e-16
#From the test above, we see that our p-value is more than 0.05, hence we reject the null hypothesis Variable that #explains the relationship most
lm.beta(compensation_lm1)
##
## Call:
## lm(formula = Salary ~ poly(Sales, 3, raw = TRUE) + Age + StGains +
## Gains + Other + misc + Compfor5Yrs + yrs + YearsCEO + as.factor(GradDegree),
## data = compensation)
##
## Standardized Coefficients::
## (Intercept) poly(Sales, 3, raw = TRUE)1
## 0.0000000000 1.1624855469
## poly(Sales, 3, raw = TRUE)2 poly(Sales, 3, raw = TRUE)3
## -1.9846996527 1.1883028112
## Age StGains
## 0.1079065651 -0.1173273069
## Gains Other
## 0.0379758070 0.1415363174
## misc Compfor5Yrs
## 0.0112815784 0.2194200553
## yrs YearsCEO
## -0.0902354452 0.0916927924
## as.factor(GradDegree)EdD as.factor(GradDegree)JD
## 0.0174236641 -0.0079015586
## as.factor(GradDegree)LLB as.factor(GradDegree)LLD
## 0.0205933391 -0.0168840485
## as.factor(GradDegree)MA as.factor(GradDegree)MBA
## 0.0075672214 -0.0240653335
## as.factor(GradDegree)MD as.factor(GradDegree)MEA
## -0.0049249613 -0.0006969604
## as.factor(GradDegree)MPA as.factor(GradDegree)MS
## 0.1174877801 0.0251526995
## as.factor(GradDegree)MSA as.factor(GradDegree)MSEE
## -0.0236390276 -0.0078081482
## as.factor(GradDegree)MSIE as.factor(GradDegree)none
## -0.0288294853 0.0009061765
## as.factor(GradDegree)PhD
## -0.0083060961
lm.beta(compensation_lm2)
##
## Call:
## lm(formula = Bonus ~ poly(Sales, 3, raw = TRUE) + Age + StGains +
## Gains + Other + misc + Compfor5Yrs + yrs + YearsCEO + as.factor(GradDegree),
## data = compensation)
##
## Standardized Coefficients::
## (Intercept) poly(Sales, 3, raw = TRUE)1
## 0.000000000 0.613126049
## poly(Sales, 3, raw = TRUE)2 poly(Sales, 3, raw = TRUE)3
## -1.125759311 0.653233916
## Age StGains
## 0.051140596 0.088084471
## Gains Other
## -0.018423792 0.198252459
## misc Compfor5Yrs
## -0.035678273 0.171048766
## yrs YearsCEO
## -0.053422223 -0.003570097
## as.factor(GradDegree)JD as.factor(GradDegree)LLB
## -0.051505380 -0.001879986
## as.factor(GradDegree)LLD as.factor(GradDegree)MA
## 0.003283446 -0.029532186
## as.factor(GradDegree)MBA as.factor(GradDegree)MD
## -0.019060701 0.006257252
## as.factor(GradDegree)MEA as.factor(GradDegree)MPA
## -0.013959918 -0.001175183
## as.factor(GradDegree)MS as.factor(GradDegree)MSA
## -0.049272505 -0.031698517
## as.factor(GradDegree)MSEE as.factor(GradDegree)MSIE
## -0.005832634 -0.008152522
## as.factor(GradDegree)none as.factor(GradDegree)PhD
## -0.010341766 -0.065071189
## The variables that most significant in my model is the compensation for five years. This applies to both bonus and salary.
# R deletes missing data and creates a model based on the available data. When a column has missing adata, R just assumes the
#missing data are zeros hence it deletes them and runs the model. It also displays a message saying that some data was deleted.
# Other Z factors Affecting CEO Salary
# size of company- Fortune 50O company CEOs would make more that CEOs of smaller companies.
# Financial performance of the company
# CEO's level of expereinece
# Age of the comoany; is is a fairly new company, is it a start up, is it an old company with established officer compensation packages.