PACKAGES NEEDED

Reading in the data

compensation<-read_csv ("http://asayanalytics.com/ceos_csv")

# It is not appropraite to combine salary and bonus as one variable. This is because, different companies treat those two items different for tax purposes.
# Combining the two variables together assumes that all the companies in the dada set have the same policies concerning salaries and bonuses. 
# Because of the different policies, different variables will influence these two variables differently. It is because of this reason that it makes sense for them to be analyzed as two
# separate independt variables.

colSums(is.na(compensation))
##        Company            CEO    CityofBirth   StateofBirth            Age 
##              0              0              7             47              0 
##      Undergrad       UGDegree         UGDate     AgeOfUnder       Graduate 
##             71             73             71             71            318 
##     GradDegree           MBA?     MasterPhd?         G_Date     AgeOfGradu 
##              0              0              0            318            318 
##      YearsFirm       YearsCEO         Salary          Bonus          Other 
##              0              0             10             98             47 
##        StGains    Compfor5Yrs     StockOwned          Sales        Profits 
##            404            132              0              0              2 
## ReturnOver5Yrs       Industry  Industry Code   WideIndustry 
##             15              0              0              0

Data Wrangling

#Since the salary column has NAs, I am imputing the mean salary to replace the missing data.
class(compensation$Salary)
## [1] "numeric"
compensation$Sal <- ifelse(is.na(compensation$Salary),1,0)

compensation$salary <- as.numeric(compensation$Salary)

Ave_salary <- mean(compensation$Salary, na.rm = TRUE)

compensation$Salary <- replace(compensation$Salary, is.na(compensation$Salary), Ave_salary)

Imputing the mean of Short Term Gains

compensation$Gains <- ifelse(is.na(compensation$StGains),1,0)

compensation$Stgains<- as.numeric(compensation$StGains)

Ave_Gains <- mean(compensation$StGains, na.rm = TRUE)

compensation$StGains <- replace(compensation$StGains, is.na(compensation$StGains), Ave_Gains)

Imouting the mean of Compensation for the last five years.

compensation$yrs <- ifelse(is.na(compensation$Compfor5Yrs),1,0)

compensation$Compfor5Yrs<- as.numeric(compensation$Compfor5Yrs)

Ave_Comp <- mean(compensation$Compfor5Yrs, na.rm = TRUE)

compensation$Compfor5Yrs <- replace(compensation$Compfor5Yrs, is.na(compensation$Compfor5Yrs), Ave_Comp)

Imputing the mean of “Others”

compensation$misc <- ifelse(is.na(compensation$Other),1,0)

compensation$Other<- as.numeric(compensation$Other)

Ave_misc <- mean(compensation$Other, na.rm = TRUE)

compensation$Other <- replace(compensation$Other, is.na(compensation$Other), Ave_misc)

The Model (Salary)

compensation_lm1 <- lm(Salary~poly(Sales ,3, raw=TRUE) +
                         Age+StGains+Gains+Other+misc+Compfor5Yrs+ yrs+
                         YearsCEO+as.factor(GradDegree), data=compensation)

 summary(compensation_lm1)
## 
## Call:
## lm(formula = Salary ~ poly(Sales, 3, raw = TRUE) + Age + StGains + 
##     Gains + Other + misc + Compfor5Yrs + yrs + YearsCEO + as.factor(GradDegree), 
##     data = compensation)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -917289 -114107  -26818   92349 2144406 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  1.753e+05  2.571e+05   0.682  0.49569    
## poly(Sales, 3, raw = TRUE)1  3.172e+01  3.268e+00   9.708  < 2e-16 ***
## poly(Sales, 3, raw = TRUE)2 -5.620e-04  8.967e-05  -6.267 6.99e-10 ***
## poly(Sales, 3, raw = TRUE)3  2.804e-09  5.448e-10   5.147 3.58e-07 ***
## Age                          4.438e+03  1.494e+03   2.971  0.00308 ** 
## StGains                     -1.041e-02  3.604e-03  -2.887  0.00402 ** 
## Gains                        2.316e+04  2.093e+04   1.107  0.26887    
## Other                        4.031e-02  9.989e-03   4.036 6.14e-05 ***
## misc                         1.260e+04  3.829e+04   0.329  0.74234    
## Compfor5Yrs                  5.684e-03  1.134e-03   5.014 7.02e-07 ***
## yrs                         -6.502e+04  2.788e+04  -2.332  0.02001 *  
## YearsCEO                     3.472e+03  1.500e+03   2.315  0.02093 *  
## as.factor(GradDegree)EdD     1.284e+05  3.419e+05   0.376  0.70735    
## as.factor(GradDegree)JD     -9.094e+03  2.446e+05  -0.037  0.97036    
## as.factor(GradDegree)LLB     3.626e+04  2.483e+05   0.146  0.88394    
## as.factor(GradDegree)LLD    -1.244e+05  3.409e+05  -0.365  0.71524    
## as.factor(GradDegree)MA      1.333e+04  2.485e+05   0.054  0.95726    
## as.factor(GradDegree)MBA    -1.599e+04  2.425e+05  -0.066  0.94746    
## as.factor(GradDegree)MD     -2.099e+04  2.793e+05  -0.075  0.94012    
## as.factor(GradDegree)MEA    -5.136e+03  3.414e+05  -0.015  0.98800    
## as.factor(GradDegree)MPA     6.127e+05  2.960e+05   2.070  0.03886 *  
## as.factor(GradDegree)MS      3.521e+04  2.456e+05   0.143  0.88605    
## as.factor(GradDegree)MSA    -1.233e+05  2.953e+05  -0.418  0.67642    
## as.factor(GradDegree)MSEE   -4.072e+04  2.954e+05  -0.138  0.89041    
## as.factor(GradDegree)MSIE   -1.504e+05  2.954e+05  -0.509  0.61096    
## as.factor(GradDegree)none    5.309e+02  2.421e+05   0.002  0.99825    
## as.factor(GradDegree)PhD    -1.299e+04  2.471e+05  -0.053  0.95809    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 240600 on 605 degrees of freedom
## Multiple R-squared:  0.3539, Adjusted R-squared:  0.3261 
## F-statistic: 12.75 on 26 and 605 DF,  p-value: < 2.2e-16

The model (Bonus)

 compensation_lm2 <- lm(Bonus~poly(Sales ,3, raw=TRUE) +
                          Age+StGains+Gains+Other+misc+Compfor5Yrs+ yrs+
                          YearsCEO+as.factor(GradDegree), data=compensation)
 summary(compensation_lm2)
## 
## Call:
## lm(formula = Bonus ~ poly(Sales, 3, raw = TRUE) + Age + StGains + 
##     Gains + Other + misc + Compfor5Yrs + yrs + YearsCEO + as.factor(GradDegree), 
##     data = compensation)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2208081  -258651  -122727    46929 10384606 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 -1.390e+05  9.664e+05  -0.144  0.88572    
## poly(Sales, 3, raw = TRUE)1  6.245e+01  1.551e+01   4.027 6.50e-05 ***
## poly(Sales, 3, raw = TRUE)2 -1.479e-03  5.391e-04  -2.743  0.00631 ** 
## poly(Sales, 3, raw = TRUE)3  8.942e-09  4.104e-09   2.179  0.02980 *  
## Age                          7.435e+03  6.403e+03   1.161  0.24610    
## StGains                      2.415e-02  1.371e-02   1.761  0.07890 .  
## Gains                       -3.680e+04  8.256e+04  -0.446  0.65595    
## Other                        1.824e-01  3.939e-02   4.632 4.61e-06 ***
## misc                        -1.619e+05  1.826e+05  -0.886  0.37578    
## Compfor5Yrs                  1.399e-02  4.436e-03   3.154  0.00170 ** 
## yrs                         -1.313e+05  1.123e+05  -1.169  0.24296    
## YearsCEO                    -4.639e+02  6.174e+03  -0.075  0.94014    
## as.factor(GradDegree)JD     -1.868e+05  9.081e+05  -0.206  0.83707    
## as.factor(GradDegree)LLB    -1.077e+04  9.256e+05  -0.012  0.99072    
## as.factor(GradDegree)LLD     7.417e+04  1.274e+06   0.058  0.95359    
## as.factor(GradDegree)MA     -1.745e+05  9.267e+05  -0.188  0.85068    
## as.factor(GradDegree)MBA    -4.159e+04  8.997e+05  -0.046  0.96315    
## as.factor(GradDegree)MD      1.000e+05  1.102e+06   0.091  0.92770    
## as.factor(GradDegree)MEA    -3.153e+05  1.269e+06  -0.249  0.80381    
## as.factor(GradDegree)MPA    -1.879e+04  1.097e+06  -0.017  0.98634    
## as.factor(GradDegree)MS     -2.236e+05  9.142e+05  -0.245  0.80689    
## as.factor(GradDegree)MSA    -5.068e+05  1.098e+06  -0.461  0.64470    
## as.factor(GradDegree)MSEE   -9.325e+04  1.102e+06  -0.085  0.93258    
## as.factor(GradDegree)MSIE   -1.303e+05  1.101e+06  -0.118  0.90579    
## as.factor(GradDegree)none   -2.021e+04  8.985e+05  -0.022  0.98206    
## as.factor(GradDegree)PhD    -3.620e+05  9.247e+05  -0.391  0.69565    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 894200 on 508 degrees of freedom
##   (98 observations deleted due to missingness)
## Multiple R-squared:  0.2023, Adjusted R-squared:  0.1631 
## F-statistic: 5.154 on 25 and 508 DF,  p-value: 6.423e-14

Checking for multicollinearity

 cor(compensation[, c("Age", "StGains", "Other", "YearsCEO", "Compfor5Yrs")], method="spearman") %>%
   datatable()
 #Using doagnostic plots to check for linearity.
 crPlots(compensation_lm1)

Testing for normality using the Shapiro Wilk test

 shapiro.test(compensation$Salary)
## 
##  Shapiro-Wilk normality test
## 
## data:  compensation$Salary
## W = 0.8482, p-value < 2.2e-16
compensation_Lm_sal <- sample(compensation$Salary,replace=FALSE)
 shapiro.test(compensation_Lm_sal)
## 
##  Shapiro-Wilk normality test
## 
## data:  compensation_Lm_sal
## W = 0.8482, p-value < 2.2e-16
#From the test above, we see that  our  p-value is more  than 0.05, hence we reject the null hypothesis Variable that #explains the relationship most
 
 
 
 lm.beta(compensation_lm1) 
## 
## Call:
## lm(formula = Salary ~ poly(Sales, 3, raw = TRUE) + Age + StGains + 
##     Gains + Other + misc + Compfor5Yrs + yrs + YearsCEO + as.factor(GradDegree), 
##     data = compensation)
## 
## Standardized Coefficients::
##                 (Intercept) poly(Sales, 3, raw = TRUE)1 
##                0.0000000000                1.1624855469 
## poly(Sales, 3, raw = TRUE)2 poly(Sales, 3, raw = TRUE)3 
##               -1.9846996527                1.1883028112 
##                         Age                     StGains 
##                0.1079065651               -0.1173273069 
##                       Gains                       Other 
##                0.0379758070                0.1415363174 
##                        misc                 Compfor5Yrs 
##                0.0112815784                0.2194200553 
##                         yrs                    YearsCEO 
##               -0.0902354452                0.0916927924 
##    as.factor(GradDegree)EdD     as.factor(GradDegree)JD 
##                0.0174236641               -0.0079015586 
##    as.factor(GradDegree)LLB    as.factor(GradDegree)LLD 
##                0.0205933391               -0.0168840485 
##     as.factor(GradDegree)MA    as.factor(GradDegree)MBA 
##                0.0075672214               -0.0240653335 
##     as.factor(GradDegree)MD    as.factor(GradDegree)MEA 
##               -0.0049249613               -0.0006969604 
##    as.factor(GradDegree)MPA     as.factor(GradDegree)MS 
##                0.1174877801                0.0251526995 
##    as.factor(GradDegree)MSA   as.factor(GradDegree)MSEE 
##               -0.0236390276               -0.0078081482 
##   as.factor(GradDegree)MSIE   as.factor(GradDegree)none 
##               -0.0288294853                0.0009061765 
##    as.factor(GradDegree)PhD 
##               -0.0083060961
 lm.beta(compensation_lm2)
## 
## Call:
## lm(formula = Bonus ~ poly(Sales, 3, raw = TRUE) + Age + StGains + 
##     Gains + Other + misc + Compfor5Yrs + yrs + YearsCEO + as.factor(GradDegree), 
##     data = compensation)
## 
## Standardized Coefficients::
##                 (Intercept) poly(Sales, 3, raw = TRUE)1 
##                 0.000000000                 0.613126049 
## poly(Sales, 3, raw = TRUE)2 poly(Sales, 3, raw = TRUE)3 
##                -1.125759311                 0.653233916 
##                         Age                     StGains 
##                 0.051140596                 0.088084471 
##                       Gains                       Other 
##                -0.018423792                 0.198252459 
##                        misc                 Compfor5Yrs 
##                -0.035678273                 0.171048766 
##                         yrs                    YearsCEO 
##                -0.053422223                -0.003570097 
##     as.factor(GradDegree)JD    as.factor(GradDegree)LLB 
##                -0.051505380                -0.001879986 
##    as.factor(GradDegree)LLD     as.factor(GradDegree)MA 
##                 0.003283446                -0.029532186 
##    as.factor(GradDegree)MBA     as.factor(GradDegree)MD 
##                -0.019060701                 0.006257252 
##    as.factor(GradDegree)MEA    as.factor(GradDegree)MPA 
##                -0.013959918                -0.001175183 
##     as.factor(GradDegree)MS    as.factor(GradDegree)MSA 
##                -0.049272505                -0.031698517 
##   as.factor(GradDegree)MSEE   as.factor(GradDegree)MSIE 
##                -0.005832634                -0.008152522 
##   as.factor(GradDegree)none    as.factor(GradDegree)PhD 
##                -0.010341766                -0.065071189
 ## The variables that most significant in my model is the compensation for five years. This applies to both bonus and salary.
 # R deletes missing data and creates a model based on the available data. When a column has missing adata, R just assumes the 
 #missing data are zeros hence it deletes them and runs the model. It also displays a message saying that some data was deleted.
 
 
 # Other Z factors  Affecting CEO Salary
 # size of company- Fortune 50O company CEOs would make more that CEOs of smaller companies.
 # Financial performance of the company
 
 # CEO's level of expereinece
 
 # Age of the comoany; is is a fairly new company, is it a start up, is it an old company with established officer compensation packages.