MBA Starting Salaries Analysis

NAME: Afreen Banu

EMAIL: afreen.banu1995@gmail.com

COMPANY: Amazon

myData <- read.csv(("MBA Starting Salaries.csv"))
head(myData)
##   age sex  sex_2 gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg quarter
## 1  23   2 Female      620       77       87       87   3.4  3.00       1
## 2  24   1   Male      610       90       71       87   3.5  4.00       1
## 3  24   1   Male      670       99       78       95   3.3  3.25       1
## 4  24   1   Male      570       56       81       75   3.3  2.67       1
## 5  24   2 Female      710       93       98       98   3.6  3.75       1
## 6  24   1   Male      640       82       89       91   3.9  3.75       1
##   work_yrs frstlang frstlang_2 salary satis
## 1        2        1    English      0     7
## 2        2        1    English      0     6
## 3        2        1    English      0     6
## 4        1        1    English      0     7
## 5        2        1    English    999     5
## 6        2        1    English      0     6
attach(myData)
str(myData)
## 'data.frame':    274 obs. of  15 variables:
##  $ age       : int  23 24 24 24 24 24 25 25 25 25 ...
##  $ sex       : int  2 1 1 1 2 1 1 2 1 1 ...
##  $ sex_2     : Factor w/ 2 levels "Female","Male": 1 2 2 2 1 2 2 1 2 2 ...
##  $ gmat_tot  : int  620 610 670 570 710 640 610 650 630 680 ...
##  $ gmat_qpc  : int  77 90 99 56 93 82 89 88 79 99 ...
##  $ gmat_vpc  : int  87 71 78 81 98 89 74 89 91 81 ...
##  $ gmat_tpc  : int  87 87 95 75 98 91 87 92 89 96 ...
##  $ s_avg     : num  3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
##  $ f_avg     : num  3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
##  $ quarter   : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ work_yrs  : int  2 2 2 1 2 2 2 2 2 2 ...
##  $ frstlang  : int  1 1 1 1 1 1 1 1 2 1 ...
##  $ frstlang_2: Factor w/ 2 levels "English","Other": 1 1 1 1 1 1 1 1 2 1 ...
##  $ salary    : int  0 0 0 0 999 0 0 0 999 998 ...
##  $ satis     : int  7 6 6 7 5 6 5 6 4 998 ...
library(lattice)
histogram(~salary|sex,type="count",breaks=5,col = c("purple","green","pink"))

histogram(~salary|satis,type="count",breaks=5,col = c("purple","green","pink"))

plot(salary,satis,cex=0.6,log="y")

library(car)
scatterplotMatrix(formula = ~ salary +  work_yrs + age , cex=0.6,
                       diagonal="histogram")

scatterplotMatrix(formula = ~salary+gmat_qpc+gmat_tot+gmat_tpc+gmat_vpc,cex = 0.6,
                  diagonal = "histogram")

library("corrgram")
corrgram(myData, order=TRUE, lower.panel=panel.shade,
  upper.panel=panel.pie, text.panel=panel.txt,
  main="Correlation of relative price!!")

gender <- xtabs(~satis+sex,data=myData)
addmargins(gender)
##      sex
## satis   1   2 Sum
##   1     1   0   1
##   2     0   1   1
##   3     4   1   5
##   4    12   5  17
##   5    53  21  74
##   6    79  18  97
##   7    20  13  33
##   998  37   9  46
##   Sum 206  68 274
prop.table(gender,2)
##      sex
## satis           1           2
##   1   0.004854369 0.000000000
##   2   0.000000000 0.014705882
##   3   0.019417476 0.014705882
##   4   0.058252427 0.073529412
##   5   0.257281553 0.308823529
##   6   0.383495146 0.264705882
##   7   0.097087379 0.191176471
##   998 0.179611650 0.132352941
chisq.test(gender)
## Warning in chisq.test(gender): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  gender
## X-squared = 10.593, df = 7, p-value = 0.1574
quat <- xtabs(~satis+quarter,data=myData)
addmargins(quat)
##      quarter
## satis   1   2   3   4 Sum
##   1     0   1   0   0   1
##   2     0   0   1   0   1
##   3     1   1   0   3   5
##   4     4   3   2   8  17
##   5    20  24  15  15  74
##   6    23  25  27  22  97
##   7    10   5  10   8  33
##   998  11  11  15   9  46
##   Sum  69  70  70  65 274
prop.table(quat,1)
##      quarter
## satis         1         2         3         4
##   1   0.0000000 1.0000000 0.0000000 0.0000000
##   2   0.0000000 0.0000000 1.0000000 0.0000000
##   3   0.2000000 0.2000000 0.0000000 0.6000000
##   4   0.2352941 0.1764706 0.1176471 0.4705882
##   5   0.2702703 0.3243243 0.2027027 0.2027027
##   6   0.2371134 0.2577320 0.2783505 0.2268041
##   7   0.3030303 0.1515152 0.3030303 0.2424242
##   998 0.2391304 0.2391304 0.3260870 0.1956522
prop.table(quat,2)
##      quarter
## satis          1          2          3          4
##   1   0.00000000 0.01428571 0.00000000 0.00000000
##   2   0.00000000 0.00000000 0.01428571 0.00000000
##   3   0.01449275 0.01428571 0.00000000 0.04615385
##   4   0.05797101 0.04285714 0.02857143 0.12307692
##   5   0.28985507 0.34285714 0.21428571 0.23076923
##   6   0.33333333 0.35714286 0.38571429 0.33846154
##   7   0.14492754 0.07142857 0.14285714 0.12307692
##   998 0.15942029 0.15714286 0.21428571 0.13846154
chisq.test(quat)
## Warning in chisq.test(quat): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  quat
## X-squared = 21.92, df = 21, p-value = 0.4042
lang <- xtabs(~satis+frstlang,data=myData)
addmargins(lang)
##      frstlang
## satis   1   2 Sum
##   1     1   0   1
##   2     0   1   1
##   3     3   2   5
##   4    10   7  17
##   5    67   7  74
##   6    92   5  97
##   7    31   2  33
##   998  38   8  46
##   Sum 242  32 274
prop.table(lang,1)
##      frstlang
## satis          1          2
##   1   1.00000000 0.00000000
##   2   0.00000000 1.00000000
##   3   0.60000000 0.40000000
##   4   0.58823529 0.41176471
##   5   0.90540541 0.09459459
##   6   0.94845361 0.05154639
##   7   0.93939394 0.06060606
##   998 0.82608696 0.17391304
prop.table(lang,2)
##      frstlang
## satis           1           2
##   1   0.004132231 0.000000000
##   2   0.000000000 0.031250000
##   3   0.012396694 0.062500000
##   4   0.041322314 0.218750000
##   5   0.276859504 0.218750000
##   6   0.380165289 0.156250000
##   7   0.128099174 0.062500000
##   998 0.157024793 0.250000000
chisq.test(lang)
## Warning in chisq.test(lang): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  lang
## X-squared = 32.744, df = 7, p-value = 2.954e-05
t.test(salary~frstlang)
## 
##  Welch Two Sample t-test
## 
## data:  salary by frstlang
## t = 1.3595, df = 38.488, p-value = 0.1819
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -6698.237 34122.797
## sample estimates:
## mean in group 1 mean in group 2 
##        40627.12        26914.84
t.test(salary~sex)
## 
##  Welch Two Sample t-test
## 
## data:  salary by sex
## t = -1.123, df = 111.89, p-value = 0.2638
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -22412.277   6197.373
## sample estimates:
## mean in group 1 mean in group 2 
##        37013.62        45121.07
t.test(salary,satis)
## 
##  Welch Two Sample t-test
## 
## data:  salary and satis
## t = 12.622, df = 273.03, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  32793.53 44913.49
## sample estimates:
##  mean of x  mean of y 
## 39025.6898   172.1788
t.test(salary,gmat_tot)
## 
##  Welch Two Sample t-test
## 
## data:  salary and gmat_tot
## t = 12.477, df = 273, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  32346.41 44466.06
## sample estimates:
##  mean of x  mean of y 
## 39025.6898   619.4526
t.test(salary,work_yrs)
## 
##  Welch Two Sample t-test
## 
## data:  salary and work_yrs
## t = 12.677, df = 273, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  32961.99 45081.64
## sample estimates:
##    mean of x    mean of y 
## 39025.689781     3.872263
t.test(salary,age)
## 
##  Welch Two Sample t-test
## 
## data:  salary and age
## t = 12.67, df = 273, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  32938.51 45058.15
## sample estimates:
##   mean of x   mean of y 
## 39025.68978    27.35766

Regression model below :

model1 <- lm(salary~work_yrs)
summary(model1)
## 
## Call:
## lm(formula = salary ~ work_yrs)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -41604 -38759 -37903  58063 179392 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  38474.9     4817.1   7.987 3.92e-14 ***
## work_yrs       142.2      955.7   0.149    0.882    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 51040 on 272 degrees of freedom
## Multiple R-squared:  8.142e-05,  Adjusted R-squared:  -0.003595 
## F-statistic: 0.02215 on 1 and 272 DF,  p-value: 0.8818
model2 <- lm(salary~quarter)
summary(model2)
## 
## Call:
## lm(formula = salary ~ quarter)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -50178 -41635 -27543  55367 192457 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    57723       7452   7.746 1.89e-13 ***
## quarter        -7545       2745  -2.748  0.00639 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 50350 on 272 degrees of freedom
## Multiple R-squared:  0.02702,    Adjusted R-squared:  0.02344 
## F-statistic: 7.553 on 1 and 272 DF,  p-value: 0.006392
model3 <- lm(salary~sex)
summary(model3)
## 
## Call:
## lm(formula = salary ~ sex)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -45121 -37014 -36016  55879 174879 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)    28906       9407   3.073  0.00234 **
## sex             8108       7122   1.138  0.25598   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 50920 on 272 degrees of freedom
## Multiple R-squared:  0.004742,   Adjusted R-squared:  0.001082 
## F-statistic: 1.296 on 1 and 272 DF,  p-value: 0.256
model4 <- lm(salary~frstlang)
summary(model4)
## 
## Call:
## lm(formula = salary ~ frstlang)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -40627 -40627 -39628  56373 193085 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    54339      11116   4.889 1.74e-06 ***
## frstlang      -13712       9566  -1.433    0.153    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 50850 on 272 degrees of freedom
## Multiple R-squared:  0.007498,   Adjusted R-squared:  0.003849 
## F-statistic: 2.055 on 1 and 272 DF,  p-value: 0.1529
model5 <- lm(salary~gmat_tot)
summary(model5)
## 
## Call:
## lm(formula = salary ~ gmat_tot)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -47274 -38999 -36053  56932 175160 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 69179.83   33352.12   2.074    0.039 *
## gmat_tot      -48.68      53.61  -0.908    0.365  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 50970 on 272 degrees of freedom
## Multiple R-squared:  0.003022,   Adjusted R-squared:  -0.0006435 
## F-statistic: 0.8245 on 1 and 272 DF,  p-value: 0.3647
model6 <- lm(salary~quarter+satis+work_yrs+age)
summary(model6)
## 
## Call:
## lm(formula = salary ~ quarter + satis + work_yrs + age)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -69411 -41662  -5287  43706 198650 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 164432.615  35974.583   4.571 7.41e-06 ***
## quarter      -7385.029   2570.857  -2.873  0.00440 ** 
## satis          -47.999      7.705  -6.229 1.80e-09 ***
## work_yrs      3347.801   1718.224   1.948  0.05241 .  
## age          -4086.799   1496.400  -2.731  0.00673 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 46920 on 269 degrees of freedom
## Multiple R-squared:  0.1643, Adjusted R-squared:  0.1519 
## F-statistic: 13.23 on 4 and 269 DF,  p-value: 7.531e-10

The salary of the employees is best predicted by quarter, job satisfaction, work years and age combined.

myData <- myData[which( salary != 998 & salary != 999),]
notPlaced <- myData[which(salary == 0),]
placed <- myData[which(salary!=0), ]
myData$placed <- 0
myData <- within(myData, placed[salary != 0]<- 1)
train <- myData[1:60,]
test <- myData[61:81,]
head(myData)
##   age sex  sex_2 gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg quarter
## 1  23   2 Female      620       77       87       87   3.4  3.00       1
## 2  24   1   Male      610       90       71       87   3.5  4.00       1
## 3  24   1   Male      670       99       78       95   3.3  3.25       1
## 4  24   1   Male      570       56       81       75   3.3  2.67       1
## 6  24   1   Male      640       82       89       91   3.9  3.75       1
## 7  25   1   Male      610       89       74       87   3.4  3.50       1
##   work_yrs frstlang frstlang_2 salary satis placed
## 1        2        1    English      0     7      0
## 2        2        1    English      0     6      0
## 3        2        1    English      0     6      0
## 4        1        1    English      0     7      0
## 6        2        1    English      0     6      0
## 7        2        1    English      0     5      0
head(placed)
##    age sex  sex_2 gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg quarter
## 6   24   1   Male      640       82       89       91  3.90  3.75       1
## 23  27   1   Male      750       99       98       99  3.40  3.50       1
## 24  28   2 Female      540       75       50       65  3.60  4.00       1
## 25  29   1   Male      580       56       87       78  3.64  3.33       1
## 27  31   2 Female      560       60       78       72  3.30  3.75       1
## 28  32   1   Male      760       99       99       99  3.40  3.00       1
##    work_yrs frstlang frstlang_2 salary satis
## 6         2        1    English      0     6
## 23        1        2      Other      0     5
## 24        5        1    English      0     5
## 25        3        1    English      0     5
## 27       10        1    English      0     7
## 28        5        1    English      0     5
head(notPlaced)
##   age sex  sex_2 gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg quarter
## 1  23   2 Female      620       77       87       87   3.4  3.00       1
## 2  24   1   Male      610       90       71       87   3.5  4.00       1
## 3  24   1   Male      670       99       78       95   3.3  3.25       1
## 4  24   1   Male      570       56       81       75   3.3  2.67       1
## 7  25   1   Male      610       89       74       87   3.4  3.50       1
## 8  25   2 Female      650       88       89       92   3.3  3.75       1
##   work_yrs frstlang frstlang_2 salary satis
## 1        2        1    English      0     7
## 2        2        1    English      0     6
## 3        2        1    English      0     6
## 4        1        1    English      0     7
## 7        2        1    English      0     5
## 8        2        1    English      0     6
npGender <- table(notPlaced$sex)
pGender <- table(placed$sex)
prop.table(npGender)*100
## 
##        1        2 
## 76.47059 23.52941
prop.table(pGender)*100
## 
##    1    2 
## 69.6 30.4
library(lattice)
par(mfrow=c(2, 1))

bwplot(notPlaced$gmat_tot,horizontal=TRUE,xlab="Not placed GMAT score",col="yellow")

par(mfrow=c(1,1))
par(mfrow=c(1,1))
histogram(~notPlaced$age,type="count",breaks=5,col = c("purple","green","pink"),main="Not placed age distribution")

histogram(~placed$age,type="count",breaks=5,col = c("purple","green","pink"),main="Placed age distribution")

logistic regression

modelp <- glm(placed~gmat_tot+frstlang+age,family=binomial(link='logit'),data=train)
summary(modelp)
## 
## Call:
## glm(formula = placed ~ gmat_tot + frstlang + age, family = binomial(link = "logit"), 
##     data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4377  -1.3199   0.9556   1.0207   1.5529  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)
## (Intercept)  1.187993   3.631206   0.327    0.744
## gmat_tot     0.001581   0.004980   0.318    0.751
## frstlang    -1.107950   1.261055  -0.879    0.380
## age         -0.024319   0.052268  -0.465    0.642
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 81.503  on 59  degrees of freedom
## Residual deviance: 80.375  on 56  degrees of freedom
## AIC: 88.375
## 
## Number of Fisher Scoring iterations: 4
anova(modelp, test="Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: placed
## 
## Terms added sequentially (first to last)
## 
## 
##          Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL                        59     81.503         
## gmat_tot  1  0.07283        58     81.430   0.7873
## frstlang  1  0.83876        57     80.592   0.3598
## age       1  0.21627        56     80.375   0.6419
library(pscl)
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
pR2(modelp)
##          llh      llhNull           G2     McFadden         r2ML 
## -40.18766470 -40.75159596   1.12786251   0.01383826   0.01862213 
##         r2CU 
##   0.02506597
fitted.results <- predict(modelp,newdata=test,type='response')
fitted.results <- ifelse(fitted.results > 0.5,1,0)
misClasificError <- mean(fitted.results == test$placed)
print(paste('Accuracy',misClasificError))
## [1] "Accuracy 0.0952380952380952"
bwplot(placed$gmat_tot,horizontal = TRUE,xlab="Placed GMAT score",col = "yellow")