Basic Analysis

mba = read.csv(paste("MBAStartingSalariesData.csv", sep=""))
sal = subset(mba, salary != 998  & salary != 999)
nojob = subset(sal, salary == 0)
job = subset(sal, salary != 0) 
attach(sal)
head(sal)
##   age sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg quarter work_yrs
## 1  23   2      620       77       87       87   3.4  3.00       1        2
## 2  24   1      610       90       71       87   3.5  4.00       1        2
## 3  24   1      670       99       78       95   3.3  3.25       1        2
## 4  24   1      570       56       81       75   3.3  2.67       1        1
## 6  24   1      640       82       89       91   3.9  3.75       1        2
## 7  25   1      610       89       74       87   3.4  3.50       1        2
##   frstlang salary satis
## 1        1      0     7
## 2        1      0     6
## 3        1      0     6
## 4        1      0     7
## 6        1      0     6
## 7        1      0     5
str(sal)
## 'data.frame':    193 obs. of  13 variables:
##  $ age     : int  23 24 24 24 24 25 25 27 27 28 ...
##  $ sex     : int  2 1 1 1 1 1 2 1 1 2 ...
##  $ gmat_tot: int  620 610 670 570 640 610 650 740 750 540 ...
##  $ gmat_qpc: int  77 90 99 56 82 89 88 99 99 75 ...
##  $ gmat_vpc: int  87 71 78 81 89 74 89 96 98 50 ...
##  $ gmat_tpc: int  87 87 95 75 91 87 92 99 99 65 ...
##  $ s_avg   : num  3.4 3.5 3.3 3.3 3.9 3.4 3.3 3.5 3.4 3.6 ...
##  $ f_avg   : num  3 4 3.25 2.67 3.75 3.5 3.75 3.5 3.5 4 ...
##  $ quarter : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ work_yrs: int  2 2 2 1 2 2 2 3 1 5 ...
##  $ frstlang: int  1 1 1 1 1 1 1 1 2 1 ...
##  $ salary  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ satis   : int  7 6 6 7 6 5 6 6 5 5 ...
summary(sal)
##       age             sex          gmat_tot        gmat_qpc    
##  Min.   :22.00   Min.   :1.00   Min.   :450.0   Min.   :28.00  
##  1st Qu.:25.00   1st Qu.:1.00   1st Qu.:570.0   1st Qu.:72.00  
##  Median :27.00   Median :1.00   Median :610.0   Median :82.00  
##  Mean   :27.59   Mean   :1.28   Mean   :615.2   Mean   :79.35  
##  3rd Qu.:29.00   3rd Qu.:2.00   3rd Qu.:650.0   3rd Qu.:91.00  
##  Max.   :48.00   Max.   :2.00   Max.   :760.0   Max.   :99.00  
##     gmat_vpc        gmat_tpc         s_avg           f_avg      
##  Min.   :22.00   Min.   : 0.00   Min.   :2.000   Min.   :0.000  
##  1st Qu.:71.00   1st Qu.:75.00   1st Qu.:2.800   1st Qu.:2.750  
##  Median :81.00   Median :87.00   Median :3.090   Median :3.000  
##  Mean   :78.13   Mean   :83.48   Mean   :3.064   Mean   :3.078  
##  3rd Qu.:91.00   3rd Qu.:93.00   3rd Qu.:3.300   3rd Qu.:3.330  
##  Max.   :99.00   Max.   :99.00   Max.   :4.000   Max.   :4.000  
##     quarter         work_yrs         frstlang         salary      
##  Min.   :1.000   Min.   : 0.000   Min.   :1.000   Min.   :     0  
##  1st Qu.:1.000   1st Qu.: 2.000   1st Qu.:1.000   1st Qu.:     0  
##  Median :2.000   Median : 3.000   Median :1.000   Median : 85000  
##  Mean   :2.394   Mean   : 4.104   Mean   :1.078   Mean   : 54985  
##  3rd Qu.:3.000   3rd Qu.: 5.000   3rd Qu.:1.000   3rd Qu.:100000  
##  Max.   :4.000   Max.   :22.000   Max.   :2.000   Max.   :220000  
##      satis      
##  Min.   :3.000  
##  1st Qu.:5.000  
##  Median :6.000  
##  Mean   :5.762  
##  3rd Qu.:6.000  
##  Max.   :7.000
library(psych)
describe(sal)
##          vars   n     mean       sd   median  trimmed      mad min    max
## age         1 193    27.59     4.22    27.00    26.86     2.97  22     48
## sex         2 193     1.28     0.45     1.00     1.23     0.00   1      2
## gmat_tot    3 193   615.23    56.54   610.00   614.19    59.30 450    760
## gmat_qpc    4 193    79.35    15.15    82.00    80.92    14.83  28     99
## gmat_vpc    5 193    78.13    16.10    81.00    79.87    14.83  22     99
## gmat_tpc    6 193    83.48    13.53    87.00    85.08    11.86   0     99
## s_avg       7 193     3.06     0.38     3.09     3.08     0.43   2      4
## f_avg       8 193     3.08     0.52     3.00     3.11     0.37   0      4
## quarter     9 193     2.39     1.10     2.00     2.37     1.48   1      4
## work_yrs   10 193     4.10     3.69     3.00     3.37     1.48   0     22
## frstlang   11 193     1.08     0.27     1.00     1.00     0.00   1      2
## salary     12 193 54985.32 53152.39 85000.00 52726.81 51891.00   0 220000
## satis      13 193     5.76     0.77     6.00     5.75     1.48   3      7
##           range  skew kurtosis      se
## age          26  1.93     4.55    0.30
## sex           1  0.97    -1.06    0.03
## gmat_tot    310  0.08    -0.31    4.07
## gmat_qpc     71 -0.88     0.23    1.09
## gmat_vpc     77 -0.90     0.36    1.16
## gmat_tpc     99 -1.87     7.03    0.97
## s_avg         2 -0.27    -0.15    0.03
## f_avg         4 -2.17    11.03    0.04
## quarter       3  0.13    -1.32    0.08
## work_yrs     22  2.47     7.02    0.27
## frstlang      1  3.13     7.84    0.02
## salary   220000  0.10    -1.45 3825.99
## satis         4 -0.17    -0.06    0.06
table(sex)
## sex
##   1   2 
## 139  54
table(quarter)
## quarter
##  1  2  3  4 
## 53 52 47 41
table(frstlang)
## frstlang
##   1   2 
## 178  15
jobsalary = subset(salary, salary !=0)
summary(jobsalary)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   64000   95000  100000  103031  106000  220000
describe(jobsalary)
##    vars   n     mean      sd median  trimmed  mad   min    max  range skew
## X1    1 103 103030.7 17868.8  1e+05 101065.1 7413 64000 220000 156000 3.18
##    kurtosis      se
## X1    17.16 1760.67

Plots

hist(age, breaks = 13, col="lightblue")

par(mfrow= c(1,2), oma=c(0,0,2,0))
boxplot(gmat_qpc, ylim = c(20,100), main= "Quantitative")
boxplot(gmat_vpc, main= "Verbal")
title("GMAT Percentile", outer = TRUE)

par(mfrow= c(1,2), oma=c(0,0,2,0))
boxplot(s_avg, main= "Spring")
boxplot(f_avg, ylim = c(2,4), main= "Fall")
title("MBA Average", outer = TRUE)

par(mfrow= c(1,2))
boxplot(gmat_tot, main= "Total Score")
boxplot(gmat_tpc, main= "Overall Percentile")

hist(work_yrs, breaks = 11, col="lightblue")

library(lattice)
bwplot(jobsalary, xlab = "Salary")

bwplot(satis, xlab = "Satisfaction")

detach(sal)
attach(job)
library(car)
scatterplot(work_yrs~age, cex = 0.9, pch=19, main = " Work experience vs Age")

scatterplot(salary~age, cex = 0.9, pch=19, main = " Salary vs Age")

scatterplot(salary~gmat_tpc, cex = 0.9, pch=19, main = " Salary vs GMAT Percentile")

Correlations

round(cor(job),2)
##            age   sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg
## age       1.00 -0.14    -0.08    -0.17     0.02    -0.10  0.16 -0.22
## sex      -0.14  1.00    -0.02    -0.15     0.05    -0.05  0.08  0.17
## gmat_tot -0.08 -0.02     1.00     0.67     0.78     0.97  0.17  0.12
## gmat_qpc -0.17 -0.15     0.67     1.00     0.09     0.66  0.02  0.10
## gmat_vpc  0.02  0.05     0.78     0.09     1.00     0.78  0.16  0.02
## gmat_tpc -0.10 -0.05     0.97     0.66     0.78     1.00  0.14  0.07
## s_avg     0.16  0.08     0.17     0.02     0.16     0.14  1.00  0.45
## f_avg    -0.22  0.17     0.12     0.10     0.02     0.07  0.45  1.00
## quarter  -0.13 -0.02    -0.11     0.01    -0.13    -0.10 -0.84 -0.43
## work_yrs  0.88 -0.09    -0.12    -0.18    -0.03    -0.13  0.16 -0.22
## frstlang  0.35  0.08    -0.13     0.01    -0.22    -0.16 -0.14 -0.05
## salary    0.50 -0.17    -0.09     0.01    -0.14    -0.13  0.10 -0.11
## satis     0.11 -0.09     0.06     0.00     0.15     0.12 -0.14 -0.12
##          quarter work_yrs frstlang salary satis
## age        -0.13     0.88     0.35   0.50  0.11
## sex        -0.02    -0.09     0.08  -0.17 -0.09
## gmat_tot   -0.11    -0.12    -0.13  -0.09  0.06
## gmat_qpc    0.01    -0.18     0.01   0.01  0.00
## gmat_vpc   -0.13    -0.03    -0.22  -0.14  0.15
## gmat_tpc   -0.10    -0.13    -0.16  -0.13  0.12
## s_avg      -0.84     0.16    -0.14   0.10 -0.14
## f_avg      -0.43    -0.22    -0.05  -0.11 -0.12
## quarter     1.00    -0.13     0.11  -0.13  0.23
## work_yrs   -0.13     1.00     0.20   0.45  0.06
## frstlang    0.11     0.20     1.00   0.27  0.09
## salary     -0.13     0.45     0.27   1.00 -0.04
## satis       0.23     0.06     0.09  -0.04  1.00
library(corrgram)
corrgram(job, order=TRUE, upper.panel = panel.pie)

Test for dependency

t.test(salary~sex)
## 
##  Welch Two Sample t-test
## 
## data:  salary by sex
## t = 1.3628, df = 38.115, p-value = 0.1809
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -3128.55 16021.72
## sample estimates:
## mean in group 1 mean in group 2 
##       104970.97        98524.39
t.test(salary~frstlang)
## 
##  Welch Two Sample t-test
## 
## data:  salary by frstlang
## t = -1.1202, df = 6.0863, p-value = 0.3049
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -59933.62  22202.25
## sample estimates:
## mean in group 1 mean in group 2 
##        101748.6        120614.3

As seen from t-test we cannot say that salary depends upon sex or first language

chisq.test(table(sex,quarter))
## 
##  Pearson's Chi-squared test
## 
## data:  table(sex, quarter)
## X-squared = 0.76332, df = 3, p-value = 0.8582
chisq.test(table(sex,satis))
## Warning in chisq.test(table(sex, satis)): Chi-squared approximation may be
## incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  table(sex, satis)
## X-squared = 7.3413, df = 4, p-value = 0.1189
chisq.test(table(frstlang,satis))
## Warning in chisq.test(table(frstlang, satis)): Chi-squared approximation
## may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  table(frstlang, satis)
## X-squared = 0.95627, df = 4, p-value = 0.9164

As seen from chi square test, we cannot say that either sex, quartile, first language or satisfaction is related to each other.

Regression

As age and work_yrs are highly correlated and also gmat total scores is highly correlated with gmat total percentile and gmat verbal percentile, thus only one can be used in each case.

fit1 = lm(salary~ .-(satis+sex+frstlang+quarter+work_yrs+gmat_tot+gmat_vpc), data = job)
summary(fit1)
## 
## Call:
## lm(formula = salary ~ . - (satis + sex + frstlang + quarter + 
##     work_yrs + gmat_tot + gmat_vpc), data = job)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -29470  -8566   -211   4624  74865 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  31568.0    22091.8   1.429   0.1562    
## age           2725.7      501.6   5.434 4.09e-07 ***
## gmat_qpc       376.5      153.1   2.459   0.0157 *  
## gmat_tpc      -450.7      186.5  -2.416   0.0175 *  
## s_avg         3569.4     4763.3   0.749   0.4555    
## f_avg        -1452.4     3689.5  -0.394   0.6947    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15310 on 97 degrees of freedom
## Multiple R-squared:  0.3019, Adjusted R-squared:  0.2659 
## F-statistic:  8.39 on 5 and 97 DF,  p-value: 1.263e-06
fit2 = lm(salary~ .-(satis+sex+frstlang+quarter+age+gmat_tot+gmat_tpc), data = job)
summary(fit2)
## 
## Call:
## lm(formula = salary ~ . - (satis + sex + frstlang + quarter + 
##     age + gmat_tot + gmat_tpc), data = job)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -32873  -8428  -1373   3678  85757 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  88871.2    17521.6   5.072 1.89e-06 ***
## gmat_qpc       151.8      121.0   1.254    0.213    
## gmat_vpc      -161.3      100.2  -1.611    0.111    
## s_avg         3371.6     4977.7   0.677    0.500    
## f_avg        -1779.3     3855.2  -0.462    0.645    
## work_yrs      2666.1      573.1   4.652 1.04e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15990 on 97 degrees of freedom
## Multiple R-squared:  0.2383, Adjusted R-squared:  0.199 
## F-statistic: 6.069 on 5 and 97 DF,  p-value: 6.246e-05

We see that the first model is better than the second model as it has more r-sq = 30% and has very high dependency upon age, and slight dependency upon quantative percentile and total percentile.

coefficients(fit1)
## (Intercept)         age    gmat_qpc    gmat_tpc       s_avg       f_avg 
##  31568.0328   2725.7455    376.5486   -450.6843   3569.4119  -1452.3981

Analysis of Job vs No job

for(i in 1:193){sal$placed[i]=if(sal$salary[i]){1}else{0}}
attach(sal)
## The following objects are masked from job:
## 
##     age, f_avg, frstlang, gmat_qpc, gmat_tot, gmat_tpc, gmat_vpc,
##     quarter, s_avg, salary, satis, sex, work_yrs
chisq.test(table(sex,placed))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(sex, placed)
## X-squared = 0.29208, df = 1, p-value = 0.5889
chisq.test(table(quarter,placed))
## 
##  Pearson's Chi-squared test
## 
## data:  table(quarter, placed)
## X-squared = 4.9172, df = 3, p-value = 0.178
chisq.test(table(frstlang,placed))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(frstlang, placed)
## X-squared = 0.074127, df = 1, p-value = 0.7854
chisq.test(table(satis,placed))
## Warning in chisq.test(table(satis, placed)): Chi-squared approximation may
## be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  table(satis, placed)
## X-squared = 8.3271, df = 4, p-value = 0.08031

Thus we cannot say that either of sex, quartile, first language or satisfaction is related to placed or not placed.

Logistic Regression

temp=placed~age+work_yrs+gmat_tot+gmat_qpc+gmat_vpc+gmat_tpc+f_avg+s_avg
fit=glm(formula = temp, family = binomial(link = "logit"), data = sal)
summary(fit)
## 
## Call:
## glm(formula = temp, family = binomial(link = "logit"), data = sal)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7445  -1.1746   0.8156   1.0280   2.1857  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  6.69268    3.82908   1.748  0.08049 . 
## age         -0.21627    0.08083  -2.676  0.00746 **
## work_yrs     0.12450    0.08961   1.389  0.16472   
## gmat_tot    -0.01314    0.01259  -1.044  0.29648   
## gmat_qpc    -0.01876    0.04496  -0.417  0.67650   
## gmat_vpc    -0.01273    0.04218  -0.302  0.76274   
## gmat_tpc     0.09306    0.06431   1.447  0.14790   
## f_avg       -0.14026    0.35626  -0.394  0.69381   
## s_avg        0.68314    0.50185   1.361  0.17344   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 266.68  on 192  degrees of freedom
## Residual deviance: 248.23  on 184  degrees of freedom
## AIC: 266.23
## 
## Number of Fisher Scoring iterations: 5
anova(fit, test="Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: placed
## 
## Terms added sequentially (first to last)
## 
## 
##          Df Deviance Resid. Df Resid. Dev Pr(>Chi)   
## NULL                       192     266.68            
## age       1   8.4714       191     258.21 0.003608 **
## work_yrs  1   2.6614       190     255.55 0.102809   
## gmat_tot  1   0.0010       189     255.54 0.975126   
## gmat_qpc  1   0.0490       188     255.50 0.824850   
## gmat_vpc  1   1.1592       187     254.34 0.281637   
## gmat_tpc  1   4.0546       186     250.28 0.044052 * 
## f_avg     1   0.1732       185     250.11 0.677243   
## s_avg     1   1.8820       184     248.23 0.170104   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Thus age and gmat total percentile are related to placed or not placed