Describing the data

mba_sal<-read.csv("MBA Starting Salaries Data.csv")
no_job<-subset(mba_sal, salary!=998  & salary!=999 & salary==0)
placed<-subset(mba_sal, salary != 998  & salary != 999 & salary != 0)
attach(placed)
summary(placed)
##       age             sex           gmat_tot      gmat_qpc    
##  Min.   :22.00   Min.   :1.000   Min.   :500   Min.   :39.00  
##  1st Qu.:25.00   1st Qu.:1.000   1st Qu.:580   1st Qu.:72.00  
##  Median :26.00   Median :1.000   Median :620   Median :82.00  
##  Mean   :26.78   Mean   :1.301   Mean   :616   Mean   :79.73  
##  3rd Qu.:28.00   3rd Qu.:2.000   3rd Qu.:655   3rd Qu.:89.00  
##  Max.   :40.00   Max.   :2.000   Max.   :720   Max.   :99.00  
##     gmat_vpc        gmat_tpc         s_avg           f_avg      
##  Min.   :30.00   Min.   :51.00   Min.   :2.200   Min.   :0.000  
##  1st Qu.:71.00   1st Qu.:78.00   1st Qu.:2.850   1st Qu.:2.915  
##  Median :81.00   Median :87.00   Median :3.100   Median :3.250  
##  Mean   :78.56   Mean   :84.52   Mean   :3.092   Mean   :3.091  
##  3rd Qu.:92.00   3rd Qu.:93.50   3rd Qu.:3.400   3rd Qu.:3.415  
##  Max.   :99.00   Max.   :99.00   Max.   :4.000   Max.   :4.000  
##     quarter         work_yrs        frstlang         salary      
##  Min.   :1.000   Min.   : 0.00   Min.   :1.000   Min.   : 64000  
##  1st Qu.:1.000   1st Qu.: 2.00   1st Qu.:1.000   1st Qu.: 95000  
##  Median :2.000   Median : 3.00   Median :1.000   Median :100000  
##  Mean   :2.262   Mean   : 3.68   Mean   :1.068   Mean   :103031  
##  3rd Qu.:3.000   3rd Qu.: 4.00   3rd Qu.:1.000   3rd Qu.:106000  
##  Max.   :4.000   Max.   :16.00   Max.   :2.000   Max.   :220000  
##      satis      
##  Min.   :3.000  
##  1st Qu.:5.000  
##  Median :6.000  
##  Mean   :5.883  
##  3rd Qu.:6.000  
##  Max.   :7.000
library(psych)
describe(placed)
##          vars   n      mean       sd   median   trimmed     mad     min
## age         1 103     26.78     3.27 2.60e+01     26.30    2.97    22.0
## sex         2 103      1.30     0.46 1.00e+00      1.25    0.00     1.0
## gmat_tot    3 103    616.02    50.69 6.20e+02    615.90   59.30   500.0
## gmat_qpc    4 103     79.73    13.39 8.20e+01     81.05   13.34    39.0
## gmat_vpc    5 103     78.56    16.14 8.10e+01     80.33   16.31    30.0
## gmat_tpc    6 103     84.52    11.01 8.70e+01     85.60   11.86    51.0
## s_avg       7 103      3.09     0.38 3.10e+00      3.10    0.44     2.2
## f_avg       8 103      3.09     0.49 3.25e+00      3.13    0.37     0.0
## quarter     9 103      2.26     1.12 2.00e+00      2.20    1.48     1.0
## work_yrs   10 103      3.68     3.01 3.00e+00      3.11    1.48     0.0
## frstlang   11 103      1.07     0.25 1.00e+00      1.00    0.00     1.0
## salary     12 103 103030.74 17868.80 1.00e+05 101065.06 7413.00 64000.0
## satis      13 103      5.88     0.78 6.00e+00      5.89    1.48     3.0
##             max    range  skew kurtosis      se
## age          40     18.0  1.92     4.90    0.32
## sex           2      1.0  0.86    -1.28    0.05
## gmat_tot    720    220.0  0.01    -0.69    4.99
## gmat_qpc     99     60.0 -0.81     0.17    1.32
## gmat_vpc     99     69.0 -0.87     0.21    1.59
## gmat_tpc     99     48.0 -0.84     0.19    1.08
## s_avg         4      1.8 -0.13    -0.61    0.04
## f_avg         4      4.0 -2.52    13.86    0.05
## quarter       4      3.0  0.27    -1.34    0.11
## work_yrs     16     16.0  2.48     6.83    0.30
## frstlang      2      1.0  3.38     9.54    0.02
## salary   220000 156000.0  3.18    17.16 1760.67
## satis         7      4.0 -0.40     0.44    0.08

Including Plots

library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
hist(placed$age, main = "Frequency distribution of age amongst working people", xlab = "Age"
  , border = "blue", col = "red",ylim = c(0,40),xlim = c(20,40), breaks = 5)

hist(placed$age, main = "Frequency distribution of age amongst the jobless", xlab = "Age"
  , border = "blue", col = "red",ylim = c(0,40),xlim = c(20,40), breaks = 5)

qplot(x = salary, y = age, data = placed, color = factor(sex), size = satis)

qplot(x = gmat_tot, y = work_yrs, data = no_job, color = factor(sex), size = satis)

library(corrgram)
corrgram(placed, order=TRUE, lower.panel=panel.shade,upper.panel=panel.cor,text.panel=panel.txt,main="MBA starting salary analysis Correlogram")

Observations:-

It is pretty clear from the correlogram that the salary is significantly dependant on only two variables namely work_yrs and age. We can also see from the scatter plot that maximum people have given a satisfaction rating of 4, 5 or 6 and a rating of 3 or 7 is rare. Also we can see that the men fare better on the starting salary scale.

Variance - Covariance Matrix

x <- placed[,c("age", "gmat_tot", "gmat_qpc", "gmat_vpc","gmat_tpc","s_avg","f_avg","work_yrs","salary")]
y <- placed[,c("age", "gmat_tot", "gmat_qpc", "gmat_vpc","gmat_tpc","s_avg","f_avg","work_yrs","salary")]
cov(x,y)
##                    age      gmat_tot      gmat_qpc      gmat_vpc
## age         10.7045498    -13.054445   -7.22796497  9.505045e-01
## gmat_tot   -13.0544451   2569.293737  452.14258519  6.386360e+02
## gmat_qpc    -7.2279650    452.142585  179.18027794  2.045850e+01
## gmat_vpc     0.9505045    638.636018   20.45849990  2.606602e+02
## gmat_tpc    -3.4602132    539.362269   97.03607462  1.393882e+02
## s_avg        0.1938587      3.299562    0.07838473  9.694594e-01
## f_avg       -0.3462517      3.027432    0.64252142  1.803303e-01
## work_yrs     8.6728536    -18.738816   -7.36245955 -1.366838e+00
## salary   29210.5193223 -82124.485056 3382.43784504 -3.964803e+04
##               gmat_tpc        s_avg         f_avg      work_yrs
## age      -3.460213e+00   0.19385875   -0.34625167     8.6728536
## gmat_tot  5.393623e+02   3.29956215    3.02743194   -18.7388159
## gmat_qpc  9.703607e+01   0.07838473    0.64252142    -7.3624595
## gmat_vpc  1.393882e+02   0.96945936    0.18033029    -1.3668380
## gmat_tpc  1.211342e+02   0.58062916    0.37850562    -4.3892062
## s_avg     5.806292e-01   0.14325138    0.08231046     0.1860480
## f_avg     3.785056e-01   0.08231046    0.23786375    -0.3176271
## work_yrs -4.389206e+00   0.18604797   -0.31762707     9.0630116
## salary   -2.596339e+04 688.02042071 -924.11288026 24458.1995050
##                 salary
## age       2.921052e+04
## gmat_tot -8.212449e+04
## gmat_qpc  3.382438e+03
## gmat_vpc -3.964803e+04
## gmat_tpc -2.596339e+04
## s_avg     6.880204e+02
## f_avg    -9.241129e+02
## work_yrs  2.445820e+04
## salary    3.192940e+08
con_table<-xtabs(~salary+frstlang,data=placed)
con_table
##         frstlang
## salary    1  2
##   64000   1  0
##   77000   1  0
##   78256   1  0
##   82000   1  0
##   85000   4  0
##   86000   2  0
##   88000   1  0
##   88500   1  0
##   90000   3  0
##   92000   3  0
##   93000   3  0
##   95000   7  0
##   96000   4  0
##   96500   1  0
##   97000   2  0
##   98000   8  2
##   99000   0  1
##   100000  9  0
##   100400  1  0
##   101000  2  0
##   101100  1  0
##   101600  1  0
##   102500  1  0
##   103000  1  0
##   104000  1  1
##   105000 11  0
##   106000  3  0
##   107000  1  0
##   107300  0  1
##   107500  1  0
##   108000  2  0
##   110000  1  0
##   112000  3  0
##   115000  5  0
##   118000  0  1
##   120000  4  0
##   126710  1  0
##   130000  1  0
##   145800  1  0
##   146000  1  0
##   162000  1  0
##   220000  0  1

Modelling the data

mbasalary <- subset(mba_sal, salary>0, select = age:satis)
chisq.test(mbasalary$age, mbasalary$salary)
## Warning in chisq.test(mbasalary$age, mbasalary$salary): Chi-squared
## approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  mbasalary$age and mbasalary$salary
## X-squared = 948.87, df = 602, p-value < 2.2e-16
chisq.test(mbasalary$sex, mbasalary$salary)
## Warning in chisq.test(mbasalary$sex, mbasalary$salary): Chi-squared
## approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  mbasalary$sex and mbasalary$salary
## X-squared = 64.319, df = 43, p-value = 0.0192
chisq.test(mbasalary$gmat_tpc, mbasalary$salary)
## Warning in chisq.test(mbasalary$gmat_tpc, mbasalary$salary): Chi-squared
## approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  mbasalary$gmat_tpc and mbasalary$salary
## X-squared = 1829.5, df = 1548, p-value = 8.178e-07
chisq.test(mbasalary$s_avg, mbasalary$salary)
## Warning in chisq.test(mbasalary$s_avg, mbasalary$salary): Chi-squared
## approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  mbasalary$s_avg and mbasalary$salary
## X-squared = 919.46, df = 1118, p-value = 1
chisq.test(mbasalary$f_avg, mbasalary$salary)
## Warning in chisq.test(mbasalary$f_avg, mbasalary$salary): Chi-squared
## approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  mbasalary$f_avg and mbasalary$salary
## X-squared = 635.53, df = 817, p-value = 1
chisq.test(mbasalary$quarter, mbasalary$salary)
## Warning in chisq.test(mbasalary$quarter, mbasalary$salary): Chi-squared
## approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  mbasalary$quarter and mbasalary$salary
## X-squared = 139.17, df = 129, p-value = 0.2551
chisq.test(mbasalary$work_yrs, mbasalary$salary)
## Warning in chisq.test(mbasalary$work_yrs, mbasalary$salary): Chi-squared
## approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  mbasalary$work_yrs and mbasalary$salary
## X-squared = 773.16, df = 516, p-value = 1.433e-12
chisq.test(mbasalary$frstlang, mbasalary$salary)
## Warning in chisq.test(mbasalary$frstlang, mbasalary$salary): Chi-squared
## approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  mbasalary$frstlang and mbasalary$salary
## X-squared = 48.273, df = 43, p-value = 0.2682
chisq.test(mbasalary$satis, mbasalary$salary)
## Warning in chisq.test(mbasalary$satis, mbasalary$salary): Chi-squared
## approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  mbasalary$satis and mbasalary$salary
## X-squared = 391.04, df = 301, p-value = 0.0003578

The results tell us that age, GMAT percentile, work experience and first language are the factors that affect starting salary, which is somewhat shown by the corrgram the only 2 exceptions being Gmat percentile and first language.

Regression model

lm(formula = placed$salary ~ placed$age+placed$work_yrs)
## 
## Call:
## lm(formula = placed$salary ~ placed$age + placed$work_yrs)
## 
## Coefficients:
##     (Intercept)       placed$age  placed$work_yrs  
##         36967.5           2413.8            388.8
lm(formula = placed$salary ~ placed$age+placed$work_yrs+placed$gmat_tot+placed$frstlang)
## 
## Call:
## lm(formula = placed$salary ~ placed$age + placed$work_yrs + placed$gmat_tot + 
##     placed$frstlang)
## 
## Coefficients:
##     (Intercept)       placed$age  placed$work_yrs  placed$gmat_tot  
##        47113.96          1938.71           684.39           -11.75  
## placed$frstlang  
##         8171.85
lm(formula = placed$salary ~ placed$age+placed$work_yrs+placed$gmat_qpc)
## 
## Call:
## lm(formula = placed$salary ~ placed$age + placed$work_yrs + placed$gmat_qpc)
## 
## Coefficients:
##     (Intercept)       placed$age  placed$work_yrs  placed$gmat_qpc  
##         25410.4           2424.1            490.0            136.8

The intercept value tells us that the mean starting salary is around 36967 when only work experience and age are taken into account(being equated to 0). It means that these two are not the only factors affecting the starting salary, but work experience, age and quantitative gmat percentile affect the salary more than other factors.