mba_sal<-read.csv("MBA Starting Salaries Data.csv")
no_job<-subset(mba_sal, salary!=998 & salary!=999 & salary==0)
placed<-subset(mba_sal, salary != 998 & salary != 999 & salary != 0)
attach(placed)
summary(placed)
## age sex gmat_tot gmat_qpc
## Min. :22.00 Min. :1.000 Min. :500 Min. :39.00
## 1st Qu.:25.00 1st Qu.:1.000 1st Qu.:580 1st Qu.:72.00
## Median :26.00 Median :1.000 Median :620 Median :82.00
## Mean :26.78 Mean :1.301 Mean :616 Mean :79.73
## 3rd Qu.:28.00 3rd Qu.:2.000 3rd Qu.:655 3rd Qu.:89.00
## Max. :40.00 Max. :2.000 Max. :720 Max. :99.00
## gmat_vpc gmat_tpc s_avg f_avg
## Min. :30.00 Min. :51.00 Min. :2.200 Min. :0.000
## 1st Qu.:71.00 1st Qu.:78.00 1st Qu.:2.850 1st Qu.:2.915
## Median :81.00 Median :87.00 Median :3.100 Median :3.250
## Mean :78.56 Mean :84.52 Mean :3.092 Mean :3.091
## 3rd Qu.:92.00 3rd Qu.:93.50 3rd Qu.:3.400 3rd Qu.:3.415
## Max. :99.00 Max. :99.00 Max. :4.000 Max. :4.000
## quarter work_yrs frstlang salary
## Min. :1.000 Min. : 0.00 Min. :1.000 Min. : 64000
## 1st Qu.:1.000 1st Qu.: 2.00 1st Qu.:1.000 1st Qu.: 95000
## Median :2.000 Median : 3.00 Median :1.000 Median :100000
## Mean :2.262 Mean : 3.68 Mean :1.068 Mean :103031
## 3rd Qu.:3.000 3rd Qu.: 4.00 3rd Qu.:1.000 3rd Qu.:106000
## Max. :4.000 Max. :16.00 Max. :2.000 Max. :220000
## satis
## Min. :3.000
## 1st Qu.:5.000
## Median :6.000
## Mean :5.883
## 3rd Qu.:6.000
## Max. :7.000
library(psych)
describe(placed)
## vars n mean sd median trimmed mad min
## age 1 103 26.78 3.27 2.60e+01 26.30 2.97 22.0
## sex 2 103 1.30 0.46 1.00e+00 1.25 0.00 1.0
## gmat_tot 3 103 616.02 50.69 6.20e+02 615.90 59.30 500.0
## gmat_qpc 4 103 79.73 13.39 8.20e+01 81.05 13.34 39.0
## gmat_vpc 5 103 78.56 16.14 8.10e+01 80.33 16.31 30.0
## gmat_tpc 6 103 84.52 11.01 8.70e+01 85.60 11.86 51.0
## s_avg 7 103 3.09 0.38 3.10e+00 3.10 0.44 2.2
## f_avg 8 103 3.09 0.49 3.25e+00 3.13 0.37 0.0
## quarter 9 103 2.26 1.12 2.00e+00 2.20 1.48 1.0
## work_yrs 10 103 3.68 3.01 3.00e+00 3.11 1.48 0.0
## frstlang 11 103 1.07 0.25 1.00e+00 1.00 0.00 1.0
## salary 12 103 103030.74 17868.80 1.00e+05 101065.06 7413.00 64000.0
## satis 13 103 5.88 0.78 6.00e+00 5.89 1.48 3.0
## max range skew kurtosis se
## age 40 18.0 1.92 4.90 0.32
## sex 2 1.0 0.86 -1.28 0.05
## gmat_tot 720 220.0 0.01 -0.69 4.99
## gmat_qpc 99 60.0 -0.81 0.17 1.32
## gmat_vpc 99 69.0 -0.87 0.21 1.59
## gmat_tpc 99 48.0 -0.84 0.19 1.08
## s_avg 4 1.8 -0.13 -0.61 0.04
## f_avg 4 4.0 -2.52 13.86 0.05
## quarter 4 3.0 0.27 -1.34 0.11
## work_yrs 16 16.0 2.48 6.83 0.30
## frstlang 2 1.0 3.38 9.54 0.02
## salary 220000 156000.0 3.18 17.16 1760.67
## satis 7 4.0 -0.40 0.44 0.08
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
hist(placed$age, main = "Frequency distribution of age amongst working people", xlab = "Age"
, border = "blue", col = "red",ylim = c(0,40),xlim = c(20,40), breaks = 5)
hist(placed$age, main = "Frequency distribution of age amongst the jobless", xlab = "Age"
, border = "blue", col = "red",ylim = c(0,40),xlim = c(20,40), breaks = 5)
qplot(x = salary, y = age, data = placed, color = factor(sex), size = satis)
qplot(x = gmat_tot, y = work_yrs, data = no_job, color = factor(sex), size = satis)
library(corrgram)
corrgram(placed, order=TRUE, lower.panel=panel.shade,upper.panel=panel.cor,text.panel=panel.txt,main="MBA starting salary analysis Correlogram")
It is pretty clear from the correlogram that the salary is significantly dependant on only two variables namely work_yrs and age. We can also see from the scatter plot that maximum people have given a satisfaction rating of 4, 5 or 6 and a rating of 3 or 7 is rare. Also we can see that the men fare better on the starting salary scale.
x <- placed[,c("age", "gmat_tot", "gmat_qpc", "gmat_vpc","gmat_tpc","s_avg","f_avg","work_yrs","salary")]
y <- placed[,c("age", "gmat_tot", "gmat_qpc", "gmat_vpc","gmat_tpc","s_avg","f_avg","work_yrs","salary")]
cov(x,y)
## age gmat_tot gmat_qpc gmat_vpc
## age 10.7045498 -13.054445 -7.22796497 9.505045e-01
## gmat_tot -13.0544451 2569.293737 452.14258519 6.386360e+02
## gmat_qpc -7.2279650 452.142585 179.18027794 2.045850e+01
## gmat_vpc 0.9505045 638.636018 20.45849990 2.606602e+02
## gmat_tpc -3.4602132 539.362269 97.03607462 1.393882e+02
## s_avg 0.1938587 3.299562 0.07838473 9.694594e-01
## f_avg -0.3462517 3.027432 0.64252142 1.803303e-01
## work_yrs 8.6728536 -18.738816 -7.36245955 -1.366838e+00
## salary 29210.5193223 -82124.485056 3382.43784504 -3.964803e+04
## gmat_tpc s_avg f_avg work_yrs
## age -3.460213e+00 0.19385875 -0.34625167 8.6728536
## gmat_tot 5.393623e+02 3.29956215 3.02743194 -18.7388159
## gmat_qpc 9.703607e+01 0.07838473 0.64252142 -7.3624595
## gmat_vpc 1.393882e+02 0.96945936 0.18033029 -1.3668380
## gmat_tpc 1.211342e+02 0.58062916 0.37850562 -4.3892062
## s_avg 5.806292e-01 0.14325138 0.08231046 0.1860480
## f_avg 3.785056e-01 0.08231046 0.23786375 -0.3176271
## work_yrs -4.389206e+00 0.18604797 -0.31762707 9.0630116
## salary -2.596339e+04 688.02042071 -924.11288026 24458.1995050
## salary
## age 2.921052e+04
## gmat_tot -8.212449e+04
## gmat_qpc 3.382438e+03
## gmat_vpc -3.964803e+04
## gmat_tpc -2.596339e+04
## s_avg 6.880204e+02
## f_avg -9.241129e+02
## work_yrs 2.445820e+04
## salary 3.192940e+08
con_table<-xtabs(~salary+frstlang,data=placed)
con_table
## frstlang
## salary 1 2
## 64000 1 0
## 77000 1 0
## 78256 1 0
## 82000 1 0
## 85000 4 0
## 86000 2 0
## 88000 1 0
## 88500 1 0
## 90000 3 0
## 92000 3 0
## 93000 3 0
## 95000 7 0
## 96000 4 0
## 96500 1 0
## 97000 2 0
## 98000 8 2
## 99000 0 1
## 100000 9 0
## 100400 1 0
## 101000 2 0
## 101100 1 0
## 101600 1 0
## 102500 1 0
## 103000 1 0
## 104000 1 1
## 105000 11 0
## 106000 3 0
## 107000 1 0
## 107300 0 1
## 107500 1 0
## 108000 2 0
## 110000 1 0
## 112000 3 0
## 115000 5 0
## 118000 0 1
## 120000 4 0
## 126710 1 0
## 130000 1 0
## 145800 1 0
## 146000 1 0
## 162000 1 0
## 220000 0 1
mbasalary <- subset(mba_sal, salary>0, select = age:satis)
chisq.test(mbasalary$age, mbasalary$salary)
## Warning in chisq.test(mbasalary$age, mbasalary$salary): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mbasalary$age and mbasalary$salary
## X-squared = 948.87, df = 602, p-value < 2.2e-16
chisq.test(mbasalary$sex, mbasalary$salary)
## Warning in chisq.test(mbasalary$sex, mbasalary$salary): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mbasalary$sex and mbasalary$salary
## X-squared = 64.319, df = 43, p-value = 0.0192
chisq.test(mbasalary$gmat_tpc, mbasalary$salary)
## Warning in chisq.test(mbasalary$gmat_tpc, mbasalary$salary): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mbasalary$gmat_tpc and mbasalary$salary
## X-squared = 1829.5, df = 1548, p-value = 8.178e-07
chisq.test(mbasalary$s_avg, mbasalary$salary)
## Warning in chisq.test(mbasalary$s_avg, mbasalary$salary): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mbasalary$s_avg and mbasalary$salary
## X-squared = 919.46, df = 1118, p-value = 1
chisq.test(mbasalary$f_avg, mbasalary$salary)
## Warning in chisq.test(mbasalary$f_avg, mbasalary$salary): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mbasalary$f_avg and mbasalary$salary
## X-squared = 635.53, df = 817, p-value = 1
chisq.test(mbasalary$quarter, mbasalary$salary)
## Warning in chisq.test(mbasalary$quarter, mbasalary$salary): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mbasalary$quarter and mbasalary$salary
## X-squared = 139.17, df = 129, p-value = 0.2551
chisq.test(mbasalary$work_yrs, mbasalary$salary)
## Warning in chisq.test(mbasalary$work_yrs, mbasalary$salary): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mbasalary$work_yrs and mbasalary$salary
## X-squared = 773.16, df = 516, p-value = 1.433e-12
chisq.test(mbasalary$frstlang, mbasalary$salary)
## Warning in chisq.test(mbasalary$frstlang, mbasalary$salary): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mbasalary$frstlang and mbasalary$salary
## X-squared = 48.273, df = 43, p-value = 0.2682
chisq.test(mbasalary$satis, mbasalary$salary)
## Warning in chisq.test(mbasalary$satis, mbasalary$salary): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mbasalary$satis and mbasalary$salary
## X-squared = 391.04, df = 301, p-value = 0.0003578
The results tell us that age, GMAT percentile, work experience and first language are the factors that affect starting salary, which is somewhat shown by the corrgram the only 2 exceptions being Gmat percentile and first language.
lm(formula = placed$salary ~ placed$age+placed$work_yrs)
##
## Call:
## lm(formula = placed$salary ~ placed$age + placed$work_yrs)
##
## Coefficients:
## (Intercept) placed$age placed$work_yrs
## 36967.5 2413.8 388.8
lm(formula = placed$salary ~ placed$age+placed$work_yrs+placed$gmat_tot+placed$frstlang)
##
## Call:
## lm(formula = placed$salary ~ placed$age + placed$work_yrs + placed$gmat_tot +
## placed$frstlang)
##
## Coefficients:
## (Intercept) placed$age placed$work_yrs placed$gmat_tot
## 47113.96 1938.71 684.39 -11.75
## placed$frstlang
## 8171.85
lm(formula = placed$salary ~ placed$age+placed$work_yrs+placed$gmat_qpc)
##
## Call:
## lm(formula = placed$salary ~ placed$age + placed$work_yrs + placed$gmat_qpc)
##
## Coefficients:
## (Intercept) placed$age placed$work_yrs placed$gmat_qpc
## 25410.4 2424.1 490.0 136.8
The intercept value tells us that the mean starting salary is around 36967 when only work experience and age are taken into account(being equated to 0). It means that these two are not the only factors affecting the starting salary, but work experience, age and quantitative gmat percentile affect the salary more than other factors.