Read the data
mba<-read.csv(paste("MBA Starting Salaries Data.csv",sep=""))
Summary Statistics
summary(mba)
## age sex gmat_tot gmat_qpc
## Min. :22.00 Min. :1.000 Min. :450.0 Min. :28.00
## 1st Qu.:25.00 1st Qu.:1.000 1st Qu.:580.0 1st Qu.:72.00
## Median :27.00 Median :1.000 Median :620.0 Median :83.00
## Mean :27.36 Mean :1.248 Mean :619.5 Mean :80.64
## 3rd Qu.:29.00 3rd Qu.:1.000 3rd Qu.:660.0 3rd Qu.:93.00
## Max. :48.00 Max. :2.000 Max. :790.0 Max. :99.00
## gmat_vpc gmat_tpc s_avg f_avg
## Min. :16.00 Min. : 0.0 Min. :2.000 Min. :0.000
## 1st Qu.:71.00 1st Qu.:78.0 1st Qu.:2.708 1st Qu.:2.750
## Median :81.00 Median :87.0 Median :3.000 Median :3.000
## Mean :78.32 Mean :84.2 Mean :3.025 Mean :3.062
## 3rd Qu.:91.00 3rd Qu.:94.0 3rd Qu.:3.300 3rd Qu.:3.250
## Max. :99.00 Max. :99.0 Max. :4.000 Max. :4.000
## quarter work_yrs frstlang salary
## Min. :1.000 Min. : 0.000 Min. :1.000 Min. : 0
## 1st Qu.:1.250 1st Qu.: 2.000 1st Qu.:1.000 1st Qu.: 0
## Median :2.000 Median : 3.000 Median :1.000 Median : 999
## Mean :2.478 Mean : 3.872 Mean :1.117 Mean : 39026
## 3rd Qu.:3.000 3rd Qu.: 4.000 3rd Qu.:1.000 3rd Qu.: 97000
## Max. :4.000 Max. :22.000 Max. :2.000 Max. :220000
## satis
## Min. : 1.0
## 1st Qu.: 5.0
## Median : 6.0
## Mean :172.2
## 3rd Qu.: 7.0
## Max. :998.0
Creating Data Frames And Finding Summary
jobmba<-subset(mba,salary!=0)
jobmba1<-subset(jobmba,salary!=999)
jobf<-subset(jobmba1,salary!=998)
summary(jobf)[,c(1:5,12)]
## age sex gmat_tot gmat_qpc
## Min. :22.00 Min. :1.000 Min. :500 Min. :39.00
## 1st Qu.:25.00 1st Qu.:1.000 1st Qu.:580 1st Qu.:72.00
## Median :26.00 Median :1.000 Median :620 Median :82.00
## Mean :26.78 Mean :1.301 Mean :616 Mean :79.73
## 3rd Qu.:28.00 3rd Qu.:2.000 3rd Qu.:655 3rd Qu.:89.00
## Max. :40.00 Max. :2.000 Max. :720 Max. :99.00
## gmat_vpc salary
## Min. :30.00 Min. : 64000
## 1st Qu.:71.00 1st Qu.: 95000
## Median :81.00 Median :100000
## Mean :78.56 Mean :103031
## 3rd Qu.:92.00 3rd Qu.:106000
## Max. :99.00 Max. :220000
Analysisng Vraious Factors like Age, Sex, Salary etc. Via Varous Plots.
mba$sex=factor(mba$sex, levels=c(1,2), labels=c("Male","Female"))
plot(mba$sex,col = "grey",main = "Sex Ratio In MBA Schools")

hist(mba$gmat_tot, xlab="GMAT total score",main="GMAT scores of Students", breaks=15,col="red")

hist(mba$work_yrs,col="light green",xlab="years of work experience",main = "Work Experience",breaks = 5)

boxplot(mba$salary, main="Salary Details",
col=c("yellow"),horizontal=TRUE,
xlab="Salary" )

mba$frstlang = factor(mba$frstlang, levels=c(1,2), labels=c("English","Others"))
plot(mba$frstlang,col="yellow",main = "Language Distribution")

f1<- mba[ which(mba$satis<='7'), ]
hist(f1$satis,breaks =5,col="green",xlab="Degree of Satisfaction (1=low,7=high)", main="Satisfaction distribution of MBA students")

Drawing Scatter Plot
scatterplot(salary ~gmat_tot,data=jobf,
spread=FALSE, smoother.args=list(lty=2),
main="Scatter plot of salary vs Gmat total",
xlab="Gmat score",
ylab="salary")

Drawing Box Plots
library(lattice)
## Warning: package 'lattice' was built under R version 3.3.3
mba$frstlang=factor(mba$frstlang,levels=c(1,2),labels=c("English","Others"))
bwplot(frstlang ~salary,data=jobf,horizontal="TRUE",ylab="language spoken (1=english,2=rest)")

bwplot(sex ~ salary, data=mba, horizontal=TRUE,
xlab = "Salary")

bwplot(satis~salary,jobf,xlab="Salary")

bwplot(work_yrs~salary,jobf)

Comparing Factors
bwplot(work_yrs ~ salary | sex, data=mba,
horizontal=TRUE, xlab="Income")

bwplot(quarter ~ salary | sex, data=mba,
horizontal=TRUE, xlab="Income")

library(corrgram)
## Warning: package 'corrgram' was built under R version 3.3.3
corrgram(mba,lower.panel=panel.shade,
upper.panel=panel.pie, text.panel=panel.txt,
main="Corrgram of salary intercorrelations")

x <- jobf[,c("age", "gmat_tot", "gmat_qpc", "gmat_vpc","gmat_tpc","s_avg","f_avg","work_yrs","salary")]
y <- jobf[,c("age", "gmat_tot", "gmat_qpc", "gmat_vpc","gmat_tpc","s_avg","f_avg","work_yrs","salary")]
cov(x,y)
## age gmat_tot gmat_qpc gmat_vpc
## age 10.7045498 -13.054445 -7.22796497 9.505045e-01
## gmat_tot -13.0544451 2569.293737 452.14258519 6.386360e+02
## gmat_qpc -7.2279650 452.142585 179.18027794 2.045850e+01
## gmat_vpc 0.9505045 638.636018 20.45849990 2.606602e+02
## gmat_tpc -3.4602132 539.362269 97.03607462 1.393882e+02
## s_avg 0.1938587 3.299562 0.07838473 9.694594e-01
## f_avg -0.3462517 3.027432 0.64252142 1.803303e-01
## work_yrs 8.6728536 -18.738816 -7.36245955 -1.366838e+00
## salary 29210.5193223 -82124.485056 3382.43784504 -3.964803e+04
## gmat_tpc s_avg f_avg work_yrs
## age -3.460213e+00 0.19385875 -0.34625167 8.6728536
## gmat_tot 5.393623e+02 3.29956215 3.02743194 -18.7388159
## gmat_qpc 9.703607e+01 0.07838473 0.64252142 -7.3624595
## gmat_vpc 1.393882e+02 0.96945936 0.18033029 -1.3668380
## gmat_tpc 1.211342e+02 0.58062916 0.37850562 -4.3892062
## s_avg 5.806292e-01 0.14325138 0.08231046 0.1860480
## f_avg 3.785056e-01 0.08231046 0.23786375 -0.3176271
## work_yrs -4.389206e+00 0.18604797 -0.31762707 9.0630116
## salary -2.596339e+04 688.02042071 -924.11288026 24458.1995050
## salary
## age 2.921052e+04
## gmat_tot -8.212449e+04
## gmat_qpc 3.382438e+03
## gmat_vpc -3.964803e+04
## gmat_tpc -2.596339e+04
## s_avg 6.880204e+02
## f_avg -9.241129e+02
## work_yrs 2.445820e+04
## salary 3.192940e+08
Contingency tables showing the affect of various factors on the starting salary
mytable<-xtabs(~salary+sex,jobf)
mytable
## sex
## salary 1 2
## 64000 0 1
## 77000 1 0
## 78256 0 1
## 82000 0 1
## 85000 1 3
## 86000 0 2
## 88000 0 1
## 88500 1 0
## 90000 3 0
## 92000 2 1
## 93000 2 1
## 95000 4 3
## 96000 3 1
## 96500 1 0
## 97000 2 0
## 98000 6 4
## 99000 0 1
## 100000 4 5
## 100400 1 0
## 101000 0 2
## 101100 1 0
## 101600 1 0
## 102500 1 0
## 103000 1 0
## 104000 2 0
## 105000 11 0
## 106000 2 1
## 107000 1 0
## 107300 1 0
## 107500 1 0
## 108000 2 0
## 110000 0 1
## 112000 3 0
## 115000 5 0
## 118000 1 0
## 120000 3 1
## 126710 1 0
## 130000 1 0
## 145800 1 0
## 146000 1 0
## 162000 1 0
## 220000 0 1
chisq.test(mytable)
## Warning in chisq.test(mytable): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable
## X-squared = 52.681, df = 41, p-value = 0.1045
t.test(salary ~ sex, data=jobf)
##
## Welch Two Sample t-test
##
## data: salary by sex
## t = 1.3628, df = 38.115, p-value = 0.1809
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -3128.55 16021.72
## sample estimates:
## mean in group 1 mean in group 2
## 104970.97 98524.39
The test shows there is a significant difference in salaries of men and women.Men are paid more.
mytable1<-xtabs(~salary+frstlang,jobf)
mytable1
## frstlang
## salary 1 2
## 64000 1 0
## 77000 1 0
## 78256 1 0
## 82000 1 0
## 85000 4 0
## 86000 2 0
## 88000 1 0
## 88500 1 0
## 90000 3 0
## 92000 3 0
## 93000 3 0
## 95000 7 0
## 96000 4 0
## 96500 1 0
## 97000 2 0
## 98000 8 2
## 99000 0 1
## 100000 9 0
## 100400 1 0
## 101000 2 0
## 101100 1 0
## 101600 1 0
## 102500 1 0
## 103000 1 0
## 104000 1 1
## 105000 11 0
## 106000 3 0
## 107000 1 0
## 107300 0 1
## 107500 1 0
## 108000 2 0
## 110000 1 0
## 112000 3 0
## 115000 5 0
## 118000 0 1
## 120000 4 0
## 126710 1 0
## 130000 1 0
## 145800 1 0
## 146000 1 0
## 162000 1 0
## 220000 0 1
chisq.test(mytable1)
## Warning in chisq.test(mytable1): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable1
## X-squared = 69.847, df = 41, p-value = 0.003296
t.test(salary ~ frstlang, data=jobf)
##
## Welch Two Sample t-test
##
## data: salary by frstlang
## t = -1.1202, df = 6.0863, p-value = 0.3049
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -59933.62 22202.25
## sample estimates:
## mean in group 1 mean in group 2
## 101748.6 120614.3
There isa correlation between language spoken and placement-English speaking grads are given more jobs than Other language speaking people.
mytable2<-xtabs(~salary+work_yrs,jobf)
mytable2
## work_yrs
## salary 0 1 2 3 4 5 6 7 8 10 15 16
## 64000 0 0 1 0 0 0 0 0 0 0 0 0
## 77000 0 0 1 0 0 0 0 0 0 0 0 0
## 78256 0 1 0 0 0 0 0 0 0 0 0 0
## 82000 0 1 0 0 0 0 0 0 0 0 0 0
## 85000 0 1 2 1 0 0 0 0 0 0 0 0
## 86000 0 0 1 1 0 0 0 0 0 0 0 0
## 88000 0 0 0 1 0 0 0 0 0 0 0 0
## 88500 0 0 0 1 0 0 0 0 0 0 0 0
## 90000 0 0 2 0 0 1 0 0 0 0 0 0
## 92000 0 0 3 0 0 0 0 0 0 0 0 0
## 93000 0 0 0 0 1 1 0 0 1 0 0 0
## 95000 1 1 2 2 0 1 0 0 0 0 0 0
## 96000 0 1 2 0 1 0 0 0 0 0 0 0
## 96500 0 0 1 0 0 0 0 0 0 0 0 0
## 97000 0 0 0 1 1 0 0 0 0 0 0 0
## 98000 0 0 7 1 1 0 0 1 0 0 0 0
## 99000 0 0 0 0 0 1 0 0 0 0 0 0
## 100000 0 0 6 1 1 0 1 0 0 0 0 0
## 100400 0 0 0 1 0 0 0 0 0 0 0 0
## 101000 0 0 2 0 0 0 0 0 0 0 0 0
## 101100 0 0 0 0 0 0 0 0 1 0 0 0
## 101600 0 0 0 1 0 0 0 0 0 0 0 0
## 102500 0 0 0 0 0 0 1 0 0 0 0 0
## 103000 0 0 0 1 0 0 0 0 0 0 0 0
## 104000 0 0 0 0 2 0 0 0 0 0 0 0
## 105000 0 0 4 4 0 1 1 0 0 0 0 1
## 106000 0 0 0 0 0 0 2 0 1 0 0 0
## 107000 0 0 1 0 0 0 0 0 0 0 0 0
## 107300 0 0 1 0 0 0 0 0 0 0 0 0
## 107500 0 0 0 1 0 0 0 0 0 0 0 0
## 108000 0 0 0 1 1 0 0 0 0 0 0 0
## 110000 0 0 0 0 0 0 1 0 0 0 0 0
## 112000 0 0 1 0 0 0 1 0 0 0 0 1
## 115000 0 2 0 1 2 0 0 0 0 0 0 0
## 118000 0 0 0 0 0 0 0 0 0 1 0 0
## 120000 0 0 0 1 0 2 0 0 1 0 0 0
## 126710 0 0 0 1 0 0 0 0 0 0 0 0
## 130000 0 0 0 0 1 0 0 0 0 0 0 0
## 145800 0 0 1 0 0 0 0 0 0 0 0 0
## 146000 0 0 0 0 0 0 0 0 0 0 1 0
## 162000 0 1 0 0 0 0 0 0 0 0 0 0
## 220000 0 0 0 0 0 0 0 0 0 0 1 0
chisq.test(mytable2)
## Warning in chisq.test(mytable2): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable2
## X-squared = 535.23, df = 451, p-value = 0.003809
Regression Analysis
m1<-lm(salary~quarter+s_avg+f_avg+age,jobf)
summary(m1)
##
## Call:
## lm(formula = salary ~ quarter + s_avg + f_avg + age, data = jobf)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25092 -8321 -2324 4764 83005
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 54566.0 32639.2 1.672 0.0978 .
## quarter -2569.0 2580.7 -0.995 0.3220
## s_avg -4672.8 7788.1 -0.600 0.5499
## f_avg -922.1 3793.1 -0.243 0.8084
## age 2673.1 508.9 5.252 8.75e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15700 on 98 degrees of freedom
## Multiple R-squared: 0.2578, Adjusted R-squared: 0.2275
## F-statistic: 8.511 on 4 and 98 DF, p-value: 6.155e-06
m2<-lm(salary~gmat_tot+gmat_tpc+gmat_vpc+gmat_qpc,jobf)
summary(m2)
##
## Call:
## lm(formula = salary ~ gmat_tot + gmat_tpc + gmat_vpc + gmat_qpc,
## data = jobf)
##
## Residuals:
## Min 1Q Median 3Q Max
## -40370 -8250 -2164 5253 100097
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 109539.54 48054.24 2.279 0.0248 *
## gmat_tot 55.01 181.71 0.303 0.7627
## gmat_tpc -1663.16 801.57 -2.075 0.0406 *
## gmat_vpc 546.10 543.85 1.004 0.3178
## gmat_qpc 718.40 541.90 1.326 0.1880
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17670 on 98 degrees of freedom
## Multiple R-squared: 0.06089, Adjusted R-squared: 0.02256
## F-statistic: 1.589 on 4 and 98 DF, p-value: 0.1834
m3<-lm(salary~sex+frstlang+satis+work_yrs+quarter,jobf)
summary(m3)
##
## Call:
## lm(formula = salary ~ sex + frstlang + satis + work_yrs + quarter,
## data = jobf)
##
## Residuals:
## Min 1Q Median 3Q Max
## -29352 -8342 -1943 5264 83154
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 99601.3 13868.8 7.182 1.41e-10 ***
## sex -6037.8 3394.5 -1.779 0.0784 .
## frstlang 15448.4 6326.4 2.442 0.0164 *
## satis -1800.6 2041.0 -0.882 0.3798
## work_yrs 2321.1 534.1 4.346 3.42e-05 ***
## quarter -1397.2 1441.5 -0.969 0.3348
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15590 on 97 degrees of freedom
## Multiple R-squared: 0.2765, Adjusted R-squared: 0.2392
## F-statistic: 7.414 on 5 and 97 DF, p-value: 6.324e-06
Details of people who did not get a job
nojob<-subset(mba,salary==0)
summary(nojob)
## age sex gmat_tot gmat_qpc
## Min. :22.00 Male :67 Min. :450.0 Min. :28.00
## 1st Qu.:25.00 Female:23 1st Qu.:570.0 1st Qu.:68.25
## Median :27.00 Median :610.0 Median :82.00
## Mean :28.51 Mean :614.3 Mean :78.91
## 3rd Qu.:29.75 3rd Qu.:650.0 3rd Qu.:93.00
## Max. :48.00 Max. :760.0 Max. :99.00
## gmat_vpc gmat_tpc s_avg f_avg
## Min. :22.00 Min. : 0.00 Min. :2.000 Min. :0.000
## 1st Qu.:70.25 1st Qu.:73.50 1st Qu.:2.800 1st Qu.:2.750
## Median :81.00 Median :86.00 Median :3.000 Median :3.000
## Mean :77.63 Mean :82.29 Mean :3.031 Mean :3.062
## 3rd Qu.:89.00 3rd Qu.:93.00 3rd Qu.:3.300 3rd Qu.:3.250
## Max. :99.00 Max. :99.00 Max. :3.900 Max. :4.000
## quarter work_yrs frstlang salary satis
## Min. :1.000 Min. : 0.000 English: 0 Min. :0 Min. :4.000
## 1st Qu.:2.000 1st Qu.: 2.000 Others : 0 1st Qu.:0 1st Qu.:5.000
## Median :2.500 Median : 3.000 NA's :90 Median :0 Median :6.000
## Mean :2.544 Mean : 4.589 Mean :0 Mean :5.622
## 3rd Qu.:3.000 3rd Qu.: 5.000 3rd Qu.:0 3rd Qu.:6.000
## Max. :4.000 Max. :22.000 Max. :0 Max. :7.000
hist(nojob$satis, xlab="Satisfaction",main="MBA Satisfaction", breaks=5,col="blue")

chisq.test(nojob$work_yrs,nojob$satis)
## Warning in chisq.test(nojob$work_yrs, nojob$satis): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: nojob$work_yrs and nojob$satis
## X-squared = 44.974, df = 48, p-value = 0.5976