Read the data

mba<-read.csv(paste("MBA Starting Salaries Data.csv",sep=""))

Summary Statistics

summary(mba)
##       age             sex           gmat_tot        gmat_qpc    
##  Min.   :22.00   Min.   :1.000   Min.   :450.0   Min.   :28.00  
##  1st Qu.:25.00   1st Qu.:1.000   1st Qu.:580.0   1st Qu.:72.00  
##  Median :27.00   Median :1.000   Median :620.0   Median :83.00  
##  Mean   :27.36   Mean   :1.248   Mean   :619.5   Mean   :80.64  
##  3rd Qu.:29.00   3rd Qu.:1.000   3rd Qu.:660.0   3rd Qu.:93.00  
##  Max.   :48.00   Max.   :2.000   Max.   :790.0   Max.   :99.00  
##     gmat_vpc        gmat_tpc        s_avg           f_avg      
##  Min.   :16.00   Min.   : 0.0   Min.   :2.000   Min.   :0.000  
##  1st Qu.:71.00   1st Qu.:78.0   1st Qu.:2.708   1st Qu.:2.750  
##  Median :81.00   Median :87.0   Median :3.000   Median :3.000  
##  Mean   :78.32   Mean   :84.2   Mean   :3.025   Mean   :3.062  
##  3rd Qu.:91.00   3rd Qu.:94.0   3rd Qu.:3.300   3rd Qu.:3.250  
##  Max.   :99.00   Max.   :99.0   Max.   :4.000   Max.   :4.000  
##     quarter         work_yrs         frstlang         salary      
##  Min.   :1.000   Min.   : 0.000   Min.   :1.000   Min.   :     0  
##  1st Qu.:1.250   1st Qu.: 2.000   1st Qu.:1.000   1st Qu.:     0  
##  Median :2.000   Median : 3.000   Median :1.000   Median :   999  
##  Mean   :2.478   Mean   : 3.872   Mean   :1.117   Mean   : 39026  
##  3rd Qu.:3.000   3rd Qu.: 4.000   3rd Qu.:1.000   3rd Qu.: 97000  
##  Max.   :4.000   Max.   :22.000   Max.   :2.000   Max.   :220000  
##      satis      
##  Min.   :  1.0  
##  1st Qu.:  5.0  
##  Median :  6.0  
##  Mean   :172.2  
##  3rd Qu.:  7.0  
##  Max.   :998.0

Creating Data Frames And Finding Summary

jobmba<-subset(mba,salary!=0)
jobmba1<-subset(jobmba,salary!=999)
jobf<-subset(jobmba1,salary!=998)

summary(jobf)[,c(1:5,12)]
##       age             sex           gmat_tot      gmat_qpc    
##  Min.   :22.00   Min.   :1.000   Min.   :500   Min.   :39.00  
##  1st Qu.:25.00   1st Qu.:1.000   1st Qu.:580   1st Qu.:72.00  
##  Median :26.00   Median :1.000   Median :620   Median :82.00  
##  Mean   :26.78   Mean   :1.301   Mean   :616   Mean   :79.73  
##  3rd Qu.:28.00   3rd Qu.:2.000   3rd Qu.:655   3rd Qu.:89.00  
##  Max.   :40.00   Max.   :2.000   Max.   :720   Max.   :99.00  
##     gmat_vpc         salary      
##  Min.   :30.00   Min.   : 64000  
##  1st Qu.:71.00   1st Qu.: 95000  
##  Median :81.00   Median :100000  
##  Mean   :78.56   Mean   :103031  
##  3rd Qu.:92.00   3rd Qu.:106000  
##  Max.   :99.00   Max.   :220000

Analysisng Vraious Factors like Age, Sex, Salary etc. Via Varous Plots.

mba$sex=factor(mba$sex, levels=c(1,2), labels=c("Male","Female"))
plot(mba$sex,col = "grey",main = "Sex Ratio In MBA Schools")

hist(mba$gmat_tot, xlab="GMAT total score",main="GMAT scores of Students", breaks=15,col="red")

hist(mba$work_yrs,col="light green",xlab="years of work experience",main = "Work Experience",breaks = 5)

boxplot(mba$salary, main="Salary Details",
              col=c("yellow"),horizontal=TRUE,
              xlab="Salary" )

mba$frstlang = factor(mba$frstlang, levels=c(1,2), labels=c("English","Others"))
    plot(mba$frstlang,col="yellow",main = "Language Distribution")

f1<- mba[ which(mba$satis<='7'), ]
hist(f1$satis,breaks =5,col="green",xlab="Degree of Satisfaction (1=low,7=high)", main="Satisfaction  distribution of MBA students")

Salary comparison to Age,Work Years,Sex,Satisfaction,MBA Scores,First Language

library(car)    
## Warning: package 'car' was built under R version 3.3.3
scatterplot(salary ~ age,data=jobf,
            main="Scatter plot of salary vs age",
            xlab="age",
            ylab="salary")

Drawing Scatter Plot

scatterplot(salary ~gmat_tot,data=jobf,
            spread=FALSE, smoother.args=list(lty=2),
            main="Scatter plot of salary vs Gmat total",
            xlab="Gmat score",
            ylab="salary")

Drawing Box Plots

library(lattice)
## Warning: package 'lattice' was built under R version 3.3.3
mba$frstlang=factor(mba$frstlang,levels=c(1,2),labels=c("English","Others"))
bwplot(frstlang ~salary,data=jobf,horizontal="TRUE",ylab="language spoken (1=english,2=rest)")

bwplot(sex ~ salary, data=mba, horizontal=TRUE, 
       xlab = "Salary")

bwplot(satis~salary,jobf,xlab="Salary")

bwplot(work_yrs~salary,jobf)

Comparing Factors

bwplot(work_yrs ~ salary | sex, data=mba, 
       horizontal=TRUE, xlab="Income")

bwplot(quarter ~ salary | sex, data=mba, 
       horizontal=TRUE, xlab="Income")

library(corrgram)
## Warning: package 'corrgram' was built under R version 3.3.3
corrgram(mba,lower.panel=panel.shade,
         upper.panel=panel.pie, text.panel=panel.txt,
         main="Corrgram of salary intercorrelations")

x <- jobf[,c("age", "gmat_tot", "gmat_qpc", "gmat_vpc","gmat_tpc","s_avg","f_avg","work_yrs","salary")]
   y <- jobf[,c("age", "gmat_tot", "gmat_qpc", "gmat_vpc","gmat_tpc","s_avg","f_avg","work_yrs","salary")]
   cov(x,y)
##                    age      gmat_tot      gmat_qpc      gmat_vpc
## age         10.7045498    -13.054445   -7.22796497  9.505045e-01
## gmat_tot   -13.0544451   2569.293737  452.14258519  6.386360e+02
## gmat_qpc    -7.2279650    452.142585  179.18027794  2.045850e+01
## gmat_vpc     0.9505045    638.636018   20.45849990  2.606602e+02
## gmat_tpc    -3.4602132    539.362269   97.03607462  1.393882e+02
## s_avg        0.1938587      3.299562    0.07838473  9.694594e-01
## f_avg       -0.3462517      3.027432    0.64252142  1.803303e-01
## work_yrs     8.6728536    -18.738816   -7.36245955 -1.366838e+00
## salary   29210.5193223 -82124.485056 3382.43784504 -3.964803e+04
##               gmat_tpc        s_avg         f_avg      work_yrs
## age      -3.460213e+00   0.19385875   -0.34625167     8.6728536
## gmat_tot  5.393623e+02   3.29956215    3.02743194   -18.7388159
## gmat_qpc  9.703607e+01   0.07838473    0.64252142    -7.3624595
## gmat_vpc  1.393882e+02   0.96945936    0.18033029    -1.3668380
## gmat_tpc  1.211342e+02   0.58062916    0.37850562    -4.3892062
## s_avg     5.806292e-01   0.14325138    0.08231046     0.1860480
## f_avg     3.785056e-01   0.08231046    0.23786375    -0.3176271
## work_yrs -4.389206e+00   0.18604797   -0.31762707     9.0630116
## salary   -2.596339e+04 688.02042071 -924.11288026 24458.1995050
##                 salary
## age       2.921052e+04
## gmat_tot -8.212449e+04
## gmat_qpc  3.382438e+03
## gmat_vpc -3.964803e+04
## gmat_tpc -2.596339e+04
## s_avg     6.880204e+02
## f_avg    -9.241129e+02
## work_yrs  2.445820e+04
## salary    3.192940e+08

Contingency tables showing the affect of various factors on the starting salary

mytable<-xtabs(~salary+sex,jobf)
   mytable
##         sex
## salary    1  2
##   64000   0  1
##   77000   1  0
##   78256   0  1
##   82000   0  1
##   85000   1  3
##   86000   0  2
##   88000   0  1
##   88500   1  0
##   90000   3  0
##   92000   2  1
##   93000   2  1
##   95000   4  3
##   96000   3  1
##   96500   1  0
##   97000   2  0
##   98000   6  4
##   99000   0  1
##   100000  4  5
##   100400  1  0
##   101000  0  2
##   101100  1  0
##   101600  1  0
##   102500  1  0
##   103000  1  0
##   104000  2  0
##   105000 11  0
##   106000  2  1
##   107000  1  0
##   107300  1  0
##   107500  1  0
##   108000  2  0
##   110000  0  1
##   112000  3  0
##   115000  5  0
##   118000  1  0
##   120000  3  1
##   126710  1  0
##   130000  1  0
##   145800  1  0
##   146000  1  0
##   162000  1  0
##   220000  0  1
 chisq.test(mytable)
## Warning in chisq.test(mytable): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  mytable
## X-squared = 52.681, df = 41, p-value = 0.1045
t.test(salary ~ sex, data=jobf)
## 
##  Welch Two Sample t-test
## 
## data:  salary by sex
## t = 1.3628, df = 38.115, p-value = 0.1809
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -3128.55 16021.72
## sample estimates:
## mean in group 1 mean in group 2 
##       104970.97        98524.39

The test shows there is a significant difference in salaries of men and women.Men are paid more.

 mytable1<-xtabs(~salary+frstlang,jobf)
   mytable1
##         frstlang
## salary    1  2
##   64000   1  0
##   77000   1  0
##   78256   1  0
##   82000   1  0
##   85000   4  0
##   86000   2  0
##   88000   1  0
##   88500   1  0
##   90000   3  0
##   92000   3  0
##   93000   3  0
##   95000   7  0
##   96000   4  0
##   96500   1  0
##   97000   2  0
##   98000   8  2
##   99000   0  1
##   100000  9  0
##   100400  1  0
##   101000  2  0
##   101100  1  0
##   101600  1  0
##   102500  1  0
##   103000  1  0
##   104000  1  1
##   105000 11  0
##   106000  3  0
##   107000  1  0
##   107300  0  1
##   107500  1  0
##   108000  2  0
##   110000  1  0
##   112000  3  0
##   115000  5  0
##   118000  0  1
##   120000  4  0
##   126710  1  0
##   130000  1  0
##   145800  1  0
##   146000  1  0
##   162000  1  0
##   220000  0  1
chisq.test(mytable1)
## Warning in chisq.test(mytable1): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  mytable1
## X-squared = 69.847, df = 41, p-value = 0.003296
 t.test(salary ~ frstlang, data=jobf)
## 
##  Welch Two Sample t-test
## 
## data:  salary by frstlang
## t = -1.1202, df = 6.0863, p-value = 0.3049
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -59933.62  22202.25
## sample estimates:
## mean in group 1 mean in group 2 
##        101748.6        120614.3

There isa correlation between language spoken and placement-English speaking grads are given more jobs than Other language speaking people.

mytable2<-xtabs(~salary+work_yrs,jobf)
   mytable2
##         work_yrs
## salary   0 1 2 3 4 5 6 7 8 10 15 16
##   64000  0 0 1 0 0 0 0 0 0  0  0  0
##   77000  0 0 1 0 0 0 0 0 0  0  0  0
##   78256  0 1 0 0 0 0 0 0 0  0  0  0
##   82000  0 1 0 0 0 0 0 0 0  0  0  0
##   85000  0 1 2 1 0 0 0 0 0  0  0  0
##   86000  0 0 1 1 0 0 0 0 0  0  0  0
##   88000  0 0 0 1 0 0 0 0 0  0  0  0
##   88500  0 0 0 1 0 0 0 0 0  0  0  0
##   90000  0 0 2 0 0 1 0 0 0  0  0  0
##   92000  0 0 3 0 0 0 0 0 0  0  0  0
##   93000  0 0 0 0 1 1 0 0 1  0  0  0
##   95000  1 1 2 2 0 1 0 0 0  0  0  0
##   96000  0 1 2 0 1 0 0 0 0  0  0  0
##   96500  0 0 1 0 0 0 0 0 0  0  0  0
##   97000  0 0 0 1 1 0 0 0 0  0  0  0
##   98000  0 0 7 1 1 0 0 1 0  0  0  0
##   99000  0 0 0 0 0 1 0 0 0  0  0  0
##   100000 0 0 6 1 1 0 1 0 0  0  0  0
##   100400 0 0 0 1 0 0 0 0 0  0  0  0
##   101000 0 0 2 0 0 0 0 0 0  0  0  0
##   101100 0 0 0 0 0 0 0 0 1  0  0  0
##   101600 0 0 0 1 0 0 0 0 0  0  0  0
##   102500 0 0 0 0 0 0 1 0 0  0  0  0
##   103000 0 0 0 1 0 0 0 0 0  0  0  0
##   104000 0 0 0 0 2 0 0 0 0  0  0  0
##   105000 0 0 4 4 0 1 1 0 0  0  0  1
##   106000 0 0 0 0 0 0 2 0 1  0  0  0
##   107000 0 0 1 0 0 0 0 0 0  0  0  0
##   107300 0 0 1 0 0 0 0 0 0  0  0  0
##   107500 0 0 0 1 0 0 0 0 0  0  0  0
##   108000 0 0 0 1 1 0 0 0 0  0  0  0
##   110000 0 0 0 0 0 0 1 0 0  0  0  0
##   112000 0 0 1 0 0 0 1 0 0  0  0  1
##   115000 0 2 0 1 2 0 0 0 0  0  0  0
##   118000 0 0 0 0 0 0 0 0 0  1  0  0
##   120000 0 0 0 1 0 2 0 0 1  0  0  0
##   126710 0 0 0 1 0 0 0 0 0  0  0  0
##   130000 0 0 0 0 1 0 0 0 0  0  0  0
##   145800 0 0 1 0 0 0 0 0 0  0  0  0
##   146000 0 0 0 0 0 0 0 0 0  0  1  0
##   162000 0 1 0 0 0 0 0 0 0  0  0  0
##   220000 0 0 0 0 0 0 0 0 0  0  1  0
 chisq.test(mytable2)
## Warning in chisq.test(mytable2): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  mytable2
## X-squared = 535.23, df = 451, p-value = 0.003809

Regression Analysis

 m1<-lm(salary~quarter+s_avg+f_avg+age,jobf)
summary(m1)
## 
## Call:
## lm(formula = salary ~ quarter + s_avg + f_avg + age, data = jobf)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -25092  -8321  -2324   4764  83005 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  54566.0    32639.2   1.672   0.0978 .  
## quarter      -2569.0     2580.7  -0.995   0.3220    
## s_avg        -4672.8     7788.1  -0.600   0.5499    
## f_avg         -922.1     3793.1  -0.243   0.8084    
## age           2673.1      508.9   5.252 8.75e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15700 on 98 degrees of freedom
## Multiple R-squared:  0.2578, Adjusted R-squared:  0.2275 
## F-statistic: 8.511 on 4 and 98 DF,  p-value: 6.155e-06
 m2<-lm(salary~gmat_tot+gmat_tpc+gmat_vpc+gmat_qpc,jobf)
summary(m2)
## 
## Call:
## lm(formula = salary ~ gmat_tot + gmat_tpc + gmat_vpc + gmat_qpc, 
##     data = jobf)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -40370  -8250  -2164   5253 100097 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 109539.54   48054.24   2.279   0.0248 *
## gmat_tot        55.01     181.71   0.303   0.7627  
## gmat_tpc     -1663.16     801.57  -2.075   0.0406 *
## gmat_vpc       546.10     543.85   1.004   0.3178  
## gmat_qpc       718.40     541.90   1.326   0.1880  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17670 on 98 degrees of freedom
## Multiple R-squared:  0.06089,    Adjusted R-squared:  0.02256 
## F-statistic: 1.589 on 4 and 98 DF,  p-value: 0.1834
m3<-lm(salary~sex+frstlang+satis+work_yrs+quarter,jobf)
summary(m3)
## 
## Call:
## lm(formula = salary ~ sex + frstlang + satis + work_yrs + quarter, 
##     data = jobf)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -29352  -8342  -1943   5264  83154 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  99601.3    13868.8   7.182 1.41e-10 ***
## sex          -6037.8     3394.5  -1.779   0.0784 .  
## frstlang     15448.4     6326.4   2.442   0.0164 *  
## satis        -1800.6     2041.0  -0.882   0.3798    
## work_yrs      2321.1      534.1   4.346 3.42e-05 ***
## quarter      -1397.2     1441.5  -0.969   0.3348    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15590 on 97 degrees of freedom
## Multiple R-squared:  0.2765, Adjusted R-squared:  0.2392 
## F-statistic: 7.414 on 5 and 97 DF,  p-value: 6.324e-06

Details of people who did not get a job

nojob<-subset(mba,salary==0)

summary(nojob)
##       age            sex        gmat_tot        gmat_qpc    
##  Min.   :22.00   Male  :67   Min.   :450.0   Min.   :28.00  
##  1st Qu.:25.00   Female:23   1st Qu.:570.0   1st Qu.:68.25  
##  Median :27.00               Median :610.0   Median :82.00  
##  Mean   :28.51               Mean   :614.3   Mean   :78.91  
##  3rd Qu.:29.75               3rd Qu.:650.0   3rd Qu.:93.00  
##  Max.   :48.00               Max.   :760.0   Max.   :99.00  
##     gmat_vpc        gmat_tpc         s_avg           f_avg      
##  Min.   :22.00   Min.   : 0.00   Min.   :2.000   Min.   :0.000  
##  1st Qu.:70.25   1st Qu.:73.50   1st Qu.:2.800   1st Qu.:2.750  
##  Median :81.00   Median :86.00   Median :3.000   Median :3.000  
##  Mean   :77.63   Mean   :82.29   Mean   :3.031   Mean   :3.062  
##  3rd Qu.:89.00   3rd Qu.:93.00   3rd Qu.:3.300   3rd Qu.:3.250  
##  Max.   :99.00   Max.   :99.00   Max.   :3.900   Max.   :4.000  
##     quarter         work_yrs         frstlang      salary      satis      
##  Min.   :1.000   Min.   : 0.000   English: 0   Min.   :0   Min.   :4.000  
##  1st Qu.:2.000   1st Qu.: 2.000   Others : 0   1st Qu.:0   1st Qu.:5.000  
##  Median :2.500   Median : 3.000   NA's   :90   Median :0   Median :6.000  
##  Mean   :2.544   Mean   : 4.589                Mean   :0   Mean   :5.622  
##  3rd Qu.:3.000   3rd Qu.: 5.000                3rd Qu.:0   3rd Qu.:6.000  
##  Max.   :4.000   Max.   :22.000                Max.   :0   Max.   :7.000
 hist(nojob$satis, xlab="Satisfaction",main="MBA Satisfaction", breaks=5,col="blue")

 chisq.test(nojob$work_yrs,nojob$satis)
## Warning in chisq.test(nojob$work_yrs, nojob$satis): Chi-squared
## approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  nojob$work_yrs and nojob$satis
## X-squared = 44.974, df = 48, p-value = 0.5976