This is my R markdown document of the case study on MBA starting salary.

Read the data in R

# Read the data
salary.df <- read.csv(paste("MBASalData.csv", sep=""))
View(salary.df)

Summarize the data set.

# Summarize the data
attach(salary.df)
library(psych)
describe(salary.df)
##          vars   n     mean       sd median  trimmed     mad min    max
## age         1 274    27.36     3.71     27    26.76    2.97  22     48
## sex         2 274     1.25     0.43      1     1.19    0.00   1      2
## gmat_tot    3 274   619.45    57.54    620   618.86   59.30 450    790
## gmat_qpc    4 274    80.64    14.87     83    82.31   14.83  28     99
## gmat_vpc    5 274    78.32    16.86     81    80.33   14.83  16     99
## gmat_tpc    6 274    84.20    14.02     87    86.12   11.86   0     99
## s_avg       7 274     3.03     0.38      3     3.03    0.44   2      4
## f_avg       8 274     3.06     0.53      3     3.09    0.37   0      4
## quarter     9 274     2.48     1.11      2     2.47    1.48   1      4
## work_yrs   10 274     3.87     3.23      3     3.29    1.48   0     22
## frstlang   11 274     1.12     0.32      1     1.02    0.00   1      2
## salary     12 274 39025.69 50951.56    999 33607.86 1481.12   0 220000
## satis      13 274   172.18   371.61      6    91.50    1.48   1    998
##           range  skew kurtosis      se
## age          26  2.16     6.45    0.22
## sex           1  1.16    -0.66    0.03
## gmat_tot    340 -0.01     0.06    3.48
## gmat_qpc     71 -0.92     0.30    0.90
## gmat_vpc     83 -1.04     0.74    1.02
## gmat_tpc     99 -2.28     9.02    0.85
## s_avg         2 -0.06    -0.38    0.02
## f_avg         4 -2.08    10.85    0.03
## quarter       3  0.02    -1.35    0.07
## work_yrs     22  2.78     9.80    0.20
## frstlang      1  2.37     3.65    0.02
## salary   220000  0.70    -1.05 3078.10
## satis       997  1.77     1.13   22.45

Generate boxplots to visualize the distribution of each varible independently

boxplot(salary.df$gmat_tot, horizontal=TRUE,
        main="GMAT Total score")

boxplot(salary.df$gmat_qpc, horizontal=TRUE,
        main="quantitative GMAT percentile")

boxplot(salary.df$gmat_vpc, horizontal=TRUE,
        main="verbal GMAT percentile")

boxplot(salary.df$gmat_tpc, horizontal=TRUE,
        main="total GMAT percentile")

boxplot(salary.df$s_avg, horizontal=TRUE,
        main="spring MBA average")

boxplot(salary.df$f_avg, horizontal=TRUE,
        main="fall MBA average")

boxplot(salary.df$work_yrs, horizontal=TRUE,
        main="years of work experience")

boxplot(salary.df$salary, horizontal=TRUE,
        main="Starting salary")

boxplot(salary.df$satis, horizontal=TRUE,
        main="degree of satisfaction with MBA program ")

Draw Scatter Plots to understand how are the variables correlated pair-wise

library(car)
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
library(psych)
 scatterplotMatrix(formula = ~ gmat_tot + gmat_qpc + gmat_vpc + gmat_tpc , 
                   data = salary.df, diagonal="histogram")

library(car)
library(psych)
 scatterplotMatrix(formula = ~ gmat_tpc + work_yrs + salary + satis , 
                   data = salary.df, diagonal="histogram")

CORRELATION MATRIX

library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
## 
## Attaching package: 'Hmisc'
## The following object is masked from 'package:psych':
## 
##     describe
## The following objects are masked from 'package:base':
## 
##     format.pval, units
colsalary <- c("salary","age","sex","work_yrs")
corMatrix <- rcorr(as.matrix(salary.df[,colsalary]))
corMatrix
##          salary   age   sex work_yrs
## salary     1.00 -0.06  0.07     0.01
## age       -0.06  1.00 -0.03     0.86
## sex        0.07 -0.03  1.00    -0.01
## work_yrs   0.01  0.86 -0.01     1.00
## 
## n= 274 
## 
## 
## P
##          salary age    sex    work_yrs
## salary          0.3020 0.2560 0.8818  
## age      0.3020        0.6432 0.0000  
## sex      0.2560 0.6432        0.8523  
## work_yrs 0.8818 0.0000 0.8523

Corrgram

library(corrgram)
corrgram(salary.df, order=FALSE, 
         lower.panel=panel.shade,
         upper.panel=panel.pie, 
         diag.panel=panel.minmax,
         text.panel=panel.txt,
         main="Corrgram of MBA Salary Data")

Take a subset of the dataset consisting of both of these 2 types of students people who actually got and did not get job.

job.df<-salary.df[which(salary.df$salary!=998 & salary.df$salary!=999),]
View(job.df)

Take a subset of the dataset consisting of only those people who actually got a job.

gotjob.df<-job.df[which(job.df$salary!=0),]
View(gotjob.df)

Here y=salary, and this salary depends upon various factors like GMAT percentile, sex, first language, work experience, spring MBA average, fall MBA average etc which are considered here as ‘x’.

Draw Contingency Tables

mytable <- xtabs(~ salary+sex , data=gotjob.df)  
mytable
##         sex
## salary    1  2
##   64000   0  1
##   77000   1  0
##   78256   0  1
##   82000   0  1
##   85000   1  3
##   86000   0  2
##   88000   0  1
##   88500   1  0
##   90000   3  0
##   92000   2  1
##   93000   2  1
##   95000   4  3
##   96000   3  1
##   96500   1  0
##   97000   2  0
##   98000   6  4
##   99000   0  1
##   100000  4  5
##   100400  1  0
##   101000  0  2
##   101100  1  0
##   101600  1  0
##   102500  1  0
##   103000  1  0
##   104000  2  0
##   105000 11  0
##   106000  2  1
##   107000  1  0
##   107300  1  0
##   107500  1  0
##   108000  2  0
##   110000  0  1
##   112000  3  0
##   115000  5  0
##   118000  1  0
##   120000  3  1
##   126710  1  0
##   130000  1  0
##   145800  1  0
##   146000  1  0
##   162000  1  0
##   220000  0  1
mytable <- xtabs(~ frstlang+salary , data=gotjob.df)  
mytable
##         salary
## frstlang 64000 77000 78256 82000 85000 86000 88000 88500 90000 92000 93000
##        1     1     1     1     1     4     2     1     1     3     3     3
##        2     0     0     0     0     0     0     0     0     0     0     0
##         salary
## frstlang 95000 96000 96500 97000 98000 99000 100000 100400 101000 101100
##        1     7     4     1     2     8     0      9      1      2      1
##        2     0     0     0     0     2     1      0      0      0      0
##         salary
## frstlang 101600 102500 103000 104000 105000 106000 107000 107300 107500
##        1      1      1      1      1     11      3      1      0      1
##        2      0      0      0      1      0      0      0      1      0
##         salary
## frstlang 108000 110000 112000 115000 118000 120000 126710 130000 145800
##        1      2      1      3      5      0      4      1      1      1
##        2      0      0      0      0      1      0      0      0      0
##         salary
## frstlang 146000 162000 220000
##        1      1      1      0
##        2      0      0      1
mytable <- xtabs(~ salary+work_yrs , data=gotjob.df)  
mytable
##         work_yrs
## salary   0 1 2 3 4 5 6 7 8 10 15 16
##   64000  0 0 1 0 0 0 0 0 0  0  0  0
##   77000  0 0 1 0 0 0 0 0 0  0  0  0
##   78256  0 1 0 0 0 0 0 0 0  0  0  0
##   82000  0 1 0 0 0 0 0 0 0  0  0  0
##   85000  0 1 2 1 0 0 0 0 0  0  0  0
##   86000  0 0 1 1 0 0 0 0 0  0  0  0
##   88000  0 0 0 1 0 0 0 0 0  0  0  0
##   88500  0 0 0 1 0 0 0 0 0  0  0  0
##   90000  0 0 2 0 0 1 0 0 0  0  0  0
##   92000  0 0 3 0 0 0 0 0 0  0  0  0
##   93000  0 0 0 0 1 1 0 0 1  0  0  0
##   95000  1 1 2 2 0 1 0 0 0  0  0  0
##   96000  0 1 2 0 1 0 0 0 0  0  0  0
##   96500  0 0 1 0 0 0 0 0 0  0  0  0
##   97000  0 0 0 1 1 0 0 0 0  0  0  0
##   98000  0 0 7 1 1 0 0 1 0  0  0  0
##   99000  0 0 0 0 0 1 0 0 0  0  0  0
##   100000 0 0 6 1 1 0 1 0 0  0  0  0
##   100400 0 0 0 1 0 0 0 0 0  0  0  0
##   101000 0 0 2 0 0 0 0 0 0  0  0  0
##   101100 0 0 0 0 0 0 0 0 1  0  0  0
##   101600 0 0 0 1 0 0 0 0 0  0  0  0
##   102500 0 0 0 0 0 0 1 0 0  0  0  0
##   103000 0 0 0 1 0 0 0 0 0  0  0  0
##   104000 0 0 0 0 2 0 0 0 0  0  0  0
##   105000 0 0 4 4 0 1 1 0 0  0  0  1
##   106000 0 0 0 0 0 0 2 0 1  0  0  0
##   107000 0 0 1 0 0 0 0 0 0  0  0  0
##   107300 0 0 1 0 0 0 0 0 0  0  0  0
##   107500 0 0 0 1 0 0 0 0 0  0  0  0
##   108000 0 0 0 1 1 0 0 0 0  0  0  0
##   110000 0 0 0 0 0 0 1 0 0  0  0  0
##   112000 0 0 1 0 0 0 1 0 0  0  0  1
##   115000 0 2 0 1 2 0 0 0 0  0  0  0
##   118000 0 0 0 0 0 0 0 0 0  1  0  0
##   120000 0 0 0 1 0 2 0 0 1  0  0  0
##   126710 0 0 0 1 0 0 0 0 0  0  0  0
##   130000 0 0 0 0 1 0 0 0 0  0  0  0
##   145800 0 0 1 0 0 0 0 0 0  0  0  0
##   146000 0 0 0 0 0 0 0 0 0  0  1  0
##   162000 0 1 0 0 0 0 0 0 0  0  0  0
##   220000 0 0 0 0 0 0 0 0 0  0  1  0

t-test analysis

t.test(salary,age , data=gotjob.df)
## 
##  Welch Two Sample t-test
## 
## data:  salary and age
## t = 12.67, df = 273, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  32938.51 45058.15
## sample estimates:
##   mean of x   mean of y 
## 39025.68978    27.35766
t.test(salary,work_yrs , data=gotjob.df)
## 
##  Welch Two Sample t-test
## 
## data:  salary and work_yrs
## t = 12.677, df = 273, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  32961.99 45081.64
## sample estimates:
##    mean of x    mean of y 
## 39025.689781     3.872263

Regression analysis

m1 <- lm(salary ~ 
         gmat_tpc
         + s_avg 
         + f_avg, 
         data=gotjob.df)
summary(m1)
## 
## Call:
## lm(formula = salary ~ gmat_tpc + s_avg + f_avg, data = gotjob.df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -45140  -7934  -1887   3623 112357 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 114442.5    19076.0   5.999 3.26e-08 ***
## gmat_tpc      -239.5      159.7  -1.499   0.1370    
## s_avg         9719.3     5176.8   1.877   0.0634 .  
## f_avg        -6867.3     3988.1  -1.722   0.0882 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17580 on 99 degrees of freedom
## Multiple R-squared:  0.06029,    Adjusted R-squared:  0.03182 
## F-statistic: 2.117 on 3 and 99 DF,  p-value: 0.1029
m2 <- lm(salary ~ 
           work_yrs 
         + frstlang
         + sex
         + age, 
         data=gotjob.df)
summary(m2)
## 
## Call:
## lm(formula = salary ~ work_yrs + frstlang + sex + age, data = gotjob.df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -29056  -9070  -1211   5858  79078 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  51623.0    24708.1   2.089   0.0393 *
## work_yrs       916.9     1119.1   0.819   0.4146  
## frstlang     10017.9     6785.8   1.476   0.1431  
## sex          -4655.5     3418.8  -1.362   0.1764  
## age           1620.5     1089.7   1.487   0.1402  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15510 on 98 degrees of freedom
## Multiple R-squared:  0.2763, Adjusted R-squared:  0.2468 
## F-statistic: 9.355 on 4 and 98 DF,  p-value: 1.907e-06
m3 <- lm(salary ~ 
         + frstlang
         + sex
         + s_avg
         + f_avg
         + age
         + work_yrs, 
         data=gotjob.df)
summary(m3)
## 
## Call:
## lm(formula = salary ~ +frstlang + sex + s_avg + f_avg + age + 
##     work_yrs, data = gotjob.df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -30413  -8757  -1921   5966  81580 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  45104.8    27753.7   1.625    0.107
## frstlang     11264.2     7061.9   1.595    0.114
## sex          -4954.4     3490.7  -1.419    0.159
## s_avg         3456.5     4953.5   0.698    0.487
## f_avg         -603.8     3812.6  -0.158    0.874
## age           1499.4     1113.9   1.346    0.181
## work_yrs       915.9     1130.2   0.810    0.420
## 
## Residual standard error: 15620 on 96 degrees of freedom
## Multiple R-squared:  0.2804, Adjusted R-squared:  0.2354 
## F-statistic: 6.233 on 6 and 96 DF,  p-value: 1.481e-05
m4 <- lm(salary ~ 
           work_yrs 
         + frstlang,
         data=gotjob.df)
summary(m4)
## 
## Call:
## lm(formula = salary ~ work_yrs + frstlang, data = gotjob.df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -33972  -8955   -455   4545  76681 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  79941.4     6788.8  11.775  < 2e-16 ***
## work_yrs      2483.3      527.9   4.704 8.18e-06 ***
## frstlang     13064.0     6283.2   2.079   0.0402 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15740 on 100 degrees of freedom
## Multiple R-squared:  0.2396, Adjusted R-squared:  0.2244 
## F-statistic: 15.75 on 2 and 100 DF,  p-value: 1.128e-06
m5 <- lm(salary ~ 
           work_yrs
           + sex
           + age,
         data=gotjob.df)
summary(m5)
## 
## Call:
## lm(formula = salary ~ work_yrs + sex + age, data = gotjob.df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -29250  -9239  -1146   5429  84318 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  45674.9    24522.2   1.863   0.0655 .
## work_yrs       478.2     1085.3   0.441   0.6604  
## sex          -3852.2     3395.2  -1.135   0.2593  
## age           2263.4     1004.8   2.253   0.0265 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15600 on 99 degrees of freedom
## Multiple R-squared:  0.2602, Adjusted R-squared:  0.2378 
## F-statistic: 11.61 on 3 and 99 DF,  p-value: 1.389e-06
m6 <- lm(salary ~ 
           work_yrs 
         + age,
         data=gotjob.df)
summary(m6)
## 
## Call:
## lm(formula = salary ~ work_yrs + age, data = gotjob.df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -31675  -8099  -2108   4411  80650 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  36967.5    23323.8   1.585   0.1161  
## work_yrs       388.8     1084.0   0.359   0.7206  
## age           2413.8      997.4   2.420   0.0173 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15620 on 100 degrees of freedom
## Multiple R-squared:  0.2506, Adjusted R-squared:  0.2356 
## F-statistic: 16.72 on 2 and 100 DF,  p-value: 5.438e-07
m7 <- lm(salary ~ age, data=gotjob.df)
summary(m7)
## 
## Call:
## lm(formula = salary ~ age, data = gotjob.df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -31454  -8533  -2182   4546  80886 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  29962.6    12697.8   2.360   0.0202 *  
## age           2728.8      470.7   5.797 7.75e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15550 on 101 degrees of freedom
## Multiple R-squared:  0.2496, Adjusted R-squared:  0.2422 
## F-statistic:  33.6 on 1 and 101 DF,  p-value: 7.748e-08
m8 <- lm(salary ~ 
           s_avg
         + f_avg
         + age,
         data=gotjob.df)
summary(m8)
## 
## Call:
## lm(formula = salary ~ s_avg + f_avg + age, data = gotjob.df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -32079  -8073  -2362   4671  82120 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  28077.5    18901.4   1.485    0.141    
## s_avg         1435.6     4795.8   0.299    0.765    
## f_avg         -469.6     3765.6  -0.125    0.901    
## age           2687.6      508.7   5.283 7.55e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15700 on 99 degrees of freedom
## Multiple R-squared:  0.2503, Adjusted R-squared:  0.2276 
## F-statistic: 11.02 on 3 and 99 DF,  p-value: 2.635e-06
m9 <- lm(salary ~ 
           work_yrs 
         + frstlang
         + gmat_tpc
         + sex
         + age
         + s_avg
         + f_avg,
         data=gotjob.df)
summary(m9)
## 
## Call:
## lm(formula = salary ~ work_yrs + frstlang + gmat_tpc + sex + 
##     age + s_avg + f_avg, data = gotjob.df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -32354  -9144  -1995   6557  78985 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  53884.7    29504.9   1.826   0.0709 .
## work_yrs       775.0     1142.7   0.678   0.4993  
## frstlang     10442.9     7130.6   1.465   0.1464  
## gmat_tpc      -128.9      145.8  -0.885   0.3786  
## sex          -5095.5     3498.3  -1.457   0.1485  
## age           1578.8     1118.7   1.411   0.1614  
## s_avg         4030.7     5001.4   0.806   0.4223  
## f_avg         -669.5     3817.7  -0.175   0.8612  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15640 on 95 degrees of freedom
## Multiple R-squared:  0.2862, Adjusted R-squared:  0.2336 
## F-statistic: 5.442 on 7 and 95 DF,  p-value: 2.828e-05

From the above analysis of of all the models I found that the adjusted R-squared value is highest for the model 2(m2) that is 0.2468.So,model 2 is the best model that fits data. Although this 24.68% is not that high percentage but still we can conclude that starting salary mostly depends upon work experience, first language, sex and age.