Reading the dataset

salary <- read.csv(paste("MBA Starting Salaries Data.csv" , sep = ""))
#View(salary)
some(salary)
##     age sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg quarter
## 8    25   2      650       88       89       92   3.3  3.75       1
## 42   25   2      560       52       81       72   3.3  3.50       1
## 84   25   1      690       96       89       97   3.0  3.00       2
## 156  26   1      660       88       93       94   2.9  2.75       3
## 166  27   1      730       95       99       99   2.9  3.33       3
## 181  29   1      560       57       74       73   2.8  3.00       3
## 184  34   1      610       82       78       86   2.7  3.00       3
## 212  25   1      600       53       95       84   2.5  3.00       4
## 218  25   1      700       99       87       98   2.0  2.00       4
## 260  26   2      630       85       81       90   2.9  3.25       4
##     work_yrs frstlang salary satis
## 8          2        1      0     6
## 42         1        1  95000     5
## 84         3        1    998   998
## 156        3        2    998   998
## 166        0        1    999     5
## 181        4        1    999     5
## 184       12        1      0     5
## 212        2        1    999     4
## 218        1        1      0     7
## 260        3        1  86000     5

Summarizing it

str(salary)
## 'data.frame':    274 obs. of  13 variables:
##  $ age     : int  23 24 24 24 24 24 25 25 25 25 ...
##  $ sex     : int  2 1 1 1 2 1 1 2 1 1 ...
##  $ gmat_tot: int  620 610 670 570 710 640 610 650 630 680 ...
##  $ gmat_qpc: int  77 90 99 56 93 82 89 88 79 99 ...
##  $ gmat_vpc: int  87 71 78 81 98 89 74 89 91 81 ...
##  $ gmat_tpc: int  87 87 95 75 98 91 87 92 89 96 ...
##  $ s_avg   : num  3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
##  $ f_avg   : num  3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
##  $ quarter : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ work_yrs: int  2 2 2 1 2 2 2 2 2 2 ...
##  $ frstlang: int  1 1 1 1 1 1 1 1 2 1 ...
##  $ salary  : int  0 0 0 0 999 0 0 0 999 998 ...
##  $ satis   : int  7 6 6 7 5 6 5 6 4 998 ...
summary(salary)
##       age             sex           gmat_tot        gmat_qpc    
##  Min.   :22.00   Min.   :1.000   Min.   :450.0   Min.   :28.00  
##  1st Qu.:25.00   1st Qu.:1.000   1st Qu.:580.0   1st Qu.:72.00  
##  Median :27.00   Median :1.000   Median :620.0   Median :83.00  
##  Mean   :27.36   Mean   :1.248   Mean   :619.5   Mean   :80.64  
##  3rd Qu.:29.00   3rd Qu.:1.000   3rd Qu.:660.0   3rd Qu.:93.00  
##  Max.   :48.00   Max.   :2.000   Max.   :790.0   Max.   :99.00  
##     gmat_vpc        gmat_tpc        s_avg           f_avg      
##  Min.   :16.00   Min.   : 0.0   Min.   :2.000   Min.   :0.000  
##  1st Qu.:71.00   1st Qu.:78.0   1st Qu.:2.708   1st Qu.:2.750  
##  Median :81.00   Median :87.0   Median :3.000   Median :3.000  
##  Mean   :78.32   Mean   :84.2   Mean   :3.025   Mean   :3.062  
##  3rd Qu.:91.00   3rd Qu.:94.0   3rd Qu.:3.300   3rd Qu.:3.250  
##  Max.   :99.00   Max.   :99.0   Max.   :4.000   Max.   :4.000  
##     quarter         work_yrs         frstlang         salary      
##  Min.   :1.000   Min.   : 0.000   Min.   :1.000   Min.   :     0  
##  1st Qu.:1.250   1st Qu.: 2.000   1st Qu.:1.000   1st Qu.:     0  
##  Median :2.000   Median : 3.000   Median :1.000   Median :   999  
##  Mean   :2.478   Mean   : 3.872   Mean   :1.117   Mean   : 39026  
##  3rd Qu.:3.000   3rd Qu.: 4.000   3rd Qu.:1.000   3rd Qu.: 97000  
##  Max.   :4.000   Max.   :22.000   Max.   :2.000   Max.   :220000  
##      satis      
##  Min.   :  1.0  
##  1st Qu.:  5.0  
##  Median :  6.0  
##  Mean   :172.2  
##  3rd Qu.:  7.0  
##  Max.   :998.0
describe(salary)
##          vars   n     mean       sd median  trimmed     mad min    max
## age         1 274    27.36     3.71     27    26.76    2.97  22     48
## sex         2 274     1.25     0.43      1     1.19    0.00   1      2
## gmat_tot    3 274   619.45    57.54    620   618.86   59.30 450    790
## gmat_qpc    4 274    80.64    14.87     83    82.31   14.83  28     99
## gmat_vpc    5 274    78.32    16.86     81    80.33   14.83  16     99
## gmat_tpc    6 274    84.20    14.02     87    86.12   11.86   0     99
## s_avg       7 274     3.03     0.38      3     3.03    0.44   2      4
## f_avg       8 274     3.06     0.53      3     3.09    0.37   0      4
## quarter     9 274     2.48     1.11      2     2.47    1.48   1      4
## work_yrs   10 274     3.87     3.23      3     3.29    1.48   0     22
## frstlang   11 274     1.12     0.32      1     1.02    0.00   1      2
## salary     12 274 39025.69 50951.56    999 33607.86 1481.12   0 220000
## satis      13 274   172.18   371.61      6    91.50    1.48   1    998
##           range  skew kurtosis      se
## age          26  2.16     6.45    0.22
## sex           1  1.16    -0.66    0.03
## gmat_tot    340 -0.01     0.06    3.48
## gmat_qpc     71 -0.92     0.30    0.90
## gmat_vpc     83 -1.04     0.74    1.02
## gmat_tpc     99 -2.28     9.02    0.85
## s_avg         2 -0.06    -0.38    0.02
## f_avg         4 -2.08    10.85    0.03
## quarter       3  0.02    -1.35    0.07
## work_yrs     22  2.78     9.80    0.20
## frstlang      1  2.37     3.65    0.02
## salary   220000  0.70    -1.05 3078.10
## satis       997  1.77     1.13   22.45

Let’s see how many people didn’t give out information

sapply(salary, function(x) sum(x == 999))
##      age      sex gmat_tot gmat_qpc gmat_vpc gmat_tpc    s_avg    f_avg 
##        0        0        0        0        0        0        0        0 
##  quarter work_yrs frstlang   salary    satis 
##        0        0        0       35        0

so 35 people didn’t provide data about their salaries

sapply(salary, function(x) sum(x == 998))
##      age      sex gmat_tot gmat_qpc gmat_vpc gmat_tpc    s_avg    f_avg 
##        0        0        0        0        0        0        0        0 
##  quarter work_yrs frstlang   salary    satis 
##        0        0        0       46       46

and 46 people didn’t respond

sal <- salary[which((salary$salary != 999) & (salary$salary != 998)) , ]
#View(sal)
#plot.new()
attach(sal)
## The following object is masked _by_ .GlobalEnv:
## 
##     salary
histogram(~sex , main = "Comparison of No. of respondent males vs females" , xlab = "sex(1 = male             2 = female)")

boxplot(sal$salary ~ sal$sex , horizontal = TRUE , col = c("lightblue4" , "pink")  ,yaxt = "n" , main = "Comaprison of salary of males and females" , xlab = "Salary" , ylab = "Sex")
axis(side = 2 , at = c(1,2) , labels = c("Male" , "Female"))

So, there isn’t much difference between salaries of males and females.

Variation of GMAT Scores

hist(gmat_tot , col = "grey" , main = "Variation in total GMAT scores" , xlab = "Score(out of 800)")

GMAT Percentiles

layout(matrix(c(1,1,2,3), 2, 2, byrow = TRUE))
hist(gmat_tpc , col = "peachpuff" , main = "GMAT Percentile(Total)" , xlab = "Percentile")
hist(gmat_qpc , col = "khaki" , main = "GMAT Percentile(Quantitative)" , xlab = "Percentile")
hist(gmat_vpc , col = "lightblue1" , main = "GMAT Percentile(Verbal)" , xlab = "Percentile")

Dependance of Verbal and Quantitative scores with native language

plot.new()
boxplot(gmat_vpc ~ frstlang , horizontal = TRUE , col = c("turquoise" , "lightgreen") , yaxt = "n" , main = "Variation of Verbal score with native language" , xlab = "Score" , ylab = "Native language")
axis(side = 2 , at = c(1,2) , labels = c("English" , "Non-English"))

plot.new()
boxplot(gmat_qpc ~ frstlang , horizontal = TRUE , col = c("turquoise" , "lightgreen") , yaxt = "n" , main = "Variation of Quantitative score with native language" , xlab = "Score" , ylab = "Native language")
axis(side = 2 , at = c(1,2) , labels = c("English" , "Non-English"))

As expected, people having English as their mother-tongue score better in verbal whereas the scores in Quantitative section are independent of mother tongue.

xyplot(s_avg ~ quarter , data = salary)

xyplot(f_avg ~ quarter , data = salary)

** We see that there is not much variation in the fall average. So the performance in spring separates people**

Creating a data frome of only the placed people

placed <- sal[which(sal$salary > 1000) , ]
#View(placed)
summary(placed)
##       age             sex           gmat_tot      gmat_qpc    
##  Min.   :22.00   Min.   :1.000   Min.   :500   Min.   :39.00  
##  1st Qu.:25.00   1st Qu.:1.000   1st Qu.:580   1st Qu.:72.00  
##  Median :26.00   Median :1.000   Median :620   Median :82.00  
##  Mean   :26.78   Mean   :1.301   Mean   :616   Mean   :79.73  
##  3rd Qu.:28.00   3rd Qu.:2.000   3rd Qu.:655   3rd Qu.:89.00  
##  Max.   :40.00   Max.   :2.000   Max.   :720   Max.   :99.00  
##     gmat_vpc        gmat_tpc         s_avg           f_avg      
##  Min.   :30.00   Min.   :51.00   Min.   :2.200   Min.   :0.000  
##  1st Qu.:71.00   1st Qu.:78.00   1st Qu.:2.850   1st Qu.:2.915  
##  Median :81.00   Median :87.00   Median :3.100   Median :3.250  
##  Mean   :78.56   Mean   :84.52   Mean   :3.092   Mean   :3.091  
##  3rd Qu.:92.00   3rd Qu.:93.50   3rd Qu.:3.400   3rd Qu.:3.415  
##  Max.   :99.00   Max.   :99.00   Max.   :4.000   Max.   :4.000  
##     quarter         work_yrs        frstlang         salary      
##  Min.   :1.000   Min.   : 0.00   Min.   :1.000   Min.   : 64000  
##  1st Qu.:1.000   1st Qu.: 2.00   1st Qu.:1.000   1st Qu.: 95000  
##  Median :2.000   Median : 3.00   Median :1.000   Median :100000  
##  Mean   :2.262   Mean   : 3.68   Mean   :1.068   Mean   :103031  
##  3rd Qu.:3.000   3rd Qu.: 4.00   3rd Qu.:1.000   3rd Qu.:106000  
##  Max.   :4.000   Max.   :16.00   Max.   :2.000   Max.   :220000  
##      satis      
##  Min.   :3.000  
##  1st Qu.:5.000  
##  Median :6.000  
##  Mean   :5.883  
##  3rd Qu.:6.000  
##  Max.   :7.000

Salary vs English

plot.new()
boxplot(placed$salary ~ placed$frstlang , horizontal = TRUE , col = c("turquoise" , "lightgreen") , yaxt = "n" , main = "Variation of salary with native language" , xlab = "Salary" , ylab = "Native language")
axis(side = 2 , at = c(1,2) , labels = c("English" , "Non-English"))

How satisfied are the rich?

boxplot(placed$salary ~ placed$satis , horizontal = TRUE , xlab = "Salary" , ylab = "Satisfaction(& being the highest)")

Will salary grow as we age?

scatterplot(placed$salary ~ placed$age)

Effect of Educational factors on Salary

scatterplotMatrix(placed[,c(4,5,6,7,8,12)])

Effect of other factors

scatterplotMatrix(placed[,c(1,2,10,11,12,13)])

Correlation of Salary with Educational and Other factors

library("Hmisc", lib.loc="~/R/win-library/3.4")
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
## 
## Attaching package: 'Hmisc'
## The following object is masked from 'package:psych':
## 
##     describe
## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units
edu <- placed[,c(3,4,5,6,7,8,9,12)]
other <- placed[,c(2,3,10,11,12,13)]
c1 <- rcorr(as.matrix(edu))
c2 <- rcorr(as.matrix(other))
c1
##          gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg quarter salary
## gmat_tot     1.00     0.67     0.78     0.97  0.17  0.12   -0.11  -0.09
## gmat_qpc     0.67     1.00     0.09     0.66  0.02  0.10    0.01   0.01
## gmat_vpc     0.78     0.09     1.00     0.78  0.16  0.02   -0.13  -0.14
## gmat_tpc     0.97     0.66     0.78     1.00  0.14  0.07   -0.10  -0.13
## s_avg        0.17     0.02     0.16     0.14  1.00  0.45   -0.84   0.10
## f_avg        0.12     0.10     0.02     0.07  0.45  1.00   -0.43  -0.11
## quarter     -0.11     0.01    -0.13    -0.10 -0.84 -0.43    1.00  -0.13
## salary      -0.09     0.01    -0.14    -0.13  0.10 -0.11   -0.13   1.00
## 
## n= 103 
## 
## 
## P
##          gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg  f_avg  quarter salary
## gmat_tot          0.0000   0.0000   0.0000   0.0824 0.2178 0.2875  0.3624
## gmat_qpc 0.0000            0.3415   0.0000   0.8767 0.3226 0.8991  0.8873
## gmat_vpc 0.0000   0.3415            0.0000   0.1095 0.8184 0.1954  0.1663
## gmat_tpc 0.0000   0.0000   0.0000            0.1603 0.4791 0.3171  0.1837
## s_avg    0.0824   0.8767   0.1095   0.1603          0.0000 0.0000  0.3065
## f_avg    0.2178   0.3226   0.8184   0.4791   0.0000        0.0000  0.2864
## quarter  0.2875   0.8991   0.1954   0.3171   0.0000 0.0000         0.1959
## salary   0.3624   0.8873   0.1663   0.1837   0.3065 0.2864 0.1959
c2
##            sex gmat_tot work_yrs frstlang salary satis
## sex       1.00    -0.02    -0.09     0.08  -0.17 -0.09
## gmat_tot -0.02     1.00    -0.12    -0.13  -0.09  0.06
## work_yrs -0.09    -0.12     1.00     0.20   0.45  0.06
## frstlang  0.08    -0.13     0.20     1.00   0.27  0.09
## salary   -0.17    -0.09     0.45     0.27   1.00 -0.04
## satis    -0.09     0.06     0.06     0.09  -0.04  1.00
## 
## n= 103 
## 
## 
## P
##          sex    gmat_tot work_yrs frstlang salary satis 
## sex             0.8446   0.3536   0.4508   0.0932 0.3554
## gmat_tot 0.8446          0.2165   0.1850   0.3624 0.5159
## work_yrs 0.3536 0.2165            0.0469   0.0000 0.5273
## frstlang 0.4508 0.1850   0.0469            0.0064 0.3668
## salary   0.0932 0.3624   0.0000   0.0064          0.6879
## satis    0.3554 0.5159   0.5273   0.3668   0.6879

A trial for a fit model

fit1 <- lm(salary ~ gmat_tot + frstlang + s_avg + f_avg + work_yrs + sex + gmat_vpc + gmat_vpc + gmat_qpc + gmat_tpc + age + satis + quarter , data = placed)
summary(fit1)
## 
## Call:
## lm(formula = salary ~ gmat_tot + frstlang + s_avg + f_avg + work_yrs + 
##     sex + gmat_vpc + gmat_vpc + gmat_qpc + gmat_tpc + age + satis + 
##     quarter, data = placed)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -26489  -7983   -373   5923  70602 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 78005.66   52981.93   1.472   0.1444  
## gmat_tot       16.19     178.85   0.090   0.9281  
## frstlang     7719.42    7373.27   1.047   0.2979  
## s_avg        -931.53    8240.31  -0.113   0.9102  
## f_avg       -2222.82    3894.57  -0.571   0.5696  
## work_yrs      749.66    1135.90   0.660   0.5110  
## sex         -3584.07    3595.85  -0.997   0.3216  
## gmat_vpc      546.31     501.97   1.088   0.2794  
## gmat_qpc      796.55     496.78   1.603   0.1123  
## gmat_tpc    -1457.09     714.94  -2.038   0.0445 *
## age          1750.65    1130.92   1.548   0.1251  
## satis       -1086.54    2157.76  -0.504   0.6158  
## quarter     -2336.56    2721.89  -0.858   0.3929  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15430 on 90 degrees of freedom
## Multiple R-squared:  0.3422, Adjusted R-squared:  0.2545 
## F-statistic: 3.902 on 12 and 90 DF,  p-value: 8.086e-05

Determining the most significant factors

library("leaps", lib.loc="~/R/win-library/3.4")
leap1 <- regsubsets(placed$salary ~ gmat_tot + frstlang + s_avg + f_avg + work_yrs + sex + gmat_vpc + gmat_vpc + gmat_qpc + gmat_tpc + age + satis + quarter , nbest = 1 , data = placed)
plot(leap1 , scale = "adjr2")

So, the best fit model excludes gmat_tot , s_avg , f_avg and ’satis` variables

A best fit regression model

fit2 <- lm(salary ~ frstlang + work_yrs + sex + gmat_vpc + gmat_vpc + gmat_qpc + gmat_tpc + age + quarter , data = placed)
summary(fit2)
## 
## Call:
## lm(formula = salary ~ frstlang + work_yrs + sex + gmat_vpc + 
##     gmat_vpc + gmat_qpc + gmat_tpc + age + quarter, data = placed)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -26192  -8279   -497   5867  70294 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  66157.6    28278.0   2.340   0.0214 *
## frstlang      7385.1     7051.4   1.047   0.2976  
## work_yrs       809.9     1112.7   0.728   0.4685  
## sex          -3740.1     3436.2  -1.088   0.2792  
## gmat_vpc       572.0      354.1   1.616   0.1095  
## gmat_qpc       824.8      352.1   2.342   0.0213 *
## gmat_tpc     -1451.7      685.9  -2.117   0.0369 *
## age           1755.4     1099.8   1.596   0.1138  
## quarter      -1824.8     1381.1  -1.321   0.1896  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15160 on 94 degrees of freedom
## Multiple R-squared:  0.337,  Adjusted R-squared:  0.2806 
## F-statistic: 5.974 on 8 and 94 DF,  p-value: 3.463e-06

Visualisation of the Coefficients

library("coefplot", lib.loc="~/R/win-library/3.4")
coefplot(fit2 , intercept = FALSE)
## Warning: Ignoring unknown aesthetics: xmin, xmax

Results

  • Salary depends negatively on sex because it is seen that salary of males is higher than females and we have taken 1 for males and 2 for females, so salary goes down as “sex” goes up.
  • Similarly, we see that if you belong to a lower quarter(3-4), then chances are high that your salary is lesser than someone of a high quarter(1-2), so as quarter increases salary decreases.
  • Also, the trend is that the more time you work for, higher is your salary

Now, comparison between the placed and the not placed people

Creating a data frame of the people who were not placed

notplaced <- sal[which(sal$salary == 0),]
View(notplaced)

Various T-Tests are run to determine the fields with significant difference in the placed and not placed population

Quarter(MBA Performance)

t.test(placed$quarter , notplaced$quarter)
## 
##  Welch Two Sample t-test
## 
## data:  placed$quarter and notplaced$quarter
## t = -1.7872, df = 189.39, p-value = 0.0755
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.59388972  0.02927267
## sample estimates:
## mean of x mean of y 
##  2.262136  2.544444

GMAT Performance

  • Total Score
t.test(placed$gmat_tot , notplaced$gmat_tot)
## 
##  Welch Two Sample t-test
## 
## data:  placed$gmat_tot and notplaced$gmat_tot
## t = 0.20321, df = 170.77, p-value = 0.8392
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -14.69189  18.06406
## sample estimates:
## mean of x mean of y 
##  616.0194  614.3333
  • Total Percentile
t.test(placed$gmat_tpc , notplaced$gmat_tpc)
## 
##  Welch Two Sample t-test
## 
## data:  placed$gmat_tpc and notplaced$gmat_tpc
## t = 1.119, df = 155.27, p-value = 0.2649
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.710571  6.181337
## sample estimates:
## mean of x mean of y 
##  84.52427  82.28889
  • MBA Performance
t.test((placed$f_avg + placed$s_avg) , (notplaced$f_avg + notplaced$s_avg))
## 
##  Welch Two Sample t-test
## 
## data:  (placed$f_avg + placed$s_avg) and (notplaced$f_avg + notplaced$s_avg)
## t = 0.78225, df = 178.77, p-value = 0.4351
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.1364781  0.3157467
## sample estimates:
## mean of x mean of y 
##  6.183301  6.093667
  • Years of Experience
t.test((placed$work_yrs) , (notplaced$work_yrs))
## 
##  Welch Two Sample t-test
## 
## data:  (placed$work_yrs) and (notplaced$work_yrs)
## t = -1.6778, df = 156.44, p-value = 0.09538
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.9797552  0.1612007
## sample estimates:
## mean of x mean of y 
##  3.679612  4.588889
  • Age
t.test((placed$age) , (notplaced$age))
## 
##  Welch Two Sample t-test
## 
## data:  (placed$age) and (notplaced$age)
## t = -2.8289, df = 150.8, p-value = 0.005307
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.9457989 -0.5230252
## sample estimates:
## mean of x mean of y 
##  26.77670  28.51111
  • Satisfaction with MBA
t.test((placed$satis) , (notplaced$satis))
## 
##  Welch Two Sample t-test
## 
## data:  (placed$satis) and (notplaced$satis)
## t = 2.3757, df = 189.69, p-value = 0.01851
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.04433331 0.47821254
## sample estimates:
## mean of x mean of y 
##  5.883495  5.622222
  • Mother tongue
t.test((placed$frstlang) , (notplaced$frstlang))
## 
##  Welch Two Sample t-test
## 
## data:  (placed$frstlang) and (notplaced$frstlang)
## t = -0.53486, df = 179.13, p-value = 0.5934
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.09813827  0.05628282
## sample estimates:
## mean of x mean of y 
##  1.067961  1.088889

Using T-Tests, we have the parameters(Age , Satisfaction and Quarter) against which we have to conduct a logistic regression

We create a new coloumn telling whether a person was placed or not

sal$pl  <- !(sal$salary == 0)
sal$pl[sal$pl == TRUE] <- 1
sal$pl[sal$pl == FALSE] <- 0
sal$pl <- as.integer(sal$pl)

The Logit Model using the significant looking factors

model <- glm(formula = pl ~ age + quarter + satis + work_yrs + gmat_tpc, family = binomial(link = "logit") , data = sal[,c(1:11,13,14)])
summary(model)
## 
## Call:
## glm(formula = pl ~ age + quarter + satis + work_yrs + gmat_tpc, 
##     family = binomial(link = "logit"), data = sal[, c(1:11, 13, 
##         14)])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9510  -1.1631   0.7745   1.0619   1.8749  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  2.87063    2.46195   1.166   0.2436  
## age         -0.19327    0.08117  -2.381   0.0173 *
## quarter     -0.28599    0.14398  -1.986   0.0470 *
## satis        0.43376    0.20494   2.116   0.0343 *
## work_yrs     0.10858    0.09051   1.200   0.2303  
## gmat_tpc     0.00401    0.01187   0.338   0.7354  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 266.68  on 192  degrees of freedom
## Residual deviance: 246.92  on 187  degrees of freedom
## AIC: 258.92
## 
## Number of Fisher Scoring iterations: 4

The lower the AIC and higher the difference b/w Null Deviance and Residual Deviance , the better

Determining correlation b/w placement and various other factors

anova(model , test = "Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: pl
## 
## Terms added sequentially (first to last)
## 
## 
##          Df Deviance Resid. Df Resid. Dev Pr(>Chi)   
## NULL                       192     266.68            
## age       1   8.4714       191     258.21 0.003608 **
## quarter   1   4.1662       190     254.04 0.041238 * 
## satis     1   5.6202       189     248.42 0.017755 * 
## work_yrs  1   1.3901       188     247.03 0.238381   
## gmat_tpc  1   0.1151       187     246.91 0.734391   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

To find out how well your model fits, see the McFadden R2

library("pscl", lib.loc="~/R/win-library/3.4")
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
pR2(model)
##           llh       llhNull            G2      McFadden          r2ML 
## -123.45771948 -133.33925034   19.76306171    0.07410819    0.09733094 
##          r2CU 
##    0.12997175

Since the fit is not so good, we try to fit it using all the factors available

The Logit Model using all the factors

model2 <- glm(formula = pl ~ ., family = binomial(link = "logit") , data = sal[,c(1:11,13,14)])
summary(model2)
## 
## Call:
## glm(formula = pl ~ ., family = binomial(link = "logit"), data = sal[, 
##     c(1:11, 13, 14)])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0419  -1.1439   0.7517   1.0238   1.9665  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  4.56015    4.57567   0.997   0.3190  
## age         -0.19812    0.08562  -2.314   0.0207 *
## sex          0.14396    0.36089   0.399   0.6900  
## gmat_tot    -0.01027    0.01289  -0.797   0.4257  
## gmat_qpc    -0.02177    0.04606  -0.473   0.6364  
## gmat_vpc    -0.02058    0.04401  -0.468   0.6401  
## gmat_tpc     0.08833    0.06492   1.361   0.1736  
## s_avg        0.24369    0.67258   0.362   0.7171  
## f_avg       -0.09871    0.36617  -0.270   0.7875  
## quarter     -0.23624    0.21132  -1.118   0.2636  
## work_yrs     0.10479    0.09386   1.116   0.2642  
## frstlang     0.31939    0.64213   0.497   0.6189  
## satis        0.42640    0.21308   2.001   0.0454 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 266.68  on 192  degrees of freedom
## Residual deviance: 242.92  on 180  degrees of freedom
## AIC: 268.92
## 
## Number of Fisher Scoring iterations: 5

Determining correlation b/w placement and all the other factors

anova(model2 , test = "Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: pl
## 
## Terms added sequentially (first to last)
## 
## 
##          Df Deviance Resid. Df Resid. Dev Pr(>Chi)   
## NULL                       192     266.68            
## age       1   8.4714       191     258.21 0.003608 **
## sex       1   0.3867       190     257.82 0.534052   
## gmat_tot  1   0.0156       189     257.81 0.900742   
## gmat_qpc  1   0.0186       188     257.79 0.891625   
## gmat_vpc  1   1.3430       187     256.44 0.246503   
## gmat_tpc  1   4.0117       186     252.43 0.045185 * 
## s_avg     1   2.0591       185     250.37 0.151303   
## f_avg     1   0.2676       184     250.10 0.604976   
## quarter   1   1.2924       183     248.81 0.255602   
## work_yrs  1   1.6279       182     247.19 0.201987   
## frstlang  1   0.1575       181     247.03 0.691446   
## satis     1   4.1033       180     242.92 0.042799 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

To find out how well your model fits, see the McFadden R2

library("pscl", lib.loc="~/R/win-library/3.4")
pR2(model2)
##          llh      llhNull           G2     McFadden         r2ML 
## -121.4618563 -133.3392503   23.7547880    0.0890765    0.1158087 
##         r2CU 
##    0.1546461

We see that :

  • AIC increases

  • Difference b/w Null and Residual Deviances increases

  • McFadden R2 increases

So, this model is slightly better, but to conclude I will say that out of the given factors - using Age , Satisfaction and Quarter(MBA Performance), we can determine whether a person is placed or not.