This is an analysis of the Harvard Business case study " MBA starting salaries“. Here, we try to analyze various factors that lead to change in the salaries of MBA students. The dataset contains the following columns: age, sex, gmat_tot, gmat_qpc, gmat_vpc, gmat_tpc, s_avg, f_avg, quarter, work_yrs, frstlang, salary, satis.

TASK 2A: UNDERSTANDING THE DATASET AND THE RELATIONSHIP BETWEEN VARIOUS VARIABLES WITH RESPECT TO SALARY.

# Reading the dataset.

setwd("C:/Users/GOWRI/Desktop/iim_internship/week_4/Mini_project")
MBAdata <- read.csv(file="MBA_Starting_Salaries_Data.csv",head=TRUE,sep=",")
View(MBAdata)

# Reading the dataset without the missing or unanswered data and unplaced students.
MBAdataRef <- MBAdata[which(MBAdata$salary!=998 & MBAdata$salary !=999 & MBAdata$salary !=0),]

# Reading the dataset without the missing or unanswered data.
MBAdataFull <- MBAdata[which(MBAdata$salary!=998 & MBAdata$salary !=999),]

Analysis of the dataset and subsets.

library(psych)

summaryOfMBAdata <- describe(MBAdataRef)
MBAdataSum <- summaryOfMBAdata[ c(3,4,5,8,9)]
MBAdataSum
##               mean       sd   median     min    max
## age          26.78     3.27 2.60e+01    22.0     40
## sex           1.30     0.46 1.00e+00     1.0      2
## gmat_tot    616.02    50.69 6.20e+02   500.0    720
## gmat_qpc     79.73    13.39 8.20e+01    39.0     99
## gmat_vpc     78.56    16.14 8.10e+01    30.0     99
## gmat_tpc     84.52    11.01 8.70e+01    51.0     99
## s_avg         3.09     0.38 3.10e+00     2.2      4
## f_avg         3.09     0.49 3.25e+00     0.0      4
## quarter       2.26     1.12 2.00e+00     1.0      4
## work_yrs      3.68     3.01 3.00e+00     0.0     16
## frstlang      1.07     0.25 1.00e+00     1.0      2
## salary   103030.74 17868.80 1.00e+05 64000.0 220000
## satis         5.88     0.78 6.00e+00     3.0      7
# Summary of MBA salaries dataset.
summary(MBAdata)
##       age             sex           gmat_tot        gmat_qpc    
##  Min.   :22.00   Min.   :1.000   Min.   :450.0   Min.   :28.00  
##  1st Qu.:25.00   1st Qu.:1.000   1st Qu.:580.0   1st Qu.:72.00  
##  Median :27.00   Median :1.000   Median :620.0   Median :83.00  
##  Mean   :27.36   Mean   :1.248   Mean   :619.5   Mean   :80.64  
##  3rd Qu.:29.00   3rd Qu.:1.000   3rd Qu.:660.0   3rd Qu.:93.00  
##  Max.   :48.00   Max.   :2.000   Max.   :790.0   Max.   :99.00  
##     gmat_vpc        gmat_tpc        s_avg           f_avg      
##  Min.   :16.00   Min.   : 0.0   Min.   :2.000   Min.   :0.000  
##  1st Qu.:71.00   1st Qu.:78.0   1st Qu.:2.708   1st Qu.:2.750  
##  Median :81.00   Median :87.0   Median :3.000   Median :3.000  
##  Mean   :78.32   Mean   :84.2   Mean   :3.025   Mean   :3.062  
##  3rd Qu.:91.00   3rd Qu.:94.0   3rd Qu.:3.300   3rd Qu.:3.250  
##  Max.   :99.00   Max.   :99.0   Max.   :4.000   Max.   :4.000  
##     quarter         work_yrs         frstlang         salary      
##  Min.   :1.000   Min.   : 0.000   Min.   :1.000   Min.   :     0  
##  1st Qu.:1.250   1st Qu.: 2.000   1st Qu.:1.000   1st Qu.:     0  
##  Median :2.000   Median : 3.000   Median :1.000   Median :   999  
##  Mean   :2.478   Mean   : 3.872   Mean   :1.117   Mean   : 39026  
##  3rd Qu.:3.000   3rd Qu.: 4.000   3rd Qu.:1.000   3rd Qu.: 97000  
##  Max.   :4.000   Max.   :22.000   Max.   :2.000   Max.   :220000  
##      satis      
##  Min.   :  1.0  
##  1st Qu.:  5.0  
##  Median :  6.0  
##  Mean   :172.2  
##  3rd Qu.:  7.0  
##  Max.   :998.0
# Summary of the dataset without the missing or unanswered data and unplaced students.
summary(MBAdataRef)
##       age             sex           gmat_tot      gmat_qpc    
##  Min.   :22.00   Min.   :1.000   Min.   :500   Min.   :39.00  
##  1st Qu.:25.00   1st Qu.:1.000   1st Qu.:580   1st Qu.:72.00  
##  Median :26.00   Median :1.000   Median :620   Median :82.00  
##  Mean   :26.78   Mean   :1.301   Mean   :616   Mean   :79.73  
##  3rd Qu.:28.00   3rd Qu.:2.000   3rd Qu.:655   3rd Qu.:89.00  
##  Max.   :40.00   Max.   :2.000   Max.   :720   Max.   :99.00  
##     gmat_vpc        gmat_tpc         s_avg           f_avg      
##  Min.   :30.00   Min.   :51.00   Min.   :2.200   Min.   :0.000  
##  1st Qu.:71.00   1st Qu.:78.00   1st Qu.:2.850   1st Qu.:2.915  
##  Median :81.00   Median :87.00   Median :3.100   Median :3.250  
##  Mean   :78.56   Mean   :84.52   Mean   :3.092   Mean   :3.091  
##  3rd Qu.:92.00   3rd Qu.:93.50   3rd Qu.:3.400   3rd Qu.:3.415  
##  Max.   :99.00   Max.   :99.00   Max.   :4.000   Max.   :4.000  
##     quarter         work_yrs        frstlang         salary      
##  Min.   :1.000   Min.   : 0.00   Min.   :1.000   Min.   : 64000  
##  1st Qu.:1.000   1st Qu.: 2.00   1st Qu.:1.000   1st Qu.: 95000  
##  Median :2.000   Median : 3.00   Median :1.000   Median :100000  
##  Mean   :2.262   Mean   : 3.68   Mean   :1.068   Mean   :103031  
##  3rd Qu.:3.000   3rd Qu.: 4.00   3rd Qu.:1.000   3rd Qu.:106000  
##  Max.   :4.000   Max.   :16.00   Max.   :2.000   Max.   :220000  
##      satis      
##  Min.   :3.000  
##  1st Qu.:5.000  
##  Median :6.000  
##  Mean   :5.883  
##  3rd Qu.:6.000  
##  Max.   :7.000
# Summary of the dataset without the missing or unanswered data.
summary(MBAdataFull)
##       age             sex          gmat_tot        gmat_qpc    
##  Min.   :22.00   Min.   :1.00   Min.   :450.0   Min.   :28.00  
##  1st Qu.:25.00   1st Qu.:1.00   1st Qu.:570.0   1st Qu.:72.00  
##  Median :27.00   Median :1.00   Median :610.0   Median :82.00  
##  Mean   :27.59   Mean   :1.28   Mean   :615.2   Mean   :79.35  
##  3rd Qu.:29.00   3rd Qu.:2.00   3rd Qu.:650.0   3rd Qu.:91.00  
##  Max.   :48.00   Max.   :2.00   Max.   :760.0   Max.   :99.00  
##     gmat_vpc        gmat_tpc         s_avg           f_avg      
##  Min.   :22.00   Min.   : 0.00   Min.   :2.000   Min.   :0.000  
##  1st Qu.:71.00   1st Qu.:75.00   1st Qu.:2.800   1st Qu.:2.750  
##  Median :81.00   Median :87.00   Median :3.090   Median :3.000  
##  Mean   :78.13   Mean   :83.48   Mean   :3.064   Mean   :3.078  
##  3rd Qu.:91.00   3rd Qu.:93.00   3rd Qu.:3.300   3rd Qu.:3.330  
##  Max.   :99.00   Max.   :99.00   Max.   :4.000   Max.   :4.000  
##     quarter         work_yrs         frstlang         salary      
##  Min.   :1.000   Min.   : 0.000   Min.   :1.000   Min.   :     0  
##  1st Qu.:1.000   1st Qu.: 2.000   1st Qu.:1.000   1st Qu.:     0  
##  Median :2.000   Median : 3.000   Median :1.000   Median : 85000  
##  Mean   :2.394   Mean   : 4.104   Mean   :1.078   Mean   : 54985  
##  3rd Qu.:3.000   3rd Qu.: 5.000   3rd Qu.:1.000   3rd Qu.:100000  
##  Max.   :4.000   Max.   :22.000   Max.   :2.000   Max.   :220000  
##      satis      
##  Min.   :3.000  
##  1st Qu.:5.000  
##  Median :6.000  
##  Mean   :5.762  
##  3rd Qu.:6.000  
##  Max.   :7.000

Effect of Age on salary.

library(lattice)

boxplot(MBAdataRef$salary ~MBAdataRef$age , 
        xlab = "Age", ylab ="salary", main = "Effect of Age on salary.")

# we can see a significant correlation between age and salary.

Effect of Gender on salary.

library(lattice)

boxplot(MBAdataRef$salary ~MBAdataRef$sex , 
        xlab = "Sex", ylab ="salary", main = "Effect of gender on salary.")

# There is not much effect of gender on salary.

Effect of work years on Salary

library(car)
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
scatterplot(MBAdataRef$salary~MBAdataRef$work_yrs,spread=FALSE, smoother.args=list(lty=2), pch=19,
            main="Scatterplot of Salary vs. work years ",
            xlab="Work years",
            ylab="salary")

# There is a strong correlation between the number of work years and the salary.

Effect of first Language on salary.

library(lattice)

boxplot(MBAdataRef$salary ~MBAdataRef$frstlang , 
        xlab = "First Language", ylab ="salary", main = "Effect of first Language on salary.")

# There is no significant correlation between the first language and the salary.

Effect of satisfaction on salary.

library(lattice)

boxplot(MBAdataRef$salary ~MBAdataRef$satis , 
        xlab = "Satisfaction", ylab ="salary", main = "Effect of satisfaction on salary.")

# There is no significant correlation between satisfaction and salary.

Effect of GMAT on Salary.

library(car)

scatterplotMatrix(formula = ~ salary + gmat_tot + gmat_qpc + gmat_vpc + gmat_tpc , cex=0.6,
                  data=MBAdataRef, main = " Effect of GMAT on Salary.")

Effect of spring_MBA, Fall_MBA and quartile ranking on Salary.

library(car)

scatterplotMatrix(formula = ~ salary + s_avg + f_avg + quarter, cex=0.6,
                  data=MBAdataRef, main = " Effect of spring_MBA, Fall_MBA and quartile ranking on Salary.")

Draw a Corrgram

library("corrgram") 
## Warning: replacing previous import by 'magrittr::%>%' when loading
## 'dendextend'
corrgram(MBAdataRef, order=FALSE, lower.panel=panel.shade,
         upper.panel=panel.pie, text.panel=panel.txt,
         main="Corrgram of MBA data variables")

Create a Variance-Covariance Matrix.

covMBAdatRef <-cov(MBAdataRef)
covMBAdatRef
##                    age           sex      gmat_tot      gmat_qpc
## age         10.7045498 -2.164477e-01 -1.305445e+01   -7.22796497
## sex         -0.2164477  2.124500e-01 -4.568818e-01   -0.90757662
## gmat_tot   -13.0544451 -4.568818e-01  2.569294e+03  452.14258519
## gmat_qpc    -7.2279650 -9.075766e-01  4.521426e+02  179.18027794
## gmat_vpc     0.9505045  3.974872e-01  6.386360e+02   20.45849990
## gmat_tpc    -3.4602132 -2.377689e-01  5.393623e+02   97.03607462
## s_avg        0.1938587  1.409575e-02  3.299562e+00    0.07838473
## f_avg       -0.3462517  3.725395e-02  3.027432e+00    0.64252142
## quarter     -0.4604988 -1.104131e-02 -6.005140e+00    0.18960594
## work_yrs     8.6728536 -1.281173e-01 -1.873882e+01   -7.36245955
## frstlang     0.2898344  8.756901e-03 -1.687607e+00    0.04806777
## salary   29210.5193223 -1.369577e+03 -8.212449e+04 3382.43784504
## satis        0.2776509 -3.321911e-02  2.570912e+00   -0.04178565
##               gmat_vpc      gmat_tpc        s_avg         f_avg
## age       9.505045e-01 -3.460213e+00   0.19385875 -3.462517e-01
## sex       3.974872e-01 -2.377689e-01   0.01409575  3.725395e-02
## gmat_tot  6.386360e+02  5.393623e+02   3.29956215  3.027432e+00
## gmat_qpc  2.045850e+01  9.703607e+01   0.07838473  6.425214e-01
## gmat_vpc  2.606602e+02  1.393882e+02   0.96945936  1.803303e-01
## gmat_tpc  1.393882e+02  1.211342e+02   0.58062916  3.785056e-01
## s_avg     9.694594e-01  5.806292e-01   0.14325138  8.231046e-02
## f_avg     1.803303e-01  3.785056e-01   0.08231046  2.378638e-01
## quarter  -2.325528e+00 -1.227013e+00  -0.35620503 -2.356492e-01
## work_yrs -1.366838e+00 -4.389206e+00   0.18604797 -3.176271e-01
## frstlang -8.915858e-01 -4.575481e-01  -0.01319912 -6.243099e-03
## salary   -3.964803e+04 -2.596339e+04 688.02042071 -9.241129e+02
## satis     1.879973e+00  1.002856e+00  -0.04256901 -4.498382e-02
##                quarter      work_yrs      frstlang        salary
## age      -4.604988e-01     8.6728536  2.898344e-01  2.921052e+04
## sex      -1.104131e-02    -0.1281173  8.756901e-03 -1.369577e+03
## gmat_tot -6.005140e+00   -18.7388159 -1.687607e+00 -8.212449e+04
## gmat_qpc  1.896059e-01    -7.3624595  4.806777e-02  3.382438e+03
## gmat_vpc -2.325528e+00    -1.3668380 -8.915858e-01 -3.964803e+04
## gmat_tpc -1.227013e+00    -4.3892062 -4.575481e-01 -2.596339e+04
## s_avg    -3.562050e-01     0.1860480 -1.319912e-02  6.880204e+02
## f_avg    -2.356492e-01    -0.3176271 -6.243099e-03 -9.241129e+02
## quarter   1.254140e+00    -0.4347992  3.102989e-02 -2.571117e+03
## work_yrs -4.347992e-01     9.0630116  1.494384e-01  2.445820e+04
## frstlang  3.102989e-02     0.1494384  6.396345e-02  1.206714e+03
## salary   -2.571117e+03 24458.1995050  1.206714e+03  3.192940e+08
## satis     1.975062e-01     0.1485818  1.779935e-02 -5.606583e+02
##                  satis
## age         0.27765087
## sex        -0.03321911
## gmat_tot    2.57091186
## gmat_qpc   -0.04178565
## gmat_vpc    1.87997335
## gmat_tpc    1.00285551
## s_avg      -0.04256901
## f_avg      -0.04498382
## quarter     0.19750619
## work_yrs    0.14858176
## frstlang    0.01779935
## salary   -560.65829050
## satis       0.61374453
#Transform covariance to correlation matrix
covCorrMBAdataRef <- cov2cor(covMBAdatRef)
covCorrMBAdataRef
##                  age         sex    gmat_tot     gmat_qpc    gmat_vpc
## age       1.00000000 -0.14352927 -0.07871678 -0.165039057  0.01799420
## sex      -0.14352927  1.00000000 -0.01955548 -0.147099027  0.05341428
## gmat_tot -0.07871678 -0.01955548  1.00000000  0.666382266  0.78038546
## gmat_qpc -0.16503906 -0.14709903  0.66638227  1.000000000  0.09466541
## gmat_vpc  0.01799420  0.05341428  0.78038546  0.094665411  1.00000000
## gmat_tpc -0.09609156 -0.04686981  0.96680810  0.658650025  0.78443167
## s_avg     0.15654954  0.08079985  0.17198874  0.015471662  0.15865101
## f_avg    -0.21699191  0.16572186  0.12246257  0.098418869  0.02290167
## quarter  -0.12568145 -0.02139041 -0.10578964  0.012648346 -0.12862079
## work_yrs  0.88052470 -0.09233003 -0.12280018 -0.182701263 -0.02812182
## frstlang  0.35026743  0.07512009 -0.13164323  0.014198516 -0.21835333
## salary    0.49964284 -0.16628869 -0.09067141  0.014141299 -0.13743230
## satis     0.10832308 -0.09199534  0.06474206 -0.003984632  0.14863481
##             gmat_tpc       s_avg       f_avg     quarter    work_yrs
## age      -0.09609156  0.15654954 -0.21699191 -0.12568145  0.88052470
## sex      -0.04686981  0.08079985  0.16572186 -0.02139041 -0.09233003
## gmat_tot  0.96680810  0.17198874  0.12246257 -0.10578964 -0.12280018
## gmat_qpc  0.65865003  0.01547166  0.09841887  0.01264835 -0.18270126
## gmat_vpc  0.78443167  0.15865101  0.02290167 -0.12862079 -0.02812182
## gmat_tpc  1.00000000  0.13938500  0.07051391 -0.09955033 -0.13246963
## s_avg     0.13938500  1.00000000  0.44590413 -0.84038355  0.16328236
## f_avg     0.07051391  0.44590413  1.00000000 -0.43144819 -0.21633018
## quarter  -0.09955033 -0.84038355 -0.43144819  1.00000000 -0.12896722
## work_yrs -0.13246963  0.16328236 -0.21633018 -0.12896722  1.00000000
## frstlang -0.16437561 -0.13788905 -0.05061394  0.10955726  0.19627277
## salary   -0.13201783  0.10173175 -0.10603897 -0.12848526  0.45466634
## satis     0.11630842 -0.14356557 -0.11773304  0.22511985  0.06299926
##             frstlang      salary        satis
## age       0.35026743  0.49964284  0.108323083
## sex       0.07512009 -0.16628869 -0.091995338
## gmat_tot -0.13164323 -0.09067141  0.064742057
## gmat_qpc  0.01419852  0.01414130 -0.003984632
## gmat_vpc -0.21835333 -0.13743230  0.148634805
## gmat_tpc -0.16437561 -0.13201783  0.116308417
## s_avg    -0.13788905  0.10173175 -0.143565573
## f_avg    -0.05061394 -0.10603897 -0.117733043
## quarter   0.10955726 -0.12848526  0.225119851
## work_yrs  0.19627277  0.45466634  0.062999256
## frstlang  1.00000000  0.26701953  0.089834769
## salary    0.26701953  1.00000000 -0.040050600
## satis     0.08983477 -0.04005060  1.000000000

TASK 2b: WHO GOT HOW MUCH SALARY?

# Articulate a Hypothesis that you could test using a Regression Model. Run T-Tests appropriate, to test your Hypotheses. Fit a Linear Regression Model using lm().

# Null Hypothesis- "The salary does not depend on Gender.
# T-Test to check correlation between salary and Gender.

t.test(MBAdataRef$salary ~ MBAdataRef$sex)
## 
##  Welch Two Sample t-test
## 
## data:  MBAdataRef$salary by MBAdataRef$sex
## t = 1.3628, df = 38.115, p-value = 0.1809
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -3128.55 16021.72
## sample estimates:
## mean in group 1 mean in group 2 
##       104970.97        98524.39
# As p-value>0.05, accept the null hypothesis, So, there is a no siginficant difference starting salary of male and female MBAs as salary doesnt depend on gender.
# Null Hypothesis- "The salary does not depend on First Language."
# T-Test to check correlation between salary and first language.

t.test(MBAdataRef$salary ~ MBAdataRef$frstlang)
## 
##  Welch Two Sample t-test
## 
## data:  MBAdataRef$salary by MBAdataRef$frstlang
## t = -1.1202, df = 6.0863, p-value = 0.3049
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -59933.62  22202.25
## sample estimates:
## mean in group 1 mean in group 2 
##        101748.6        120614.3
#  As p-value>0.05, accept the null hypothesis, So,there is no significant difference between starting salary of MBAs whose first language is english and others as salary doesnt depend on first language.

Comparision of various Regression Model that predict starting salary.

# Model 1
MbaSalModel1<-lm(salary ~ age +gmat_tot + gmat_qpc + gmat_vpc + gmat_tpc + quarter + s_avg+ f_avg +  satis + frstlang , data = MBAdataRef)
summary(MbaSalModel1)
## 
## Call:
## lm(formula = salary ~ age + gmat_tot + gmat_qpc + gmat_vpc + 
##     gmat_tpc + quarter + s_avg + f_avg + satis + frstlang, data = MBAdataRef)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -24137  -8244   -490   5313  68756 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 64622.144  49262.731   1.312   0.1929    
## age          2501.003    559.182   4.473  2.2e-05 ***
## gmat_tot        8.337    177.818   0.047   0.9627    
## gmat_qpc      827.849    491.659   1.684   0.0956 .  
## gmat_vpc      530.807    498.305   1.065   0.2896    
## gmat_tpc    -1436.428    711.446  -2.019   0.0464 *  
## quarter     -2647.810   2692.668  -0.983   0.3280    
## s_avg       -1805.530   8145.604  -0.222   0.8251    
## f_avg       -2741.535   3852.548  -0.712   0.4785    
## satis        -925.938   2140.124  -0.433   0.6663    
## frstlang     5156.619   6934.452   0.744   0.4590    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15370 on 92 degrees of freedom
## Multiple R-squared:  0.3328, Adjusted R-squared:  0.2603 
## F-statistic: 4.589 on 10 and 92 DF,  p-value: 2.778e-05
# Model 2
MbaSalModel2<-lm(salary ~ age +gmat_tot + quarter +satis 
                 + frstlang , data = MBAdataRef)
summary(MbaSalModel2)
## 
## Call:
## lm(formula = salary ~ age + gmat_tot + quarter + satis + frstlang, 
##     data = MBAdataRef)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -28366  -9128   -892   5055  76836 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 50088.40   25880.10   1.935   0.0559 .  
## age          2487.04     517.40   4.807  5.6e-06 ***
## gmat_tot      -14.65      31.11  -0.471   0.6386    
## quarter     -1119.24    1462.40  -0.765   0.4459    
## satis       -1856.87    2056.96  -0.903   0.3689    
## frstlang     8269.35    6644.88   1.244   0.2163    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15600 on 97 degrees of freedom
## Multiple R-squared:  0.2748, Adjusted R-squared:  0.2374 
## F-statistic: 7.352 on 5 and 97 DF,  p-value: 7.014e-06
# Model 3
MbaSalModel3<-lm(salary ~ age +gmat_tot + gmat_qpc + gmat_vpc + gmat_tpc + satis 
                 + frstlang , data = MBAdataRef)
summary(MbaSalModel3)
## 
## Call:
## lm(formula = salary ~ age + gmat_tot + gmat_qpc + gmat_vpc + 
##     gmat_tpc + satis + frstlang, data = MBAdataRef)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -27442  -9074    -26   5449  65805 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 51877.83   47122.04   1.101   0.2737    
## age          2720.36     507.19   5.364 5.73e-07 ***
## gmat_tot      -27.96     162.79  -0.172   0.8640    
## gmat_qpc      841.99     471.63   1.785   0.0774 .  
## gmat_vpc      567.02     477.05   1.189   0.2376    
## gmat_tpc    -1309.36     699.44  -1.872   0.0643 .  
## satis       -1688.22    2036.28  -0.829   0.4091    
## frstlang     4176.03    6703.97   0.623   0.5348    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15270 on 95 degrees of freedom
## Multiple R-squared:  0.3198, Adjusted R-squared:  0.2697 
## F-statistic:  6.38 on 7 and 95 DF,  p-value: 3.716e-06
# Seeing the R squared, the thid model looks more accurate.

Comparision of subsets of data having placed and unplaced students.

# Compare job(placed) and no Job(Placed)

MBAdataFull <- MBAdata[which(MBAdata$salary!=998 & MBAdata$salary !=999),]
MBAdataFull$job[MBAdataFull$salary ==0] <- 0
MBAdataFull$job[MBAdataFull$salary !=0] <- 1
View(MBAdataFull)

# Null hypothesis: Gender and placement are independent
# ChiSquareTest
mbadataschi<-xtabs(~sex + job,data = MBAdataFull)
addmargins(mbadataschi)
##      job
## sex     0   1 Sum
##   1    67  72 139
##   2    23  31  54
##   Sum  90 103 193
chisq.test(mbadataschi)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  mbadataschi
## X-squared = 0.29208, df = 1, p-value = 0.5889
# As p>0.05, null hypothesis is accepted, Gender and placement are independent.

# Null hypothesis: first Language and placement are independent
# ChiSquareTest
mbadataschi1<-xtabs(~frstlang + job, data = MBAdataFull)
addmargins(mbadataschi1)
##         job
## frstlang   0   1 Sum
##      1    82  96 178
##      2     8   7  15
##      Sum  90 103 193
chisq.test(mbadataschi1)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  mbadataschi1
## X-squared = 0.074127, df = 1, p-value = 0.7854
# As p>0.05, null hypothesis is accepted, language and placement are independent.

LOGISTIC REGRESSION TO PREDICT WHETHER THE STUDENT WILL GET PLACED OR NOT.

# Reading the dataset
MBATrainingdata <- read.csv(file="MBA_Starting_Salaries_Data.csv",head=TRUE,sep=",")
View(MBATrainingdata)

# Adding a column of placed and unplaced students.
MBATrainingdataFull <- MBATrainingdata[which(MBATrainingdata$salary!=998 & MBATrainingdata$salary !=999),]
MBATrainingdataFull$job[MBATrainingdataFull$salary ==0] <- 0
MBATrainingdataFull$job[MBATrainingdataFull$salary !=0] <- 1

# Making a subset of the data with relevant columns.
Data <- subset(MBATrainingdataFull,select=c(1,2,3,7,8,9,10,11,12,13,14))
View(Data)

# Splitting into train and test data.
train <- Data[1:160,]
test <- Data[161:193,]

# Logistic regression Model on the job column, to predict whether the student will get the job or not based on the various factors selected.
LRmodel <- glm(job ~.,family=binomial(link='logit'),data=train)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(LRmodel)
## 
## Call:
## glm(formula = job ~ ., family = binomial(link = "logit"), data = train)
## 
## Deviance Residuals: 
##        Min          1Q      Median          3Q         Max  
## -1.083e-05  -1.354e-06   2.110e-08   2.110e-08   1.671e-05  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.235e+01  1.291e+06   0.000    1.000
## age          3.776e-02  1.660e+04   0.000    1.000
## sex          2.759e+00  6.361e+04   0.000    1.000
## gmat_tot    -9.237e-03  5.430e+02   0.000    1.000
## s_avg       -1.328e-02  2.842e+05   0.000    1.000
## f_avg       -2.992e-01  9.099e+04   0.000    1.000
## quarter      1.701e-01  9.579e+04   0.000    1.000
## work_yrs    -5.826e-01  2.222e+04   0.000    1.000
## frstlang    -2.021e+00  2.808e+05   0.000    1.000
## salary       5.840e-04  6.096e-01   0.001    0.999
## satis        1.429e-01  3.472e+04   0.000    1.000
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2.2141e+02  on 159  degrees of freedom
## Residual deviance: 1.3566e-09  on 149  degrees of freedom
## AIC: 22
## 
## Number of Fisher Scoring iterations: 25
anova(LRmodel, test="Chisq")
## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: job
## 
## Terms added sequentially (first to last)
## 
## 
##          Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                       159     221.41              
## age       1    4.148       158     217.26 0.0416846 *  
## sex       1    0.028       157     217.23 0.8682202    
## gmat_tot  1    0.074       156     217.16 0.7859236    
## s_avg     1   14.565       155     202.59 0.0001354 ***
## f_avg     1    0.082       154     202.51 0.7746748    
## quarter   1    0.482       153     202.03 0.4874415    
## work_yrs  1    0.819       152     201.21 0.3654938    
## frstlang  1    0.020       151     201.19 0.8870032    
## salary    1  201.190       150       0.00 < 2.2e-16 ***
## satis     1    0.000       149       0.00 0.9999987    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#predictive model
fitted.results <- predict(LRmodel,newdata=test,type='response')
fitted.results <- ifelse(fitted.results > 0.5,1,0)
misClasificError <- mean(fitted.results != test$job)
print(paste('Accuracy',1-misClasificError))
## [1] "Accuracy 1"
library(ROCR)
## Warning: package 'ROCR' was built under R version 3.2.5
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.2.5
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
pr <- prediction(fitted.results, test$job)
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)

auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
auc
## [1] 1