This is an analysis of the Harvard Business case study " MBA starting salaries“. Here, we try to analyze various factors that lead to change in the salaries of MBA students. The dataset contains the following columns: age, sex, gmat_tot, gmat_qpc, gmat_vpc, gmat_tpc, s_avg, f_avg, quarter, work_yrs, frstlang, salary, satis.
# Reading the dataset.
setwd("C:/Users/GOWRI/Desktop/iim_internship/week_4/Mini_project")
MBAdata <- read.csv(file="MBA_Starting_Salaries_Data.csv",head=TRUE,sep=",")
View(MBAdata)
# Reading the dataset without the missing or unanswered data and unplaced students.
MBAdataRef <- MBAdata[which(MBAdata$salary!=998 & MBAdata$salary !=999 & MBAdata$salary !=0),]
# Reading the dataset without the missing or unanswered data.
MBAdataFull <- MBAdata[which(MBAdata$salary!=998 & MBAdata$salary !=999),]
library(psych)
summaryOfMBAdata <- describe(MBAdataRef)
MBAdataSum <- summaryOfMBAdata[ c(3,4,5,8,9)]
MBAdataSum
## mean sd median min max
## age 26.78 3.27 2.60e+01 22.0 40
## sex 1.30 0.46 1.00e+00 1.0 2
## gmat_tot 616.02 50.69 6.20e+02 500.0 720
## gmat_qpc 79.73 13.39 8.20e+01 39.0 99
## gmat_vpc 78.56 16.14 8.10e+01 30.0 99
## gmat_tpc 84.52 11.01 8.70e+01 51.0 99
## s_avg 3.09 0.38 3.10e+00 2.2 4
## f_avg 3.09 0.49 3.25e+00 0.0 4
## quarter 2.26 1.12 2.00e+00 1.0 4
## work_yrs 3.68 3.01 3.00e+00 0.0 16
## frstlang 1.07 0.25 1.00e+00 1.0 2
## salary 103030.74 17868.80 1.00e+05 64000.0 220000
## satis 5.88 0.78 6.00e+00 3.0 7
# Summary of MBA salaries dataset.
summary(MBAdata)
## age sex gmat_tot gmat_qpc
## Min. :22.00 Min. :1.000 Min. :450.0 Min. :28.00
## 1st Qu.:25.00 1st Qu.:1.000 1st Qu.:580.0 1st Qu.:72.00
## Median :27.00 Median :1.000 Median :620.0 Median :83.00
## Mean :27.36 Mean :1.248 Mean :619.5 Mean :80.64
## 3rd Qu.:29.00 3rd Qu.:1.000 3rd Qu.:660.0 3rd Qu.:93.00
## Max. :48.00 Max. :2.000 Max. :790.0 Max. :99.00
## gmat_vpc gmat_tpc s_avg f_avg
## Min. :16.00 Min. : 0.0 Min. :2.000 Min. :0.000
## 1st Qu.:71.00 1st Qu.:78.0 1st Qu.:2.708 1st Qu.:2.750
## Median :81.00 Median :87.0 Median :3.000 Median :3.000
## Mean :78.32 Mean :84.2 Mean :3.025 Mean :3.062
## 3rd Qu.:91.00 3rd Qu.:94.0 3rd Qu.:3.300 3rd Qu.:3.250
## Max. :99.00 Max. :99.0 Max. :4.000 Max. :4.000
## quarter work_yrs frstlang salary
## Min. :1.000 Min. : 0.000 Min. :1.000 Min. : 0
## 1st Qu.:1.250 1st Qu.: 2.000 1st Qu.:1.000 1st Qu.: 0
## Median :2.000 Median : 3.000 Median :1.000 Median : 999
## Mean :2.478 Mean : 3.872 Mean :1.117 Mean : 39026
## 3rd Qu.:3.000 3rd Qu.: 4.000 3rd Qu.:1.000 3rd Qu.: 97000
## Max. :4.000 Max. :22.000 Max. :2.000 Max. :220000
## satis
## Min. : 1.0
## 1st Qu.: 5.0
## Median : 6.0
## Mean :172.2
## 3rd Qu.: 7.0
## Max. :998.0
# Summary of the dataset without the missing or unanswered data and unplaced students.
summary(MBAdataRef)
## age sex gmat_tot gmat_qpc
## Min. :22.00 Min. :1.000 Min. :500 Min. :39.00
## 1st Qu.:25.00 1st Qu.:1.000 1st Qu.:580 1st Qu.:72.00
## Median :26.00 Median :1.000 Median :620 Median :82.00
## Mean :26.78 Mean :1.301 Mean :616 Mean :79.73
## 3rd Qu.:28.00 3rd Qu.:2.000 3rd Qu.:655 3rd Qu.:89.00
## Max. :40.00 Max. :2.000 Max. :720 Max. :99.00
## gmat_vpc gmat_tpc s_avg f_avg
## Min. :30.00 Min. :51.00 Min. :2.200 Min. :0.000
## 1st Qu.:71.00 1st Qu.:78.00 1st Qu.:2.850 1st Qu.:2.915
## Median :81.00 Median :87.00 Median :3.100 Median :3.250
## Mean :78.56 Mean :84.52 Mean :3.092 Mean :3.091
## 3rd Qu.:92.00 3rd Qu.:93.50 3rd Qu.:3.400 3rd Qu.:3.415
## Max. :99.00 Max. :99.00 Max. :4.000 Max. :4.000
## quarter work_yrs frstlang salary
## Min. :1.000 Min. : 0.00 Min. :1.000 Min. : 64000
## 1st Qu.:1.000 1st Qu.: 2.00 1st Qu.:1.000 1st Qu.: 95000
## Median :2.000 Median : 3.00 Median :1.000 Median :100000
## Mean :2.262 Mean : 3.68 Mean :1.068 Mean :103031
## 3rd Qu.:3.000 3rd Qu.: 4.00 3rd Qu.:1.000 3rd Qu.:106000
## Max. :4.000 Max. :16.00 Max. :2.000 Max. :220000
## satis
## Min. :3.000
## 1st Qu.:5.000
## Median :6.000
## Mean :5.883
## 3rd Qu.:6.000
## Max. :7.000
# Summary of the dataset without the missing or unanswered data.
summary(MBAdataFull)
## age sex gmat_tot gmat_qpc
## Min. :22.00 Min. :1.00 Min. :450.0 Min. :28.00
## 1st Qu.:25.00 1st Qu.:1.00 1st Qu.:570.0 1st Qu.:72.00
## Median :27.00 Median :1.00 Median :610.0 Median :82.00
## Mean :27.59 Mean :1.28 Mean :615.2 Mean :79.35
## 3rd Qu.:29.00 3rd Qu.:2.00 3rd Qu.:650.0 3rd Qu.:91.00
## Max. :48.00 Max. :2.00 Max. :760.0 Max. :99.00
## gmat_vpc gmat_tpc s_avg f_avg
## Min. :22.00 Min. : 0.00 Min. :2.000 Min. :0.000
## 1st Qu.:71.00 1st Qu.:75.00 1st Qu.:2.800 1st Qu.:2.750
## Median :81.00 Median :87.00 Median :3.090 Median :3.000
## Mean :78.13 Mean :83.48 Mean :3.064 Mean :3.078
## 3rd Qu.:91.00 3rd Qu.:93.00 3rd Qu.:3.300 3rd Qu.:3.330
## Max. :99.00 Max. :99.00 Max. :4.000 Max. :4.000
## quarter work_yrs frstlang salary
## Min. :1.000 Min. : 0.000 Min. :1.000 Min. : 0
## 1st Qu.:1.000 1st Qu.: 2.000 1st Qu.:1.000 1st Qu.: 0
## Median :2.000 Median : 3.000 Median :1.000 Median : 85000
## Mean :2.394 Mean : 4.104 Mean :1.078 Mean : 54985
## 3rd Qu.:3.000 3rd Qu.: 5.000 3rd Qu.:1.000 3rd Qu.:100000
## Max. :4.000 Max. :22.000 Max. :2.000 Max. :220000
## satis
## Min. :3.000
## 1st Qu.:5.000
## Median :6.000
## Mean :5.762
## 3rd Qu.:6.000
## Max. :7.000
library(lattice)
boxplot(MBAdataRef$salary ~MBAdataRef$age ,
xlab = "Age", ylab ="salary", main = "Effect of Age on salary.")
# we can see a significant correlation between age and salary.
library(lattice)
boxplot(MBAdataRef$salary ~MBAdataRef$sex ,
xlab = "Sex", ylab ="salary", main = "Effect of gender on salary.")
# There is not much effect of gender on salary.
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
scatterplot(MBAdataRef$salary~MBAdataRef$work_yrs,spread=FALSE, smoother.args=list(lty=2), pch=19,
main="Scatterplot of Salary vs. work years ",
xlab="Work years",
ylab="salary")
# There is a strong correlation between the number of work years and the salary.
library(lattice)
boxplot(MBAdataRef$salary ~MBAdataRef$frstlang ,
xlab = "First Language", ylab ="salary", main = "Effect of first Language on salary.")
# There is no significant correlation between the first language and the salary.
library(lattice)
boxplot(MBAdataRef$salary ~MBAdataRef$satis ,
xlab = "Satisfaction", ylab ="salary", main = "Effect of satisfaction on salary.")
# There is no significant correlation between satisfaction and salary.
library(car)
scatterplotMatrix(formula = ~ salary + gmat_tot + gmat_qpc + gmat_vpc + gmat_tpc , cex=0.6,
data=MBAdataRef, main = " Effect of GMAT on Salary.")
library(car)
scatterplotMatrix(formula = ~ salary + s_avg + f_avg + quarter, cex=0.6,
data=MBAdataRef, main = " Effect of spring_MBA, Fall_MBA and quartile ranking on Salary.")
library("corrgram")
## Warning: replacing previous import by 'magrittr::%>%' when loading
## 'dendextend'
corrgram(MBAdataRef, order=FALSE, lower.panel=panel.shade,
upper.panel=panel.pie, text.panel=panel.txt,
main="Corrgram of MBA data variables")
covMBAdatRef <-cov(MBAdataRef)
covMBAdatRef
## age sex gmat_tot gmat_qpc
## age 10.7045498 -2.164477e-01 -1.305445e+01 -7.22796497
## sex -0.2164477 2.124500e-01 -4.568818e-01 -0.90757662
## gmat_tot -13.0544451 -4.568818e-01 2.569294e+03 452.14258519
## gmat_qpc -7.2279650 -9.075766e-01 4.521426e+02 179.18027794
## gmat_vpc 0.9505045 3.974872e-01 6.386360e+02 20.45849990
## gmat_tpc -3.4602132 -2.377689e-01 5.393623e+02 97.03607462
## s_avg 0.1938587 1.409575e-02 3.299562e+00 0.07838473
## f_avg -0.3462517 3.725395e-02 3.027432e+00 0.64252142
## quarter -0.4604988 -1.104131e-02 -6.005140e+00 0.18960594
## work_yrs 8.6728536 -1.281173e-01 -1.873882e+01 -7.36245955
## frstlang 0.2898344 8.756901e-03 -1.687607e+00 0.04806777
## salary 29210.5193223 -1.369577e+03 -8.212449e+04 3382.43784504
## satis 0.2776509 -3.321911e-02 2.570912e+00 -0.04178565
## gmat_vpc gmat_tpc s_avg f_avg
## age 9.505045e-01 -3.460213e+00 0.19385875 -3.462517e-01
## sex 3.974872e-01 -2.377689e-01 0.01409575 3.725395e-02
## gmat_tot 6.386360e+02 5.393623e+02 3.29956215 3.027432e+00
## gmat_qpc 2.045850e+01 9.703607e+01 0.07838473 6.425214e-01
## gmat_vpc 2.606602e+02 1.393882e+02 0.96945936 1.803303e-01
## gmat_tpc 1.393882e+02 1.211342e+02 0.58062916 3.785056e-01
## s_avg 9.694594e-01 5.806292e-01 0.14325138 8.231046e-02
## f_avg 1.803303e-01 3.785056e-01 0.08231046 2.378638e-01
## quarter -2.325528e+00 -1.227013e+00 -0.35620503 -2.356492e-01
## work_yrs -1.366838e+00 -4.389206e+00 0.18604797 -3.176271e-01
## frstlang -8.915858e-01 -4.575481e-01 -0.01319912 -6.243099e-03
## salary -3.964803e+04 -2.596339e+04 688.02042071 -9.241129e+02
## satis 1.879973e+00 1.002856e+00 -0.04256901 -4.498382e-02
## quarter work_yrs frstlang salary
## age -4.604988e-01 8.6728536 2.898344e-01 2.921052e+04
## sex -1.104131e-02 -0.1281173 8.756901e-03 -1.369577e+03
## gmat_tot -6.005140e+00 -18.7388159 -1.687607e+00 -8.212449e+04
## gmat_qpc 1.896059e-01 -7.3624595 4.806777e-02 3.382438e+03
## gmat_vpc -2.325528e+00 -1.3668380 -8.915858e-01 -3.964803e+04
## gmat_tpc -1.227013e+00 -4.3892062 -4.575481e-01 -2.596339e+04
## s_avg -3.562050e-01 0.1860480 -1.319912e-02 6.880204e+02
## f_avg -2.356492e-01 -0.3176271 -6.243099e-03 -9.241129e+02
## quarter 1.254140e+00 -0.4347992 3.102989e-02 -2.571117e+03
## work_yrs -4.347992e-01 9.0630116 1.494384e-01 2.445820e+04
## frstlang 3.102989e-02 0.1494384 6.396345e-02 1.206714e+03
## salary -2.571117e+03 24458.1995050 1.206714e+03 3.192940e+08
## satis 1.975062e-01 0.1485818 1.779935e-02 -5.606583e+02
## satis
## age 0.27765087
## sex -0.03321911
## gmat_tot 2.57091186
## gmat_qpc -0.04178565
## gmat_vpc 1.87997335
## gmat_tpc 1.00285551
## s_avg -0.04256901
## f_avg -0.04498382
## quarter 0.19750619
## work_yrs 0.14858176
## frstlang 0.01779935
## salary -560.65829050
## satis 0.61374453
#Transform covariance to correlation matrix
covCorrMBAdataRef <- cov2cor(covMBAdatRef)
covCorrMBAdataRef
## age sex gmat_tot gmat_qpc gmat_vpc
## age 1.00000000 -0.14352927 -0.07871678 -0.165039057 0.01799420
## sex -0.14352927 1.00000000 -0.01955548 -0.147099027 0.05341428
## gmat_tot -0.07871678 -0.01955548 1.00000000 0.666382266 0.78038546
## gmat_qpc -0.16503906 -0.14709903 0.66638227 1.000000000 0.09466541
## gmat_vpc 0.01799420 0.05341428 0.78038546 0.094665411 1.00000000
## gmat_tpc -0.09609156 -0.04686981 0.96680810 0.658650025 0.78443167
## s_avg 0.15654954 0.08079985 0.17198874 0.015471662 0.15865101
## f_avg -0.21699191 0.16572186 0.12246257 0.098418869 0.02290167
## quarter -0.12568145 -0.02139041 -0.10578964 0.012648346 -0.12862079
## work_yrs 0.88052470 -0.09233003 -0.12280018 -0.182701263 -0.02812182
## frstlang 0.35026743 0.07512009 -0.13164323 0.014198516 -0.21835333
## salary 0.49964284 -0.16628869 -0.09067141 0.014141299 -0.13743230
## satis 0.10832308 -0.09199534 0.06474206 -0.003984632 0.14863481
## gmat_tpc s_avg f_avg quarter work_yrs
## age -0.09609156 0.15654954 -0.21699191 -0.12568145 0.88052470
## sex -0.04686981 0.08079985 0.16572186 -0.02139041 -0.09233003
## gmat_tot 0.96680810 0.17198874 0.12246257 -0.10578964 -0.12280018
## gmat_qpc 0.65865003 0.01547166 0.09841887 0.01264835 -0.18270126
## gmat_vpc 0.78443167 0.15865101 0.02290167 -0.12862079 -0.02812182
## gmat_tpc 1.00000000 0.13938500 0.07051391 -0.09955033 -0.13246963
## s_avg 0.13938500 1.00000000 0.44590413 -0.84038355 0.16328236
## f_avg 0.07051391 0.44590413 1.00000000 -0.43144819 -0.21633018
## quarter -0.09955033 -0.84038355 -0.43144819 1.00000000 -0.12896722
## work_yrs -0.13246963 0.16328236 -0.21633018 -0.12896722 1.00000000
## frstlang -0.16437561 -0.13788905 -0.05061394 0.10955726 0.19627277
## salary -0.13201783 0.10173175 -0.10603897 -0.12848526 0.45466634
## satis 0.11630842 -0.14356557 -0.11773304 0.22511985 0.06299926
## frstlang salary satis
## age 0.35026743 0.49964284 0.108323083
## sex 0.07512009 -0.16628869 -0.091995338
## gmat_tot -0.13164323 -0.09067141 0.064742057
## gmat_qpc 0.01419852 0.01414130 -0.003984632
## gmat_vpc -0.21835333 -0.13743230 0.148634805
## gmat_tpc -0.16437561 -0.13201783 0.116308417
## s_avg -0.13788905 0.10173175 -0.143565573
## f_avg -0.05061394 -0.10603897 -0.117733043
## quarter 0.10955726 -0.12848526 0.225119851
## work_yrs 0.19627277 0.45466634 0.062999256
## frstlang 1.00000000 0.26701953 0.089834769
## salary 0.26701953 1.00000000 -0.040050600
## satis 0.08983477 -0.04005060 1.000000000
# Articulate a Hypothesis that you could test using a Regression Model. Run T-Tests appropriate, to test your Hypotheses. Fit a Linear Regression Model using lm().
# Null Hypothesis- "The salary does not depend on Gender.
# T-Test to check correlation between salary and Gender.
t.test(MBAdataRef$salary ~ MBAdataRef$sex)
##
## Welch Two Sample t-test
##
## data: MBAdataRef$salary by MBAdataRef$sex
## t = 1.3628, df = 38.115, p-value = 0.1809
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -3128.55 16021.72
## sample estimates:
## mean in group 1 mean in group 2
## 104970.97 98524.39
# As p-value>0.05, accept the null hypothesis, So, there is a no siginficant difference starting salary of male and female MBAs as salary doesnt depend on gender.
# Null Hypothesis- "The salary does not depend on First Language."
# T-Test to check correlation between salary and first language.
t.test(MBAdataRef$salary ~ MBAdataRef$frstlang)
##
## Welch Two Sample t-test
##
## data: MBAdataRef$salary by MBAdataRef$frstlang
## t = -1.1202, df = 6.0863, p-value = 0.3049
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -59933.62 22202.25
## sample estimates:
## mean in group 1 mean in group 2
## 101748.6 120614.3
# As p-value>0.05, accept the null hypothesis, So,there is no significant difference between starting salary of MBAs whose first language is english and others as salary doesnt depend on first language.
# Model 1
MbaSalModel1<-lm(salary ~ age +gmat_tot + gmat_qpc + gmat_vpc + gmat_tpc + quarter + s_avg+ f_avg + satis + frstlang , data = MBAdataRef)
summary(MbaSalModel1)
##
## Call:
## lm(formula = salary ~ age + gmat_tot + gmat_qpc + gmat_vpc +
## gmat_tpc + quarter + s_avg + f_avg + satis + frstlang, data = MBAdataRef)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24137 -8244 -490 5313 68756
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 64622.144 49262.731 1.312 0.1929
## age 2501.003 559.182 4.473 2.2e-05 ***
## gmat_tot 8.337 177.818 0.047 0.9627
## gmat_qpc 827.849 491.659 1.684 0.0956 .
## gmat_vpc 530.807 498.305 1.065 0.2896
## gmat_tpc -1436.428 711.446 -2.019 0.0464 *
## quarter -2647.810 2692.668 -0.983 0.3280
## s_avg -1805.530 8145.604 -0.222 0.8251
## f_avg -2741.535 3852.548 -0.712 0.4785
## satis -925.938 2140.124 -0.433 0.6663
## frstlang 5156.619 6934.452 0.744 0.4590
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15370 on 92 degrees of freedom
## Multiple R-squared: 0.3328, Adjusted R-squared: 0.2603
## F-statistic: 4.589 on 10 and 92 DF, p-value: 2.778e-05
# Model 2
MbaSalModel2<-lm(salary ~ age +gmat_tot + quarter +satis
+ frstlang , data = MBAdataRef)
summary(MbaSalModel2)
##
## Call:
## lm(formula = salary ~ age + gmat_tot + quarter + satis + frstlang,
## data = MBAdataRef)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28366 -9128 -892 5055 76836
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 50088.40 25880.10 1.935 0.0559 .
## age 2487.04 517.40 4.807 5.6e-06 ***
## gmat_tot -14.65 31.11 -0.471 0.6386
## quarter -1119.24 1462.40 -0.765 0.4459
## satis -1856.87 2056.96 -0.903 0.3689
## frstlang 8269.35 6644.88 1.244 0.2163
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15600 on 97 degrees of freedom
## Multiple R-squared: 0.2748, Adjusted R-squared: 0.2374
## F-statistic: 7.352 on 5 and 97 DF, p-value: 7.014e-06
# Model 3
MbaSalModel3<-lm(salary ~ age +gmat_tot + gmat_qpc + gmat_vpc + gmat_tpc + satis
+ frstlang , data = MBAdataRef)
summary(MbaSalModel3)
##
## Call:
## lm(formula = salary ~ age + gmat_tot + gmat_qpc + gmat_vpc +
## gmat_tpc + satis + frstlang, data = MBAdataRef)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27442 -9074 -26 5449 65805
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 51877.83 47122.04 1.101 0.2737
## age 2720.36 507.19 5.364 5.73e-07 ***
## gmat_tot -27.96 162.79 -0.172 0.8640
## gmat_qpc 841.99 471.63 1.785 0.0774 .
## gmat_vpc 567.02 477.05 1.189 0.2376
## gmat_tpc -1309.36 699.44 -1.872 0.0643 .
## satis -1688.22 2036.28 -0.829 0.4091
## frstlang 4176.03 6703.97 0.623 0.5348
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15270 on 95 degrees of freedom
## Multiple R-squared: 0.3198, Adjusted R-squared: 0.2697
## F-statistic: 6.38 on 7 and 95 DF, p-value: 3.716e-06
# Seeing the R squared, the thid model looks more accurate.
# Compare job(placed) and no Job(Placed)
MBAdataFull <- MBAdata[which(MBAdata$salary!=998 & MBAdata$salary !=999),]
MBAdataFull$job[MBAdataFull$salary ==0] <- 0
MBAdataFull$job[MBAdataFull$salary !=0] <- 1
View(MBAdataFull)
# Null hypothesis: Gender and placement are independent
# ChiSquareTest
mbadataschi<-xtabs(~sex + job,data = MBAdataFull)
addmargins(mbadataschi)
## job
## sex 0 1 Sum
## 1 67 72 139
## 2 23 31 54
## Sum 90 103 193
chisq.test(mbadataschi)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: mbadataschi
## X-squared = 0.29208, df = 1, p-value = 0.5889
# As p>0.05, null hypothesis is accepted, Gender and placement are independent.
# Null hypothesis: first Language and placement are independent
# ChiSquareTest
mbadataschi1<-xtabs(~frstlang + job, data = MBAdataFull)
addmargins(mbadataschi1)
## job
## frstlang 0 1 Sum
## 1 82 96 178
## 2 8 7 15
## Sum 90 103 193
chisq.test(mbadataschi1)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: mbadataschi1
## X-squared = 0.074127, df = 1, p-value = 0.7854
# As p>0.05, null hypothesis is accepted, language and placement are independent.
# Reading the dataset
MBATrainingdata <- read.csv(file="MBA_Starting_Salaries_Data.csv",head=TRUE,sep=",")
View(MBATrainingdata)
# Adding a column of placed and unplaced students.
MBATrainingdataFull <- MBATrainingdata[which(MBATrainingdata$salary!=998 & MBATrainingdata$salary !=999),]
MBATrainingdataFull$job[MBATrainingdataFull$salary ==0] <- 0
MBATrainingdataFull$job[MBATrainingdataFull$salary !=0] <- 1
# Making a subset of the data with relevant columns.
Data <- subset(MBATrainingdataFull,select=c(1,2,3,7,8,9,10,11,12,13,14))
View(Data)
# Splitting into train and test data.
train <- Data[1:160,]
test <- Data[161:193,]
# Logistic regression Model on the job column, to predict whether the student will get the job or not based on the various factors selected.
LRmodel <- glm(job ~.,family=binomial(link='logit'),data=train)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(LRmodel)
##
## Call:
## glm(formula = job ~ ., family = binomial(link = "logit"), data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.083e-05 -1.354e-06 2.110e-08 2.110e-08 1.671e-05
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.235e+01 1.291e+06 0.000 1.000
## age 3.776e-02 1.660e+04 0.000 1.000
## sex 2.759e+00 6.361e+04 0.000 1.000
## gmat_tot -9.237e-03 5.430e+02 0.000 1.000
## s_avg -1.328e-02 2.842e+05 0.000 1.000
## f_avg -2.992e-01 9.099e+04 0.000 1.000
## quarter 1.701e-01 9.579e+04 0.000 1.000
## work_yrs -5.826e-01 2.222e+04 0.000 1.000
## frstlang -2.021e+00 2.808e+05 0.000 1.000
## salary 5.840e-04 6.096e-01 0.001 0.999
## satis 1.429e-01 3.472e+04 0.000 1.000
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2.2141e+02 on 159 degrees of freedom
## Residual deviance: 1.3566e-09 on 149 degrees of freedom
## AIC: 22
##
## Number of Fisher Scoring iterations: 25
anova(LRmodel, test="Chisq")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: job
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 159 221.41
## age 1 4.148 158 217.26 0.0416846 *
## sex 1 0.028 157 217.23 0.8682202
## gmat_tot 1 0.074 156 217.16 0.7859236
## s_avg 1 14.565 155 202.59 0.0001354 ***
## f_avg 1 0.082 154 202.51 0.7746748
## quarter 1 0.482 153 202.03 0.4874415
## work_yrs 1 0.819 152 201.21 0.3654938
## frstlang 1 0.020 151 201.19 0.8870032
## salary 1 201.190 150 0.00 < 2.2e-16 ***
## satis 1 0.000 149 0.00 0.9999987
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#predictive model
fitted.results <- predict(LRmodel,newdata=test,type='response')
fitted.results <- ifelse(fitted.results > 0.5,1,0)
misClasificError <- mean(fitted.results != test$job)
print(paste('Accuracy',1-misClasificError))
## [1] "Accuracy 1"
library(ROCR)
## Warning: package 'ROCR' was built under R version 3.2.5
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.2.5
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
pr <- prediction(fitted.results, test$job)
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)
auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
auc
## [1] 1