Distribution of first language among students
table(surveyanswered$frstlang)## 1=English, 2=Others
##
## 1 2
## 204 24
Excluding data of students who have not filled survey or didn’t reported salary
salaryknown <-MBA[which(MBA$salary != '998'& MBA$salary != '999'),]
dim(salaryknown)
## [1] 193 13
View(salaryknown)
Variation of Salary among MBA students only whose salary are known
boxplot(salaryknown$salary,ylim=c(-100000,200000),xlim=c(1,1),main="Variation of Salary Among Students whose salary are known",ylab="Salary")

mean(salaryknown$salary)
## [1] 54985.32
Variation of Satisfaction Rate among MBA students
boxplot(surveyanswered$satis,ylim=c(2,10),xlim=c(1,1),main="Variation of Satisfaction Rate Among MBA Students")

mean(surveyanswered$satis)
## [1] 5.565789
Scatter Plot showing relation in Salary and GMAT Total Scores
plot(x=salaryknown$salary,y=salaryknown$gmat_tot,xlab="Salary",ylab="GMAT Total Score", main="Scatter Plot showing relation in Salary and GMAT Total Scores")

Scatter Plot showing relation in Salary and GMAT Quant Percentile
plot(x=salaryknown$salary,y=salaryknown$gmat_qpc,xlab="Salary",ylab="GMAT Quant Percentile", main="Scatter Plot showing relation in Salary and GMAT Quant Percentile")

Scatter Plot showing relation in Salary and GMAT Verbal Percentile
plot(x=salaryknown$salary,y=salaryknown$gmat_vpc,xlab="Salary",ylab="GMAT Verbal Percentile", main="Scatter Plot showing relation in Salary and GMAT Verbal Percentile")

Scatter Plot showing relation in Salary and GMAT Total Percentile
plot(x=salaryknown$salary,y=salaryknown$gmat_tpc,xlab="Salary",ylab="GMAT Total Percentile", main="Scatter Plot showing relation in Salary and GMAT Total Percentile")

Scatter Plot showing relation in Salary and Spring MBA Average
plot(x=salaryknown$salary,y=salaryknown$s_avg,xlab="Salary",ylab="Spring MBA Average", main="Scatter Plot showing relation in Salary and Spring MBA Average")

Scatter Plot showing relation in Salary and Fall MBA Average
plot(x=salaryknown$salary,y=salaryknown$f_avg,xlab="Salary",ylab="Fall MBA Average", main="Scatter Plot showing relation in Salary and Fall MBA Average")

Scatter Plot showing relation in Salary and Quartile Ranking
plot(y=salaryknown$salary,x=salaryknown$quarter,ylab="Salary",xlab="Quartile Ranking", main="Scatter Plot showing relation in Salary and Quartile Ranking")

Scatter Plot showing relation in Salary and Working Experience
plot(y=salaryknown$salary,x=salaryknown$work_yrs,ylab="Salary",xlab="Years of Working Experience", main="Scatter Plot showing relation in Salary and Working Experience")

Scatter Plot showing relation in Salary and Students First Language
plot(y=salaryknown$salary,x=salaryknown$frstlang,ylab="Salary",xlab="First Language, 1=English, 2=Other", main="Scatter Plot showing relation in Salary and Students First Language")

Scatter Plot showing relation in Salary and Students Satisfaction Rate
plot(y=salaryknown$salary,x=salaryknown$satis,ylab="Salary",xlab="Satisfaction Rating", main="Scatter Plot showing relation in Salary and Students Satisfaction Rate")

Covariance Correltion Table
cor(salaryknown,use="complete")
## age sex gmat_tot gmat_qpc gmat_vpc
## age 1.000000000 -0.031876273 -1.256220e-01 -0.220590341 -0.006721674
## sex -0.031876273 1.000000000 -4.351109e-02 -0.167904888 0.099184398
## gmat_tot -0.125622047 -0.043511095 1.000000e+00 0.743099719 0.752906719
## gmat_qpc -0.220590341 -0.167904888 7.430997e-01 1.000000000 0.175497777
## gmat_vpc -0.006721674 0.099184398 7.529067e-01 0.175497777 1.000000000
## gmat_tpc -0.131681932 -0.012849186 8.791496e-01 0.690581939 0.688039929
## s_avg 0.164342257 0.073368077 1.435675e-01 0.019038162 0.190665307
## f_avg -0.034290725 0.042895288 1.010821e-01 0.130285115 0.033106093
## quarter -0.076614994 -0.086616877 -8.407099e-02 0.008601267 -0.139400223
## work_yrs 0.871679595 -0.023832548 -1.736909e-01 -0.241384675 -0.041357878
## frstlang 0.097619028 -0.008488358 -9.557089e-02 0.094537575 -0.295162826
## salary -0.130198680 0.018516965 -5.685962e-05 0.028391635 0.003389965
## satis -0.073500580 -0.061738773 7.981946e-02 -0.020006117 0.195134711
## gmat_tpc s_avg f_avg quarter work_yrs
## age -0.13168193 0.16434226 -0.034290725 -0.076614994 0.871679595
## sex -0.01284919 0.07336808 0.042895288 -0.086616877 -0.023832548
## gmat_tot 0.87914961 0.14356746 0.101082103 -0.084070990 -0.173690863
## gmat_qpc 0.69058194 0.01903816 0.130285115 0.008601267 -0.241384675
## gmat_vpc 0.68803993 0.19066531 0.033106093 -0.139400223 -0.041357878
## gmat_tpc 1.00000000 0.18894788 0.109811857 -0.128533421 -0.166139876
## s_avg 0.18894788 1.00000000 0.520554250 -0.735421726 0.159136628
## f_avg 0.10981186 0.52055425 1.000000000 -0.382421186 -0.047951357
## quarter -0.12853342 -0.73542173 -0.382421186 1.000000000 -0.126454286
## work_yrs -0.16613988 0.15913663 -0.047951357 -0.126454286 1.000000000
## frstlang -0.10789784 -0.12631935 -0.055830525 0.089504320 -0.002916547
## salary 0.06094464 0.09632412 0.008846655 -0.147257809 -0.053266846
## satis 0.13288434 -0.04639953 -0.114704819 0.067729421 -0.007722658
## frstlang salary satis
## age 0.097619028 -1.301987e-01 -0.073500580
## sex -0.008488358 1.851696e-02 -0.061738773
## gmat_tot -0.095570885 -5.685962e-05 0.079819458
## gmat_qpc 0.094537575 2.839164e-02 -0.020006117
## gmat_vpc -0.295162826 3.389965e-03 0.195134711
## gmat_tpc -0.107897839 6.094464e-02 0.132884339
## s_avg -0.126319350 9.632412e-02 -0.046399534
## f_avg -0.055830525 8.846655e-03 -0.114704819
## quarter 0.089504320 -1.472578e-01 0.067729421
## work_yrs -0.002916547 -5.326685e-02 -0.007722658
## frstlang 1.000000000 7.125825e-03 -0.135986251
## salary 0.007125825 1.000000e+00 0.156439455
## satis -0.135986251 1.564395e-01 1.000000000
Corrogram of variables
library("corrgram", lib.loc="~/R/win-library/3.4")
corrgram(salaryknown,lower.panel="panel.shade",upper.panel="panel.pie",order=TRUE)

Creating a data frame placed in which all the stduents are placed and have salary > 0
placed <-MBA[which(MBA$salary != '998' & MBA$salary != '999' & MBA$salary != '0'),]
dim(placed)
## [1] 103 13
View(placed)
Now putting salary as a function of only significant variable fromabove regression i.e age, GMAT Quant Percentile, GMAT Total Percentile, Quartile Ranking and First Language
fit <-lm(salary ~ age + gmat_qpc + gmat_tpc + quarter + frstlang, data=placed)
summary(fit )
##
## Call:
## lm(formula = salary ~ age + gmat_qpc + gmat_tpc + quarter + frstlang,
## data = placed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25457 -8804 -679 4434 73624
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 38832.1 19296.8 2.012 0.0470 *
## age 2641.4 510.2 5.177 1.22e-06 ***
## gmat_qpc 355.5 155.3 2.289 0.0242 *
## gmat_tpc -423.9 189.2 -2.240 0.0274 *
## quarter -1657.6 1383.5 -1.198 0.2338
## frstlang 4402.0 6660.0 0.661 0.5102
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15220 on 97 degrees of freedom
## Multiple R-squared: 0.3099, Adjusted R-squared: 0.2743
## F-statistic: 8.71 on 5 and 97 DF, p-value: 7.51e-07
residuals(fit)
## 35 36 37 38 39
## -6842.41226 -19775.94616 -11852.13138 -9345.99118 -13487.03737
## 40 41 42 43 44
## -4146.87982 432.00320 -580.73774 -4532.13518 -279.35436
## 45 46 47 48 49
## 170.48290 4025.72794 -3674.06248 -2040.29412 3031.91597
## 50 51 52 53 54
## -98.19145 2360.76003 -278.65380 -1969.41583 -12268.22429
## 55 56 57 58 59
## 726.03459 -8258.84371 112.56121 6415.38278 1393.38325
## 60 61 62 63 64
## -23008.20649 119.66834 9868.50824 -3866.55137 18465.84608
## 65 66 67 68 69
## 12227.14786 7410.17151 8517.20412 12096.87957 61084.59218
## 115 116 117 118 119
## -17259.75340 -8720.38506 -22582.70211 -4462.75785 -5433.15646
## 120 121 122 123 124
## 4718.22360 -12503.21418 6063.43534 -2676.55670 -16272.23668
## 125 126 127 128 129
## -15304.05099 4150.48715 -15460.02872 279.61494 -5423.60805
## 130 131 132 133 134
## -6024.76681 7230.74602 2688.98422 -21077.88008 3720.53216
## 135 136 137 138 139
## 3393.98305 18698.70334 15888.89465 29181.43018 48076.52092
## 186 187 188 189 190
## -11460.98108 -15134.41365 -8118.82068 -3237.42043 -6062.78410
## 191 192 193 194 195
## -12097.52458 -3366.18621 -7757.35340 7526.61334 10356.55704
## 196 197 198 199 200
## 1166.72773 -9353.20122 -9410.72784 -7964.05678 5109.47149
## 201 202 203 204 205
## 3562.66913 3730.85108 -5848.41599 -3821.54893 4763.81285
## 206 207 208 209 256
## 3741.99386 -9553.95248 7659.78845 17328.38310 -25457.07388
## 257 258 259 260 261
## -8887.19671 -9807.65111 -10287.81666 -11352.77328 -10144.72826
## 262 263 264 265 266
## -5569.73834 -695.77211 -5030.90272 3127.46446 8231.13668
## 267 268 269 270 271
## 1785.54091 -678.62417 760.08768 -19274.22963 17951.41429
## 272 273 274
## 11825.29562 25078.20488 73624.22237
We see that R-Squared value have reduced significantly but no much reduction in adjusted R-Square value
Now putting slary as a function of only age, GMAT Quant Percentile and GMAT Total Percentile
fit <-lm(salary ~ age + gmat_qpc + gmat_tpc , data=placed)
summary(fit )
##
## Call:
## lm(formula = salary ~ age + gmat_qpc + gmat_tpc, data = placed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27913 -8913 -660 4973 72402
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 33975.5 18197.9 1.867 0.0649 .
## age 2836.8 466.4 6.082 2.24e-08 ***
## gmat_qpc 363.0 150.8 2.407 0.0179 *
## gmat_tpc -424.1 181.7 -2.333 0.0217 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15200 on 99 degrees of freedom
## Multiple R-squared: 0.2979, Adjusted R-squared: 0.2766
## F-statistic: 14 on 3 and 99 DF, p-value: 1.117e-07
residuals(fit)
## 35 36 37 38 39
## -4190.23389 -18129.85104 -9763.63397 -7220.99720 -11855.77441
## 40 41 42 43 44
## -2386.88954 2737.99372 1762.65180 -2176.05420 1867.88785
## 45 46 47 48 49
## 2290.52015 6460.02399 -1350.49439 -233.38058 5060.50091
## 50 51 52 53 54
## 2081.26589 4390.63665 1670.22977 -660.41158 -11245.20937
## 55 56 57 58 59
## 2035.81728 -7214.28841 1764.71013 8538.43065 2879.75701
## 60 61 62 63 64
## -23547.15050 1753.87132 11507.18727 1169.87750 20309.25620
## 65 66 67 68 69
## 13670.38742 8593.29700 9628.30953 11371.46905 63091.90193
## 115 116 117 118 119
## -17024.49104 -8306.07228 -23130.24132 -4091.38466 -5004.20613
## 120 121 122 123 124
## 5599.45166 -11818.69643 6993.53277 -2017.22722 -11683.40545
## 125 126 127 128 129
## -11031.15947 4884.56127 -16065.53428 693.92772 -5459.30825
## 130 131 132 133 134
## -6095.86650 7488.45122 2772.13591 -22299.95223 3975.70742
## 135 136 137 138 139
## 3013.26265 19345.53600 16038.79107 29401.47116 48693.68926
## 186 187 188 189 190
## -11977.89434 -16753.74862 -9281.89426 -4376.49427 -7306.07228
## 191 192 193 194 195
## -13940.08783 -4925.99794 -9462.63816 6525.01262 9481.15369
## 196 197 198 199 200
## 47.76141 -6642.79331 -11240.52906 -10016.43186 3989.79732
## 201 202 203 204 205
## 2356.85057 2682.61315 -7886.73735 -5995.78652 3714.21264
## 206 207 208 209 256
## 1792.86007 -7717.50066 6072.57714 16230.56794 -27913.49061
## 257 258 259 260 261
## -11336.32137 -12758.11748 -13302.17918 -14418.85375 -13435.32870
## 262 263 264 265 266
## -8543.23140 -3552.35939 -8113.17763 523.50457 5614.09614
## 267 268 269 270 271
## -1324.45221 -4234.80864 -2387.37506 -18846.83822 15625.05460
## 272 273 274
## 8994.28583 22104.44662 72401.75896
We see that Multiple R-Squared value have reduced again little but adjusted R-Sqaure value is still same but we can conclude that salary of placed students depend much on age, GMAT Quant Percentile and GMAT Total Percentile more
Creating a data frame unplaced of students whose salary is equal to 0
unplaced <-MBA[which(MBA$salary =='0'),]
dim(unplaced)
## [1] 90 13
View(unplaced)
Introducing a new variable status_placed that whether a student is placed or not 1 represnt placed and 0 represent unplaced
salaryknown$status_placed <- ifelse( salaryknown$salary > 1, "1" , no = "0")
Contigency tale with status placement and Sex
table(salaryknown$status_placed,salaryknown$sex)
##
## 1 2
## 0 67 23
## 1 72 31
prop.table(table(salaryknown$status_placed,salaryknown$sex),1)
##
## 1 2
## 0 0.7444444 0.2555556
## 1 0.6990291 0.3009709
prop.table(table(salaryknown$status_placed,salaryknown$sex),2)
##
## 1 2
## 0 0.4820144 0.4259259
## 1 0.5179856 0.5740741
Higher P-value also support that placement and sex are related
Comparing Mean Age of all, placed and unplaced students
mean(salaryknown$age)
## [1] 27.58549
mean(salaryknown$age[salaryknown$salary != "0" ])
## [1] 26.7767
mean(salaryknown$age[salaryknown$salary == "0" ])
## [1] 28.51111
So average age of placed students is lowest while avaerge age of unplaced is the highest,which means lower age helps in placement
Comparing Mean GMAT Total of all, placed and unplaced students
mean(salaryknown$gmat_tot)
## [1] 615.2332
mean(salaryknown$gmat_tot[salaryknown$salary != "0" ])
## [1] 616.0194
mean(salaryknown$gmat_tot[salaryknown$salary == "0" ])
## [1] 614.3333
So clearly placed student have highest Average GMAT Total folloewd by allover Average GMAT total and at lowest the GMAT Total of unplaced students. So Higer the GMAT total more the chance of placement
Comparing Mean GMAT Quant Percentile of all, placed and unplaced students
mean(salaryknown$gmat_qpc)
## [1] 79.34715
mean(salaryknown$gmat_qpc[salaryknown$salary != "0" ])
## [1] 79.72816
mean(salaryknown$gmat_qpc[salaryknown$salary == "0" ])
## [1] 78.91111
Again placed student have highest Average GMATQunat Percentile folloewd by allover Average GMAT Quant Percentile and at lowest the GMAT Quant Percentile of unplaced students. So Higer the Quant Percentile more the chance of placement.
Comparing Mean GMAT Verbal Percentile of all, placed and unplaced students
mean(salaryknown$gmat_vpc)
## [1] 78.12953
mean(salaryknown$gmat_vpc[salaryknown$salary != "0" ])
## [1] 78.56311
mean(salaryknown$gmat_vpc[salaryknown$salary == "0" ])
## [1] 77.63333
Again placed student have highest Average GMAT Verabl Percentile folloewd by allover Average GMAT Verbal Percentile and at lowest the GMAT Verbal Percentile of unplaced students. So Higer the Verbal Percentile more the chance of placement.
Comparing Mean GMAT Total Percentile of all, placed and unplaced students
mean(surveyanswered$gmat_tpc)
## [1] 83.61404
mean(surveyanswered$gmat_tpc[surveyanswered$salary != "0" ])
## [1] 84.47826
mean(surveyanswered$gmat_tpc[surveyanswered$salary == "0" ])
## [1] 82.28889
Again placed student have highest Average GMAT total Percentile folloewd by allover Average GMAT Verbal Percentile and at lowest the GMAT total Percentile of unplaced students. So Higer the total Percentile more the chance of placement.
Comparing Mean Spring Average of all, placed and unplaced students
mean(salaryknown$s_avg)
## [1] 3.063834
mean(salaryknown$s_avg[salaryknown$salary != "0" ])
## [1] 3.09233
mean(salaryknown$s_avg[salaryknown$salary == "0" ])
## [1] 3.031222
Again placed student have highest Spring Average followed by allover Spring Average Percentile and at lowest the Spring Average of unplaced students. So Higer the spring Average more the chance of placement.
Comparing Mean Fall Average of all, placed and unplaced students
mean(salaryknown$f_avg)
## [1] 3.077668
mean(salaryknown$f_avg[salaryknown$salary != "0" ])
## [1] 3.090971
mean(salaryknown$f_avg[salaryknown$salary == "0" ])
## [1] 3.062444
Again placed student have highest Fall Average followed by allover Fall Average Percentile and at lowest the Fall Average of unplaced students. So Higer the Fall Average more the chance of placement.
Comparing Mean Quartile Ranking of all, placed and unplaced students
mean(salaryknown$quarter)
## [1] 2.393782
mean(salaryknown$quarter[salaryknown$salary != "0" ])
## [1] 2.262136
mean(salaryknown$quarter[salaryknown$salary == "0" ])
## [1] 2.544444
Again placed student havebetter ranking followed by allover Average ranking and at lowest the Average Quartile Ranking of unplaced students. So better the ranking better the chance of placement.
Comparing Mean Working Experience of all, placed and unplaced students
mean(salaryknown$work_yrs)
## [1] 4.103627
mean(salaryknown$work_yrs[salaryknown$salary != "0" ])
## [1] 3.679612
mean(salaryknown$work_yrs[salaryknown$salary == "0" ])
## [1] 4.588889
Here placed students have minimum avaerage work ex folowed by allover avaerage and lastly unplaced have maximum work experince, so less experience help in getting placed.
Comparing the effect of first language on all, placed and unplaced students
mytable <-xtabs(~ status_placed + frstlang, data=salaryknown)
mytable
## frstlang
## status_placed 1 2
## 0 82 8
## 1 96 7
addmargins(mytable)
## frstlang
## status_placed 1 2 Sum
## 0 82 8 90
## 1 96 7 103
## Sum 178 15 193
chisq.test(mytable)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: mytable
## X-squared = 0.074127, df = 1, p-value = 0.7854
Higher P-Value shows that first language have effect on placement
Comparing satisfaction rate of all, placed and unplaced students
mean(salaryknown$satis)
## [1] 5.761658
mean(salaryknown$satis[salaryknown$salary != "0" ])
## [1] 5.883495
mean(salaryknown$satis[salaryknown$salary == "0" ])
## [1] 5.622222
So placed students have shown highest satisfaction while unplaced students have shown the lowest satisfaction
Assigning values to variable status_placed and converting it into numeric variable
salaryknown$status_placed <- ifelse( salaryknown$salary > 1, "1" , no = "0")
salaryknown$status_placed <- factor(salaryknown$status_placed)
salaryknown$status_placed <- factor(salaryknown$status_placed)
Challenge
Running logistic regression on dataset salary known to predict the status of placement of student
data <- subset(salaryknown, select=c(1,2,3,4,5,6,9,10,11,13,14))
train <-data[1:150,]
test <-data[151:193,]
library("glm2", lib.loc="~/R/win-library/3.4")
model <- glm(status_placed ~.,family=binomial(link='logit'),data=train)
summary(model)
##
## Call:
## glm(formula = status_placed ~ ., family = binomial(link = "logit"),
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9080 -1.1151 0.7427 1.0386 1.5244
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.625855 4.846925 0.748 0.4544
## age -0.143999 0.101420 -1.420 0.1557
## sex -0.099873 0.392916 -0.254 0.7994
## gmat_tot -0.004329 0.014667 -0.295 0.7679
## gmat_qpc -0.055383 0.052299 -1.059 0.2896
## gmat_vpc -0.047847 0.050862 -0.941 0.3469
## gmat_tpc 0.127970 0.078713 1.626 0.1040
## quarter -0.488002 0.224710 -2.172 0.0299 *
## work_yrs 0.037768 0.110118 0.343 0.7316
## frstlang -0.101547 0.692551 -0.147 0.8834
## satis 0.281737 0.235374 1.197 0.2313
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 206.64 on 149 degrees of freedom
## Residual deviance: 191.72 on 139 degrees of freedom
## AIC: 213.72
##
## Number of Fisher Scoring iterations: 4
Annova Test
anova(model, test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: status_placed
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 149 206.64
## age 1 4.9691 148 201.67 0.02580 *
## sex 1 0.0012 147 201.66 0.97186
## gmat_tot 1 0.0452 146 201.62 0.83169
## gmat_qpc 1 0.1837 145 201.44 0.66819
## gmat_vpc 1 0.3484 144 201.09 0.55502
## gmat_tpc 1 3.1229 143 197.97 0.07720 .
## quarter 1 4.4985 142 193.47 0.03393 *
## work_yrs 1 0.2539 141 193.21 0.61432
## frstlang 1 0.0460 140 193.17 0.83009
## satis 1 1.4471 139 191.72 0.22900
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Finding Accuracy of the model
fitted.results <- predict(model,newdata=subset(test,select=c(1,2,3,4,5,6,7,8,9,10,11)),type='response')
fitted.results <- ifelse(fitted.results > 0.5,1,0)
misClasificError <- mean(fitted.results != test$status_placed)
print(paste('Accuracy',1-misClasificError))
## [1] "Accuracy 0.581395348837209"
58% is the accuracy of the above model
Final conclusions Factors Affecting placements are lower age,female biased, higher GMAT Total, higher GMAT Quant Percentile, higher GMAT Verable Perecntile, highre GMAT total percentile, higher Sprong Average, higher Fall Average, low work experience and higher satisfaction rating are factors which improve chances of placement.
Higher salary is more influenced by age, GMAT Quant Percentile and GMAT total percentile