mbasalary <- read.csv("MBA Starting Salaries Data.csv")
summary(mbasalary)
## age sex gmat_tot gmat_qpc
## Min. :22.00 Min. :1.000 Min. :450.0 Min. :28.00
## 1st Qu.:25.00 1st Qu.:1.000 1st Qu.:580.0 1st Qu.:72.00
## Median :27.00 Median :1.000 Median :620.0 Median :83.00
## Mean :27.36 Mean :1.248 Mean :619.5 Mean :80.64
## 3rd Qu.:29.00 3rd Qu.:1.000 3rd Qu.:660.0 3rd Qu.:93.00
## Max. :48.00 Max. :2.000 Max. :790.0 Max. :99.00
## gmat_vpc gmat_tpc s_avg f_avg
## Min. :16.00 Min. : 0.0 Min. :2.000 Min. :0.000
## 1st Qu.:71.00 1st Qu.:78.0 1st Qu.:2.708 1st Qu.:2.750
## Median :81.00 Median :87.0 Median :3.000 Median :3.000
## Mean :78.32 Mean :84.2 Mean :3.025 Mean :3.062
## 3rd Qu.:91.00 3rd Qu.:94.0 3rd Qu.:3.300 3rd Qu.:3.250
## Max. :99.00 Max. :99.0 Max. :4.000 Max. :4.000
## quarter work_yrs frstlang salary
## Min. :1.000 Min. : 0.000 Min. :1.000 Min. : 0
## 1st Qu.:1.250 1st Qu.: 2.000 1st Qu.:1.000 1st Qu.: 0
## Median :2.000 Median : 3.000 Median :1.000 Median : 999
## Mean :2.478 Mean : 3.872 Mean :1.117 Mean : 39026
## 3rd Qu.:3.000 3rd Qu.: 4.000 3rd Qu.:1.000 3rd Qu.: 97000
## Max. :4.000 Max. :22.000 Max. :2.000 Max. :220000
## satis
## Min. : 1.0
## 1st Qu.: 5.0
## Median : 6.0
## Mean :172.2
## 3rd Qu.: 7.0
## Max. :998.0
mbasalary$sex1 <- mbasalary$sex
mbasalary$frstlang1 <- mbasalary$frstlang
mbasalary$sex1[mbasalary$sex1==1] <- "Male"
mbasalary$sex1[mbasalary$sex1==2] <- "Female"
mbasalary$frstlang1[mbasalary$frstlang1==1] <- "English"
mbasalary$frstlang1[mbasalary$frstlang1==2] <- "Non English"
a <- table(mbasalary$sex1)
barplot(a, main = "Gender distribution in the dataset")
boxplot(mbasalary$gmat_tot, horizontal = TRUE, xlab= "GMAT Total Scores", main = "GMAT Total Distribution",las= 1)
boxplot(mbasalary$gmat_qpc, horizontal = TRUE, xlab= "Quantitative GMAT Scores", main = "Quantitative GMAT Score Distribution",las= 1)
boxplot(mbasalary$gmat_vpc, horizontal = TRUE, xlab= "Verbal GMAT Scores", main = "Verbal GMAT Score Distribution",las= 1)
boxplot(mbasalary$gmat_tpc, horizontal = TRUE, xlab= "Overall GMAT Percentile", main = "Overall GMAT Percentile Distribution",las= 1)
boxplot(mbasalary$s_avg, horizontal = TRUE, xlab= "Spring MBA Avg", main = "Spring MBA Avg Distribution",las= 1)
boxplot(mbasalary$f_avg, horizontal = TRUE, xlab= "Fall MBA Avg", main = "Fall MBA Avg Distribution",las= 1)
boxplot(mbasalary$work_yrs, horizontal = TRUE, xlab= "Work Ex", main = "Work Ex Distribution",las= 1)
b <- table(mbasalary$quarter)
barplot(b)
yesjob <- mbasalary[which(mbasalary$salary >= 1000),]
boxplot(yesjob$salary, horizontal = TRUE, xlab= "Salary", main = "Salary Distribution",las= 1)
nosatis <- mbasalary[which(mbasalary$satis < 10),]
boxplot(nosatis$satis, horizontal = TRUE, xlab= "Satisfaction on a Scale of 1-7", main = "Satisfaction Distribution",las= 1)
library(car)
## Warning: package 'car' was built under R version 3.4.3
scatterplot(yesjob$sex,yesjob$salary)
scatterplot(yesjob$age,yesjob$salary)
scatterplot(yesjob$gmat_tot,yesjob$salary)
scatterplot(yesjob$work_yrs,yesjob$salary)
scatterplot(yesjob$s_avg,yesjob$salary)
scatterplot(yesjob$f_avg,yesjob$salary)
scatterplot(yesjob$satis,yesjob$salary)
scatterplot(yesjob$salary, yesjob$frstlang)
scatterplot(nosatis$sex,nosatis$satis)
scatterplot(nosatis$satis, nosatis$age)
scatterplot(nosatis$satis, nosatis$gmat_tot)
scatterplot(nosatis$satis, nosatis$work_yrs)
scatterplot(nosatis$satis, nosatis$s_avg)
scatterplot(nosatis$satis, nosatis$f_avg)
abc <- table(nosatis$frstlang1,nosatis$satis)
barplot(abc,ylab="Frequency",xlab = "Satisfaction on a scale of 1-7",col=c("darkblue","red"),
legend = rownames(abc), beside=TRUE)
scatterplot(mbasalary$gmat_tot, mbasalary$s_avg)
scatterplot(mbasalary$gmat_tot, mbasalary$f_avg)
library(corrgram)
## Warning: package 'corrgram' was built under R version 3.4.3
corrgram(mbasalary[c(1:13)], order=TRUE,
main="MBA SALARY CORRGRAM",
lower.panel=panel.shade, upper.panel=panel.pie,
diag.panel=panel.minmax, text.panel=panel.txt)
cor(mbasalary[c(1:13)])
## age sex gmat_tot gmat_qpc gmat_vpc
## age 1.00000000 -0.028106442 -0.14593840 -0.21616985 -0.04417547
## sex -0.02810644 1.000000000 -0.05336820 -0.16377435 0.07488782
## gmat_tot -0.14593840 -0.053368202 1.00000000 0.72473781 0.74839187
## gmat_qpc -0.21616985 -0.163774346 0.72473781 1.00000000 0.15218014
## gmat_vpc -0.04417547 0.074887816 0.74839187 0.15218014 1.00000000
## gmat_tpc -0.16990307 -0.008090213 0.84779965 0.65137754 0.66621604
## s_avg 0.14970402 0.127115144 0.11311702 -0.02984873 0.20445365
## f_avg -0.01744806 0.091663891 0.10442409 0.07370455 0.07592225
## quarter -0.04967221 -0.133533171 -0.09223903 0.03636638 -0.17460736
## work_yrs 0.85829810 -0.011296374 -0.18235434 -0.23660827 -0.06639049
## frstlang 0.05692649 0.001536205 -0.13503402 0.13892774 -0.38980465
## salary -0.06257355 0.068858628 -0.05497188 -0.04403293 -0.00613934
## satis -0.12788825 -0.054602220 0.08255770 0.06060004 0.06262375
## gmat_tpc s_avg f_avg quarter work_yrs
## age -0.169903066 0.14970402 -0.01744806 -4.967221e-02 0.858298096
## sex -0.008090213 0.12711514 0.09166389 -1.335332e-01 -0.011296374
## gmat_tot 0.847799647 0.11311702 0.10442409 -9.223903e-02 -0.182354339
## gmat_qpc 0.651377538 -0.02984873 0.07370455 3.636638e-02 -0.236608270
## gmat_vpc 0.666216035 0.20445365 0.07592225 -1.746074e-01 -0.066390490
## gmat_tpc 1.000000000 0.11736245 0.07973210 -8.303535e-02 -0.173361859
## s_avg 0.117362449 1.00000000 0.55062139 -7.621166e-01 0.129292714
## f_avg 0.079732099 0.55062139 1.00000000 -4.475064e-01 -0.039056921
## quarter -0.083035351 -0.76211664 -0.44750637 1.000000e+00 -0.086026406
## work_yrs -0.173361859 0.12929271 -0.03905692 -8.602641e-02 1.000000000
## frstlang -0.103362747 -0.13631308 -0.03705695 9.949226e-02 -0.027866747
## salary 0.004930901 0.14583606 0.02944303 -1.643699e-01 0.009023407
## satis 0.092934266 -0.03268664 0.01089273 -1.267198e-05 -0.109255286
## frstlang salary satis
## age 0.056926486 -0.062573547 -1.278882e-01
## sex 0.001536205 0.068858628 -5.460222e-02
## gmat_tot -0.135034017 -0.054971880 8.255770e-02
## gmat_qpc 0.138927742 -0.044032933 6.060004e-02
## gmat_vpc -0.389804653 -0.006139340 6.262375e-02
## gmat_tpc -0.103362747 0.004930901 9.293427e-02
## s_avg -0.136313080 0.145836062 -3.268664e-02
## f_avg -0.037056954 0.029443027 1.089273e-02
## quarter 0.099492259 -0.164369865 -1.267198e-05
## work_yrs -0.027866747 0.009023407 -1.092553e-01
## frstlang 1.000000000 -0.086592096 7.932264e-02
## salary -0.086592096 1.000000000 -3.352171e-01
## satis 0.079322637 -0.335217114 1.000000e+00
nojob <- mbasalary
nojob <- mbasalary[which(mbasalary$salary == 0),]
print("All Students")
## [1] "All Students"
table(mbasalary$frstlang1,mbasalary$sex1)
##
## Female Male
## English 60 182
## Non English 8 24
print("Not placed Students")
## [1] "Not placed Students"
table(nojob$frstlang1,nojob$sex1)
##
## Female Male
## English 22 60
## Non English 1 7
print("Placed Students")
## [1] "Placed Students"
table(yesjob$frstlang1,yesjob$sex1)
##
## Female Male
## English 28 68
## Non English 3 4
nojob <- mbasalary
nojob <- mbasalary[which(mbasalary$salary == 0),]
print("All Students")
## [1] "All Students"
table(mbasalary$frstlang1,mbasalary$sex1)
##
## Female Male
## English 60 182
## Non English 8 24
print("Not placed Students")
## [1] "Not placed Students"
table(nojob$frstlang1,nojob$sex1)
##
## Female Male
## English 22 60
## Non English 1 7
print("Placed Students")
## [1] "Placed Students"
table(yesjob$frstlang1,yesjob$sex1)
##
## Female Male
## English 28 68
## Non English 3 4
mbasalary$salary1 <- mbasalary$salary
mbasalary$salary1[mbasalary$salary1>1000] <- "Got an offer"
mbasalary$salary1[mbasalary$salary1==0] <- "Job less"
mbasalary$salary1[mbasalary$salary1==999] <- "Not Applicable"
mbasalary$salary1[mbasalary$salary1==998] <- "Not Applicable"
table(mbasalary$salary1,mbasalary$sex1)
##
## Female Male
## Got an offer 31 72
## Job less 23 67
## Not Applicable 14 67
table(mbasalary$salary1,mbasalary$frstlang1)
##
## English Non English
## Got an offer 96 7
## Job less 82 8
## Not Applicable 64 17
print("now comparing job placement vs satisfaction level")
## [1] "now comparing job placement vs satisfaction level"
table(mbasalary$salary1,mbasalary$satis)
##
## 1 2 3 4 5 6 7 998
## Got an offer 0 0 1 1 29 50 22 0
## Job less 0 0 0 4 36 40 10 0
## Not Applicable 1 1 4 12 9 7 1 46
chisq.test(mbasalary$salary1,mbasalary$sex1)
##
## Pearson's Chi-squared test
##
## data: mbasalary$salary1 and mbasalary$sex1
## X-squared = 4.0288, df = 2, p-value = 0.1334
chisq.test(mbasalary$salary1,mbasalary$frstlang1)
##
## Pearson's Chi-squared test
##
## data: mbasalary$salary1 and mbasalary$frstlang1
## X-squared = 9.8645, df = 2, p-value = 0.00721
predi <- lm(yesjob$salary~yesjob$sex+yesjob$work_yrs+yesjob$gmat_tot+yesjob$s_avg+yesjob$f_avg+yesjob$frstlang)
summary(predi)
##
## Call:
## lm(formula = yesjob$salary ~ yesjob$sex + yesjob$work_yrs + yesjob$gmat_tot +
## yesjob$s_avg + yesjob$f_avg + yesjob$frstlang)
##
## Residuals:
## Min 1Q Median 3Q Max
## -32652 -8940 -1709 5186 83182
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 82356.30 24206.88 3.402 0.000976 ***
## yesjob$sex -5886.39 3462.79 -1.700 0.092388 .
## yesjob$work_yrs 2201.83 579.92 3.797 0.000257 ***
## yesjob$gmat_tot -11.90 31.77 -0.375 0.708712
## yesjob$s_avg 4851.02 4986.79 0.973 0.333110
## yesjob$f_avg -1153.74 3822.28 -0.302 0.763422
## yesjob$frstlang 15101.77 6473.46 2.333 0.021743 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15760 on 96 degrees of freedom
## Multiple R-squared: 0.2678, Adjusted R-squared: 0.2221
## F-statistic: 5.853 on 6 and 96 DF, p-value: 3.114e-05
predi1 <- lm(yesjob$salary~yesjob$work_yrs+yesjob$frstlang)
summary(predi1)
##
## Call:
## lm(formula = yesjob$salary ~ yesjob$work_yrs + yesjob$frstlang)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33972 -8955 -455 4545 76681
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 79941.4 6788.8 11.775 < 2e-16 ***
## yesjob$work_yrs 2483.3 527.9 4.704 8.18e-06 ***
## yesjob$frstlang 13064.0 6283.2 2.079 0.0402 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15740 on 100 degrees of freedom
## Multiple R-squared: 0.2396, Adjusted R-squared: 0.2244
## F-statistic: 15.75 on 2 and 100 DF, p-value: 1.128e-06
predi2 <- lm(yesjob$salary~yesjob$sex+yesjob$gmat_tot+yesjob$s_avg+yesjob$f_avg)
summary(predi2)
##
## Call:
## lm(formula = yesjob$salary ~ yesjob$sex + yesjob$gmat_tot + yesjob$s_avg +
## yesjob$f_avg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -39889 -7926 -2357 4047 120521
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 122912.34 24422.85 5.033 2.2e-06 ***
## yesjob$sex -6158.11 3825.41 -1.610 0.1107
## yesjob$gmat_tot -38.61 34.87 -1.107 0.2708
## yesjob$s_avg 9603.44 5173.84 1.856 0.0664 .
## yesjob$f_avg -5752.29 4028.95 -1.428 0.1565
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17550 on 98 degrees of freedom
## Multiple R-squared: 0.07369, Adjusted R-squared: 0.03588
## F-statistic: 1.949 on 4 and 98 DF, p-value: 0.1084
From the above analysis we see that the best predictor for salary are Gender and First language.
table(mbasalary$salary1,mbasalary$sex1)
##
## Female Male
## Got an offer 31 72
## Job less 23 67
## Not Applicable 14 67
table(mbasalary$salary1,mbasalary$frstlang1)
##
## English Non English
## Got an offer 96 7
## Job less 82 8
## Not Applicable 64 17
chisq.test(mbasalary$salary1,mbasalary$sex1)
##
## Pearson's Chi-squared test
##
## data: mbasalary$salary1 and mbasalary$sex1
## X-squared = 4.0288, df = 2, p-value = 0.1334
chisq.test(mbasalary$salary1,mbasalary$frstlang1)
##
## Pearson's Chi-squared test
##
## data: mbasalary$salary1 and mbasalary$frstlang1
## X-squared = 9.8645, df = 2, p-value = 0.00721
chisq.test(mbasalary$frstlang1,mbasalary$sex1)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: mbasalary$frstlang1 and mbasalary$sex1
## X-squared = 2.4871e-29, df = 1, p-value = 1
mbasalary12 <- mbasalary
mbasalary12$salary2 <- mbasalary12$salary
mbasalary12<-mbasalary12[mbasalary12$salary2!=998,]
mbasalary12$salary2[mbasalary12$salary2>1] <- 1
mbasalary12$salary2[mbasalary12$salary2==0] <- 0
We are using 90:10 split for the dataframe
mbasalary12 <- mbasalary12[-c(12)]
train <- mbasalary12[1:205,]
test <- mbasalary12[206:228,]
model <- glm(salary2 ~ age + sex+ gmat_tot+gmat_qpc+gmat_vpc+ gmat_tpc + s_avg +f_avg + quarter + work_yrs+frstlang + satis ,family=binomial(link='logit'),data=mbasalary12)
summary(model)
##
## Call:
## glm(formula = salary2 ~ age + sex + gmat_tot + gmat_qpc + gmat_vpc +
## gmat_tpc + s_avg + f_avg + quarter + work_yrs + frstlang +
## satis, family = binomial(link = "logit"), data = mbasalary12)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9069 -1.2617 0.8233 0.9796 1.5543
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 7.229484 3.907020 1.850 0.0643 .
## age -0.182398 0.075593 -2.413 0.0158 *
## sex -0.071435 0.332632 -0.215 0.8300
## gmat_tot -0.006845 0.010361 -0.661 0.5088
## gmat_qpc 0.012925 0.028238 0.458 0.6472
## gmat_vpc 0.016111 0.027631 0.583 0.5598
## gmat_tpc 0.011001 0.019087 0.576 0.5644
## s_avg 0.032437 0.621421 0.052 0.9584
## f_avg -0.190403 0.340350 -0.559 0.5759
## quarter -0.141159 0.192870 -0.732 0.4642
## work_yrs 0.097279 0.083493 1.165 0.2440
## frstlang 0.608889 0.578531 1.052 0.2926
## satis -0.158727 0.157510 -1.008 0.3136
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 305.89 on 227 degrees of freedom
## Residual deviance: 291.96 on 215 degrees of freedom
## AIC: 317.96
##
## Number of Fisher Scoring iterations: 4
fitted.results <- predict(model,newdata=subset(test,select=c(1:12)),type='response')
fitted.results <- ifelse(fitted.results > 0.5,1,0)
misClasificError <- mean(fitted.results != test$salary2)
print(paste('Accuracy',1-misClasificError))
## [1] "Accuracy 0.869565217391304"