myData <- read.csv(("MBA Starting Salaries.csv"))
head(myData)
## age sex sex_2 gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg quarter
## 1 23 2 Female 620 77 87 87 3.4 3.00 1
## 2 24 1 Male 610 90 71 87 3.5 4.00 1
## 3 24 1 Male 670 99 78 95 3.3 3.25 1
## 4 24 1 Male 570 56 81 75 3.3 2.67 1
## 5 24 2 Female 710 93 98 98 3.6 3.75 1
## 6 24 1 Male 640 82 89 91 3.9 3.75 1
## work_yrs frstlang frstlang_2 salary satis
## 1 2 1 English 0 7
## 2 2 1 English 0 6
## 3 2 1 English 0 6
## 4 1 1 English 0 7
## 5 2 1 English 999 5
## 6 2 1 English 0 6
attach(myData)
str(myData)
## 'data.frame': 274 obs. of 15 variables:
## $ age : int 23 24 24 24 24 24 25 25 25 25 ...
## $ sex : int 2 1 1 1 2 1 1 2 1 1 ...
## $ sex_2 : Factor w/ 2 levels "Female","Male": 1 2 2 2 1 2 2 1 2 2 ...
## $ gmat_tot : int 620 610 670 570 710 640 610 650 630 680 ...
## $ gmat_qpc : int 77 90 99 56 93 82 89 88 79 99 ...
## $ gmat_vpc : int 87 71 78 81 98 89 74 89 91 81 ...
## $ gmat_tpc : int 87 87 95 75 98 91 87 92 89 96 ...
## $ s_avg : num 3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
## $ f_avg : num 3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
## $ quarter : int 1 1 1 1 1 1 1 1 1 1 ...
## $ work_yrs : int 2 2 2 1 2 2 2 2 2 2 ...
## $ frstlang : int 1 1 1 1 1 1 1 1 2 1 ...
## $ frstlang_2: Factor w/ 2 levels "English","Other": 1 1 1 1 1 1 1 1 2 1 ...
## $ salary : int 0 0 0 0 999 0 0 0 999 998 ...
## $ satis : int 7 6 6 7 5 6 5 6 4 998 ...
library(lattice)
histogram(~salary|sex,type="count",breaks=5,col = c("purple","green","pink"))
histogram(~salary|satis,type="count",breaks=5,col = c("purple","green","pink"))
plot(salary,satis,cex=0.6,log="y")
library(car)
scatterplotMatrix(formula = ~ salary + work_yrs + age , cex=0.6,
diagonal="histogram")
scatterplotMatrix(formula = ~salary+gmat_qpc+gmat_tot+gmat_tpc+gmat_vpc,cex = 0.6,
diagonal = "histogram")
library("corrgram")
corrgram(myData, order=TRUE, lower.panel=panel.shade,
upper.panel=panel.pie, text.panel=panel.txt,
main="Correlation of relative price!!")
gender <- xtabs(~satis+sex,data=myData)
addmargins(gender)
## sex
## satis 1 2 Sum
## 1 1 0 1
## 2 0 1 1
## 3 4 1 5
## 4 12 5 17
## 5 53 21 74
## 6 79 18 97
## 7 20 13 33
## 998 37 9 46
## Sum 206 68 274
prop.table(gender,2)
## sex
## satis 1 2
## 1 0.004854369 0.000000000
## 2 0.000000000 0.014705882
## 3 0.019417476 0.014705882
## 4 0.058252427 0.073529412
## 5 0.257281553 0.308823529
## 6 0.383495146 0.264705882
## 7 0.097087379 0.191176471
## 998 0.179611650 0.132352941
chisq.test(gender)
## Warning in chisq.test(gender): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: gender
## X-squared = 10.593, df = 7, p-value = 0.1574
quat <- xtabs(~satis+quarter,data=myData)
addmargins(quat)
## quarter
## satis 1 2 3 4 Sum
## 1 0 1 0 0 1
## 2 0 0 1 0 1
## 3 1 1 0 3 5
## 4 4 3 2 8 17
## 5 20 24 15 15 74
## 6 23 25 27 22 97
## 7 10 5 10 8 33
## 998 11 11 15 9 46
## Sum 69 70 70 65 274
prop.table(quat,1)
## quarter
## satis 1 2 3 4
## 1 0.0000000 1.0000000 0.0000000 0.0000000
## 2 0.0000000 0.0000000 1.0000000 0.0000000
## 3 0.2000000 0.2000000 0.0000000 0.6000000
## 4 0.2352941 0.1764706 0.1176471 0.4705882
## 5 0.2702703 0.3243243 0.2027027 0.2027027
## 6 0.2371134 0.2577320 0.2783505 0.2268041
## 7 0.3030303 0.1515152 0.3030303 0.2424242
## 998 0.2391304 0.2391304 0.3260870 0.1956522
prop.table(quat,2)
## quarter
## satis 1 2 3 4
## 1 0.00000000 0.01428571 0.00000000 0.00000000
## 2 0.00000000 0.00000000 0.01428571 0.00000000
## 3 0.01449275 0.01428571 0.00000000 0.04615385
## 4 0.05797101 0.04285714 0.02857143 0.12307692
## 5 0.28985507 0.34285714 0.21428571 0.23076923
## 6 0.33333333 0.35714286 0.38571429 0.33846154
## 7 0.14492754 0.07142857 0.14285714 0.12307692
## 998 0.15942029 0.15714286 0.21428571 0.13846154
chisq.test(quat)
## Warning in chisq.test(quat): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: quat
## X-squared = 21.92, df = 21, p-value = 0.4042
lang <- xtabs(~satis+frstlang,data=myData)
addmargins(lang)
## frstlang
## satis 1 2 Sum
## 1 1 0 1
## 2 0 1 1
## 3 3 2 5
## 4 10 7 17
## 5 67 7 74
## 6 92 5 97
## 7 31 2 33
## 998 38 8 46
## Sum 242 32 274
prop.table(lang,1)
## frstlang
## satis 1 2
## 1 1.00000000 0.00000000
## 2 0.00000000 1.00000000
## 3 0.60000000 0.40000000
## 4 0.58823529 0.41176471
## 5 0.90540541 0.09459459
## 6 0.94845361 0.05154639
## 7 0.93939394 0.06060606
## 998 0.82608696 0.17391304
prop.table(lang,2)
## frstlang
## satis 1 2
## 1 0.004132231 0.000000000
## 2 0.000000000 0.031250000
## 3 0.012396694 0.062500000
## 4 0.041322314 0.218750000
## 5 0.276859504 0.218750000
## 6 0.380165289 0.156250000
## 7 0.128099174 0.062500000
## 998 0.157024793 0.250000000
chisq.test(lang)
## Warning in chisq.test(lang): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: lang
## X-squared = 32.744, df = 7, p-value = 2.954e-05
t.test(salary~frstlang)
##
## Welch Two Sample t-test
##
## data: salary by frstlang
## t = 1.3595, df = 38.488, p-value = 0.1819
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -6698.237 34122.797
## sample estimates:
## mean in group 1 mean in group 2
## 40627.12 26914.84
t.test(salary~sex)
##
## Welch Two Sample t-test
##
## data: salary by sex
## t = -1.123, df = 111.89, p-value = 0.2638
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -22412.277 6197.373
## sample estimates:
## mean in group 1 mean in group 2
## 37013.62 45121.07
t.test(salary,satis)
##
## Welch Two Sample t-test
##
## data: salary and satis
## t = 12.622, df = 273.03, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 32793.53 44913.49
## sample estimates:
## mean of x mean of y
## 39025.6898 172.1788
t.test(salary,gmat_tot)
##
## Welch Two Sample t-test
##
## data: salary and gmat_tot
## t = 12.477, df = 273, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 32346.41 44466.06
## sample estimates:
## mean of x mean of y
## 39025.6898 619.4526
t.test(salary,work_yrs)
##
## Welch Two Sample t-test
##
## data: salary and work_yrs
## t = 12.677, df = 273, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 32961.99 45081.64
## sample estimates:
## mean of x mean of y
## 39025.689781 3.872263
t.test(salary,age)
##
## Welch Two Sample t-test
##
## data: salary and age
## t = 12.67, df = 273, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 32938.51 45058.15
## sample estimates:
## mean of x mean of y
## 39025.68978 27.35766
Regression model below :
model1 <- lm(salary~work_yrs)
summary(model1)
##
## Call:
## lm(formula = salary ~ work_yrs)
##
## Residuals:
## Min 1Q Median 3Q Max
## -41604 -38759 -37903 58063 179392
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 38474.9 4817.1 7.987 3.92e-14 ***
## work_yrs 142.2 955.7 0.149 0.882
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 51040 on 272 degrees of freedom
## Multiple R-squared: 8.142e-05, Adjusted R-squared: -0.003595
## F-statistic: 0.02215 on 1 and 272 DF, p-value: 0.8818
model2 <- lm(salary~quarter)
summary(model2)
##
## Call:
## lm(formula = salary ~ quarter)
##
## Residuals:
## Min 1Q Median 3Q Max
## -50178 -41635 -27543 55367 192457
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 57723 7452 7.746 1.89e-13 ***
## quarter -7545 2745 -2.748 0.00639 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 50350 on 272 degrees of freedom
## Multiple R-squared: 0.02702, Adjusted R-squared: 0.02344
## F-statistic: 7.553 on 1 and 272 DF, p-value: 0.006392
model3 <- lm(salary~sex)
summary(model3)
##
## Call:
## lm(formula = salary ~ sex)
##
## Residuals:
## Min 1Q Median 3Q Max
## -45121 -37014 -36016 55879 174879
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 28906 9407 3.073 0.00234 **
## sex 8108 7122 1.138 0.25598
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 50920 on 272 degrees of freedom
## Multiple R-squared: 0.004742, Adjusted R-squared: 0.001082
## F-statistic: 1.296 on 1 and 272 DF, p-value: 0.256
model4 <- lm(salary~frstlang)
summary(model4)
##
## Call:
## lm(formula = salary ~ frstlang)
##
## Residuals:
## Min 1Q Median 3Q Max
## -40627 -40627 -39628 56373 193085
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 54339 11116 4.889 1.74e-06 ***
## frstlang -13712 9566 -1.433 0.153
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 50850 on 272 degrees of freedom
## Multiple R-squared: 0.007498, Adjusted R-squared: 0.003849
## F-statistic: 2.055 on 1 and 272 DF, p-value: 0.1529
model5 <- lm(salary~gmat_tot)
summary(model5)
##
## Call:
## lm(formula = salary ~ gmat_tot)
##
## Residuals:
## Min 1Q Median 3Q Max
## -47274 -38999 -36053 56932 175160
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 69179.83 33352.12 2.074 0.039 *
## gmat_tot -48.68 53.61 -0.908 0.365
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 50970 on 272 degrees of freedom
## Multiple R-squared: 0.003022, Adjusted R-squared: -0.0006435
## F-statistic: 0.8245 on 1 and 272 DF, p-value: 0.3647
model6 <- lm(salary~quarter+satis+work_yrs+age)
summary(model6)
##
## Call:
## lm(formula = salary ~ quarter + satis + work_yrs + age)
##
## Residuals:
## Min 1Q Median 3Q Max
## -69411 -41662 -5287 43706 198650
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 164432.615 35974.583 4.571 7.41e-06 ***
## quarter -7385.029 2570.857 -2.873 0.00440 **
## satis -47.999 7.705 -6.229 1.80e-09 ***
## work_yrs 3347.801 1718.224 1.948 0.05241 .
## age -4086.799 1496.400 -2.731 0.00673 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 46920 on 269 degrees of freedom
## Multiple R-squared: 0.1643, Adjusted R-squared: 0.1519
## F-statistic: 13.23 on 4 and 269 DF, p-value: 7.531e-10
The salary of the employees is best predicted by quarter, job satisfaction, work years and age combined.
myData <- myData[which( salary != 998 & salary != 999),]
notPlaced <- myData[which(salary == 0),]
placed <- myData[which(salary!=0), ]
myData$placed <- 0
myData <- within(myData, placed[salary != 0]<- 1)
train <- myData[1:60,]
test <- myData[61:81,]
head(myData)
## age sex sex_2 gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg quarter
## 1 23 2 Female 620 77 87 87 3.4 3.00 1
## 2 24 1 Male 610 90 71 87 3.5 4.00 1
## 3 24 1 Male 670 99 78 95 3.3 3.25 1
## 4 24 1 Male 570 56 81 75 3.3 2.67 1
## 6 24 1 Male 640 82 89 91 3.9 3.75 1
## 7 25 1 Male 610 89 74 87 3.4 3.50 1
## work_yrs frstlang frstlang_2 salary satis placed
## 1 2 1 English 0 7 0
## 2 2 1 English 0 6 0
## 3 2 1 English 0 6 0
## 4 1 1 English 0 7 0
## 6 2 1 English 0 6 0
## 7 2 1 English 0 5 0
head(placed)
## age sex sex_2 gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg quarter
## 6 24 1 Male 640 82 89 91 3.90 3.75 1
## 23 27 1 Male 750 99 98 99 3.40 3.50 1
## 24 28 2 Female 540 75 50 65 3.60 4.00 1
## 25 29 1 Male 580 56 87 78 3.64 3.33 1
## 27 31 2 Female 560 60 78 72 3.30 3.75 1
## 28 32 1 Male 760 99 99 99 3.40 3.00 1
## work_yrs frstlang frstlang_2 salary satis
## 6 2 1 English 0 6
## 23 1 2 Other 0 5
## 24 5 1 English 0 5
## 25 3 1 English 0 5
## 27 10 1 English 0 7
## 28 5 1 English 0 5
head(notPlaced)
## age sex sex_2 gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg quarter
## 1 23 2 Female 620 77 87 87 3.4 3.00 1
## 2 24 1 Male 610 90 71 87 3.5 4.00 1
## 3 24 1 Male 670 99 78 95 3.3 3.25 1
## 4 24 1 Male 570 56 81 75 3.3 2.67 1
## 7 25 1 Male 610 89 74 87 3.4 3.50 1
## 8 25 2 Female 650 88 89 92 3.3 3.75 1
## work_yrs frstlang frstlang_2 salary satis
## 1 2 1 English 0 7
## 2 2 1 English 0 6
## 3 2 1 English 0 6
## 4 1 1 English 0 7
## 7 2 1 English 0 5
## 8 2 1 English 0 6
npGender <- table(notPlaced$sex)
pGender <- table(placed$sex)
prop.table(npGender)*100
##
## 1 2
## 76.47059 23.52941
prop.table(pGender)*100
##
## 1 2
## 69.6 30.4
library(lattice)
par(mfrow=c(2, 1))
bwplot(notPlaced$gmat_tot,horizontal=TRUE,xlab="Not placed GMAT score",col="yellow")
par(mfrow=c(1,1))
par(mfrow=c(1,1))
histogram(~notPlaced$age,type="count",breaks=5,col = c("purple","green","pink"),main="Not placed age distribution")
histogram(~placed$age,type="count",breaks=5,col = c("purple","green","pink"),main="Placed age distribution")
logistic regression
modelp <- glm(placed~gmat_tot+frstlang+age,family=binomial(link='logit'),data=train)
summary(modelp)
##
## Call:
## glm(formula = placed ~ gmat_tot + frstlang + age, family = binomial(link = "logit"),
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4377 -1.3199 0.9556 1.0207 1.5529
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.187993 3.631206 0.327 0.744
## gmat_tot 0.001581 0.004980 0.318 0.751
## frstlang -1.107950 1.261055 -0.879 0.380
## age -0.024319 0.052268 -0.465 0.642
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 81.503 on 59 degrees of freedom
## Residual deviance: 80.375 on 56 degrees of freedom
## AIC: 88.375
##
## Number of Fisher Scoring iterations: 4
anova(modelp, test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: placed
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 59 81.503
## gmat_tot 1 0.07283 58 81.430 0.7873
## frstlang 1 0.83876 57 80.592 0.3598
## age 1 0.21627 56 80.375 0.6419
library(pscl)
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
pR2(modelp)
## llh llhNull G2 McFadden r2ML
## -40.18766470 -40.75159596 1.12786251 0.01383826 0.01862213
## r2CU
## 0.02506597
fitted.results <- predict(modelp,newdata=test,type='response')
fitted.results <- ifelse(fitted.results > 0.5,1,0)
misClasificError <- mean(fitted.results == test$placed)
print(paste('Accuracy',misClasificError))
## [1] "Accuracy 0.0952380952380952"
bwplot(placed$gmat_tot,horizontal = TRUE,xlab="Placed GMAT score",col = "yellow")