mba = read.csv(paste("MBAStartingSalariesData.csv", sep=""))
sal = subset(mba, salary != 998 & salary != 999)
nojob = subset(sal, salary == 0)
job = subset(sal, salary != 0)
attach(sal)
head(sal)
## age sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg quarter work_yrs
## 1 23 2 620 77 87 87 3.4 3.00 1 2
## 2 24 1 610 90 71 87 3.5 4.00 1 2
## 3 24 1 670 99 78 95 3.3 3.25 1 2
## 4 24 1 570 56 81 75 3.3 2.67 1 1
## 6 24 1 640 82 89 91 3.9 3.75 1 2
## 7 25 1 610 89 74 87 3.4 3.50 1 2
## frstlang salary satis
## 1 1 0 7
## 2 1 0 6
## 3 1 0 6
## 4 1 0 7
## 6 1 0 6
## 7 1 0 5
str(sal)
## 'data.frame': 193 obs. of 13 variables:
## $ age : int 23 24 24 24 24 25 25 27 27 28 ...
## $ sex : int 2 1 1 1 1 1 2 1 1 2 ...
## $ gmat_tot: int 620 610 670 570 640 610 650 740 750 540 ...
## $ gmat_qpc: int 77 90 99 56 82 89 88 99 99 75 ...
## $ gmat_vpc: int 87 71 78 81 89 74 89 96 98 50 ...
## $ gmat_tpc: int 87 87 95 75 91 87 92 99 99 65 ...
## $ s_avg : num 3.4 3.5 3.3 3.3 3.9 3.4 3.3 3.5 3.4 3.6 ...
## $ f_avg : num 3 4 3.25 2.67 3.75 3.5 3.75 3.5 3.5 4 ...
## $ quarter : int 1 1 1 1 1 1 1 1 1 1 ...
## $ work_yrs: int 2 2 2 1 2 2 2 3 1 5 ...
## $ frstlang: int 1 1 1 1 1 1 1 1 2 1 ...
## $ salary : int 0 0 0 0 0 0 0 0 0 0 ...
## $ satis : int 7 6 6 7 6 5 6 6 5 5 ...
summary(sal)
## age sex gmat_tot gmat_qpc
## Min. :22.00 Min. :1.00 Min. :450.0 Min. :28.00
## 1st Qu.:25.00 1st Qu.:1.00 1st Qu.:570.0 1st Qu.:72.00
## Median :27.00 Median :1.00 Median :610.0 Median :82.00
## Mean :27.59 Mean :1.28 Mean :615.2 Mean :79.35
## 3rd Qu.:29.00 3rd Qu.:2.00 3rd Qu.:650.0 3rd Qu.:91.00
## Max. :48.00 Max. :2.00 Max. :760.0 Max. :99.00
## gmat_vpc gmat_tpc s_avg f_avg
## Min. :22.00 Min. : 0.00 Min. :2.000 Min. :0.000
## 1st Qu.:71.00 1st Qu.:75.00 1st Qu.:2.800 1st Qu.:2.750
## Median :81.00 Median :87.00 Median :3.090 Median :3.000
## Mean :78.13 Mean :83.48 Mean :3.064 Mean :3.078
## 3rd Qu.:91.00 3rd Qu.:93.00 3rd Qu.:3.300 3rd Qu.:3.330
## Max. :99.00 Max. :99.00 Max. :4.000 Max. :4.000
## quarter work_yrs frstlang salary
## Min. :1.000 Min. : 0.000 Min. :1.000 Min. : 0
## 1st Qu.:1.000 1st Qu.: 2.000 1st Qu.:1.000 1st Qu.: 0
## Median :2.000 Median : 3.000 Median :1.000 Median : 85000
## Mean :2.394 Mean : 4.104 Mean :1.078 Mean : 54985
## 3rd Qu.:3.000 3rd Qu.: 5.000 3rd Qu.:1.000 3rd Qu.:100000
## Max. :4.000 Max. :22.000 Max. :2.000 Max. :220000
## satis
## Min. :3.000
## 1st Qu.:5.000
## Median :6.000
## Mean :5.762
## 3rd Qu.:6.000
## Max. :7.000
library(psych)
describe(sal)
## vars n mean sd median trimmed mad min max
## age 1 193 27.59 4.22 27.00 26.86 2.97 22 48
## sex 2 193 1.28 0.45 1.00 1.23 0.00 1 2
## gmat_tot 3 193 615.23 56.54 610.00 614.19 59.30 450 760
## gmat_qpc 4 193 79.35 15.15 82.00 80.92 14.83 28 99
## gmat_vpc 5 193 78.13 16.10 81.00 79.87 14.83 22 99
## gmat_tpc 6 193 83.48 13.53 87.00 85.08 11.86 0 99
## s_avg 7 193 3.06 0.38 3.09 3.08 0.43 2 4
## f_avg 8 193 3.08 0.52 3.00 3.11 0.37 0 4
## quarter 9 193 2.39 1.10 2.00 2.37 1.48 1 4
## work_yrs 10 193 4.10 3.69 3.00 3.37 1.48 0 22
## frstlang 11 193 1.08 0.27 1.00 1.00 0.00 1 2
## salary 12 193 54985.32 53152.39 85000.00 52726.81 51891.00 0 220000
## satis 13 193 5.76 0.77 6.00 5.75 1.48 3 7
## range skew kurtosis se
## age 26 1.93 4.55 0.30
## sex 1 0.97 -1.06 0.03
## gmat_tot 310 0.08 -0.31 4.07
## gmat_qpc 71 -0.88 0.23 1.09
## gmat_vpc 77 -0.90 0.36 1.16
## gmat_tpc 99 -1.87 7.03 0.97
## s_avg 2 -0.27 -0.15 0.03
## f_avg 4 -2.17 11.03 0.04
## quarter 3 0.13 -1.32 0.08
## work_yrs 22 2.47 7.02 0.27
## frstlang 1 3.13 7.84 0.02
## salary 220000 0.10 -1.45 3825.99
## satis 4 -0.17 -0.06 0.06
table(sex)
## sex
## 1 2
## 139 54
table(quarter)
## quarter
## 1 2 3 4
## 53 52 47 41
table(frstlang)
## frstlang
## 1 2
## 178 15
jobsalary = subset(salary, salary !=0)
summary(jobsalary)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 64000 95000 100000 103031 106000 220000
describe(jobsalary)
## vars n mean sd median trimmed mad min max range skew
## X1 1 103 103030.7 17868.8 1e+05 101065.1 7413 64000 220000 156000 3.18
## kurtosis se
## X1 17.16 1760.67
hist(age, breaks = 13, col="lightblue")
par(mfrow= c(1,2), oma=c(0,0,2,0))
boxplot(gmat_qpc, ylim = c(20,100), main= "Quantitative")
boxplot(gmat_vpc, main= "Verbal")
title("GMAT Percentile", outer = TRUE)
par(mfrow= c(1,2), oma=c(0,0,2,0))
boxplot(s_avg, main= "Spring")
boxplot(f_avg, ylim = c(2,4), main= "Fall")
title("MBA Average", outer = TRUE)
par(mfrow= c(1,2))
boxplot(gmat_tot, main= "Total Score")
boxplot(gmat_tpc, main= "Overall Percentile")
hist(work_yrs, breaks = 11, col="lightblue")
library(lattice)
bwplot(jobsalary, xlab = "Salary")
bwplot(satis, xlab = "Satisfaction")
detach(sal)
attach(job)
library(car)
scatterplot(work_yrs~age, cex = 0.9, pch=19, main = " Work experience vs Age")
scatterplot(salary~age, cex = 0.9, pch=19, main = " Salary vs Age")
scatterplot(salary~gmat_tpc, cex = 0.9, pch=19, main = " Salary vs GMAT Percentile")
round(cor(job),2)
## age sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg
## age 1.00 -0.14 -0.08 -0.17 0.02 -0.10 0.16 -0.22
## sex -0.14 1.00 -0.02 -0.15 0.05 -0.05 0.08 0.17
## gmat_tot -0.08 -0.02 1.00 0.67 0.78 0.97 0.17 0.12
## gmat_qpc -0.17 -0.15 0.67 1.00 0.09 0.66 0.02 0.10
## gmat_vpc 0.02 0.05 0.78 0.09 1.00 0.78 0.16 0.02
## gmat_tpc -0.10 -0.05 0.97 0.66 0.78 1.00 0.14 0.07
## s_avg 0.16 0.08 0.17 0.02 0.16 0.14 1.00 0.45
## f_avg -0.22 0.17 0.12 0.10 0.02 0.07 0.45 1.00
## quarter -0.13 -0.02 -0.11 0.01 -0.13 -0.10 -0.84 -0.43
## work_yrs 0.88 -0.09 -0.12 -0.18 -0.03 -0.13 0.16 -0.22
## frstlang 0.35 0.08 -0.13 0.01 -0.22 -0.16 -0.14 -0.05
## salary 0.50 -0.17 -0.09 0.01 -0.14 -0.13 0.10 -0.11
## satis 0.11 -0.09 0.06 0.00 0.15 0.12 -0.14 -0.12
## quarter work_yrs frstlang salary satis
## age -0.13 0.88 0.35 0.50 0.11
## sex -0.02 -0.09 0.08 -0.17 -0.09
## gmat_tot -0.11 -0.12 -0.13 -0.09 0.06
## gmat_qpc 0.01 -0.18 0.01 0.01 0.00
## gmat_vpc -0.13 -0.03 -0.22 -0.14 0.15
## gmat_tpc -0.10 -0.13 -0.16 -0.13 0.12
## s_avg -0.84 0.16 -0.14 0.10 -0.14
## f_avg -0.43 -0.22 -0.05 -0.11 -0.12
## quarter 1.00 -0.13 0.11 -0.13 0.23
## work_yrs -0.13 1.00 0.20 0.45 0.06
## frstlang 0.11 0.20 1.00 0.27 0.09
## salary -0.13 0.45 0.27 1.00 -0.04
## satis 0.23 0.06 0.09 -0.04 1.00
library(corrgram)
corrgram(job, order=TRUE, upper.panel = panel.pie)
t.test(salary~sex)
##
## Welch Two Sample t-test
##
## data: salary by sex
## t = 1.3628, df = 38.115, p-value = 0.1809
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -3128.55 16021.72
## sample estimates:
## mean in group 1 mean in group 2
## 104970.97 98524.39
t.test(salary~frstlang)
##
## Welch Two Sample t-test
##
## data: salary by frstlang
## t = -1.1202, df = 6.0863, p-value = 0.3049
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -59933.62 22202.25
## sample estimates:
## mean in group 1 mean in group 2
## 101748.6 120614.3
As seen from t-test we cannot say that salary depends upon sex or first language
chisq.test(table(sex,quarter))
##
## Pearson's Chi-squared test
##
## data: table(sex, quarter)
## X-squared = 0.76332, df = 3, p-value = 0.8582
chisq.test(table(sex,satis))
## Warning in chisq.test(table(sex, satis)): Chi-squared approximation may be
## incorrect
##
## Pearson's Chi-squared test
##
## data: table(sex, satis)
## X-squared = 7.3413, df = 4, p-value = 0.1189
chisq.test(table(frstlang,satis))
## Warning in chisq.test(table(frstlang, satis)): Chi-squared approximation
## may be incorrect
##
## Pearson's Chi-squared test
##
## data: table(frstlang, satis)
## X-squared = 0.95627, df = 4, p-value = 0.9164
As seen from chi square test, we cannot say that either sex, quartile, first language or satisfaction is related to each other.
As age and work_yrs are highly correlated and also gmat total scores is highly correlated with gmat total percentile and gmat verbal percentile, thus only one can be used in each case.
fit1 = lm(salary~ .-(satis+sex+frstlang+quarter+work_yrs+gmat_tot+gmat_vpc), data = job)
summary(fit1)
##
## Call:
## lm(formula = salary ~ . - (satis + sex + frstlang + quarter +
## work_yrs + gmat_tot + gmat_vpc), data = job)
##
## Residuals:
## Min 1Q Median 3Q Max
## -29470 -8566 -211 4624 74865
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 31568.0 22091.8 1.429 0.1562
## age 2725.7 501.6 5.434 4.09e-07 ***
## gmat_qpc 376.5 153.1 2.459 0.0157 *
## gmat_tpc -450.7 186.5 -2.416 0.0175 *
## s_avg 3569.4 4763.3 0.749 0.4555
## f_avg -1452.4 3689.5 -0.394 0.6947
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15310 on 97 degrees of freedom
## Multiple R-squared: 0.3019, Adjusted R-squared: 0.2659
## F-statistic: 8.39 on 5 and 97 DF, p-value: 1.263e-06
fit2 = lm(salary~ .-(satis+sex+frstlang+quarter+age+gmat_tot+gmat_tpc), data = job)
summary(fit2)
##
## Call:
## lm(formula = salary ~ . - (satis + sex + frstlang + quarter +
## age + gmat_tot + gmat_tpc), data = job)
##
## Residuals:
## Min 1Q Median 3Q Max
## -32873 -8428 -1373 3678 85757
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 88871.2 17521.6 5.072 1.89e-06 ***
## gmat_qpc 151.8 121.0 1.254 0.213
## gmat_vpc -161.3 100.2 -1.611 0.111
## s_avg 3371.6 4977.7 0.677 0.500
## f_avg -1779.3 3855.2 -0.462 0.645
## work_yrs 2666.1 573.1 4.652 1.04e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15990 on 97 degrees of freedom
## Multiple R-squared: 0.2383, Adjusted R-squared: 0.199
## F-statistic: 6.069 on 5 and 97 DF, p-value: 6.246e-05
We see that the first model is better than the second model as it has more r-sq = 30% and has very high dependency upon age, and slight dependency upon quantative percentile and total percentile.
coefficients(fit1)
## (Intercept) age gmat_qpc gmat_tpc s_avg f_avg
## 31568.0328 2725.7455 376.5486 -450.6843 3569.4119 -1452.3981
for(i in 1:193){sal$placed[i]=if(sal$salary[i]){1}else{0}}
attach(sal)
## The following objects are masked from job:
##
## age, f_avg, frstlang, gmat_qpc, gmat_tot, gmat_tpc, gmat_vpc,
## quarter, s_avg, salary, satis, sex, work_yrs
chisq.test(table(sex,placed))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(sex, placed)
## X-squared = 0.29208, df = 1, p-value = 0.5889
chisq.test(table(quarter,placed))
##
## Pearson's Chi-squared test
##
## data: table(quarter, placed)
## X-squared = 4.9172, df = 3, p-value = 0.178
chisq.test(table(frstlang,placed))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(frstlang, placed)
## X-squared = 0.074127, df = 1, p-value = 0.7854
chisq.test(table(satis,placed))
## Warning in chisq.test(table(satis, placed)): Chi-squared approximation may
## be incorrect
##
## Pearson's Chi-squared test
##
## data: table(satis, placed)
## X-squared = 8.3271, df = 4, p-value = 0.08031
Thus we cannot say that either of sex, quartile, first language or satisfaction is related to placed or not placed.
temp=placed~age+work_yrs+gmat_tot+gmat_qpc+gmat_vpc+gmat_tpc+f_avg+s_avg
fit=glm(formula = temp, family = binomial(link = "logit"), data = sal)
summary(fit)
##
## Call:
## glm(formula = temp, family = binomial(link = "logit"), data = sal)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7445 -1.1746 0.8156 1.0280 2.1857
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 6.69268 3.82908 1.748 0.08049 .
## age -0.21627 0.08083 -2.676 0.00746 **
## work_yrs 0.12450 0.08961 1.389 0.16472
## gmat_tot -0.01314 0.01259 -1.044 0.29648
## gmat_qpc -0.01876 0.04496 -0.417 0.67650
## gmat_vpc -0.01273 0.04218 -0.302 0.76274
## gmat_tpc 0.09306 0.06431 1.447 0.14790
## f_avg -0.14026 0.35626 -0.394 0.69381
## s_avg 0.68314 0.50185 1.361 0.17344
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 266.68 on 192 degrees of freedom
## Residual deviance: 248.23 on 184 degrees of freedom
## AIC: 266.23
##
## Number of Fisher Scoring iterations: 5
anova(fit, test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: placed
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 192 266.68
## age 1 8.4714 191 258.21 0.003608 **
## work_yrs 1 2.6614 190 255.55 0.102809
## gmat_tot 1 0.0010 189 255.54 0.975126
## gmat_qpc 1 0.0490 188 255.50 0.824850
## gmat_vpc 1 1.1592 187 254.34 0.281637
## gmat_tpc 1 4.0546 186 250.28 0.044052 *
## f_avg 1 0.1732 185 250.11 0.677243
## s_avg 1 1.8820 184 248.23 0.170104
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Thus age and gmat total percentile are related to placed or not placed