This is my R markdown document of the case study on MBA starting salary.
# Read the data
salary.df <- read.csv(paste("MBASalData.csv", sep=""))
View(salary.df)
# Summarize the data
attach(salary.df)
library(psych)
describe(salary.df)
## vars n mean sd median trimmed mad min max
## age 1 274 27.36 3.71 27 26.76 2.97 22 48
## sex 2 274 1.25 0.43 1 1.19 0.00 1 2
## gmat_tot 3 274 619.45 57.54 620 618.86 59.30 450 790
## gmat_qpc 4 274 80.64 14.87 83 82.31 14.83 28 99
## gmat_vpc 5 274 78.32 16.86 81 80.33 14.83 16 99
## gmat_tpc 6 274 84.20 14.02 87 86.12 11.86 0 99
## s_avg 7 274 3.03 0.38 3 3.03 0.44 2 4
## f_avg 8 274 3.06 0.53 3 3.09 0.37 0 4
## quarter 9 274 2.48 1.11 2 2.47 1.48 1 4
## work_yrs 10 274 3.87 3.23 3 3.29 1.48 0 22
## frstlang 11 274 1.12 0.32 1 1.02 0.00 1 2
## salary 12 274 39025.69 50951.56 999 33607.86 1481.12 0 220000
## satis 13 274 172.18 371.61 6 91.50 1.48 1 998
## range skew kurtosis se
## age 26 2.16 6.45 0.22
## sex 1 1.16 -0.66 0.03
## gmat_tot 340 -0.01 0.06 3.48
## gmat_qpc 71 -0.92 0.30 0.90
## gmat_vpc 83 -1.04 0.74 1.02
## gmat_tpc 99 -2.28 9.02 0.85
## s_avg 2 -0.06 -0.38 0.02
## f_avg 4 -2.08 10.85 0.03
## quarter 3 0.02 -1.35 0.07
## work_yrs 22 2.78 9.80 0.20
## frstlang 1 2.37 3.65 0.02
## salary 220000 0.70 -1.05 3078.10
## satis 997 1.77 1.13 22.45
boxplot(salary.df$gmat_tot, horizontal=TRUE,
main="GMAT Total score")
boxplot(salary.df$gmat_qpc, horizontal=TRUE,
main="quantitative GMAT percentile")
boxplot(salary.df$gmat_vpc, horizontal=TRUE,
main="verbal GMAT percentile")
boxplot(salary.df$gmat_tpc, horizontal=TRUE,
main="total GMAT percentile")
boxplot(salary.df$s_avg, horizontal=TRUE,
main="spring MBA average")
boxplot(salary.df$f_avg, horizontal=TRUE,
main="fall MBA average")
boxplot(salary.df$work_yrs, horizontal=TRUE,
main="years of work experience")
boxplot(salary.df$salary, horizontal=TRUE,
main="Starting salary")
boxplot(salary.df$satis, horizontal=TRUE,
main="degree of satisfaction with MBA program ")
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
##
## Attaching package: 'Hmisc'
## The following object is masked from 'package:psych':
##
## describe
## The following objects are masked from 'package:base':
##
## format.pval, units
colsalary <- c("salary","age","sex","work_yrs")
corMatrix <- rcorr(as.matrix(salary.df[,colsalary]))
corMatrix
## salary age sex work_yrs
## salary 1.00 -0.06 0.07 0.01
## age -0.06 1.00 -0.03 0.86
## sex 0.07 -0.03 1.00 -0.01
## work_yrs 0.01 0.86 -0.01 1.00
##
## n= 274
##
##
## P
## salary age sex work_yrs
## salary 0.3020 0.2560 0.8818
## age 0.3020 0.6432 0.0000
## sex 0.2560 0.6432 0.8523
## work_yrs 0.8818 0.0000 0.8523
library(corrgram)
corrgram(salary.df, order=FALSE,
lower.panel=panel.shade,
upper.panel=panel.pie,
diag.panel=panel.minmax,
text.panel=panel.txt,
main="Corrgram of MBA Salary Data")
job.df<-salary.df[which(salary.df$salary!=998 & salary.df$salary!=999),]
View(job.df)
gotjob.df<-job.df[which(job.df$salary!=0),]
View(gotjob.df)
Here y=salary, and this salary depends upon various factors like GMAT percentile, sex, first language, work experience, spring MBA average, fall MBA average etc which are considered here as ‘x’.
mytable <- xtabs(~ salary+sex , data=gotjob.df)
mytable
## sex
## salary 1 2
## 64000 0 1
## 77000 1 0
## 78256 0 1
## 82000 0 1
## 85000 1 3
## 86000 0 2
## 88000 0 1
## 88500 1 0
## 90000 3 0
## 92000 2 1
## 93000 2 1
## 95000 4 3
## 96000 3 1
## 96500 1 0
## 97000 2 0
## 98000 6 4
## 99000 0 1
## 100000 4 5
## 100400 1 0
## 101000 0 2
## 101100 1 0
## 101600 1 0
## 102500 1 0
## 103000 1 0
## 104000 2 0
## 105000 11 0
## 106000 2 1
## 107000 1 0
## 107300 1 0
## 107500 1 0
## 108000 2 0
## 110000 0 1
## 112000 3 0
## 115000 5 0
## 118000 1 0
## 120000 3 1
## 126710 1 0
## 130000 1 0
## 145800 1 0
## 146000 1 0
## 162000 1 0
## 220000 0 1
mytable <- xtabs(~ frstlang+salary , data=gotjob.df)
mytable
## salary
## frstlang 64000 77000 78256 82000 85000 86000 88000 88500 90000 92000 93000
## 1 1 1 1 1 4 2 1 1 3 3 3
## 2 0 0 0 0 0 0 0 0 0 0 0
## salary
## frstlang 95000 96000 96500 97000 98000 99000 100000 100400 101000 101100
## 1 7 4 1 2 8 0 9 1 2 1
## 2 0 0 0 0 2 1 0 0 0 0
## salary
## frstlang 101600 102500 103000 104000 105000 106000 107000 107300 107500
## 1 1 1 1 1 11 3 1 0 1
## 2 0 0 0 1 0 0 0 1 0
## salary
## frstlang 108000 110000 112000 115000 118000 120000 126710 130000 145800
## 1 2 1 3 5 0 4 1 1 1
## 2 0 0 0 0 1 0 0 0 0
## salary
## frstlang 146000 162000 220000
## 1 1 1 0
## 2 0 0 1
mytable <- xtabs(~ salary+work_yrs , data=gotjob.df)
mytable
## work_yrs
## salary 0 1 2 3 4 5 6 7 8 10 15 16
## 64000 0 0 1 0 0 0 0 0 0 0 0 0
## 77000 0 0 1 0 0 0 0 0 0 0 0 0
## 78256 0 1 0 0 0 0 0 0 0 0 0 0
## 82000 0 1 0 0 0 0 0 0 0 0 0 0
## 85000 0 1 2 1 0 0 0 0 0 0 0 0
## 86000 0 0 1 1 0 0 0 0 0 0 0 0
## 88000 0 0 0 1 0 0 0 0 0 0 0 0
## 88500 0 0 0 1 0 0 0 0 0 0 0 0
## 90000 0 0 2 0 0 1 0 0 0 0 0 0
## 92000 0 0 3 0 0 0 0 0 0 0 0 0
## 93000 0 0 0 0 1 1 0 0 1 0 0 0
## 95000 1 1 2 2 0 1 0 0 0 0 0 0
## 96000 0 1 2 0 1 0 0 0 0 0 0 0
## 96500 0 0 1 0 0 0 0 0 0 0 0 0
## 97000 0 0 0 1 1 0 0 0 0 0 0 0
## 98000 0 0 7 1 1 0 0 1 0 0 0 0
## 99000 0 0 0 0 0 1 0 0 0 0 0 0
## 100000 0 0 6 1 1 0 1 0 0 0 0 0
## 100400 0 0 0 1 0 0 0 0 0 0 0 0
## 101000 0 0 2 0 0 0 0 0 0 0 0 0
## 101100 0 0 0 0 0 0 0 0 1 0 0 0
## 101600 0 0 0 1 0 0 0 0 0 0 0 0
## 102500 0 0 0 0 0 0 1 0 0 0 0 0
## 103000 0 0 0 1 0 0 0 0 0 0 0 0
## 104000 0 0 0 0 2 0 0 0 0 0 0 0
## 105000 0 0 4 4 0 1 1 0 0 0 0 1
## 106000 0 0 0 0 0 0 2 0 1 0 0 0
## 107000 0 0 1 0 0 0 0 0 0 0 0 0
## 107300 0 0 1 0 0 0 0 0 0 0 0 0
## 107500 0 0 0 1 0 0 0 0 0 0 0 0
## 108000 0 0 0 1 1 0 0 0 0 0 0 0
## 110000 0 0 0 0 0 0 1 0 0 0 0 0
## 112000 0 0 1 0 0 0 1 0 0 0 0 1
## 115000 0 2 0 1 2 0 0 0 0 0 0 0
## 118000 0 0 0 0 0 0 0 0 0 1 0 0
## 120000 0 0 0 1 0 2 0 0 1 0 0 0
## 126710 0 0 0 1 0 0 0 0 0 0 0 0
## 130000 0 0 0 0 1 0 0 0 0 0 0 0
## 145800 0 0 1 0 0 0 0 0 0 0 0 0
## 146000 0 0 0 0 0 0 0 0 0 0 1 0
## 162000 0 1 0 0 0 0 0 0 0 0 0 0
## 220000 0 0 0 0 0 0 0 0 0 0 1 0
t.test(salary,age , data=gotjob.df)
##
## Welch Two Sample t-test
##
## data: salary and age
## t = 12.67, df = 273, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 32938.51 45058.15
## sample estimates:
## mean of x mean of y
## 39025.68978 27.35766
t.test(salary,work_yrs , data=gotjob.df)
##
## Welch Two Sample t-test
##
## data: salary and work_yrs
## t = 12.677, df = 273, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 32961.99 45081.64
## sample estimates:
## mean of x mean of y
## 39025.689781 3.872263
m1 <- lm(salary ~
gmat_tpc
+ s_avg
+ f_avg,
data=gotjob.df)
summary(m1)
##
## Call:
## lm(formula = salary ~ gmat_tpc + s_avg + f_avg, data = gotjob.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -45140 -7934 -1887 3623 112357
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 114442.5 19076.0 5.999 3.26e-08 ***
## gmat_tpc -239.5 159.7 -1.499 0.1370
## s_avg 9719.3 5176.8 1.877 0.0634 .
## f_avg -6867.3 3988.1 -1.722 0.0882 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17580 on 99 degrees of freedom
## Multiple R-squared: 0.06029, Adjusted R-squared: 0.03182
## F-statistic: 2.117 on 3 and 99 DF, p-value: 0.1029
m2 <- lm(salary ~
work_yrs
+ frstlang
+ sex
+ age,
data=gotjob.df)
summary(m2)
##
## Call:
## lm(formula = salary ~ work_yrs + frstlang + sex + age, data = gotjob.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -29056 -9070 -1211 5858 79078
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 51623.0 24708.1 2.089 0.0393 *
## work_yrs 916.9 1119.1 0.819 0.4146
## frstlang 10017.9 6785.8 1.476 0.1431
## sex -4655.5 3418.8 -1.362 0.1764
## age 1620.5 1089.7 1.487 0.1402
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15510 on 98 degrees of freedom
## Multiple R-squared: 0.2763, Adjusted R-squared: 0.2468
## F-statistic: 9.355 on 4 and 98 DF, p-value: 1.907e-06
m3 <- lm(salary ~
+ frstlang
+ sex
+ s_avg
+ f_avg
+ age
+ work_yrs,
data=gotjob.df)
summary(m3)
##
## Call:
## lm(formula = salary ~ +frstlang + sex + s_avg + f_avg + age +
## work_yrs, data = gotjob.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -30413 -8757 -1921 5966 81580
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 45104.8 27753.7 1.625 0.107
## frstlang 11264.2 7061.9 1.595 0.114
## sex -4954.4 3490.7 -1.419 0.159
## s_avg 3456.5 4953.5 0.698 0.487
## f_avg -603.8 3812.6 -0.158 0.874
## age 1499.4 1113.9 1.346 0.181
## work_yrs 915.9 1130.2 0.810 0.420
##
## Residual standard error: 15620 on 96 degrees of freedom
## Multiple R-squared: 0.2804, Adjusted R-squared: 0.2354
## F-statistic: 6.233 on 6 and 96 DF, p-value: 1.481e-05
m4 <- lm(salary ~
work_yrs
+ frstlang,
data=gotjob.df)
summary(m4)
##
## Call:
## lm(formula = salary ~ work_yrs + frstlang, data = gotjob.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33972 -8955 -455 4545 76681
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 79941.4 6788.8 11.775 < 2e-16 ***
## work_yrs 2483.3 527.9 4.704 8.18e-06 ***
## frstlang 13064.0 6283.2 2.079 0.0402 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15740 on 100 degrees of freedom
## Multiple R-squared: 0.2396, Adjusted R-squared: 0.2244
## F-statistic: 15.75 on 2 and 100 DF, p-value: 1.128e-06
m5 <- lm(salary ~
work_yrs
+ sex
+ age,
data=gotjob.df)
summary(m5)
##
## Call:
## lm(formula = salary ~ work_yrs + sex + age, data = gotjob.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -29250 -9239 -1146 5429 84318
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 45674.9 24522.2 1.863 0.0655 .
## work_yrs 478.2 1085.3 0.441 0.6604
## sex -3852.2 3395.2 -1.135 0.2593
## age 2263.4 1004.8 2.253 0.0265 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15600 on 99 degrees of freedom
## Multiple R-squared: 0.2602, Adjusted R-squared: 0.2378
## F-statistic: 11.61 on 3 and 99 DF, p-value: 1.389e-06
m6 <- lm(salary ~
work_yrs
+ age,
data=gotjob.df)
summary(m6)
##
## Call:
## lm(formula = salary ~ work_yrs + age, data = gotjob.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31675 -8099 -2108 4411 80650
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36967.5 23323.8 1.585 0.1161
## work_yrs 388.8 1084.0 0.359 0.7206
## age 2413.8 997.4 2.420 0.0173 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15620 on 100 degrees of freedom
## Multiple R-squared: 0.2506, Adjusted R-squared: 0.2356
## F-statistic: 16.72 on 2 and 100 DF, p-value: 5.438e-07
m7 <- lm(salary ~ age, data=gotjob.df)
summary(m7)
##
## Call:
## lm(formula = salary ~ age, data = gotjob.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31454 -8533 -2182 4546 80886
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29962.6 12697.8 2.360 0.0202 *
## age 2728.8 470.7 5.797 7.75e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15550 on 101 degrees of freedom
## Multiple R-squared: 0.2496, Adjusted R-squared: 0.2422
## F-statistic: 33.6 on 1 and 101 DF, p-value: 7.748e-08
m8 <- lm(salary ~
s_avg
+ f_avg
+ age,
data=gotjob.df)
summary(m8)
##
## Call:
## lm(formula = salary ~ s_avg + f_avg + age, data = gotjob.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -32079 -8073 -2362 4671 82120
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 28077.5 18901.4 1.485 0.141
## s_avg 1435.6 4795.8 0.299 0.765
## f_avg -469.6 3765.6 -0.125 0.901
## age 2687.6 508.7 5.283 7.55e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15700 on 99 degrees of freedom
## Multiple R-squared: 0.2503, Adjusted R-squared: 0.2276
## F-statistic: 11.02 on 3 and 99 DF, p-value: 2.635e-06
m9 <- lm(salary ~
work_yrs
+ frstlang
+ gmat_tpc
+ sex
+ age
+ s_avg
+ f_avg,
data=gotjob.df)
summary(m9)
##
## Call:
## lm(formula = salary ~ work_yrs + frstlang + gmat_tpc + sex +
## age + s_avg + f_avg, data = gotjob.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -32354 -9144 -1995 6557 78985
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 53884.7 29504.9 1.826 0.0709 .
## work_yrs 775.0 1142.7 0.678 0.4993
## frstlang 10442.9 7130.6 1.465 0.1464
## gmat_tpc -128.9 145.8 -0.885 0.3786
## sex -5095.5 3498.3 -1.457 0.1485
## age 1578.8 1118.7 1.411 0.1614
## s_avg 4030.7 5001.4 0.806 0.4223
## f_avg -669.5 3817.7 -0.175 0.8612
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15640 on 95 degrees of freedom
## Multiple R-squared: 0.2862, Adjusted R-squared: 0.2336
## F-statistic: 5.442 on 7 and 95 DF, p-value: 2.828e-05
From the above analysis of of all the models I found that the adjusted R-squared value is highest for the model 2(m2) that is 0.2468.So,model 2 is the best model that fits data. Although this 24.68% is not that high percentage but still we can conclude that starting salary mostly depends upon work experience, first language, sex and age.