MBASalaryData.df <- read.csv(paste("MBA Starting Salaries Data.csv", sep=""))
str(MBASalaryData.df)
## 'data.frame': 274 obs. of 13 variables:
## $ age : int 23 24 24 24 24 24 25 25 25 25 ...
## $ sex : int 2 1 1 1 2 1 1 2 1 1 ...
## $ gmat_tot: int 620 610 670 570 710 640 610 650 630 680 ...
## $ gmat_qpc: int 77 90 99 56 93 82 89 88 79 99 ...
## $ gmat_vpc: int 87 71 78 81 98 89 74 89 91 81 ...
## $ gmat_tpc: int 87 87 95 75 98 91 87 92 89 96 ...
## $ s_avg : num 3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
## $ f_avg : num 3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
## $ quarter : int 1 1 1 1 1 1 1 1 1 1 ...
## $ work_yrs: int 2 2 2 1 2 2 2 2 2 2 ...
## $ frstlang: int 1 1 1 1 1 1 1 1 2 1 ...
## $ salary : int 0 0 0 0 999 0 0 0 999 998 ...
## $ satis : int 7 6 6 7 5 6 5 6 4 998 ...
## Converting sex, quarter and first language columns to factors from integers
MBASalaryData.df$sex <- as.factor(MBASalaryData.df$sex)
MBASalaryData.df$quarter <- as.factor(MBASalaryData.df$quarter)
MBASalaryData.df$frstlang <- as.factor(MBASalaryData.df$frstlang)
str(MBASalaryData.df)
## 'data.frame': 274 obs. of 13 variables:
## $ age : int 23 24 24 24 24 24 25 25 25 25 ...
## $ sex : Factor w/ 2 levels "1","2": 2 1 1 1 2 1 1 2 1 1 ...
## $ gmat_tot: int 620 610 670 570 710 640 610 650 630 680 ...
## $ gmat_qpc: int 77 90 99 56 93 82 89 88 79 99 ...
## $ gmat_vpc: int 87 71 78 81 98 89 74 89 91 81 ...
## $ gmat_tpc: int 87 87 95 75 98 91 87 92 89 96 ...
## $ s_avg : num 3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
## $ f_avg : num 3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
## $ quarter : Factor w/ 4 levels "1","2","3","4": 1 1 1 1 1 1 1 1 1 1 ...
## $ work_yrs: int 2 2 2 1 2 2 2 2 2 2 ...
## $ frstlang: Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 2 1 ...
## $ salary : int 0 0 0 0 999 0 0 0 999 998 ...
## $ satis : int 7 6 6 7 5 6 5 6 4 998 ...
summary(MBASalaryData.df)
## age sex gmat_tot gmat_qpc gmat_vpc
## Min. :22.00 1:206 Min. :450.0 Min. :28.00 Min. :16.00
## 1st Qu.:25.00 2: 68 1st Qu.:580.0 1st Qu.:72.00 1st Qu.:71.00
## Median :27.00 Median :620.0 Median :83.00 Median :81.00
## Mean :27.36 Mean :619.5 Mean :80.64 Mean :78.32
## 3rd Qu.:29.00 3rd Qu.:660.0 3rd Qu.:93.00 3rd Qu.:91.00
## Max. :48.00 Max. :790.0 Max. :99.00 Max. :99.00
## gmat_tpc s_avg f_avg quarter work_yrs
## Min. : 0.0 Min. :2.000 Min. :0.000 1:69 Min. : 0.000
## 1st Qu.:78.0 1st Qu.:2.708 1st Qu.:2.750 2:70 1st Qu.: 2.000
## Median :87.0 Median :3.000 Median :3.000 3:70 Median : 3.000
## Mean :84.2 Mean :3.025 Mean :3.062 4:65 Mean : 3.872
## 3rd Qu.:94.0 3rd Qu.:3.300 3rd Qu.:3.250 3rd Qu.: 4.000
## Max. :99.0 Max. :4.000 Max. :4.000 Max. :22.000
## frstlang salary satis
## 1:242 Min. : 0 Min. : 1.0
## 2: 32 1st Qu.: 0 1st Qu.: 5.0
## Median : 999 Median : 6.0
## Mean : 39026 Mean :172.2
## 3rd Qu.: 97000 3rd Qu.: 7.0
## Max. :220000 Max. :998.0
library(psych)
describe(MBASalaryData.df)
## vars n mean sd median trimmed mad min max
## age 1 274 27.36 3.71 27 26.76 2.97 22 48
## sex* 2 274 1.25 0.43 1 1.19 0.00 1 2
## gmat_tot 3 274 619.45 57.54 620 618.86 59.30 450 790
## gmat_qpc 4 274 80.64 14.87 83 82.31 14.83 28 99
## gmat_vpc 5 274 78.32 16.86 81 80.33 14.83 16 99
## gmat_tpc 6 274 84.20 14.02 87 86.12 11.86 0 99
## s_avg 7 274 3.03 0.38 3 3.03 0.44 2 4
## f_avg 8 274 3.06 0.53 3 3.09 0.37 0 4
## quarter* 9 274 2.48 1.11 2 2.47 1.48 1 4
## work_yrs 10 274 3.87 3.23 3 3.29 1.48 0 22
## frstlang* 11 274 1.12 0.32 1 1.02 0.00 1 2
## salary 12 274 39025.69 50951.56 999 33607.86 1481.12 0 220000
## satis 13 274 172.18 371.61 6 91.50 1.48 1 998
## range skew kurtosis se
## age 26 2.16 6.45 0.22
## sex* 1 1.16 -0.66 0.03
## gmat_tot 340 -0.01 0.06 3.48
## gmat_qpc 71 -0.92 0.30 0.90
## gmat_vpc 83 -1.04 0.74 1.02
## gmat_tpc 99 -2.28 9.02 0.85
## s_avg 2 -0.06 -0.38 0.02
## f_avg 4 -2.08 10.85 0.03
## quarter* 3 0.02 -1.35 0.07
## work_yrs 22 2.78 9.80 0.20
## frstlang* 1 2.37 3.65 0.02
## salary 220000 0.70 -1.05 3078.10
## satis 997 1.77 1.13 22.45
# Those who got placed irrespective of whether they disclosed their salaries in survey or not
Job.df <- MBASalaryData.df[which (MBASalaryData.df$salary >= 999) , ]
View(Job.df)
# Those who were not placed
noJob.df <- MBASalaryData.df[which(MBASalaryData.df$salary==0), ]
View(noJob.df)
avg_job = mean(Job.df$salary)
avg_job
## [1] 77153.12
median_job = median(Job.df$salary)
median_job
## [1] 97000
plot(salary ~ work_yrs ,data=Job.df,
main="Visualization of Salary and Work Experience",
xlab="Years of Work Experience", ylab="Starting Salary",
horizontal=TRUE)
boxplot(salary ~ work_yrs ,data=Job.df,
main="Plot of Salary with Work Experience", ylab="Years of Work Experience",
xlab="Starting Salary", horizontal=TRUE,
col=c("red","blue","peachpuff","yellow", "green", "pink"))
plot(salary ~ gmat_tpc , data=Job.df,
xlab="GMAT Overall Score", ylab="Starting Salary",
main="Visualization of Salary with Overall GMAT score")
library(corrgram)
corrgram(Job.df[, names(Job.df)], order=FALSE,
main="Corrgram of dataset variables of places students",
lower.panel=panel.shade, upper.panel=panel.pie,
text.panel=panel.txt)
library(lattice)
histogram(~salary | sex, data=Job.df)
## 1=Male; 2=Female
library(lattice)
histogram(~salary | frstlang, data=Job.df)
## first language (1=English; 2=other)
plot(salary ~ age , data=Job.df,
xlab="Age", ylab="Starting Salary",
main="Visualization of Salary with Age")
plot(salary ~ satis , data=Job.df,
xlab="Satisfies/not satisfied", ylab="Starting Salary",
main="Visualization of Salary and satisfaction")
plot(salary ~ s_avg , data=Job.df,
xlab="Spring average performance", ylab="Starting Salary",
main="Visualization of Salary and spring average performance")
plot(salary ~ f_avg , data=Job.df,
xlab="Fall average performance", ylab="Starting Salary",
main="Visualization of Salary and fall average performance")
aggregate(salary ~ sex, data = Job.df, mean)
## sex salary
## 1 1 74390.98
## 2 2 84979.19
## 1=Male; 2=Female
aggregate(salary ~ work_yrs, data = Job.df, mean)
## work_yrs salary
## 1 0 47999.50
## 2 1 83025.40
## 3 2 84490.77
## 4 3 73886.28
## 5 4 53226.77
## 6 5 80444.22
## 7 6 82610.89
## 8 7 33332.67
## 9 8 105025.00
## 10 9 999.00
## 11 10 118000.00
## 12 15 183000.00
## 13 16 108500.00
aggregate(salary ~ age, data = Job.df, mean)
## age salary
## 1 22 85000.00
## 2 23 91651.20
## 3 24 90349.89
## 4 25 78792.90
## 5 26 75173.95
## 6 27 68475.86
## 7 28 75636.09
## 8 29 56135.91
## 9 30 73610.78
## 10 31 67333.00
## 11 32 36432.67
## 12 33 118000.00
## 13 34 105000.00
## 14 39 112000.00
## 15 40 183000.00
aggregate(cbind(work_yrs, age) ~ sex, data = Job.df, mean)
## sex work_yrs age
## 1 1 3.803922 27.24510
## 2 2 3.277778 26.13889
## 1=Male; 2=Female
sal_sex <- xtabs(~salary+sex, data=Job.df)
chisq.test(sal_sex)
##
## Pearson's Chi-squared test
##
## data: sal_sex
## X-squared = 60.869, df = 42, p-value = 0.02987
sal_lang <- xtabs(~salary+frstlang, data=Job.df)
chisq.test(sal_lang)
##
## Pearson's Chi-squared test
##
## data: sal_lang
## X-squared = 52.285, df = 42, p-value = 0.1328
## first language (1=English; 2=other)
In this model we are considering maximum number of important variables - 1) Independent Variables - work_yrs, sex, gmat_qpc, gmat_vpc, gmat_tpc, s_avg, f_avg, frstlang, satis 2) Dependent Variables - Salary
fit_a <- lm(salary ~ work_yrs+sex+gmat_qpc+gmat_vpc+gmat_tpc+s_avg+f_avg+frstlang+satis, data = Job.df)
summary(fit_a)
##
## Call:
## lm(formula = salary ~ work_yrs + sex + gmat_qpc + gmat_vpc +
## gmat_tpc + s_avg + f_avg + frstlang + satis, data = Job.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -102066 -21162 7023 24234 129791
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -94218.4 42592.5 -2.212 0.02873 *
## work_yrs 1486.8 1341.7 1.108 0.26986
## sex2 6582.5 7832.9 0.840 0.40227
## gmat_qpc -404.5 364.8 -1.109 0.26955
## gmat_vpc -713.1 326.3 -2.185 0.03067 *
## gmat_tpc 594.5 486.8 1.221 0.22422
## s_avg 32113.7 10752.2 2.987 0.00338 **
## f_avg -1726.0 9014.2 -0.191 0.84845
## frstlang2 -15675.1 12231.1 -1.282 0.20231
## satis 20286.7 3133.3 6.475 1.85e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 38810 on 128 degrees of freedom
## Multiple R-squared: 0.3669, Adjusted R-squared: 0.3223
## F-statistic: 8.241 on 9 and 128 DF, p-value: 1.321e-09
In this model we are considering only those variables whose effect Daer esp. wants to see given in the last paragraph of the case study. 1) Independent Variables - age, sex, gmat_tpc, frstlang, satis 2) Dependent Variables - Salary
fit_b <- lm(salary ~ age+sex+gmat_tpc+frstlang+satis, data = Job.df)
summary(fit_b)
##
## Call:
## lm(formula = salary ~ age + sex + gmat_tpc + frstlang + satis,
## data = Job.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -112089 -21032 9688 24751 110605
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -64941.4 45251.6 -1.435 0.154
## age 1618.5 1208.4 1.339 0.183
## sex2 11308.8 7995.9 1.414 0.160
## gmat_tpc -213.2 270.3 -0.789 0.432
## frstlang2 -16054.1 11596.5 -1.384 0.169
## satis 20868.8 3219.8 6.481 1.66e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 40490 on 132 degrees of freedom
## Multiple R-squared: 0.2893, Adjusted R-squared: 0.2623
## F-statistic: 10.74 on 5 and 132 DF, p-value: 1.113e-08
The first model is a better one since it has more statistically significant variables
The explanatory variable(s) whose beta-coefficients are statistically significant (p < 0.05) -
p- value of the whole model is 1.321e-09 which is much less than 0.05 and therefore, the model as a whole is a good model for the prediction of profit.
The model has passed the F-Test most likely.
According to the Adjusted R-Squared, all the predictor variables taken explain a 32.23% of variance approximately. Since it is around 33%, we can say that the number of variables taken to calculate the effect on salary is less and it would be good if we could incorporate more factors.
There is a very positive relationship between student satisfaction and starting salary.
It can be said that the students who were not very satisfied with the MBA Program did not get placed and students who were satisfied with the MBA Program got placements with reported starting salaries. Therefore, satisfaction with the MBA Program is a big factor which drives placement after MBA.
Also, performance in GMAT verbal section is a significant variable affecting the placement process. Therefore, one whose score in GMAT verbal section is above a certain threshold is assumed to have good communication skills also and this thus becomes an impotant factor for getting placement after MBA.
It is seen from the first graph that students with some prior work experience of around 2 - 4 years approx. are earning higher startig salary as compared to others. Therefore, prior work experience might play a role in getting a placement after MBA.
## check for missing values and look how many unique values there are for each variable using the sapply() function which applies the function passed as argument to each column of the dataframe.
sapply(MBASalaryData.df, function(x) sum(is.na(x)))
## age sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg
## 0 0 0 0 0 0 0 0
## quarter work_yrs frstlang salary satis
## 0 0 0 0 0
sapply(MBASalaryData.df, function(x) length(unique(x)))
## age sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg
## 21 2 31 48 34 42 36 21
## quarter work_yrs frstlang salary satis
## 4 18 2 45 8
## A visual take on the missing values might be helpful: the Amelia package has a special plotting function missmap() that will plot your dataset and highlight missing values.
library(Amelia)
missmap(MBASalaryData.df, main = "Missing values vs observed")
## The graph clearly shows that there are no missing values in the dataset.
## We split the data into two chunks: training and testing set. The training set will be used to fit our model which we will be testing over the testing set.
train <- MBASalaryData.df[1:194, ]
test <- MBASalaryData.df[195:274,]
## Now, we will fit the model. We specify the parameter family=binomial in the glm() function first. Also, since the dependent variable has to be a categorical variable, therefore, I have used first language of the student as the dependent variable.
model <- glm(frstlang ~.,family=binomial(link='logit'),data=MBASalaryData.df)
## By using function summary() we obtain the results of our model:
summary(model)
##
## Call:
## glm(formula = frstlang ~ ., family = binomial(link = "logit"),
## data = MBASalaryData.df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9514 -0.3900 -0.2215 -0.1128 3.1951
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.508e+00 7.077e+00 -1.061 0.288690
## age 4.865e-01 1.385e-01 3.514 0.000442 ***
## sex2 7.613e-01 5.815e-01 1.309 0.190482
## gmat_tot 5.945e-03 1.999e-02 0.297 0.766157
## gmat_qpc 1.712e-02 7.747e-02 0.221 0.825118
## gmat_vpc -1.231e-01 5.732e-02 -2.148 0.031725 *
## gmat_tpc 7.036e-02 8.761e-02 0.803 0.421932
## s_avg -3.573e+00 1.926e+00 -1.855 0.063586 .
## f_avg 9.554e-01 9.388e-01 1.018 0.308807
## quarter2 -6.544e-01 9.648e-01 -0.678 0.497614
## quarter3 -8.922e-01 1.292e+00 -0.690 0.489971
## quarter4 -2.653e+00 1.716e+00 -1.546 0.121997
## work_yrs -4.621e-01 1.596e-01 -2.896 0.003780 **
## salary 4.061e-07 5.305e-06 0.077 0.938978
## satis 1.131e-03 6.317e-04 1.791 0.073287 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 197.54 on 273 degrees of freedom
## Residual deviance: 126.81 on 259 degrees of freedom
## AIC: 156.81
##
## Number of Fisher Scoring iterations: 7