start_salary <- read.csv(paste("MBA Starting Salaries Data.csv", sep=""))
library(psych)
describe(start_salary)
## vars n mean sd median trimmed mad min max
## age 1 274 27.36 3.71 27 26.76 2.97 22 48
## sex 2 274 1.25 0.43 1 1.19 0.00 1 2
## gmat_tot 3 274 619.45 57.54 620 618.86 59.30 450 790
## gmat_qpc 4 274 80.64 14.87 83 82.31 14.83 28 99
## gmat_vpc 5 274 78.32 16.86 81 80.33 14.83 16 99
## gmat_tpc 6 274 84.20 14.02 87 86.12 11.86 0 99
## s_avg 7 274 3.03 0.38 3 3.03 0.44 2 4
## f_avg 8 274 3.06 0.53 3 3.09 0.37 0 4
## quarter 9 274 2.48 1.11 2 2.47 1.48 1 4
## work_yrs 10 274 3.87 3.23 3 3.29 1.48 0 22
## frstlang 11 274 1.12 0.32 1 1.02 0.00 1 2
## salary 12 274 39025.69 50951.56 999 33607.86 1481.12 0 220000
## satis 13 274 172.18 371.61 6 91.50 1.48 1 998
## range skew kurtosis se
## age 26 2.16 6.45 0.22
## sex 1 1.16 -0.66 0.03
## gmat_tot 340 -0.01 0.06 3.48
## gmat_qpc 71 -0.92 0.30 0.90
## gmat_vpc 83 -1.04 0.74 1.02
## gmat_tpc 99 -2.28 9.02 0.85
## s_avg 2 -0.06 -0.38 0.02
## f_avg 4 -2.08 10.85 0.03
## quarter 3 0.02 -1.35 0.07
## work_yrs 22 2.78 9.80 0.20
## frstlang 1 2.37 3.65 0.02
## salary 220000 0.70 -1.05 3078.10
## satis 997 1.77 1.13 22.45
**The values for Salary and Satisfaction is not correct due to lack of data.
# Dataframe for MBA students who got placed
Placed <- start_salary[which (start_salary$salary > 999), ]
View(Placed)
# Dataframe for MBA students who didnot get placed
Not_Placed <- start_salary[which (start_salary$salary == 0), ]
View(Not_Placed)
# Dataframe for MBA students who didnot disclose their starting salary
Salary_not_disclosed <- start_salary[which (start_salary$salary == 999), ]
View(Salary_not_disclosed)
# Dataframe for MBA students who didnot answer the survey
Not_answered <- start_salary[which (start_salary$salary == 998), ]
View(Not_answered)
# Dataframe for MBA students whose starting salary is known
Salary_known <- start_salary[which((start_salary$salary == 0) | (start_salary$salary > 999)), ]
View(Salary_known)
# Dataframe for MBA students whose satisfaction is known
Satis_known <- start_salary[which(start_salary$satis < 8), ]
View(Satis_known)
______________________________________________________________________________________________________ ** The Salary data unavailable for the students who didnot participate in the survey or didnot disclose their starting salaries are not considered for Salary_known dataframe
** The Satisfaction Level unavailable for the students who didnot participate in the survey is not considered for Satis_known dataframe. ________________________________________________________________________________________________________
boxplot(start_salary$age,
horizontal = TRUE,
main = "Box Plot for Age",
xlab = "Age",
col = "tomato"
)
## Bar plot for Sex
counts_sex <-table(start_salary$sex)
barplot(counts_sex, width=1, space=1, main="Gender Distribution",xlab="Gender",col=c("darkblue","pink"),
names.arg=c("Male","Female"),ylim=c(0,105),xlim=c(0,10))
## Box plot for GMAT Total
boxplot(start_salary$gmat_tot,
horizontal = TRUE,
main = "Box Plot for GMAT Total",
xlab = "Score",
col = "maroon"
)
boxplot(start_salary$gmat_qpc,
horizontal = TRUE,
main = "Box Plot for Quantitative GMAT Percentile",
xlab = "Percentile",
col = "gold"
)
boxplot(start_salary$gmat_vpc,
horizontal = TRUE,
main = "Box Plot for Verbal GMAT Percentile ",
xlab = "Percentile",
col = "dodgerblue"
)
boxplot(start_salary$gmat_tpc,
horizontal = TRUE,
main = "Box Plot for Overall GMAT Percentile ",
xlab = "Percentile",
col = "tan"
)
boxplot(start_salary$s_avg,
horizontal = TRUE,
main = "Box Plot for Spring MBA Average ",
xlab = "Average",
col = "navy"
)
boxplot(start_salary$f_avg,
horizontal = TRUE,
main = "Box Plot for Fall MBA Average ",
xlab = "Average",
col = "peachpuff"
)
boxplot(start_salary$work_yrs,
horizontal = TRUE,
main = "Box Plot for Years of Work Experience ",
xlab = "Years",
col = "chocolate"
)
counts_lang <-table(start_salary$frstlang)
barplot(counts_lang, width=1, space=1, main="First Language of Students ",xlab="Language",
col=c("purple","darkgreen"),names.arg=c("English","Other"),ylim=c(0,105),xlim=c(0,10))
boxplot(Salary_known$salary,
horizontal = TRUE,
main = "Box Plot for Starting Salary ",
xlab = "Salary ",
col = "darksalmon"
)
hist(Satis_known$satis, breaks=5,col="orange",xlab="Satisfaction level,1=low 7=high", main="Satisfaction distribution")
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
scatterplotMatrix(~salary+age+gmat_tot+s_avg+f_avg+work_yrs |sex, data=start_salary, main="Variation of Salary with other variables")
library(corrgram)
## Warning: replacing previous import by 'magrittr::%>%' when loading
## 'dendextend'
corrgram(Salary_known, order=TRUE, lower.panel=panel.shade,upper.panel=panel.pie, text.panel=panel.txt,main="MBA starting salary analysis Corrgram")
library(corrplot)
## corrplot 0.84 loaded
corr <- Salary_known [, c("age","work_yrs", "gmat_tot", "gmat_qpc", "gmat_vpc", "gmat_tpc", "s_avg", "f_avg", "quarter", "satis")]
corr_final <- cor(corr)
corrplot(corr_final, method="circle")
chi1 <- xtabs (~ salary + sex, data=Salary_known)
chisq.test(chi1)
## Warning in chisq.test(chi1): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: chi1
## X-squared = 55.494, df = 42, p-value = 0.07929
-> The p-value = 0.07929 which signifies that Salary has significant dependency on Sex of the students.
chi2 <- xtabs (~ salary + quarter, data=Salary_known)
chisq.test(chi2)
## Warning in chisq.test(chi2): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: chi2
## X-squared = 132.64, df = 126, p-value = 0.3252
-> The p-value = 0.3252 which signifies that Salary has no significant dependency on the Quartile Ranking
chi3 <- xtabs (~ salary + frstlang, data=Salary_known)
chisq.test(chi3)
## Warning in chisq.test(chi3): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: chi3
## X-squared = 62.016, df = 42, p-value = 0.02384
-> The p-value = 0.02384 which signifies that Salary has very significant dependency on the First Language spoken by the student.
round(cor(Salary_known), 3)
## age sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg
## age 1.000 -0.032 -0.126 -0.221 -0.007 -0.132 0.164 -0.034
## sex -0.032 1.000 -0.044 -0.168 0.099 -0.013 0.073 0.043
## gmat_tot -0.126 -0.044 1.000 0.743 0.753 0.879 0.144 0.101
## gmat_qpc -0.221 -0.168 0.743 1.000 0.175 0.691 0.019 0.130
## gmat_vpc -0.007 0.099 0.753 0.175 1.000 0.688 0.191 0.033
## gmat_tpc -0.132 -0.013 0.879 0.691 0.688 1.000 0.189 0.110
## s_avg 0.164 0.073 0.144 0.019 0.191 0.189 1.000 0.521
## f_avg -0.034 0.043 0.101 0.130 0.033 0.110 0.521 1.000
## quarter -0.077 -0.087 -0.084 0.009 -0.139 -0.129 -0.735 -0.382
## work_yrs 0.872 -0.024 -0.174 -0.241 -0.041 -0.166 0.159 -0.048
## frstlang 0.098 -0.008 -0.096 0.095 -0.295 -0.108 -0.126 -0.056
## salary -0.130 0.019 0.000 0.028 0.003 0.061 0.096 0.009
## satis -0.074 -0.062 0.080 -0.020 0.195 0.133 -0.046 -0.115
## quarter work_yrs frstlang salary satis
## age -0.077 0.872 0.098 -0.130 -0.074
## sex -0.087 -0.024 -0.008 0.019 -0.062
## gmat_tot -0.084 -0.174 -0.096 0.000 0.080
## gmat_qpc 0.009 -0.241 0.095 0.028 -0.020
## gmat_vpc -0.139 -0.041 -0.295 0.003 0.195
## gmat_tpc -0.129 -0.166 -0.108 0.061 0.133
## s_avg -0.735 0.159 -0.126 0.096 -0.046
## f_avg -0.382 -0.048 -0.056 0.009 -0.115
## quarter 1.000 -0.126 0.090 -0.147 0.068
## work_yrs -0.126 1.000 -0.003 -0.053 -0.008
## frstlang 0.090 -0.003 1.000 0.007 -0.136
## salary -0.147 -0.053 0.007 1.000 0.156
## satis 0.068 -0.008 -0.136 0.156 1.000
regress <- lm(salary ~ age + sex + gmat_qpc + gmat_vpc +gmat_tpc + s_avg + f_avg + work_yrs + frstlang
+ satis, data = Salary_known)
summary(regress)
##
## Call:
## lm(formula = salary ~ age + sex + gmat_qpc + gmat_vpc + gmat_tpc +
## s_avg + f_avg + work_yrs + frstlang + satis, data = Salary_known)
##
## Residuals:
## Min 1Q Median 3Q Max
## -93924 -48912 20019 44376 179796
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 40696.2 70338.2 0.579 0.5636
## age -4376.4 1916.4 -2.284 0.0235 *
## sex 1749.0 8733.1 0.200 0.8415
## gmat_qpc -246.8 442.3 -0.558 0.5775
## gmat_vpc -381.8 406.9 -0.938 0.3494
## gmat_tpc 571.8 652.3 0.876 0.3819
## s_avg 22054.8 12502.0 1.764 0.0794 .
## f_avg -5868.9 8794.0 -0.667 0.5054
## work_yrs 3252.9 2171.5 1.498 0.1359
## frstlang 13151.7 15529.4 0.847 0.3982
## satis 9967.1 5137.4 1.940 0.0539 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 52430 on 182 degrees of freedom
## Multiple R-squared: 0.07776, Adjusted R-squared: 0.02709
## F-statistic: 1.535 on 10 and 182 DF, p-value: 0.1301
regress <- lm(salary ~ age + sex + gmat_qpc +gmat_tpc + f_avg + work_yrs + frstlang
+ satis, data = Salary_known)
summary(regress)
##
## Call:
## lm(formula = salary ~ age + sex + gmat_qpc + gmat_tpc + f_avg +
## work_yrs + frstlang + satis, data = Salary_known)
##
## Residuals:
## Min 1Q Median 3Q Max
## -80262 -52304 20427 44021 169869
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 68519.7 67901.0 1.009 0.3142
## age -4385.4 1915.6 -2.289 0.0232 *
## sex 1776.0 8764.8 0.203 0.8397
## gmat_qpc -153.5 382.5 -0.401 0.6886
## gmat_tpc 294.5 417.2 0.706 0.4811
## f_avg 2627.6 7477.6 0.351 0.7257
## work_yrs 3671.7 2171.7 1.691 0.0926 .
## frstlang 14674.3 15160.5 0.968 0.3343
## satis 9335.3 5152.1 1.812 0.0716 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 52720 on 184 degrees of freedom
## Multiple R-squared: 0.05736, Adjusted R-squared: 0.01637
## F-statistic: 1.4 on 8 and 184 DF, p-value: 0.1992
->The best model that can be considered for estimation is Model 1. It has Multiple R-squared = 0.07776 and p-value = 0.1301 which is better than Model 2.
Salary_known$placed <- ifelse(Salary_known$salary == 0, 0, 1)
counts_placed <-table(Salary_known$placed)
barplot(counts_placed, width=1, space=1, main="Placed vs Not-placed",xlab="Status",
col=c("red","green"),names.arg=c("Not Placed","Placed"),ylim=c(0,105),xlim=c(0,10))
chi4 <- xtabs(~placed+sex, data=Salary_known)
addmargins(chi4)
## sex
## placed 1 2 Sum
## 0 67 23 90
## 1 72 31 103
## Sum 139 54 193
chisq.test(chi4)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: chi4
## X-squared = 0.29208, df = 1, p-value = 0.5889
-> The p-value = 0.5889 which signifies that Placement Status has no significant dependency on Sex of the students.
chi5 <- xtabs(~placed+age, data=Salary_known)
addmargins(chi5)
## age
## placed 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 39
## 0 1 3 13 9 10 14 6 11 2 2 5 0 3 3 2 1 1
## 1 1 5 16 23 14 14 8 6 6 4 1 1 1 0 0 0 1
## Sum 2 8 29 32 24 28 14 17 8 6 6 1 4 3 2 1 2
## age
## placed 40 42 43 48 Sum
## 0 0 1 2 1 90
## 1 2 0 0 0 103
## Sum 2 1 2 1 193
chisq.test(chi5)
## Warning in chisq.test(chi5): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: chi5
## X-squared = 27.943, df = 20, p-value = 0.1108
-> The p-value = 0.1108 which signifies that Placement Status has no significant dependency on Age of the students.
chi6 <- xtabs(~placed+work_yrs, data=Salary_known)
addmargins(chi6)
## work_yrs
## placed 0 1 2 3 4 5 6 7 8 9 10 11 12 13 15 16 18
## 0 1 12 22 14 9 12 2 5 2 1 1 2 2 1 0 1 1
## 1 1 8 38 21 11 7 7 1 4 0 1 0 0 0 2 2 0
## Sum 2 20 60 35 20 19 9 6 6 1 2 2 2 1 2 3 1
## work_yrs
## placed 22 Sum
## 0 2 90
## 1 0 103
## Sum 2 193
chisq.test(chi6)
## Warning in chisq.test(chi6): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: chi6
## X-squared = 24.663, df = 17, p-value = 0.1025
-> The p-value = 0.1025 which signifies that Placement Status has no significant dependency on Years of Work Experience for the students.
chi7 <- xtabs(~placed+frstlang, data=Salary_known)
addmargins(chi7)
## frstlang
## placed 1 2 Sum
## 0 82 8 90
## 1 96 7 103
## Sum 178 15 193
chisq.test(chi7)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: chi7
## X-squared = 0.074127, df = 1, p-value = 0.7854
-> The p-value = 0.7854 which signifies that Placement Status has no significant dependency on First Language of the students.
The analysis was carried out in order to know how salary is dependent on other variables:
The variables satis, s_avg and age have good dependency on the salary earned by MBA students.
As age variable has a negative intercept, it implies that older students are offered less in comparision to younger students
s_avg contributes highly to salary implying that starting salary offered to students is higher if student has higher Spring MBA average in comparision to Fall MBA average.
There is a difference of about 10000 in salary for each increement in the satisfaction level of the students.
The status of being placed or unplaced is independent of all these variables.