MBA_start <- read.csv(paste("MBA Starting Salaries Data.csv", sep=""))
summary(MBA_start)
## age sex gmat_tot gmat_qpc
## Min. :22.00 Min. :1.000 Min. :450.0 Min. :28.00
## 1st Qu.:25.00 1st Qu.:1.000 1st Qu.:580.0 1st Qu.:72.00
## Median :27.00 Median :1.000 Median :620.0 Median :83.00
## Mean :27.36 Mean :1.248 Mean :619.5 Mean :80.64
## 3rd Qu.:29.00 3rd Qu.:1.000 3rd Qu.:660.0 3rd Qu.:93.00
## Max. :48.00 Max. :2.000 Max. :790.0 Max. :99.00
## gmat_vpc gmat_tpc s_avg f_avg
## Min. :16.00 Min. : 0.0 Min. :2.000 Min. :0.000
## 1st Qu.:71.00 1st Qu.:78.0 1st Qu.:2.708 1st Qu.:2.750
## Median :81.00 Median :87.0 Median :3.000 Median :3.000
## Mean :78.32 Mean :84.2 Mean :3.025 Mean :3.062
## 3rd Qu.:91.00 3rd Qu.:94.0 3rd Qu.:3.300 3rd Qu.:3.250
## Max. :99.00 Max. :99.0 Max. :4.000 Max. :4.000
## quarter work_yrs frstlang salary
## Min. :1.000 Min. : 0.000 Min. :1.000 Min. : 0
## 1st Qu.:1.250 1st Qu.: 2.000 1st Qu.:1.000 1st Qu.: 0
## Median :2.000 Median : 3.000 Median :1.000 Median : 999
## Mean :2.478 Mean : 3.872 Mean :1.117 Mean : 39026
## 3rd Qu.:3.000 3rd Qu.: 4.000 3rd Qu.:1.000 3rd Qu.: 97000
## Max. :4.000 Max. :22.000 Max. :2.000 Max. :220000
## satis
## Min. : 1.0
## 1st Qu.: 5.0
## Median : 6.0
## Mean :172.2
## 3rd Qu.: 7.0
## Max. :998.0
library(psych)
describe(MBA_start)
## vars n mean sd median trimmed mad min max
## age 1 274 27.36 3.71 27 26.76 2.97 22 48
## sex 2 274 1.25 0.43 1 1.19 0.00 1 2
## gmat_tot 3 274 619.45 57.54 620 618.86 59.30 450 790
## gmat_qpc 4 274 80.64 14.87 83 82.31 14.83 28 99
## gmat_vpc 5 274 78.32 16.86 81 80.33 14.83 16 99
## gmat_tpc 6 274 84.20 14.02 87 86.12 11.86 0 99
## s_avg 7 274 3.03 0.38 3 3.03 0.44 2 4
## f_avg 8 274 3.06 0.53 3 3.09 0.37 0 4
## quarter 9 274 2.48 1.11 2 2.47 1.48 1 4
## work_yrs 10 274 3.87 3.23 3 3.29 1.48 0 22
## frstlang 11 274 1.12 0.32 1 1.02 0.00 1 2
## salary 12 274 39025.69 50951.56 999 33607.86 1481.12 0 220000
## satis 13 274 172.18 371.61 6 91.50 1.48 1 998
## range skew kurtosis se
## age 26 2.16 6.45 0.22
## sex 1 1.16 -0.66 0.03
## gmat_tot 340 -0.01 0.06 3.48
## gmat_qpc 71 -0.92 0.30 0.90
## gmat_vpc 83 -1.04 0.74 1.02
## gmat_tpc 99 -2.28 9.02 0.85
## s_avg 2 -0.06 -0.38 0.02
## f_avg 4 -2.08 10.85 0.03
## quarter 3 0.02 -1.35 0.07
## work_yrs 22 2.78 9.80 0.20
## frstlang 1 2.37 3.65 0.02
## salary 220000 0.70 -1.05 3078.10
## satis 997 1.77 1.13 22.45
# Dataframe for students who were placed
Placed <- MBA_start[which (MBA_start$salary > 999), ]
View(Placed)
# Dataframe for students who were not placed
Not_Placed <- MBA_start[which (MBA_start$salary == 0), ]
View(Not_Placed)
# Dataframe for students who didnot disclose their starting salary
Salary_not_disclosed <- MBA_start[which (MBA_start$salary == 999), ]
View(Salary_not_disclosed)
# Dataframe for students who didnot answer the survey
Not_answered <- MBA_start[which (MBA_start$salary == 998), ]
View(Not_answered)
# Dataframe for students whose starting salary is known
Salary_known <- MBA_start[which((MBA_start$salary == 0) | (MBA_start$salary > 999)), ]
View(Salary_known)
# Dataframe for students whose satisfaction is known
Satis_known <- MBA_start[which(MBA_start$satis < 8), ]
View(Satis_known)
boxplot(MBA_start$age,
horizontal = TRUE,
main = "Box Plot for Age",
xlab = "Age",
col = "blue"
)
boxplot(MBA_start$gmat_tot,
horizontal = TRUE,
main = "Box Plot for GMAT Total",
xlab = "GMAT Scores",
col = "green"
)
boxplot(MBA_start$gmat_qpc,
horizontal = TRUE,
main = "Box Plot for GMAT Percentile for Quants Section",
xlab = "Percentile",
col = "Yellow"
)
boxplot(MBA_start$gmat_vpc,
horizontal = TRUE,
main = "Box Plot for GMAT Percentile for Verbal Section ",
xlab = "Percentile",
col = "darkgreen"
)
boxplot(MBA_start$gmat_tpc,
horizontal = TRUE,
main = "Box Plot for GMAT Percentile (OVERALL) ",
xlab = "Percentile",
col = "black"
)
boxplot(MBA_start$s_avg,
horizontal = TRUE,
main = "Box Plot for Spring MBA Average ",
xlab = "Average",
col = "purple"
)
boxplot(MBA_start$f_avg,
horizontal = TRUE,
main = "Box Plot for Fall MBA Average ",
xlab = "Average",
col = "tan"
)
boxplot(MBA_start$work_yrs,
horizontal = TRUE,
main = "Box Plot for Years of Work Experience ",
xlab = "Years",
col = "magenta"
)
boxplot(Salary_known$salary,
horizontal = TRUE,
main = "Box Plot for Starting Salary ",
xlab = "Salary ",
col = "lightgreen"
)
cnt_gender <-table(MBA_start$sex)
barplot(cnt_gender, width=1, space=1, main="Gender Distribution",xlab="Gender",col=c("blue","red"),
names.arg=c("Male","Female"),ylim=c(0,105),xlim=c(0,10))
first_lang <-table(MBA_start$frstlang)
barplot(first_lang, width=1, space=1, main="First Language of Students ",xlab="Language",
col=c("yellow","green"),names.arg=c("English","Other"),ylim=c(0,105),xlim=c(0,10))
hist(Satis_known$satis, breaks=5,col="brown",xlab="Satisfaction level,1=low 7=high", main="Satisfaction distribution")
Salary_known$placed <- ifelse(Salary_known$salary == 0, 0, 1)
cnt_placed <-table(Salary_known$placed)
barplot(cnt_placed, width=1, space=1, main="Placed vs Not-placed",xlab="Status",
col=c("green","red"),names.arg=c("Not Placed","Placed"),ylim=c(0,105),xlim=c(0,10))
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
scatterplotMatrix(~salary+gmat_tot+s_avg+age+f_avg+work_yrs |sex, data=MBA_start, main="Variation of Salary with different variables")
library(corrgram)
corrgram(Salary_known, order=TRUE, lower.panel=panel.shade,upper.panel=panel.pie, text.panel=panel.txt,main="Corrgram for MBA starting salary analysis")
c1 <- xtabs (~ salary + sex, data=Salary_known)
chisq.test(c1)
## Warning in chisq.test(c1): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: c1
## X-squared = 55.494, df = 42, p-value = 0.07929
c2 <- xtabs (~ salary + quarter, data=Salary_known)
chisq.test(c2)
## Warning in chisq.test(c2): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: c2
## X-squared = 132.64, df = 126, p-value = 0.3252
c3 <- xtabs (~ salary + frstlang, data=Salary_known)
chisq.test(c3)
## Warning in chisq.test(c3): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: c3
## X-squared = 62.016, df = 42, p-value = 0.02384
c4 <- xtabs(~ placed + sex, data=Salary_known)
chisq.test(c4)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: c4
## X-squared = 0.29208, df = 1, p-value = 0.5889
c5 <- xtabs(~ placed + age, data=Salary_known)
chisq.test(c5)
## Warning in chisq.test(c5): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: c5
## X-squared = 27.943, df = 20, p-value = 0.1108
c6 <- xtabs(~ placed + work_yrs, data=Salary_known)
chisq.test(c6)
## Warning in chisq.test(c6): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: c6
## X-squared = 24.663, df = 17, p-value = 0.1025
c7 <- xtabs(~ placed + frstlang, data=Salary_known)
chisq.test(c7)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: c7
## X-squared = 0.074127, df = 1, p-value = 0.7854
(cor(Salary_known))
## age sex gmat_tot gmat_qpc gmat_vpc
## age 1.000000000 -0.031876273 -1.256220e-01 -0.220590341 -0.006721674
## sex -0.031876273 1.000000000 -4.351109e-02 -0.167904888 0.099184398
## gmat_tot -0.125622047 -0.043511095 1.000000e+00 0.743099719 0.752906719
## gmat_qpc -0.220590341 -0.167904888 7.430997e-01 1.000000000 0.175497777
## gmat_vpc -0.006721674 0.099184398 7.529067e-01 0.175497777 1.000000000
## gmat_tpc -0.131681932 -0.012849186 8.791496e-01 0.690581939 0.688039929
## s_avg 0.164342257 0.073368077 1.435675e-01 0.019038162 0.190665307
## f_avg -0.034290725 0.042895288 1.010821e-01 0.130285115 0.033106093
## quarter -0.076614994 -0.086616877 -8.407099e-02 0.008601267 -0.139400223
## work_yrs 0.871679595 -0.023832548 -1.736909e-01 -0.241384675 -0.041357878
## frstlang 0.097619028 -0.008488358 -9.557089e-02 0.094537575 -0.295162826
## salary -0.130198680 0.018516965 -5.685962e-05 0.028391635 0.003389965
## satis -0.073500580 -0.061738773 7.981946e-02 -0.020006117 0.195134711
## placed -0.205697192 0.050470540 1.491495e-02 0.026982025 0.028880982
## gmat_tpc s_avg f_avg quarter work_yrs
## age -0.13168193 0.16434226 -0.034290725 -0.076614994 0.871679595
## sex -0.01284919 0.07336808 0.042895288 -0.086616877 -0.023832548
## gmat_tot 0.87914961 0.14356746 0.101082103 -0.084070990 -0.173690863
## gmat_qpc 0.69058194 0.01903816 0.130285115 0.008601267 -0.241384675
## gmat_vpc 0.68803993 0.19066531 0.033106093 -0.139400223 -0.041357878
## gmat_tpc 1.00000000 0.18894788 0.109811857 -0.128533421 -0.166139876
## s_avg 0.18894788 1.00000000 0.520554250 -0.735421726 0.159136628
## f_avg 0.10981186 0.52055425 1.000000000 -0.382421186 -0.047951357
## quarter -0.12853342 -0.73542173 -0.382421186 1.000000000 -0.126454286
## work_yrs -0.16613988 0.15913663 -0.047951357 -0.126454286 1.000000000
## frstlang -0.10789784 -0.12631935 -0.055830525 0.089504320 -0.002916547
## salary 0.06094464 0.09632412 0.008846655 -0.147257809 -0.053266846
## satis 0.13288434 -0.04639953 -0.114704819 0.067729421 -0.007722658
## placed 0.08264631 0.08063913 0.027460510 -0.127882161 -0.123303946
## frstlang salary satis placed
## age 0.097619028 -1.301987e-01 -0.073500580 -0.20569719
## sex -0.008488358 1.851696e-02 -0.061738773 0.05047054
## gmat_tot -0.095570885 -5.685962e-05 0.079819458 0.01491495
## gmat_qpc 0.094537575 2.839164e-02 -0.020006117 0.02698202
## gmat_vpc -0.295162826 3.389965e-03 0.195134711 0.02888098
## gmat_tpc -0.107897839 6.094464e-02 0.132884339 0.08264631
## s_avg -0.126319350 9.632412e-02 -0.046399534 0.08063913
## f_avg -0.055830525 8.846655e-03 -0.114704819 0.02746051
## quarter 0.089504320 -1.472578e-01 0.067729421 -0.12788216
## work_yrs -0.002916547 -5.326685e-02 -0.007722658 -0.12330395
## frstlang 1.000000000 7.125825e-03 -0.135986251 -0.03899476
## salary 0.007125825 1.000000e+00 0.156439455 0.96951510
## satis -0.135986251 1.564395e-01 1.000000000 0.16882557
## placed -0.038994758 9.695151e-01 0.168825569 1.00000000
reg <- lm(salary ~ age + sex + gmat_qpc + gmat_vpc +gmat_tpc + s_avg + f_avg + work_yrs + frstlang
+ satis, data = Salary_known)
summary(reg)
##
## Call:
## lm(formula = salary ~ age + sex + gmat_qpc + gmat_vpc + gmat_tpc +
## s_avg + f_avg + work_yrs + frstlang + satis, data = Salary_known)
##
## Residuals:
## Min 1Q Median 3Q Max
## -93924 -48912 20019 44376 179796
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 40696.2 70338.2 0.579 0.5636
## age -4376.4 1916.4 -2.284 0.0235 *
## sex 1749.0 8733.1 0.200 0.8415
## gmat_qpc -246.8 442.3 -0.558 0.5775
## gmat_vpc -381.8 406.9 -0.938 0.3494
## gmat_tpc 571.8 652.3 0.876 0.3819
## s_avg 22054.8 12502.0 1.764 0.0794 .
## f_avg -5868.9 8794.0 -0.667 0.5054
## work_yrs 3252.9 2171.5 1.498 0.1359
## frstlang 13151.7 15529.4 0.847 0.3982
## satis 9967.1 5137.4 1.940 0.0539 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 52430 on 182 degrees of freedom
## Multiple R-squared: 0.07776, Adjusted R-squared: 0.02709
## F-statistic: 1.535 on 10 and 182 DF, p-value: 0.1301
According to the performed Analysis on the MBA Starting Salary File, the following insights can be drawn:
2.The salary offered to students is higher if they are from Spring MBA average.
The age estimate comes out to be negative implying that age is inversely proportional to the starting salary, experienced students got higher salary.
With a unit rise in satisfaction level of students, the salary rises by almost 10000 units.
The students having first language as English are contributing significantly for starting salary of MBA Students.
With a unit rise in work experience years, the salary rises by almost 3300 units.