This is an R Markdown document which contains the analysis of the case MBA Starting Salaries. The MBA Starting Salaries Dataset has been used here.
setwd("D:/R Internship")
mba.df<-read.csv(paste("MBA Starting Salaries Data.csv",sep = ""))
View(mba.df)
summary(mba.df)
## age sex gmat_tot gmat_qpc
## Min. :22.00 Min. :1.000 Min. :450.0 Min. :28.00
## 1st Qu.:25.00 1st Qu.:1.000 1st Qu.:580.0 1st Qu.:72.00
## Median :27.00 Median :1.000 Median :620.0 Median :83.00
## Mean :27.36 Mean :1.248 Mean :619.5 Mean :80.64
## 3rd Qu.:29.00 3rd Qu.:1.000 3rd Qu.:660.0 3rd Qu.:93.00
## Max. :48.00 Max. :2.000 Max. :790.0 Max. :99.00
## gmat_vpc gmat_tpc s_avg f_avg
## Min. :16.00 Min. : 0.0 Min. :2.000 Min. :0.000
## 1st Qu.:71.00 1st Qu.:78.0 1st Qu.:2.708 1st Qu.:2.750
## Median :81.00 Median :87.0 Median :3.000 Median :3.000
## Mean :78.32 Mean :84.2 Mean :3.025 Mean :3.062
## 3rd Qu.:91.00 3rd Qu.:94.0 3rd Qu.:3.300 3rd Qu.:3.250
## Max. :99.00 Max. :99.0 Max. :4.000 Max. :4.000
## quarter work_yrs frstlang salary
## Min. :1.000 Min. : 0.000 Min. :1.000 Min. : 0
## 1st Qu.:1.250 1st Qu.: 2.000 1st Qu.:1.000 1st Qu.: 0
## Median :2.000 Median : 3.000 Median :1.000 Median : 999
## Mean :2.478 Mean : 3.872 Mean :1.117 Mean : 39026
## 3rd Qu.:3.000 3rd Qu.: 4.000 3rd Qu.:1.000 3rd Qu.: 97000
## Max. :4.000 Max. :22.000 Max. :2.000 Max. :220000
## satis
## Min. : 1.0
## 1st Qu.: 5.0
## Median : 6.0
## Mean :172.2
## 3rd Qu.: 7.0
## Max. :998.0
library(psych)
describe(mba.df)
## vars n mean sd median trimmed mad min max
## age 1 274 27.36 3.71 27 26.76 2.97 22 48
## sex 2 274 1.25 0.43 1 1.19 0.00 1 2
## gmat_tot 3 274 619.45 57.54 620 618.86 59.30 450 790
## gmat_qpc 4 274 80.64 14.87 83 82.31 14.83 28 99
## gmat_vpc 5 274 78.32 16.86 81 80.33 14.83 16 99
## gmat_tpc 6 274 84.20 14.02 87 86.12 11.86 0 99
## s_avg 7 274 3.03 0.38 3 3.03 0.44 2 4
## f_avg 8 274 3.06 0.53 3 3.09 0.37 0 4
## quarter 9 274 2.48 1.11 2 2.47 1.48 1 4
## work_yrs 10 274 3.87 3.23 3 3.29 1.48 0 22
## frstlang 11 274 1.12 0.32 1 1.02 0.00 1 2
## salary 12 274 39025.69 50951.56 999 33607.86 1481.12 0 220000
## satis 13 274 172.18 371.61 6 91.50 1.48 1 998
## range skew kurtosis se
## age 26 2.16 6.45 0.22
## sex 1 1.16 -0.66 0.03
## gmat_tot 340 -0.01 0.06 3.48
## gmat_qpc 71 -0.92 0.30 0.90
## gmat_vpc 83 -1.04 0.74 1.02
## gmat_tpc 99 -2.28 9.02 0.85
## s_avg 2 -0.06 -0.38 0.02
## f_avg 4 -2.08 10.85 0.03
## quarter 3 0.02 -1.35 0.07
## work_yrs 22 2.78 9.80 0.20
## frstlang 1 2.37 3.65 0.02
## salary 220000 0.70 -1.05 3078.10
## satis 997 1.77 1.13 22.45
dim(mba.df)
## [1] 274 13
mba_sal.df1<-mba.df[(mba.df$salary!=0),]
dim(mba_sal.df1)
## [1] 184 13
mba_sal.df2<-mba_sal.df1[(mba_sal.df1$salary!=998),]
dim(mba_sal.df2)
## [1] 138 13
mba_sal.df<-mba_sal.df2[(mba_sal.df2$salary!=999),]
dim(mba_sal.df)
## [1] 103 13
hist(mba_sal.df$salary,main ="Frequency count of Disclosed Salaries",
xlab = "Salary",ylab = "No.of Students",col = "Grey")
boxplot(salary~sex,data = mba_sal.df,
main="Boxplot of disclosed salaries and Gender",
horizontal=TRUE,xlab="Salary",ylab="Gender",yaxt="n")
axis(side=2,at=c(1,2),labels=c("Male","Female"))
boxplot(salary~frstlang,data = mba_sal.df,
main="Boxplot of disclosed salaries and First Language",
horizontal=TRUE,xlab="Salary",ylab="First Language",yaxt="n")
axis(side=2,at=c(1,2),labels=c("English","Other"))
library(lattice)
## Warning: package 'lattice' was built under R version 3.3.3
sal_mean_quart<-aggregate(salary~quarter,data = mba_sal.df,mean)
sal_mean_quart
## quarter salary
## 1 1 106328.6
## 2 2 103612.0
## 3 3 98319.0
## 4 4 102142.6
barchart(quarter~salary,data = sal_mean_quart,col="Grey",
main="Barchart of mean disclosed salary by quartile ranking",
xlab="Mean Salary",ylab="Quartile Ranking")
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
scatterplotMatrix(formula=~gmat_tpc+s_avg+f_avg+salary,cex=0.6,data=mba_sal.df)
library(corrgram)
## Warning: package 'corrgram' was built under R version 3.3.3
corrgram(mba_sal.df,order = TRUE,lower.panel = panel.shade,
upper.panel = panel.pie,text.panel = panel.txt,
main="Corrgram of variables in the MBA Starting Salaries dataset")
table1<-ftable(xtabs(~sex+frstlang+quarter,data = mba_sal.df1))
table1
## quarter 1 2 3 4
## sex frstlang
## 1 1 30 32 31 29
## 2 3 2 7 5
## 2 1 18 5 8 7
## 2 0 4 1 2
table2<-xtabs(~sex+frstlang,data=mba_sal.df1)
table2
## frstlang
## sex 1 2
## 1 122 17
## 2 38 7
chisq.test(table2)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table2
## X-squared = 0.10308, df = 1, p-value = 0.7482
table3<-xtabs(~sex+quarter,data=mba_sal.df1)
table3
## quarter
## sex 1 2 3 4
## 1 33 34 38 34
## 2 18 9 9 9
chisq.test(table3)
##
## Pearson's Chi-squared test
##
## data: table3
## X-squared = 4.5377, df = 3, p-value = 0.209
SO, there is no correlation between Sex and First Language as well as between Sex and Quartile Ranking.
t.test(mba_sal.df1$age,mba_sal.df1$salary)
##
## Welch Two Sample t-test
##
## data: mba_sal.df1$age and mba_sal.df1$salary
## t = -15.005, df = 183, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -65725.43 -50449.67
## sample estimates:
## mean of x mean of y
## 26.79348 58114.34239
t.test(mba_sal.df1$sex,mba_sal.df1$salary)
##
## Welch Two Sample t-test
##
## data: mba_sal.df1$sex and mba_sal.df1$salary
## t = -15.012, df = 183, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -65750.98 -50475.22
## sample estimates:
## mean of x mean of y
## 1.244565 58114.342391
So, there is significant difference in salary for different genders and ages.
fit1<-lm(salary~age+sex+quarter+frstlang,data = mba_sal.df1)
summary(fit1)
##
## Call:
## lm(formula = salary ~ age + sex + quarter + frstlang, data = mba_sal.df1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -82795 -51156 16846 40981 149776
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14703 41367 0.355 0.7227
## age 2820 1396 2.020 0.0449 *
## sex 14121 8865 1.593 0.1130
## quarter -7472 3378 -2.212 0.0282 *
## frstlang -27806 11435 -2.432 0.0160 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 50640 on 179 degrees of freedom
## Multiple R-squared: 0.09018, Adjusted R-squared: 0.06985
## F-statistic: 4.436 on 4 and 179 DF, p-value: 0.001924
fit2<-lm(salary~s_avg+f_avg+work_yrs+satis,data = mba_sal.df1)
summary(fit2)
##
## Call:
## lm(formula = salary ~ s_avg + f_avg + work_yrs + satis, data = mba_sal.df1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -90334 -7908 9167 23072 136462
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2606.510 24534.451 -0.106 0.91551
## s_avg 28894.347 9196.268 3.142 0.00196 **
## f_avg -4587.269 6939.022 -0.661 0.50941
## work_yrs 1798.036 1218.902 1.475 0.14193
## satis -74.529 6.834 -10.906 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 39610 on 179 degrees of freedom
## Multiple R-squared: 0.4434, Adjusted R-squared: 0.431
## F-statistic: 35.65 on 4 and 179 DF, p-value: < 2.2e-16
fit3<-lm(salary~gmat_tot+gmat_qpc+gmat_vpc+gmat_tpc,data = mba_sal.df1)
summary(fit3)
##
## Call:
## lm(formula = salary ~ gmat_tot + gmat_qpc + gmat_vpc + gmat_tpc,
## data = mba_sal.df1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -74505 -55243 26417 41678 154546
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 224287.4 76794.0 2.921 0.00394 **
## gmat_tot -488.9 287.6 -1.700 0.09091 .
## gmat_qpc 423.2 794.1 0.533 0.59471
## gmat_vpc 814.7 722.3 1.128 0.26086
## gmat_tpc 462.1 622.7 0.742 0.45904
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 52100 on 179 degrees of freedom
## Multiple R-squared: 0.03695, Adjusted R-squared: 0.01542
## F-statistic: 1.717 on 4 and 179 DF, p-value: 0.1482
fit_final<-lm(salary~frstlang+s_avg+satis,data = mba_sal.df1)
summary(fit_final)
##
## Call:
## lm(formula = salary ~ frstlang + s_avg + satis, data = mba_sal.df1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -92001 -10717 10528 22444 168786
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18051.389 26863.106 0.672 0.5025
## frstlang -14319.747 8777.099 -1.631 0.1045
## s_avg 24900.867 7722.454 3.224 0.0015 **
## satis -74.961 6.808 -11.010 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 39570 on 180 degrees of freedom
## Multiple R-squared: 0.4416, Adjusted R-squared: 0.4323
## F-statistic: 47.45 on 3 and 180 DF, p-value: < 2.2e-16
So, based on R-squared values, the second and the final models are the most relaible ones.
mba_notplaced<-mba.df[mba.df$salary==0,]
dim(mba_notplaced)
## [1] 90 13
table4<-ftable(xtabs(~sex+frstlang+quarter,data = mba_notplaced))
table4
## quarter 1 2 3 4
## sex frstlang
## 1 1 10 19 14 17
## 2 1 2 2 2
## 2 1 7 6 6 3
## 2 0 0 1 0
table5<-xtabs(~sex+frstlang,data=mba_notplaced)
table5
## frstlang
## sex 1 2
## 1 60 7
## 2 22 1
chisq.test(table5)
## Warning in chisq.test(table5): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table5
## X-squared = 0.21376, df = 1, p-value = 0.6438
table6<-xtabs(~sex+quarter,data=mba_notplaced)
table6
## quarter
## sex 1 2 3 4
## 1 11 21 16 19
## 2 7 6 7 3
chisq.test(table6)
## Warning in chisq.test(table6): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: table6
## X-squared = 3.7704, df = 3, p-value = 0.2874
There isn’t any correlation between Sex and First Language as well as between Sex and Quartile ranking