Read

saldata=read.csv(paste("MBA Starting Salaries Data.csv",sep=""))
attach(saldata)

View Dataset

View(saldata)

Check

dim(saldata)
## [1] 274  13

Data Types

str(saldata)
## 'data.frame':    274 obs. of  13 variables:
##  $ age     : int  23 24 24 24 24 24 25 25 25 25 ...
##  $ sex     : int  2 1 1 1 2 1 1 2 1 1 ...
##  $ gmat_tot: int  620 610 670 570 710 640 610 650 630 680 ...
##  $ gmat_qpc: int  77 90 99 56 93 82 89 88 79 99 ...
##  $ gmat_vpc: int  87 71 78 81 98 89 74 89 91 81 ...
##  $ gmat_tpc: int  87 87 95 75 98 91 87 92 89 96 ...
##  $ s_avg   : num  3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
##  $ f_avg   : num  3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
##  $ quarter : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ work_yrs: int  2 2 2 1 2 2 2 2 2 2 ...
##  $ frstlang: int  1 1 1 1 1 1 1 1 2 1 ...
##  $ salary  : int  0 0 0 0 999 0 0 0 999 998 ...
##  $ satis   : int  7 6 6 7 5 6 5 6 4 998 ...

Convert Sex and frstlang into factor variables

saldata$frstlang[saldata$frstlang==1]="English"
saldata$frstlang[saldata$frstlang==2]="Other"

saldata$sex[saldata$sex==1]="Male"
saldata$sex[saldata$sex==2]="Female"

#View(saldata)
saldata$sex=factor(saldata$sex)
saldata$frstlang=factor(saldata$frstlang)

str(saldata)
## 'data.frame':    274 obs. of  13 variables:
##  $ age     : int  23 24 24 24 24 24 25 25 25 25 ...
##  $ sex     : Factor w/ 2 levels "Female","Male": 1 2 2 2 1 2 2 1 2 2 ...
##  $ gmat_tot: int  620 610 670 570 710 640 610 650 630 680 ...
##  $ gmat_qpc: int  77 90 99 56 93 82 89 88 79 99 ...
##  $ gmat_vpc: int  87 71 78 81 98 89 74 89 91 81 ...
##  $ gmat_tpc: int  87 87 95 75 98 91 87 92 89 96 ...
##  $ s_avg   : num  3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
##  $ f_avg   : num  3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
##  $ quarter : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ work_yrs: int  2 2 2 1 2 2 2 2 2 2 ...
##  $ frstlang: Factor w/ 2 levels "English","Other": 1 1 1 1 1 1 1 1 2 1 ...
##  $ salary  : int  0 0 0 0 999 0 0 0 999 998 ...
##  $ satis   : int  7 6 6 7 5 6 5 6 4 998 ...

Remove those people who have not filled the survey

surveydone=saldata[satis!=998,]
#View(surveydone)

Remove those who have not disclosed salary

saltold=saldata[which(salary!=999 & salary!=998),]
#View(saltold)

Dataframe of placed students

placed=saldata[which(salary>999),]
#View(placed)

People who have not disclosed salary

salnot=saldata[which(salary==999),]
#View(salnot)

Not placed students

notplaced=saldata[which(salary==0),]
View(notplaced)

Summary

summary(surveydone)
##       age            sex         gmat_tot        gmat_qpc    
##  Min.   :22.00   Female: 59   Min.   :450.0   Min.   :28.00  
##  1st Qu.:25.00   Male  :169   1st Qu.:570.0   1st Qu.:72.00  
##  Median :27.00                Median :620.0   Median :83.00  
##  Mean   :27.57                Mean   :617.3   Mean   :80.24  
##  3rd Qu.:29.00                3rd Qu.:660.0   3rd Qu.:92.25  
##  Max.   :48.00                Max.   :790.0   Max.   :99.00  
##     gmat_vpc        gmat_tpc         s_avg           f_avg      
##  Min.   :22.00   Min.   : 0.00   Min.   :2.000   Min.   :0.000  
##  1st Qu.:69.25   1st Qu.:75.00   1st Qu.:2.800   1st Qu.:2.750  
##  Median :81.00   Median :87.00   Median :3.000   Median :3.000  
##  Mean   :77.85   Mean   :83.61   Mean   :3.031   Mean   :3.059  
##  3rd Qu.:91.00   3rd Qu.:94.00   3rd Qu.:3.300   3rd Qu.:3.250  
##  Max.   :99.00   Max.   :99.00   Max.   :4.000   Max.   :4.000  
##     quarter         work_yrs         frstlang       salary      
##  Min.   :1.000   Min.   : 0.000   English:204   Min.   :     0  
##  1st Qu.:1.000   1st Qu.: 2.000   Other  : 24   1st Qu.:     0  
##  Median :2.000   Median : 3.000                 Median :   999  
##  Mean   :2.478   Mean   : 4.031                 Mean   : 46698  
##  3rd Qu.:3.000   3rd Qu.: 5.000                 3rd Qu.: 99250  
##  Max.   :4.000   Max.   :22.000                 Max.   :220000  
##      satis      
##  Min.   :1.000  
##  1st Qu.:5.000  
##  Median :6.000  
##  Mean   :5.566  
##  3rd Qu.:6.000  
##  Max.   :7.000
mean(saltold$salary)
## [1] 54985.32
mean(placed$salary)
## [1] 103030.7

Visualisations

par(mfrow=c(1,1))
hist(placed$salary,breaks=20,col="Green", main="Salary of Placed Students")

hist(placed$satis,col="navy blue",main="Satisfaction of Placed Students")

par(mfrow=c(1,2))
hist(saldata$f_avg ,breaks=20,col="brown",main="Fall average of all students")
hist(saldata$s_avg ,breaks=20,col="brown",main="Spring average of all students")

par(mfrow=c(2,2))
boxplot(saldata$gmat_tot,col="orange",main="boxplot of Total GMAT score")
boxplot(saldata$gmat_qpc,col="orange",main="boxplot of quantitative GMAT percentile score")
boxplot(saldata$gmat_vpc,col="orange",main="boxplot of verbal GMAT percentile score")
boxplot(saldata$gmat_tpc,col="orange",main="boxplot of overall GMAT percentile score")

boxplot(saldata$work_yrs,col="purple",main="Boxplot of Work Experience")

Effect of age on salary

boxplot(placed$salary~placed$age,main="Boxplot of Salary vs age of placed students")
library(car)
## Warning: package 'car' was built under R version 3.4.3

scatterplot(placed$salary~placed$age,main="ScatterPlot of salary vs age of placed students")

From the above visualisations, we can observe that salary of MBA graduates is higher if their age is higher.

To check if there is any correlation between salary and age, let us run the following correlation test

cor.test(placed$salary,placed$age)
## 
##  Pearson's product-moment correlation
## 
## data:  placed$salary and placed$age
## t = 5.7968, df = 101, p-value = 7.748e-08
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3388862 0.6320523
## sample estimates:
##       cor 
## 0.4996428

The test shows a positive correlation between salary and age which is statistically significant.

To check the effect of gender on salary

boxplot(placed$salary~placed$sex,col=c("blue","red"), ylab="Salary", xlab="Gender (1=Male, 2=Female)")

From this we can observe that men are paid slightly higher than women.

chisq.test(placed$salary,placed$sex)
## Warning in chisq.test(placed$salary, placed$sex): Chi-squared approximation
## may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  placed$salary and placed$sex
## X-squared = 52.681, df = 41, p-value = 0.1045

Since p-value is very high, therefore w cannot reject the null hypothesis that Salary and sex are independent of each other.

Effect of G_MAT score on Salary

par(mfrow=c(2,2))
scatterplot(placed$salary~placed$gmat_tot,main="scatterplot of Total GMAT score")

scatterplot(placed$salary~placed$gmat_qpc,main="scatter plot of Quantitative GMAT percentile score")

scatterplot(placed$salary~placed$gmat_vpc,main="scatter plot of Verbal GMAT percentile score")

scatterplot(placed$salary~placed$gmat_tpc,main="scatter plot of Total GMAT percentile score")

t.test(placed$gmat_tot,notplaced$gmat_tot)
## 
##  Welch Two Sample t-test
## 
## data:  placed$gmat_tot and notplaced$gmat_tot
## t = 0.20321, df = 170.77, p-value = 0.8392
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -14.69189  18.06406
## sample estimates:
## mean of x mean of y 
##  616.0194  614.3333
t.test(placed$gmat_tpc,notplaced$gmat_tpc)
## 
##  Welch Two Sample t-test
## 
## data:  placed$gmat_tpc and notplaced$gmat_tpc
## t = 1.119, df = 155.27, p-value = 0.2649
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.710571  6.181337
## sample estimates:
## mean of x mean of y 
##  84.52427  82.28889

Therefore there is no such difference between the GMAT scores of placed and not placed students

Effect of f-avg and s-avg on salary

library(car)
scatterplotMatrix(~placed$salary+placed$f_avg+placed$s_avg,main="ScatterPlot Matrix of salary, f_avg, s_avg of placed students")

To check if there is any difference between the average semester scores of placed and not placed students

t.test(placed$s_avg,notplaced$s_avg)
## 
##  Welch Two Sample t-test
## 
## data:  placed$s_avg and notplaced$s_avg
## t = 1.118, df = 187.46, p-value = 0.265
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.04671895  0.16893470
## sample estimates:
## mean of x mean of y 
##  3.092330  3.031222
t.test(placed$f_avg,notplaced$f_avg)
## 
##  Welch Two Sample t-test
## 
## data:  placed$f_avg and notplaced$f_avg
## t = 0.37631, df = 178.47, p-value = 0.7071
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.1210636  0.1781164
## sample estimates:
## mean of x mean of y 
##  3.090971  3.062444

There is no significant difference between the scores of placed and not placed students

Effect of First Language

#plot(placed$frstlang,placed$salary)

table(placed$frstlang)
## 
## English   Other 
##      96       7
table(notplaced$frstlang)
## 
## English   Other 
##      82       8
tab3=xtabs(~(salary>1000)+frstlang,data=saltold)
tab3
##              frstlang
## salary > 1000 English Other
##         FALSE      82     8
##         TRUE       96     7
ptab3=prop.table(tab3,2)
ptab3
##              frstlang
## salary > 1000   English     Other
##         FALSE 0.4606742 0.5333333
##         TRUE  0.5393258 0.4666667
chisq.test(ptab3)
## Warning in chisq.test(ptab3): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  ptab3
## X-squared = 3.6905e-32, df = 1, p-value = 1

From the above Chisquare test, we can conclude that getting placed is independent of your first language

Effect of Work Experience

scatterplot(placed$salary~placed$work_yrs)

boxplot(placed$salary~placed$work_yrs)

To check if there is any difference between the work experience of placed and not placed students

t.test(placed$work_yrs,notplaced$work_yrs)
## 
##  Welch Two Sample t-test
## 
## data:  placed$work_yrs and notplaced$work_yrs
## t = -1.6778, df = 156.44, p-value = 0.09538
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.9797552  0.1612007
## sample estimates:
## mean of x mean of y 
##  3.679612  4.588889

There is no statistically significant difference between the work experience of placed and not placed students

Correlations

cor.test(placed$age,placed$work_yrs)
## 
##  Pearson's product-moment correlation
## 
## data:  placed$age and placed$work_yrs
## t = 18.669, df = 101, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8281132 0.9176746
## sample estimates:
##       cor 
## 0.8805247

There is a strong positive correlation between Age and Work Experience of an MBA graduate

cor.test(placed$salary,placed$satis)
## 
##  Pearson's product-moment correlation
## 
## data:  placed$salary and placed$satis
## t = -0.40283, df = 101, p-value = 0.6879
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.2317788  0.1546729
## sample estimates:
##        cor 
## -0.0400506
cor.test(placed$satis,placed$f_avg)
## 
##  Pearson's product-moment correlation
## 
## data:  placed$satis and placed$f_avg
## t = -1.1915, df = 101, p-value = 0.2363
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.30432392  0.07755874
## sample estimates:
##       cor 
## -0.117733
cor.test(placed$satis,placed$s_avg)
## 
##  Pearson's product-moment correlation
## 
## data:  placed$satis and placed$s_avg
## t = -1.4579, df = 101, p-value = 0.148
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.3279779  0.0513868
## sample estimates:
##        cor 
## -0.1435656
cor.test(placed$satis,placed$quarter)
## 
##  Pearson's product-moment correlation
## 
## data:  placed$satis and placed$quarter
## t = 2.322, df = 101, p-value = 0.02224
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.03303437 0.40116717
## sample estimates:
##       cor 
## 0.2251199

There is no significant correlation between satisfaction with MBA program and the average GPAs. However there is a slightly positive correlation between the quartile rank and satisfaction.

cor(surveydone$s_avg,surveydone$quarter)
## [1] -0.7602882
cor(surveydone$f_avg,surveydone$quarter)
## [1] -0.4141323

As the Spring and Fall avereage GPA rise, the quartile rank decreases

Correlation Matrix

library(corrgram)
## Warning: package 'corrgram' was built under R version 3.4.3
corrgram(placed)

Regression Model

model=lm(salary~work_yrs+satis+f_avg+s_avg+gmat_tot+sex+frstlang,data = placed)
summary(model)
## 
## Call:
## lm(formula = salary ~ work_yrs + satis + f_avg + s_avg + gmat_tot + 
##     sex + frstlang, data = placed)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -29707  -8168  -2187   5698  82749 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   97193.01   25417.81   3.824 0.000235 ***
## work_yrs       2238.46     581.26   3.851 0.000213 ***
## satis         -2000.50    2043.62  -0.979 0.330116    
## f_avg         -1274.09    3825.09  -0.333 0.739803    
## s_avg          4265.73    5023.59   0.849 0.397938    
## gmat_tot         -8.48      31.96  -0.265 0.791361    
## sexMale        6128.86    3472.39   1.765 0.080773 .  
## frstlangOther 15563.87    6492.06   2.397 0.018469 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15760 on 95 degrees of freedom
## Multiple R-squared:  0.2752, Adjusted R-squared:  0.2217 
## F-statistic: 5.152 on 7 and 95 DF,  p-value: 5.365e-05
fitted(model)
##        35        36        37        38        39        40        41 
##  93984.38  93667.03  97429.25  94755.56  99392.57 102850.84  99574.39 
##        42        43        44        45        46        47        48 
##  94297.76 104093.20  95662.41 101077.25  92835.62 100874.55 109194.40 
##        49        50        51        52        53        54        55 
##  98537.91 103840.97 104951.09 103135.33 107359.26 110813.32 113076.20 
##        56        57        58        59        60        61        62 
## 104050.82 104628.45 103735.50 106091.36 130942.99 108126.44 100434.34 
##        63        64        65        66        67        68        69 
## 132479.94 106054.52 109689.07 108212.41 107352.17 136616.23 100200.61 
##       115       116       117       118       119       120       121 
##  88727.40  95174.27 114048.81  99305.20 102669.58  98556.02 100849.22 
##       122       123       124       125       126       127       128 
## 100302.12 100471.72 113929.59 116429.25  95405.42 104882.73  95174.27 
##       129       130       131       132       133       134       135 
## 102604.70 107033.94 101777.38 103752.06 133445.60 101430.07 108469.50 
##       136       137       138       139       186       187       188 
##  96956.12 104758.36 102269.73  99939.67  93886.23 100790.35  97106.70 
##       189       190       191       192       193       194       195 
## 100786.12 103113.61  98181.33 101881.45 101237.61  90214.63  91465.96 
##       196       197       198       199       200       201       202 
##  92551.77 114460.11 100794.59 111197.59 100561.78  92764.81  92700.69 
##       203       204       205       206       207       208       209 
## 111751.49 109508.80 101685.48 107531.55 111673.64 103987.44  98469.67 
##       256       257       258       259       260       261       262 
##  93706.87  97041.03  96143.10  98227.74  96793.30 106370.70  98561.16 
##       263       264       265       266       267       268       269 
##  95356.47  98186.87  97588.13  91847.54  89423.04  98597.00 100278.98 
##       270       271       272       273       274 
## 118076.16  97713.80 103361.21 100957.38 137251.31
predicted.salary=data.frame(fitted(model))
Compare=cbind(predicted.salary,placed$salary)
View(Compare)

Conclusions

  1. Salary depends highly on the work experience of the student. Higher the work experience, higher the salary
  2. If a student has a lower quartile rank, he is expected to be more satisfied with the MBA program
  3. The GMAT performance of a student does not significantly affect his placement.
  4. The semester performance also does not significantly affect the placements
  5. People who speak English do not have a significant advantage.