MBA.df <- read.csv(paste("MBA Starting Salaries Data.csv", sep=""))
attach(MBA.df)
MBA.df$sex[MBA.df$sex == 1] <- 'Male'
MBA.df$sex[MBA.df$sex == 2] <- 'Female'
MBA.df$sex <- factor(MBA.df$sex)
MBA.df$frstlang[MBA.df$frstlang == 1] <- 'English'
MBA.df$frstlang[MBA.df$frstlang == 2] <- 'Other'
MBA.df$frstlang <- factor(MBA.df$frstlang)
library(psych)
summary(MBA.df)
##       age            sex         gmat_tot        gmat_qpc    
##  Min.   :22.00   Female: 68   Min.   :450.0   Min.   :28.00  
##  1st Qu.:25.00   Male  :206   1st Qu.:580.0   1st Qu.:72.00  
##  Median :27.00                Median :620.0   Median :83.00  
##  Mean   :27.36                Mean   :619.5   Mean   :80.64  
##  3rd Qu.:29.00                3rd Qu.:660.0   3rd Qu.:93.00  
##  Max.   :48.00                Max.   :790.0   Max.   :99.00  
##     gmat_vpc        gmat_tpc        s_avg           f_avg      
##  Min.   :16.00   Min.   : 0.0   Min.   :2.000   Min.   :0.000  
##  1st Qu.:71.00   1st Qu.:78.0   1st Qu.:2.708   1st Qu.:2.750  
##  Median :81.00   Median :87.0   Median :3.000   Median :3.000  
##  Mean   :78.32   Mean   :84.2   Mean   :3.025   Mean   :3.062  
##  3rd Qu.:91.00   3rd Qu.:94.0   3rd Qu.:3.300   3rd Qu.:3.250  
##  Max.   :99.00   Max.   :99.0   Max.   :4.000   Max.   :4.000  
##     quarter         work_yrs         frstlang       salary      
##  Min.   :1.000   Min.   : 0.000   English:242   Min.   :     0  
##  1st Qu.:1.250   1st Qu.: 2.000   Other  : 32   1st Qu.:     0  
##  Median :2.000   Median : 3.000                 Median :   999  
##  Mean   :2.478   Mean   : 3.872                 Mean   : 39026  
##  3rd Qu.:3.000   3rd Qu.: 4.000                 3rd Qu.: 97000  
##  Max.   :4.000   Max.   :22.000                 Max.   :220000  
##      satis      
##  Min.   :  1.0  
##  1st Qu.:  5.0  
##  Median :  6.0  
##  Mean   :172.2  
##  3rd Qu.:  7.0  
##  Max.   :998.0
describe(MBA.df)
##           vars   n     mean       sd median  trimmed     mad min    max
## age          1 274    27.36     3.71     27    26.76    2.97  22     48
## sex*         2 274     1.75     0.43      2     1.81    0.00   1      2
## gmat_tot     3 274   619.45    57.54    620   618.86   59.30 450    790
## gmat_qpc     4 274    80.64    14.87     83    82.31   14.83  28     99
## gmat_vpc     5 274    78.32    16.86     81    80.33   14.83  16     99
## gmat_tpc     6 274    84.20    14.02     87    86.12   11.86   0     99
## s_avg        7 274     3.03     0.38      3     3.03    0.44   2      4
## f_avg        8 274     3.06     0.53      3     3.09    0.37   0      4
## quarter      9 274     2.48     1.11      2     2.47    1.48   1      4
## work_yrs    10 274     3.87     3.23      3     3.29    1.48   0     22
## frstlang*   11 274     1.12     0.32      1     1.02    0.00   1      2
## salary      12 274 39025.69 50951.56    999 33607.86 1481.12   0 220000
## satis       13 274   172.18   371.61      6    91.50    1.48   1    998
##            range  skew kurtosis      se
## age           26  2.16     6.45    0.22
## sex*           1 -1.16    -0.66    0.03
## gmat_tot     340 -0.01     0.06    3.48
## gmat_qpc      71 -0.92     0.30    0.90
## gmat_vpc      83 -1.04     0.74    1.02
## gmat_tpc      99 -2.28     9.02    0.85
## s_avg          2 -0.06    -0.38    0.02
## f_avg          4 -2.08    10.85    0.03
## quarter        3  0.02    -1.35    0.07
## work_yrs      22  2.78     9.80    0.20
## frstlang*      1  2.37     3.65    0.02
## salary    220000  0.70    -1.05 3078.10
## satis        997  1.77     1.13   22.45

Sorting data salarywise for those who have disclosed their salary:-

placed.df <- MBA.df[which (MBA.df$salary > 1000)  , ]
View(placed.df)
notPlaced.df <- MBA.df[which(MBA.df$salary==0), ]
View(notPlaced.df)

Effect of Gender of student on salary

library(lattice)
boxplot(salary ~ sex ,data=MBA.df, main="Salary", ylab="Work Experience", xlab="MBA's Starting Salaries", horizontal=TRUE)

Combined effect of gender and first language on salary:

mba.agg <- aggregate(salary ~ sex + frstlang, data=MBA.df, mean)
barchart(salary ~ sex, data = mba.agg, groups=frstlang, auto.key=TRUE,  par.settings = simpleTheme(col=c("gray95", "gray50")) )

Effect of work experience on salary:

plot(work_yrs, salary, data = placed.df, main="Work exp & salary", xlab="Work Experience ", ylab="Salary", pch=19)
## Warning in plot.window(...): "data" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "data" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not
## a graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not
## a graphical parameter
## Warning in box(...): "data" is not a graphical parameter
## Warning in title(...): "data" is not a graphical parameter

Combined effects of GMAT score, Spring & fall average score and work experience on salary of a MBA graduate:

library(car)
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
scatterplotMatrix(~salary+gmat_tot+s_avg+f_avg+work_yrs|sex, data=placed.df, main="ScatterPlotMatrix")

Correlation Matrix:-

library(corrplot)
## corrplot 0.84 loaded
cor <- cor(MBA.df[sapply(MBA.df, function(x) !is.factor(x))])
corrplot(cor, method = "pie")

mba2.agg <- aggregate(satis ~ salary + sex, data=placed.df, mean)

Studying Salary Distribution:-

histogram(MBA.df$salary, col = "red", main = "Salary distribution", xlab = "Salary")

columns = c("salary", "work_yrs", "gmat_qpc", "gmat_vpc", "s_avg", "f_avg", "satis")
salary <- placed.df[, columns]
mba3 <- cor(salary)

M <- cor(placed.df[, columns])
corrplot(M, method="pie")

Study impact of gender; first language; prior work experience; GMAT performance; MBA performance in determining the Starting Salary \[Salary= \alpha_0 + \alpha_1 sex + \alpha_2 GMATScore + \alpha_3 WorkExperience + \alpha_4 FirstLanguage + \alpha_5 SpringScore + \alpha_6 FallScore + \epsilon\]

Model1 <- salary ~ work_yrs + s_avg + f_avg + gmat_tot + sex + frstlang
fit1 <- lm(Model1, data = placed.df)
summary(fit1)
## 
## Call:
## lm(formula = Model1, data = placed.df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -32652  -8940  -1709   5186  83182 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   85685.28   22530.99   3.803 0.000251 ***
## work_yrs       2201.83     579.92   3.797 0.000257 ***
## s_avg          4851.02    4986.79   0.973 0.333110    
## f_avg         -1153.74    3822.28  -0.302 0.763422    
## gmat_tot        -11.90      31.77  -0.375 0.708712    
## sexMale        5886.39    3462.79   1.700 0.092388 .  
## frstlangOther 15101.77    6473.46   2.333 0.021743 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15760 on 96 degrees of freedom
## Multiple R-squared:  0.2678, Adjusted R-squared:  0.2221 
## F-statistic: 5.853 on 6 and 96 DF,  p-value: 3.114e-05

Solve Maer’s question of whether her GMAT score made a difference in marks. Since her native language was not English, Daer had a relatively low GMAT.

library(polycor)
## 
## Attaching package: 'polycor'
## The following object is masked from 'package:psych':
## 
##     polyserial
hetcor(MBA.df$gmat_tot, MBA.df$frstlang)
## 
## Two-Step Estimates
## 
## Correlations/Type of Correlation:
##                 MBA.df$gmat_tot MBA.df.frstlang
## MBA.df$gmat_tot               1      Polyserial
## MBA.df.frstlang         -0.2172               1
## 
## Standard Errors:
## MBA.df$gmat_tot MBA.df.frstlang 
##                         0.09433 
## Levels:  0.09433
## 
## n = 274 
## 
## P-values for Tests of Bivariate Normality:
## MBA.df$gmat_tot MBA.df.frstlang 
##                         0.04903 
## Levels:  0.04903