MBA.df <- read.csv(paste("MBA Starting Salaries Data.csv", sep=""))
attach(MBA.df)
MBA.df$sex[MBA.df$sex == 1] <- 'Male'
MBA.df$sex[MBA.df$sex == 2] <- 'Female'
MBA.df$sex <- factor(MBA.df$sex)
MBA.df$frstlang[MBA.df$frstlang == 1] <- 'English'
MBA.df$frstlang[MBA.df$frstlang == 2] <- 'Other'
MBA.df$frstlang <- factor(MBA.df$frstlang)
library(psych)
summary(MBA.df)
## age sex gmat_tot gmat_qpc
## Min. :22.00 Female: 68 Min. :450.0 Min. :28.00
## 1st Qu.:25.00 Male :206 1st Qu.:580.0 1st Qu.:72.00
## Median :27.00 Median :620.0 Median :83.00
## Mean :27.36 Mean :619.5 Mean :80.64
## 3rd Qu.:29.00 3rd Qu.:660.0 3rd Qu.:93.00
## Max. :48.00 Max. :790.0 Max. :99.00
## gmat_vpc gmat_tpc s_avg f_avg
## Min. :16.00 Min. : 0.0 Min. :2.000 Min. :0.000
## 1st Qu.:71.00 1st Qu.:78.0 1st Qu.:2.708 1st Qu.:2.750
## Median :81.00 Median :87.0 Median :3.000 Median :3.000
## Mean :78.32 Mean :84.2 Mean :3.025 Mean :3.062
## 3rd Qu.:91.00 3rd Qu.:94.0 3rd Qu.:3.300 3rd Qu.:3.250
## Max. :99.00 Max. :99.0 Max. :4.000 Max. :4.000
## quarter work_yrs frstlang salary
## Min. :1.000 Min. : 0.000 English:242 Min. : 0
## 1st Qu.:1.250 1st Qu.: 2.000 Other : 32 1st Qu.: 0
## Median :2.000 Median : 3.000 Median : 999
## Mean :2.478 Mean : 3.872 Mean : 39026
## 3rd Qu.:3.000 3rd Qu.: 4.000 3rd Qu.: 97000
## Max. :4.000 Max. :22.000 Max. :220000
## satis
## Min. : 1.0
## 1st Qu.: 5.0
## Median : 6.0
## Mean :172.2
## 3rd Qu.: 7.0
## Max. :998.0
describe(MBA.df)
## vars n mean sd median trimmed mad min max
## age 1 274 27.36 3.71 27 26.76 2.97 22 48
## sex* 2 274 1.75 0.43 2 1.81 0.00 1 2
## gmat_tot 3 274 619.45 57.54 620 618.86 59.30 450 790
## gmat_qpc 4 274 80.64 14.87 83 82.31 14.83 28 99
## gmat_vpc 5 274 78.32 16.86 81 80.33 14.83 16 99
## gmat_tpc 6 274 84.20 14.02 87 86.12 11.86 0 99
## s_avg 7 274 3.03 0.38 3 3.03 0.44 2 4
## f_avg 8 274 3.06 0.53 3 3.09 0.37 0 4
## quarter 9 274 2.48 1.11 2 2.47 1.48 1 4
## work_yrs 10 274 3.87 3.23 3 3.29 1.48 0 22
## frstlang* 11 274 1.12 0.32 1 1.02 0.00 1 2
## salary 12 274 39025.69 50951.56 999 33607.86 1481.12 0 220000
## satis 13 274 172.18 371.61 6 91.50 1.48 1 998
## range skew kurtosis se
## age 26 2.16 6.45 0.22
## sex* 1 -1.16 -0.66 0.03
## gmat_tot 340 -0.01 0.06 3.48
## gmat_qpc 71 -0.92 0.30 0.90
## gmat_vpc 83 -1.04 0.74 1.02
## gmat_tpc 99 -2.28 9.02 0.85
## s_avg 2 -0.06 -0.38 0.02
## f_avg 4 -2.08 10.85 0.03
## quarter 3 0.02 -1.35 0.07
## work_yrs 22 2.78 9.80 0.20
## frstlang* 1 2.37 3.65 0.02
## salary 220000 0.70 -1.05 3078.10
## satis 997 1.77 1.13 22.45
Sorting data salarywise for those who have disclosed their salary:-
placed.df <- MBA.df[which (MBA.df$salary > 1000) , ]
View(placed.df)
notPlaced.df <- MBA.df[which(MBA.df$salary==0), ]
View(notPlaced.df)
Effect of Gender of student on salary
library(lattice)
boxplot(salary ~ sex ,data=MBA.df, main="Salary", ylab="Work Experience", xlab="MBA's Starting Salaries", horizontal=TRUE)
Combined effect of gender and first language on salary:
mba.agg <- aggregate(salary ~ sex + frstlang, data=MBA.df, mean)
barchart(salary ~ sex, data = mba.agg, groups=frstlang, auto.key=TRUE, par.settings = simpleTheme(col=c("gray95", "gray50")) )
Effect of work experience on salary:
plot(work_yrs, salary, data = placed.df, main="Work exp & salary", xlab="Work Experience ", ylab="Salary", pch=19)
## Warning in plot.window(...): "data" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "data" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not
## a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "data" is not
## a graphical parameter
## Warning in box(...): "data" is not a graphical parameter
## Warning in title(...): "data" is not a graphical parameter
Combined effects of GMAT score, Spring & fall average score and work experience on salary of a MBA graduate:
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
scatterplotMatrix(~salary+gmat_tot+s_avg+f_avg+work_yrs|sex, data=placed.df, main="ScatterPlotMatrix")
Correlation Matrix:-
library(corrplot)
## corrplot 0.84 loaded
cor <- cor(MBA.df[sapply(MBA.df, function(x) !is.factor(x))])
corrplot(cor, method = "pie")
mba2.agg <- aggregate(satis ~ salary + sex, data=placed.df, mean)
Studying Salary Distribution:-
histogram(MBA.df$salary, col = "red", main = "Salary distribution", xlab = "Salary")
columns = c("salary", "work_yrs", "gmat_qpc", "gmat_vpc", "s_avg", "f_avg", "satis")
salary <- placed.df[, columns]
mba3 <- cor(salary)
M <- cor(placed.df[, columns])
corrplot(M, method="pie")
Study impact of gender; first language; prior work experience; GMAT performance; MBA performance in determining the Starting Salary \[Salary= \alpha_0 + \alpha_1 sex + \alpha_2 GMATScore + \alpha_3 WorkExperience + \alpha_4 FirstLanguage + \alpha_5 SpringScore + \alpha_6 FallScore + \epsilon\]
Model1 <- salary ~ work_yrs + s_avg + f_avg + gmat_tot + sex + frstlang
fit1 <- lm(Model1, data = placed.df)
summary(fit1)
##
## Call:
## lm(formula = Model1, data = placed.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -32652 -8940 -1709 5186 83182
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 85685.28 22530.99 3.803 0.000251 ***
## work_yrs 2201.83 579.92 3.797 0.000257 ***
## s_avg 4851.02 4986.79 0.973 0.333110
## f_avg -1153.74 3822.28 -0.302 0.763422
## gmat_tot -11.90 31.77 -0.375 0.708712
## sexMale 5886.39 3462.79 1.700 0.092388 .
## frstlangOther 15101.77 6473.46 2.333 0.021743 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15760 on 96 degrees of freedom
## Multiple R-squared: 0.2678, Adjusted R-squared: 0.2221
## F-statistic: 5.853 on 6 and 96 DF, p-value: 3.114e-05
Solve Maer’s question of whether her GMAT score made a difference in marks. Since her native language was not English, Daer had a relatively low GMAT.
library(polycor)
##
## Attaching package: 'polycor'
## The following object is masked from 'package:psych':
##
## polyserial
hetcor(MBA.df$gmat_tot, MBA.df$frstlang)
##
## Two-Step Estimates
##
## Correlations/Type of Correlation:
## MBA.df$gmat_tot MBA.df.frstlang
## MBA.df$gmat_tot 1 Polyserial
## MBA.df.frstlang -0.2172 1
##
## Standard Errors:
## MBA.df$gmat_tot MBA.df.frstlang
## 0.09433
## Levels: 0.09433
##
## n = 274
##
## P-values for Tests of Bivariate Normality:
## MBA.df$gmat_tot MBA.df.frstlang
## 0.04903
## Levels: 0.04903