Inspecting the data
# Sex
MBA.df$sex[MBA.df$sex == 1] <- 'Male'
MBA.df$sex[MBA.df$sex == 2] <- 'Female'
MBA.df$sex <- factor(MBA.df$sex)
#First Language
MBA.df$frstlang[MBA.df$frstlang == 1] <- 'English'
MBA.df$frstlang[MBA.df$frstlang == 2] <- 'Other'
MBA.df$frstlang <- factor(MBA.df$frstlang)
# MBAs who got placed and who disclosed their salaries
placed.df <- MBA.df[which (MBA.df$salary > 1000) , ]
View(placed.df)
# MBAs who were not placed
notPlaced.df <- MBA.df[which(MBA.df$salary==0), ]
View(notPlaced.df)
# MBAs who were placed but did not disclose their salary
notDisclosedSalary.df <- MBA.df[which (MBA.df$salary == 999) , ]
View(notDisclosedSalary.df)
# MBAs who did not answer the survey
notAnsweredSurvey.df <- MBA.df[which (MBA.df$salary == 998) , ]
View(notAnsweredSurvey.df)
# Let avgSalary = Average Salary of students who were placed and who disclosed their salary
avgSalary = mean(placed.df$salary)
avgSalary
## [1] 103030.7
# Assume avgSalary to be the salary of Students who did not disclose their salary (notDisclosedSalary.df)
notDisclosedSalary.df$salary = avgSalary
# allPlaced.df: A dataframe containing all students who were placed.
# It includes those students who disclosed their salary (placed.df) and those who did not disclose their salary (notDisclosed.df), where the salary of students who did not disclose their salary (notDisclosed.df) is assumed to be the sample average of the students who disclosed their salary (placed.df)
allPlaced.df <- rbind(placed.df, notDisclosedSalary.df)
Summary Statistics
library(psych)
describe(allPlaced.df)[,c(1:5)]
## vars n mean sd median
## age 1 138 26.96 3.05 26.0
## sex* 2 138 1.74 0.44 2.0
## gmat_tot 3 138 619.28 53.47 620.0
## gmat_qpc 4 138 81.10 13.59 83.5
## gmat_vpc 5 138 77.99 17.10 81.5
## gmat_tpc 6 138 84.48 13.08 87.0
## s_avg 7 138 3.03 0.38 3.0
## f_avg 8 138 3.06 0.46 3.0
## quarter 9 138 2.43 1.15 2.0
## work_yrs 10 138 3.67 2.75 3.0
## frstlang* 11 138 1.12 0.32 1.0
## salary 12 138 103030.74 15418.25 103030.7
## satis 13 138 5.53 1.11 6.0
View(allPlaced.df)
Review the Distribution of Salary
library(lattice)
histogram(~salary, data = placed.df,
main = "Distribution of Starting Salary of placed and disclosed", xlab="Starting Salary", col='yellow' )

histogram(~salary, data = allPlaced.df,
main = "Distribution of Starting Salary of all those who are placed", xlab="Starting Salary", col='green' )

Comparison of salaray with the given variables
aggregate(cbind(salary, work_yrs, age) ~ sex,
data = placed.df, mean)
## sex salary work_yrs age
## 1 Female 98524.39 3.258065 26.06452
## 2 Male 104970.97 3.861111 27.08333
aggregate(cbind(salary, work_yrs, age) ~ sex,
data = allPlaced.df, mean)
## sex salary work_yrs age
## 1 Female 99150.27 3.277778 26.13889
## 2 Male 104400.32 3.803922 27.24510
Comparison of Salary with Work Experience
library(car)
# Scatter plot of Salary with Work Experience
scatterplot(salary ~ work_yrs ,data=placed.df, main="Scatterplot of Salary with Work Experience", xlab="Work Experience", ylab="MBA's Starting Salaries", horizontal=TRUE)

# Box plot distribution of Salary with Work Experience
boxplot(salary ~ work_yrs ,data=placed.df, main="Distribution of Salary with Work Experience", ylab="Work Experience", xlab="MBA's Starting Salaries", horizontal=TRUE)

library(lattice)
histogram(~salary, data = placed.df,
main = "Frequency of Starting Salary", xlab="Starting Salary", col='grey' )

# Average Salary by Work Experience
salaryWorkEx = aggregate(salary ~ work_yrs, data = placed.df, mean)
salaryWorkEx
## work_yrs salary
## 1 0 95000.00
## 2 1 103532.00
## 3 2 97673.68
## 4 3 101652.86
## 5 4 105454.55
## 6 5 103142.86
## 7 6 105928.57
## 8 7 98000.00
## 9 8 105025.00
## 10 10 118000.00
## 11 15 183000.00
## 12 16 108500.00
Comparison of Salary with GMAT total score
scatterplot(salary ~ gmat_tot , data=placed.df,
xlab="GMAT Total", ylab="Salary",
main="Comparison of Salary with Total GMAT score",
labels=row.names(placed.df))

scatterplot(salary ~ gmat_tot | sex, data=placed.df,
xlab="GMAT Total", ylab="Salary",
main="Comparison of Salary with Total GMAT score",
labels=row.names(placed.df))

boxplot(salary ~ gmat_tot , data=placed.df,
ylab="GMAT Total", xlab="Salary",
main="Comparison of Salary with Total GMAT score",
horizontal=TRUE,
labels=row.names(placed.df))

colnames(placed.df)
## [1] "age" "sex" "gmat_tot" "gmat_qpc" "gmat_vpc" "gmat_tpc"
## [7] "s_avg" "f_avg" "quarter" "work_yrs" "frstlang" "salary"
## [13] "satis"
library(car)
scatterplot.matrix(~salary+gmat_tot+s_avg+f_avg, data=placed.df,
main="Salary versus other variables")

scatterplot.matrix(~salary+gmat_tot+s_avg+f_avg |sex, data=placed.df,
main="Salary versus other variables")

Number of male and females in dataframe age-wise
ageTable <- table(placed.df$sex, placed.df$age)
ageTable
##
## 22 23 24 25 26 27 28 29 30 31 32 33 34 39 40
## Female 1 2 5 10 5 1 3 1 2 0 0 0 0 0 1
## Male 0 3 11 13 9 13 5 5 4 4 1 1 1 1 1
Effect of Sex on the on Salary
aggregate(cbind(salary, work_yrs, age) ~ sex,
data = MBA.df, mean)
## sex salary work_yrs age
## 1 Female 45121.07 3.808824 27.17647
## 2 Male 37013.62 3.893204 27.41748
Effect of Age on the on Salary
aggregate(cbind(salary, work_yrs) ~ age, data = MBA.df, mean)
## age salary work_yrs
## 1 22 42500.00 1.000000
## 2 23 57282.00 1.750000
## 3 24 49342.24 1.727273
## 4 25 43395.55 2.264151
## 5 26 35982.07 2.875000
## 6 27 31499.37 3.130435
## 7 28 39809.00 4.666667
## 8 29 28067.95 4.500000
## 9 30 55291.25 5.583333
## 10 31 40599.40 5.800000
## 11 32 13662.25 5.625000
## 12 33 118000.00 10.000000
## 13 34 26250.00 11.500000
## 14 35 0.00 9.333333
## 15 36 0.00 12.500000
## 16 37 0.00 9.000000
## 17 39 56000.00 10.500000
## 18 40 183000.00 15.000000
## 19 42 0.00 13.000000
## 20 43 0.00 19.000000
## 21 48 0.00 22.000000
Effect of Satisfaction level on the on Salary
aggregate(cbind(salary, work_yrs) ~ satis , data = MBA.df, mean)
## satis salary work_yrs
## 1 1 999.000 3.000000
## 2 2 999.000 2.000000
## 3 3 19799.200 4.200000
## 4 4 6293.412 2.941176
## 5 5 40476.311 4.243243
## 6 6 54383.536 4.185567
## 7 7 65718.152 3.727273
## 8 998 998.000 3.086957
Effect of MBA’s Starting salary based on Work Experience
boxplot(salary ~ work_yrs ,data=MBA.df, main="Effect of Work Experience on Salary", ylab="Work Experience", xlab="MBA's Starting Salaries", horizontal=FALSE)

Effect of MBA’s Starting salary based on Gender
boxplot(salary ~ sex ,data=MBA.df, main="Effect of Gender on Salary", ylab="Work Experience", xlab="MBA's Starting Salaries", horizontal=FALSE)

Distribution of MBA’s Starting Salary
library(lattice)
histogram(~salary, data = MBA.df,
main = "Distribution of MBA's Starting Salary", xlab="MBA's Starting Slariy", col='grey' )

Merge placed.df ; notDisclosed.df ; notPlaced = knownMBA.df
knownMBA.df <- rbind(placed.df, notDisclosedSalary.df, notPlaced.df)
View(knownMBA.df)
“GotPlaced” = 1 (got a job) or 0 (did not get a job)
knownMBA.df$GotPlaced = (knownMBA.df$salary >1000)
View(knownMBA.df)
knownMBA.df$GotPlaced <- factor(knownMBA.df$GotPlaced)
str(knownMBA.df)
## 'data.frame': 228 obs. of 14 variables:
## $ age : int 22 27 25 25 27 28 24 25 25 25 ...
## $ sex : Factor w/ 2 levels "Female","Male": 1 1 1 1 2 1 2 1 1 2 ...
## $ gmat_tot : int 660 700 680 650 710 620 670 560 530 650 ...
## $ gmat_qpc : int 90 94 87 82 96 52 84 52 50 79 ...
## $ gmat_vpc : int 92 98 96 91 96 98 96 81 62 93 ...
## $ gmat_tpc : int 94 98 96 93 98 87 95 72 61 93 ...
## $ s_avg : num 3.5 3.3 3.5 3.4 3.3 3.4 3.3 3.3 3.6 3.3 ...
## $ f_avg : num 3.75 3.25 2.67 3.25 3.5 3.75 3.25 3.5 3.67 3.5 ...
## $ quarter : int 1 1 1 1 1 1 1 1 1 1 ...
## $ work_yrs : int 1 2 2 3 2 5 0 1 3 1 ...
## $ frstlang : Factor w/ 2 levels "English","Other": 1 1 1 1 1 1 1 1 1 1 ...
## $ salary : num 85000 85000 86000 88000 92000 93000 95000 95000 95000 96000 ...
## $ satis : int 5 6 5 7 6 5 4 5 3 7 ...
## $ GotPlaced: Factor w/ 2 levels "FALSE","TRUE": 2 2 2 2 2 2 2 2 2 2 ...
#GotPlaced = factor(year)
#dummies = model.matrix(~year.f)
Create contingency tables, counting allPlaced / notPlaced versus Sex: Male / Female
Number of Placed and Not Placed candiadtes
allplaced <- table(knownMBA.df$GotPlaced == 'TRUE')
allplaced
##
## FALSE TRUE
## 90 138
allPlaced / notPlaced versus Sex: Male / Female
placedbySex <- xtabs(~ knownMBA.df$GotPlaced + knownMBA.df$sex , data=knownMBA.df)
placedbySex
## knownMBA.df$sex
## knownMBA.df$GotPlaced Female Male
## FALSE 23 67
## TRUE 36 102
addmargins(placedbySex)
## knownMBA.df$sex
## knownMBA.df$GotPlaced Female Male Sum
## FALSE 23 67 90
## TRUE 36 102 138
## Sum 59 169 228
Percentage of Male / Female candidates who got Placed
prop.table(placedbySex, 2)
## knownMBA.df$sex
## knownMBA.df$GotPlaced Female Male
## FALSE 0.3898305 0.3964497
## TRUE 0.6101695 0.6035503
allPlaced / notPlaced versus First Language: Enlish / Other
placedbyLanguage <- xtabs(~ knownMBA.df$GotPlaced + knownMBA.df$frstlang, data=knownMBA.df)
placedbyLanguage
## knownMBA.df$frstlang
## knownMBA.df$GotPlaced English Other
## FALSE 82 8
## TRUE 122 16
addmargins(placedbyLanguage)
## knownMBA.df$frstlang
## knownMBA.df$GotPlaced English Other Sum
## FALSE 82 8 90
## TRUE 122 16 138
## Sum 204 24 228
Percentage of First Language candidates who got Placed
prop.table(placedbyLanguage, 2)
## knownMBA.df$frstlang
## knownMBA.df$GotPlaced English Other
## FALSE 0.4019608 0.3333333
## TRUE 0.5980392 0.6666667
H1: The percentage of Females placed is more than Males
Chi Square Test : percentage of female who got placed is higher than percentage of male who got placed
chisq.test(placedbySex)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: placedbySex
## X-squared = 3.5816e-30, df = 1, p-value = 1
H2: The percentage of people placed whose first language is English is higher than the percentage of people placed whose first language is not English
Chi Square Test
chisq.test(placedbyLanguage)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: placedbyLanguage
## X-squared = 0.18479, df = 1, p-value = 0.6673
MODEL SELECTION
library(corrplot)
colnames(placed.df)
## [1] "age" "sex" "gmat_tot" "gmat_qpc" "gmat_vpc" "gmat_tpc"
## [7] "s_avg" "f_avg" "quarter" "work_yrs" "frstlang" "salary"
## [13] "satis"
dataColumns <- placed.df[, c("age","work_yrs", "gmat_tot", "gmat_qpc", "gmat_vpc", "gmat_tpc", "s_avg", "f_avg", "quarter", "satis")]
N <- cor(dataColumns)
corrplot(N, method="circle")

res <- cor(dataColumns)
round(res, 2)
## age work_yrs gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg
## age 1.00 0.88 -0.08 -0.17 0.02 -0.10 0.16 -0.22
## work_yrs 0.88 1.00 -0.12 -0.18 -0.03 -0.13 0.16 -0.22
## gmat_tot -0.08 -0.12 1.00 0.67 0.78 0.97 0.17 0.12
## gmat_qpc -0.17 -0.18 0.67 1.00 0.09 0.66 0.02 0.10
## gmat_vpc 0.02 -0.03 0.78 0.09 1.00 0.78 0.16 0.02
## gmat_tpc -0.10 -0.13 0.97 0.66 0.78 1.00 0.14 0.07
## s_avg 0.16 0.16 0.17 0.02 0.16 0.14 1.00 0.45
## f_avg -0.22 -0.22 0.12 0.10 0.02 0.07 0.45 1.00
## quarter -0.13 -0.13 -0.11 0.01 -0.13 -0.10 -0.84 -0.43
## satis 0.11 0.06 0.06 0.00 0.15 0.12 -0.14 -0.12
## quarter satis
## age -0.13 0.11
## work_yrs -0.13 0.06
## gmat_tot -0.11 0.06
## gmat_qpc 0.01 0.00
## gmat_vpc -0.13 0.15
## gmat_tpc -0.10 0.12
## s_avg -0.84 -0.14
## f_avg -0.43 -0.12
## quarter 1.00 0.23
## satis 0.23 1.00
# MBA PERFORMANCE
# The variables tracking performance during the MBA are heavily correlated
mbaPerformance <- placed.df[, c("s_avg", "f_avg", "quarter")]
N <- cor(mbaPerformance)
corrplot(N, method="circle")

res <- cor(mbaPerformance)
round(res, 2)
## s_avg f_avg quarter
## s_avg 1.00 0.45 -0.84
## f_avg 0.45 1.00 -0.43
## quarter -0.84 -0.43 1.00
# The overall performance quartile (quarter) is highly correlated with the Spring (s_avg) and Fall (f_avg) GPA
# We will include 's_avg' and 'f_avg' in our regression, but exclude 'quarter' from our regression.
# Identifying DEPENDENT and INDEPENDENT Variables
# The DEPENDENT variable is "salary"
# Identifying INDEPENDENT Variables:
#1a. Variables related to GMAT are highly correlated: "gmat_tot" "gmat_qpc" "gmat_vpc" "gmat_tpc"
#1b. Variables relatd to MBA performance are highly correlated: "s_avg" "f_avg" "quarter"
#1c. Variables related to age and work experience are highly correlated: "age", "work_yrs"
#1d. Other variables: "sex" , "frstlang" ; "satis"
# 1e. GMAT
# The GMAT related columns are heavily strongly correlated with each other
gmat <- placed.df[, c("gmat_tot", "gmat_qpc", "gmat_vpc", "gmat_tpc")]
res <- cor(gmat)
round(res, 2)
## gmat_tot gmat_qpc gmat_vpc gmat_tpc
## gmat_tot 1.00 0.67 0.78 0.97
## gmat_qpc 0.67 1.00 0.09 0.66
## gmat_vpc 0.78 0.09 1.00 0.78
## gmat_tpc 0.97 0.66 0.78 1.00
library(corrplot)
M <- cor(gmat)
corrplot(M, method="circle")

# However, GMAT verbal and quantitative scores are very weakly correlated
cor(placed.df$gmat_qpc, placed.df$gmat_vpc)
## [1] 0.09466541
# Therefore, in our regression we will include gmat_qpc and gmat_vpc , but exclude "gmat_tot" and "gmat_tpc"
# 1f. MBA PERFORMANCE
# The variables tracking performance during the MBA are heavily correlated
mbaPerformance <- placed.df[, c("s_avg", "f_avg", "quarter")]
N <- cor(mbaPerformance)
corrplot(N, method="circle")

res <- cor(mbaPerformance)
round(res, 2)
## s_avg f_avg quarter
## s_avg 1.00 0.45 -0.84
## f_avg 0.45 1.00 -0.43
## quarter -0.84 -0.43 1.00
# The overall performance quartile (quarter) is highly correlated with the Spring (s_avg) and Fall (f_avg) GPA
# We will include 's_avg' and 'f_avg' in our regression, but exclude 'quarter' from our regression.
# 1g. WORK EXPERIENCE AND AGE
# The variables 'age' and 'work_years' are highly correlated. The older the person, the greater the work experience.
cor(placed.df$age,placed.df$work_yrs)
## [1] 0.8805247
# Therefore we will include 'work_years' in our regression, but exclude 'age' from our regression
# SUMMARY OF MODEL SELECTION
# Given the above discussion, the indepedent variables we will include in the regression are {work_yrs,s_avg,f_avg,gmat_qpc,gmat_vpc,sex,frstlang,satis}
VARIANCE - COVARIANCE MATRIX
columns = c("salary", "work_yrs", "gmat_qpc", "gmat_vpc", "s_avg", "f_avg", "satis")
placedVariables <- placed.df[, columns]
res <- cor(placedVariables)
round(res, 2)
## salary work_yrs gmat_qpc gmat_vpc s_avg f_avg satis
## salary 1.00 0.45 0.01 -0.14 0.10 -0.11 -0.04
## work_yrs 0.45 1.00 -0.18 -0.03 0.16 -0.22 0.06
## gmat_qpc 0.01 -0.18 1.00 0.09 0.02 0.10 0.00
## gmat_vpc -0.14 -0.03 0.09 1.00 0.16 0.02 0.15
## s_avg 0.10 0.16 0.02 0.16 1.00 0.45 -0.14
## f_avg -0.11 -0.22 0.10 0.02 0.45 1.00 -0.12
## satis -0.04 0.06 0.00 0.15 -0.14 -0.12 1.00
library(corrplot)
M <- cor(placed.df[, columns])
corrplot(M, method="circle")

SCATTER PLOTS
library(car)
scatterplotMatrix(~salary + s_avg + f_avg + satis, data=placed.df,
main="Salary versus MBA Performance and MBA Satisfaction")

library(car)
scatterplotMatrix(~salary + work_yrs + gmat_qpc + gmat_vpc, data=placed.df,
main="Salary versus Work Experience; GMAT Performance")

REGRESSION
Formulating multivariate linear regression model to fit salary with respect to the model selection
Dependent Variable: Salary
Model1 <- salary ~
work_yrs + s_avg + f_avg + gmat_qpc + gmat_vpc + sex + frstlang + satis
fit1 <- lm(Model1, data = placed.df)
summary(fit1)
##
## Call:
## lm(formula = Model1, data = placed.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -29800 -7822 -1742 4869 82341
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 90136.22 21739.22 4.146 7.4e-05 ***
## work_yrs 2331.12 585.99 3.978 0.000137 ***
## s_avg 4659.05 5015.66 0.929 0.355320
## f_avg -1698.83 3834.70 -0.443 0.658773
## gmat_qpc 98.72 121.85 0.810 0.419884
## gmat_vpc -95.80 102.99 -0.930 0.354699
## sexMale 5289.24 3545.91 1.492 0.139140
## frstlangOther 13994.76 6641.66 2.107 0.037770 *
## satis -1671.20 2070.62 -0.807 0.421643
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15740 on 94 degrees of freedom
## Multiple R-squared: 0.285, Adjusted R-squared: 0.2241
## F-statistic: 4.683 on 8 and 94 DF, p-value: 7.574e-05
library(leaps)
leap <- regsubsets(Model1, data = placed.df, nbest=1)
summary(leap)
## Subset selection object
## Call: regsubsets.formula(Model1, data = placed.df, nbest = 1)
## 8 Variables (and intercept)
## Forced in Forced out
## work_yrs FALSE FALSE
## s_avg FALSE FALSE
## f_avg FALSE FALSE
## gmat_qpc FALSE FALSE
## gmat_vpc FALSE FALSE
## sexMale FALSE FALSE
## frstlangOther FALSE FALSE
## satis FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
## work_yrs s_avg f_avg gmat_qpc gmat_vpc sexMale frstlangOther
## 1 ( 1 ) "*" " " " " " " " " " " " "
## 2 ( 1 ) "*" " " " " " " " " " " "*"
## 3 ( 1 ) "*" " " " " " " " " "*" "*"
## 4 ( 1 ) "*" " " " " " " " " "*" "*"
## 5 ( 1 ) "*" "*" " " " " " " "*" "*"
## 6 ( 1 ) "*" "*" " " " " "*" "*" "*"
## 7 ( 1 ) "*" "*" " " "*" "*" "*" "*"
## 8 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## satis
## 1 ( 1 ) " "
## 2 ( 1 ) " "
## 3 ( 1 ) " "
## 4 ( 1 ) "*"
## 5 ( 1 ) "*"
## 6 ( 1 ) "*"
## 7 ( 1 ) "*"
## 8 ( 1 ) "*"
plot(leap, scale="adjr2")

Model2 <- salary ~
work_yrs +
# age +
# s_avg +
# f_avg +
# quarter +
# gmat_qpc +
# gmat_vpc +
# gmat_tot +
# gmat_tpc +
sex +
frstlang +
satis
fit2 <- lm(Model2, data = placed.df)
summary(fit2)
##
## Call:
## lm(formula = Model2, data = placed.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -30492 -8055 -1744 5362 80436
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 102214.0 11827.8 8.642 1.06e-13 ***
## work_yrs 2409.4 526.1 4.579 1.37e-05 ***
## sexMale 5949.5 3392.2 1.754 0.0826 .
## frstlangOther 14675.7 6274.0 2.339 0.0214 *
## satis -2244.4 1988.4 -1.129 0.2618
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15580 on 98 degrees of freedom
## Multiple R-squared: 0.2695, Adjusted R-squared: 0.2397
## F-statistic: 9.038 on 4 and 98 DF, p-value: 2.953e-06
library(leaps)
leap <- regsubsets(Model2, data = placed.df, nbest=1)
summary(leap)
## Subset selection object
## Call: regsubsets.formula(Model2, data = placed.df, nbest = 1)
## 4 Variables (and intercept)
## Forced in Forced out
## work_yrs FALSE FALSE
## sexMale FALSE FALSE
## frstlangOther FALSE FALSE
## satis FALSE FALSE
## 1 subsets of each size up to 4
## Selection Algorithm: exhaustive
## work_yrs sexMale frstlangOther satis
## 1 ( 1 ) "*" " " " " " "
## 2 ( 1 ) "*" " " "*" " "
## 3 ( 1 ) "*" "*" "*" " "
## 4 ( 1 ) "*" "*" "*" "*"
plot(leap, scale="adjr2")
