Inspecting the data

# Sex 
MBA.df$sex[MBA.df$sex == 1] <- 'Male'
MBA.df$sex[MBA.df$sex == 2] <- 'Female'
MBA.df$sex <- factor(MBA.df$sex)

#First Language
MBA.df$frstlang[MBA.df$frstlang == 1] <- 'English'
MBA.df$frstlang[MBA.df$frstlang == 2] <- 'Other'
MBA.df$frstlang <- factor(MBA.df$frstlang)

# MBAs who got placed and who disclosed their salaries
placed.df <- MBA.df[which (MBA.df$salary > 1000)  , ]
View(placed.df)

# MBAs who were not placed
notPlaced.df <- MBA.df[which(MBA.df$salary==0), ]
View(notPlaced.df)

# MBAs who were placed but did not disclose their salary
notDisclosedSalary.df  <- MBA.df[which (MBA.df$salary == 999)  , ]
View(notDisclosedSalary.df)

# MBAs who did not answer the survey
notAnsweredSurvey.df  <- MBA.df[which (MBA.df$salary == 998)  , ]
View(notAnsweredSurvey.df)

# Let avgSalary = Average Salary of students who were placed and who disclosed their salary
avgSalary = mean(placed.df$salary)
avgSalary

## [1] 103030.7

# Assume avgSalary to be the salary of Students who did not disclose their salary (notDisclosedSalary.df) 
notDisclosedSalary.df$salary = avgSalary

# allPlaced.df:  A dataframe containing all students who were placed. 
# It includes those students who disclosed their salary (placed.df) and those who did not disclose their salary (notDisclosed.df), where the salary of students who did not disclose their salary (notDisclosed.df) is assumed to be the sample average of the students who disclosed their salary (placed.df)
allPlaced.df <- rbind(placed.df, notDisclosedSalary.df)

Summary Statistics

library(psych)
describe(allPlaced.df)[,c(1:5)]

##           vars   n      mean       sd   median
## age          1 138     26.96     3.05     26.0
## sex*         2 138      1.74     0.44      2.0
## gmat_tot     3 138    619.28    53.47    620.0
## gmat_qpc     4 138     81.10    13.59     83.5
## gmat_vpc     5 138     77.99    17.10     81.5
## gmat_tpc     6 138     84.48    13.08     87.0
## s_avg        7 138      3.03     0.38      3.0
## f_avg        8 138      3.06     0.46      3.0
## quarter      9 138      2.43     1.15      2.0
## work_yrs    10 138      3.67     2.75      3.0
## frstlang*   11 138      1.12     0.32      1.0
## salary      12 138 103030.74 15418.25 103030.7
## satis       13 138      5.53     1.11      6.0

View(allPlaced.df)

Review the Distribution of Salary

library(lattice)
histogram(~salary, data = placed.df,
 main = "Distribution of Starting Salary of placed and disclosed", xlab="Starting Salary", col='yellow' )

histogram(~salary, data = allPlaced.df,
 main = "Distribution of Starting Salary of all those who are placed", xlab="Starting Salary", col='green' )

Comparison of salaray with the given variables

aggregate(cbind(salary, work_yrs, age) ~ sex, 
                   data = placed.df, mean)

##      sex    salary work_yrs      age
## 1 Female  98524.39 3.258065 26.06452
## 2   Male 104970.97 3.861111 27.08333

aggregate(cbind(salary, work_yrs, age) ~ sex, 
                   data = allPlaced.df, mean)

##      sex    salary work_yrs      age
## 1 Female  99150.27 3.277778 26.13889
## 2   Male 104400.32 3.803922 27.24510

Comparison of Salary with Work Experience

library(car)
# Scatter plot of Salary with Work Experience

scatterplot(salary ~ work_yrs ,data=placed.df, main="Scatterplot of Salary with Work Experience", xlab="Work Experience", ylab="MBA's Starting Salaries", horizontal=TRUE)

# Box plot distribution of Salary with Work Experience

boxplot(salary ~ work_yrs ,data=placed.df, main="Distribution of Salary with Work Experience", ylab="Work Experience", xlab="MBA's Starting Salaries", horizontal=TRUE)

library(lattice)
histogram(~salary, data = placed.df,
 main = "Frequency of Starting Salary", xlab="Starting Salary", col='grey' )

# Average Salary by Work Experience
salaryWorkEx = aggregate(salary ~ work_yrs, data = placed.df, mean)
salaryWorkEx

##    work_yrs    salary
## 1         0  95000.00
## 2         1 103532.00
## 3         2  97673.68
## 4         3 101652.86
## 5         4 105454.55
## 6         5 103142.86
## 7         6 105928.57
## 8         7  98000.00
## 9         8 105025.00
## 10       10 118000.00
## 11       15 183000.00
## 12       16 108500.00

Comparison of Salary with GMAT total score

scatterplot(salary ~ gmat_tot , data=placed.df, 
    xlab="GMAT Total", ylab="Salary", 
   main="Comparison of Salary with Total GMAT score", 
   labels=row.names(placed.df))

scatterplot(salary ~ gmat_tot | sex, data=placed.df, 
    xlab="GMAT Total", ylab="Salary", 
   main="Comparison of Salary with Total GMAT score", 
   labels=row.names(placed.df))

boxplot(salary ~ gmat_tot , data=placed.df, 
    ylab="GMAT Total", xlab="Salary", 
   main="Comparison of Salary with Total GMAT score", 
   horizontal=TRUE,
   labels=row.names(placed.df))

colnames(placed.df)

##  [1] "age"      "sex"      "gmat_tot" "gmat_qpc" "gmat_vpc" "gmat_tpc"
##  [7] "s_avg"    "f_avg"    "quarter"  "work_yrs" "frstlang" "salary"  
## [13] "satis"

library(car)
scatterplot.matrix(~salary+gmat_tot+s_avg+f_avg, data=placed.df,
    main="Salary versus other variables")

scatterplot.matrix(~salary+gmat_tot+s_avg+f_avg |sex, data=placed.df,
    main="Salary versus other variables")

Number of male and females in dataframe age-wise

ageTable <- table(placed.df$sex, placed.df$age)
ageTable

##         
##          22 23 24 25 26 27 28 29 30 31 32 33 34 39 40
##   Female  1  2  5 10  5  1  3  1  2  0  0  0  0  0  1
##   Male    0  3 11 13  9 13  5  5  4  4  1  1  1  1  1

Effect of Sex on the on Salary

aggregate(cbind(salary, work_yrs, age) ~ sex, 
                   data = MBA.df, mean)

##      sex   salary work_yrs      age
## 1 Female 45121.07 3.808824 27.17647
## 2   Male 37013.62 3.893204 27.41748

Effect of Age on the on Salary

aggregate(cbind(salary, work_yrs) ~ age, data = MBA.df, mean)

##    age    salary  work_yrs
## 1   22  42500.00  1.000000
## 2   23  57282.00  1.750000
## 3   24  49342.24  1.727273
## 4   25  43395.55  2.264151
## 5   26  35982.07  2.875000
## 6   27  31499.37  3.130435
## 7   28  39809.00  4.666667
## 8   29  28067.95  4.500000
## 9   30  55291.25  5.583333
## 10  31  40599.40  5.800000
## 11  32  13662.25  5.625000
## 12  33 118000.00 10.000000
## 13  34  26250.00 11.500000
## 14  35      0.00  9.333333
## 15  36      0.00 12.500000
## 16  37      0.00  9.000000
## 17  39  56000.00 10.500000
## 18  40 183000.00 15.000000
## 19  42      0.00 13.000000
## 20  43      0.00 19.000000
## 21  48      0.00 22.000000

Effect of Satisfaction level on the on Salary

aggregate(cbind(salary, work_yrs) ~ satis , data = MBA.df, mean)

##   satis    salary work_yrs
## 1     1   999.000 3.000000
## 2     2   999.000 2.000000
## 3     3 19799.200 4.200000
## 4     4  6293.412 2.941176
## 5     5 40476.311 4.243243
## 6     6 54383.536 4.185567
## 7     7 65718.152 3.727273
## 8   998   998.000 3.086957

Effect of MBA’s Starting salary based on Work Experience

boxplot(salary ~ work_yrs ,data=MBA.df, main="Effect of Work Experience on Salary", ylab="Work Experience", xlab="MBA's Starting Salaries", horizontal=FALSE)

Effect of MBA’s Starting salary based on Gender

boxplot(salary ~ sex ,data=MBA.df, main="Effect of Gender on Salary", ylab="Work Experience", xlab="MBA's Starting Salaries", horizontal=FALSE)

Distribution of MBA’s Starting Salary

library(lattice)
histogram(~salary, data = MBA.df,
 main = "Distribution of MBA's Starting Salary", xlab="MBA's Starting Slariy", col='grey' )

Merge placed.df ; notDisclosed.df ; notPlaced = knownMBA.df

knownMBA.df <- rbind(placed.df, notDisclosedSalary.df, notPlaced.df)
View(knownMBA.df)

“GotPlaced” = 1 (got a job) or 0 (did not get a job)

knownMBA.df$GotPlaced = (knownMBA.df$salary >1000)
View(knownMBA.df)

knownMBA.df$GotPlaced <- factor(knownMBA.df$GotPlaced)
str(knownMBA.df)

## 'data.frame':    228 obs. of  14 variables:
##  $ age      : int  22 27 25 25 27 28 24 25 25 25 ...
##  $ sex      : Factor w/ 2 levels "Female","Male": 1 1 1 1 2 1 2 1 1 2 ...
##  $ gmat_tot : int  660 700 680 650 710 620 670 560 530 650 ...
##  $ gmat_qpc : int  90 94 87 82 96 52 84 52 50 79 ...
##  $ gmat_vpc : int  92 98 96 91 96 98 96 81 62 93 ...
##  $ gmat_tpc : int  94 98 96 93 98 87 95 72 61 93 ...
##  $ s_avg    : num  3.5 3.3 3.5 3.4 3.3 3.4 3.3 3.3 3.6 3.3 ...
##  $ f_avg    : num  3.75 3.25 2.67 3.25 3.5 3.75 3.25 3.5 3.67 3.5 ...
##  $ quarter  : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ work_yrs : int  1 2 2 3 2 5 0 1 3 1 ...
##  $ frstlang : Factor w/ 2 levels "English","Other": 1 1 1 1 1 1 1 1 1 1 ...
##  $ salary   : num  85000 85000 86000 88000 92000 93000 95000 95000 95000 96000 ...
##  $ satis    : int  5 6 5 7 6 5 4 5 3 7 ...
##  $ GotPlaced: Factor w/ 2 levels "FALSE","TRUE": 2 2 2 2 2 2 2 2 2 2 ...

#GotPlaced = factor(year)
#dummies = model.matrix(~year.f)

Create contingency tables, counting allPlaced / notPlaced versus Sex: Male / Female

Number of Placed and Not Placed candiadtes

allplaced <- table(knownMBA.df$GotPlaced == 'TRUE')
allplaced

## 
## FALSE  TRUE 
##    90   138

allPlaced / notPlaced versus Sex: Male / Female

placedbySex <- xtabs(~ knownMBA.df$GotPlaced + knownMBA.df$sex , data=knownMBA.df)
placedbySex

##                      knownMBA.df$sex
## knownMBA.df$GotPlaced Female Male
##                 FALSE     23   67
##                 TRUE      36  102

addmargins(placedbySex)

##                      knownMBA.df$sex
## knownMBA.df$GotPlaced Female Male Sum
##                 FALSE     23   67  90
##                 TRUE      36  102 138
##                 Sum       59  169 228

Percentage of Male / Female candidates who got Placed

prop.table(placedbySex, 2)

##                      knownMBA.df$sex
## knownMBA.df$GotPlaced    Female      Male
##                 FALSE 0.3898305 0.3964497
##                 TRUE  0.6101695 0.6035503

allPlaced / notPlaced versus First Language: Enlish / Other

placedbyLanguage <- xtabs(~ knownMBA.df$GotPlaced + knownMBA.df$frstlang, data=knownMBA.df)
placedbyLanguage

##                      knownMBA.df$frstlang
## knownMBA.df$GotPlaced English Other
##                 FALSE      82     8
##                 TRUE      122    16

addmargins(placedbyLanguage)

##                      knownMBA.df$frstlang
## knownMBA.df$GotPlaced English Other Sum
##                 FALSE      82     8  90
##                 TRUE      122    16 138
##                 Sum       204    24 228

Percentage of First Language candidates who got Placed

prop.table(placedbyLanguage, 2)

##                      knownMBA.df$frstlang
## knownMBA.df$GotPlaced   English     Other
##                 FALSE 0.4019608 0.3333333
##                 TRUE  0.5980392 0.6666667

H1: The percentage of Females placed is more than Males

Chi Square Test : percentage of female who got placed is higher than percentage of male who got placed

chisq.test(placedbySex)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  placedbySex
## X-squared = 3.5816e-30, df = 1, p-value = 1

H2: The percentage of people placed whose first language is English is higher than the percentage of people placed whose first language is not English

Chi Square Test

chisq.test(placedbyLanguage)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  placedbyLanguage
## X-squared = 0.18479, df = 1, p-value = 0.6673

MODEL SELECTION

library(corrplot)
colnames(placed.df)

##  [1] "age"      "sex"      "gmat_tot" "gmat_qpc" "gmat_vpc" "gmat_tpc"
##  [7] "s_avg"    "f_avg"    "quarter"  "work_yrs" "frstlang" "salary"  
## [13] "satis"

dataColumns <- placed.df[, c("age","work_yrs", "gmat_tot", "gmat_qpc", "gmat_vpc", "gmat_tpc", "s_avg", "f_avg", "quarter", "satis")]

N <- cor(dataColumns)
corrplot(N, method="circle")

res <- cor(dataColumns)
round(res, 2)

##            age work_yrs gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg
## age       1.00     0.88    -0.08    -0.17     0.02    -0.10  0.16 -0.22
## work_yrs  0.88     1.00    -0.12    -0.18    -0.03    -0.13  0.16 -0.22
## gmat_tot -0.08    -0.12     1.00     0.67     0.78     0.97  0.17  0.12
## gmat_qpc -0.17    -0.18     0.67     1.00     0.09     0.66  0.02  0.10
## gmat_vpc  0.02    -0.03     0.78     0.09     1.00     0.78  0.16  0.02
## gmat_tpc -0.10    -0.13     0.97     0.66     0.78     1.00  0.14  0.07
## s_avg     0.16     0.16     0.17     0.02     0.16     0.14  1.00  0.45
## f_avg    -0.22    -0.22     0.12     0.10     0.02     0.07  0.45  1.00
## quarter  -0.13    -0.13    -0.11     0.01    -0.13    -0.10 -0.84 -0.43
## satis     0.11     0.06     0.06     0.00     0.15     0.12 -0.14 -0.12
##          quarter satis
## age        -0.13  0.11
## work_yrs   -0.13  0.06
## gmat_tot   -0.11  0.06
## gmat_qpc    0.01  0.00
## gmat_vpc   -0.13  0.15
## gmat_tpc   -0.10  0.12
## s_avg      -0.84 -0.14
## f_avg      -0.43 -0.12
## quarter     1.00  0.23
## satis       0.23  1.00

# MBA PERFORMANCE
# The variables tracking performance during the MBA are heavily correlated
mbaPerformance <- placed.df[, c("s_avg", "f_avg", "quarter")]

N <- cor(mbaPerformance)
corrplot(N, method="circle")

res <- cor(mbaPerformance)
round(res, 2)

##         s_avg f_avg quarter
## s_avg    1.00  0.45   -0.84
## f_avg    0.45  1.00   -0.43
## quarter -0.84 -0.43    1.00

# The overall performance quartile (quarter) is highly correlated with the Spring (s_avg) and Fall (f_avg) GPA
# We will include 's_avg' and 'f_avg' in our regression, but exclude 'quarter' from our regression.

# Identifying DEPENDENT and INDEPENDENT Variables
# The DEPENDENT variable is "salary"
# Identifying INDEPENDENT Variables:
#1a.  Variables related to GMAT are highly correlated:   "gmat_tot" "gmat_qpc" "gmat_vpc" "gmat_tpc"
#1b.  Variables relatd to MBA performance are highly correlated:    "s_avg"    "f_avg"    "quarter"
#1c.  Variables related to age and work experience are highly correlated: "age", "work_yrs"
#1d.  Other variables: "sex"   , "frstlang" ;   "satis"   
# 1e. GMAT 
# The GMAT related columns are heavily strongly correlated with each other
gmat <- placed.df[, c("gmat_tot", "gmat_qpc", "gmat_vpc", "gmat_tpc")]
res <- cor(gmat)
round(res, 2)

##          gmat_tot gmat_qpc gmat_vpc gmat_tpc
## gmat_tot     1.00     0.67     0.78     0.97
## gmat_qpc     0.67     1.00     0.09     0.66
## gmat_vpc     0.78     0.09     1.00     0.78
## gmat_tpc     0.97     0.66     0.78     1.00

library(corrplot)
M <- cor(gmat)
corrplot(M, method="circle")

# However, GMAT verbal and quantitative scores are very weakly correlated
cor(placed.df$gmat_qpc, placed.df$gmat_vpc)

## [1] 0.09466541

# Therefore, in our regression we will include gmat_qpc and gmat_vpc ,  but exclude "gmat_tot" and "gmat_tpc"

# 1f. MBA PERFORMANCE
# The variables tracking performance during the MBA are heavily correlated
mbaPerformance <- placed.df[, c("s_avg", "f_avg", "quarter")]

N <- cor(mbaPerformance)
corrplot(N, method="circle")

res <- cor(mbaPerformance)
round(res, 2)

##         s_avg f_avg quarter
## s_avg    1.00  0.45   -0.84
## f_avg    0.45  1.00   -0.43
## quarter -0.84 -0.43    1.00

# The overall performance quartile (quarter) is highly correlated with the Spring (s_avg) and Fall (f_avg) GPA
# We will include 's_avg' and 'f_avg' in our regression, but exclude 'quarter' from our regression.

# 1g. WORK EXPERIENCE AND AGE
# The variables 'age' and 'work_years' are highly correlated. The older the person, the greater the work experience.
cor(placed.df$age,placed.df$work_yrs)

## [1] 0.8805247

# Therefore we will include 'work_years' in our regression, but exclude 'age' from our regression

# SUMMARY OF MODEL SELECTION
# Given the above discussion, the indepedent variables we will include in the regression are {work_yrs,s_avg,f_avg,gmat_qpc,gmat_vpc,sex,frstlang,satis}

VARIANCE - COVARIANCE MATRIX

columns = c("salary", "work_yrs", "gmat_qpc", "gmat_vpc", "s_avg", "f_avg", "satis")
placedVariables <- placed.df[, columns]
res <- cor(placedVariables)
round(res, 2)

##          salary work_yrs gmat_qpc gmat_vpc s_avg f_avg satis
## salary     1.00     0.45     0.01    -0.14  0.10 -0.11 -0.04
## work_yrs   0.45     1.00    -0.18    -0.03  0.16 -0.22  0.06
## gmat_qpc   0.01    -0.18     1.00     0.09  0.02  0.10  0.00
## gmat_vpc  -0.14    -0.03     0.09     1.00  0.16  0.02  0.15
## s_avg      0.10     0.16     0.02     0.16  1.00  0.45 -0.14
## f_avg     -0.11    -0.22     0.10     0.02  0.45  1.00 -0.12
## satis     -0.04     0.06     0.00     0.15 -0.14 -0.12  1.00

library(corrplot)
M <- cor(placed.df[, columns])
corrplot(M, method="circle")

SCATTER PLOTS

library(car)
scatterplotMatrix(~salary + s_avg + f_avg + satis, data=placed.df,
    main="Salary versus MBA Performance and MBA Satisfaction")

library(car)
scatterplotMatrix(~salary + work_yrs + gmat_qpc + gmat_vpc, data=placed.df,
    main="Salary versus Work Experience; GMAT Performance")

REGRESSION

Formulating multivariate linear regression model to fit salary with respect to the model selection

Independent Variables: {work_yrs,s_avg,f_avg,gmat_qpc,gmat_vpc,sex,frstlang,satis}

Dependent Variable: Salary

Model1 <- salary ~ 
             work_yrs + s_avg + f_avg + gmat_qpc + gmat_vpc + sex + frstlang + satis 
fit1 <- lm(Model1, data = placed.df)
summary(fit1)

## 
## Call:
## lm(formula = Model1, data = placed.df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -29800  -7822  -1742   4869  82341 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   90136.22   21739.22   4.146  7.4e-05 ***
## work_yrs       2331.12     585.99   3.978 0.000137 ***
## s_avg          4659.05    5015.66   0.929 0.355320    
## f_avg         -1698.83    3834.70  -0.443 0.658773    
## gmat_qpc         98.72     121.85   0.810 0.419884    
## gmat_vpc        -95.80     102.99  -0.930 0.354699    
## sexMale        5289.24    3545.91   1.492 0.139140    
## frstlangOther 13994.76    6641.66   2.107 0.037770 *  
## satis         -1671.20    2070.62  -0.807 0.421643    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15740 on 94 degrees of freedom
## Multiple R-squared:  0.285,  Adjusted R-squared:  0.2241 
## F-statistic: 4.683 on 8 and 94 DF,  p-value: 7.574e-05

library(leaps)
leap <- regsubsets(Model1, data = placed.df, nbest=1)
summary(leap)

## Subset selection object
## Call: regsubsets.formula(Model1, data = placed.df, nbest = 1)
## 8 Variables  (and intercept)
##               Forced in Forced out
## work_yrs          FALSE      FALSE
## s_avg             FALSE      FALSE
## f_avg             FALSE      FALSE
## gmat_qpc          FALSE      FALSE
## gmat_vpc          FALSE      FALSE
## sexMale           FALSE      FALSE
## frstlangOther     FALSE      FALSE
## satis             FALSE      FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
##          work_yrs s_avg f_avg gmat_qpc gmat_vpc sexMale frstlangOther
## 1  ( 1 ) "*"      " "   " "   " "      " "      " "     " "          
## 2  ( 1 ) "*"      " "   " "   " "      " "      " "     "*"          
## 3  ( 1 ) "*"      " "   " "   " "      " "      "*"     "*"          
## 4  ( 1 ) "*"      " "   " "   " "      " "      "*"     "*"          
## 5  ( 1 ) "*"      "*"   " "   " "      " "      "*"     "*"          
## 6  ( 1 ) "*"      "*"   " "   " "      "*"      "*"     "*"          
## 7  ( 1 ) "*"      "*"   " "   "*"      "*"      "*"     "*"          
## 8  ( 1 ) "*"      "*"   "*"   "*"      "*"      "*"     "*"          
##          satis
## 1  ( 1 ) " "  
## 2  ( 1 ) " "  
## 3  ( 1 ) " "  
## 4  ( 1 ) "*"  
## 5  ( 1 ) "*"  
## 6  ( 1 ) "*"  
## 7  ( 1 ) "*"  
## 8  ( 1 ) "*"

plot(leap, scale="adjr2")

Model2 <- salary ~ 
             work_yrs + 
             # age +
             # s_avg +
             # f_avg +
             # quarter +
             # gmat_qpc +
             # gmat_vpc +
             # gmat_tot +
             # gmat_tpc +
             sex +
             frstlang +
             satis 
fit2 <- lm(Model2, data = placed.df)
summary(fit2)

## 
## Call:
## lm(formula = Model2, data = placed.df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -30492  -8055  -1744   5362  80436 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   102214.0    11827.8   8.642 1.06e-13 ***
## work_yrs        2409.4      526.1   4.579 1.37e-05 ***
## sexMale         5949.5     3392.2   1.754   0.0826 .  
## frstlangOther  14675.7     6274.0   2.339   0.0214 *  
## satis          -2244.4     1988.4  -1.129   0.2618    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15580 on 98 degrees of freedom
## Multiple R-squared:  0.2695, Adjusted R-squared:  0.2397 
## F-statistic: 9.038 on 4 and 98 DF,  p-value: 2.953e-06

library(leaps)
leap <- regsubsets(Model2, data = placed.df, nbest=1)
summary(leap)

## Subset selection object
## Call: regsubsets.formula(Model2, data = placed.df, nbest = 1)
## 4 Variables  (and intercept)
##               Forced in Forced out
## work_yrs          FALSE      FALSE
## sexMale           FALSE      FALSE
## frstlangOther     FALSE      FALSE
## satis             FALSE      FALSE
## 1 subsets of each size up to 4
## Selection Algorithm: exhaustive
##          work_yrs sexMale frstlangOther satis
## 1  ( 1 ) "*"      " "     " "           " "  
## 2  ( 1 ) "*"      " "     "*"           " "  
## 3  ( 1 ) "*"      "*"     "*"           " "  
## 4  ( 1 ) "*"      "*"     "*"           "*"

plot(leap, scale="adjr2")

Analysis of MBA Starting Salaries

Dheepika C

17 February 2018

Inspecting the data

Summary Statistics

Review the Distribution of Salary

Comparison of salaray with the given variables

Comparison of Salary with Work Experience

Comparison of Salary with GMAT total score

Number of male and females in dataframe age-wise

Effect of Sex on the on Salary

Effect of Age on the on Salary

Effect of Satisfaction level on the on Salary

Effect of MBA’s Starting salary based on Work Experience

Effect of MBA’s Starting salary based on Gender

Distribution of MBA’s Starting Salary

Merge placed.df ; notDisclosed.df ; notPlaced = knownMBA.df

“GotPlaced” = 1 (got a job) or 0 (did not get a job)

Create contingency tables, counting allPlaced / notPlaced versus Sex: Male / Female

Number of Placed and Not Placed candiadtes

allPlaced / notPlaced versus Sex: Male / Female

Percentage of Male / Female candidates who got Placed

allPlaced / notPlaced versus First Language: Enlish / Other

Percentage of First Language candidates who got Placed

H1: The percentage of Females placed is more than Males

Chi Square Test : percentage of female who got placed is higher than percentage of male who got placed

H2: The percentage of people placed whose first language is English is higher than the percentage of people placed whose first language is not English

Chi Square Test

MODEL SELECTION

VARIANCE - COVARIANCE MATRIX

SCATTER PLOTS

REGRESSION

Formulating multivariate linear regression model to fit salary with respect to the model selection

Independent Variables: {work_yrs,s_avg,f_avg,gmat_qpc,gmat_vpc,sex,frstlang,satis}

Dependent Variable: Salary