Reading the raw data into a dataframe

MBA.df <- read.csv(paste("MBA Starting Salaries Data.csv", sep=""))
attach(MBA.df)
dim(MBA.df)

## [1] 274  13

Generate Summary Statistics

library(psych)
describe(MBA.df)[,c(1:5)]

##          vars   n     mean       sd median
## age         1 274    27.36     3.71     27
## sex         2 274     1.25     0.43      1
## gmat_tot    3 274   619.45    57.54    620
## gmat_qpc    4 274    80.64    14.87     83
## gmat_vpc    5 274    78.32    16.86     81
## gmat_tpc    6 274    84.20    14.02     87
## s_avg       7 274     3.03     0.38      3
## f_avg       8 274     3.06     0.53      3
## quarter     9 274     2.48     1.11      2
## work_yrs   10 274     3.87     3.23      3
## frstlang   11 274     1.12     0.32      1
## salary     12 274 39025.69 50951.56    999
## satis      13 274   172.18   371.61      6

There are lots of missing values and special cases in the salary data above

Inspect the datatypes. Convert the data type of some columns

Sex and First Language

str(MBA.df)

## 'data.frame':    274 obs. of  13 variables:
##  $ age     : int  23 24 24 24 24 24 25 25 25 25 ...
##  $ sex     : int  2 1 1 1 2 1 1 2 1 1 ...
##  $ gmat_tot: int  620 610 670 570 710 640 610 650 630 680 ...
##  $ gmat_qpc: int  77 90 99 56 93 82 89 88 79 99 ...
##  $ gmat_vpc: int  87 71 78 81 98 89 74 89 91 81 ...
##  $ gmat_tpc: int  87 87 95 75 98 91 87 92 89 96 ...
##  $ s_avg   : num  3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
##  $ f_avg   : num  3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
##  $ quarter : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ work_yrs: int  2 2 2 1 2 2 2 2 2 2 ...
##  $ frstlang: int  1 1 1 1 1 1 1 1 2 1 ...
##  $ salary  : int  0 0 0 0 999 0 0 0 999 998 ...
##  $ satis   : int  7 6 6 7 5 6 5 6 4 998 ...

# Replace the 'sex' columns as follows:  1 = Male, 2 = Female
# Replace the 'frstlang' columns as follows:  1 = English, 2 = Other
# Convert them both into factors

MBA.df$sex[MBA.df$sex == 1] <- 'Male'
MBA.df$sex[MBA.df$sex == 2] <- 'Female'
MBA.df$sex <- factor(MBA.df$sex)

MBA.df$frstlang[MBA.df$frstlang == 1] <- 'English'
MBA.df$frstlang[MBA.df$frstlang == 2] <- 'Other'
MBA.df$frstlang <- factor(MBA.df$frstlang)

str(MBA.df) #Verify the data types, sex and frstlang should be Factor variables

## 'data.frame':    274 obs. of  13 variables:
##  $ age     : int  23 24 24 24 24 24 25 25 25 25 ...
##  $ sex     : Factor w/ 2 levels "Female","Male": 1 2 2 2 1 2 2 1 2 2 ...
##  $ gmat_tot: int  620 610 670 570 710 640 610 650 630 680 ...
##  $ gmat_qpc: int  77 90 99 56 93 82 89 88 79 99 ...
##  $ gmat_vpc: int  87 71 78 81 98 89 74 89 91 81 ...
##  $ gmat_tpc: int  87 87 95 75 98 91 87 92 89 96 ...
##  $ s_avg   : num  3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
##  $ f_avg   : num  3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
##  $ quarter : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ work_yrs: int  2 2 2 1 2 2 2 2 2 2 ...
##  $ frstlang: Factor w/ 2 levels "English","Other": 1 1 1 1 1 1 1 1 2 1 ...
##  $ salary  : int  0 0 0 0 999 0 0 0 999 998 ...
##  $ satis   : int  7 6 6 7 5 6 5 6 4 998 ...

# We divide the data into 4 dataframes.

# This is because thee Data has the following special conditions for "Salary".

#  999: Answered Survey But Did Not Disclose Salary
#  998: Did not Answer Survey
#  0:  Not Yet Placed
#  999: Answered Survey, Got Placed, Disclosed Salary

# MBAs who got placed and who disclosed their salaries
placed.df <- MBA.df[which (MBA.df$salary > 1000)  , ]
View(placed.df)

# MBAs who were not placed
notPlaced.df <- MBA.df[which(MBA.df$salary==0), ]
View(notPlaced.df)

# MBAs who were placed but did not disclose their salary
notDisclosedSalary.df  <- MBA.df[which (MBA.df$salary == 999)  , ]
View(notDisclosedSalary.df)

# MBAs who did not answer the survey
notAnsweredSurvey.df  <- MBA.df[which (MBA.df$salary == 998)  , ]
View(notAnsweredSurvey.df)

# Let avgSalary = Average Salary of students who were placed and who disclosed their salary
avgSalary = mean(placed.df$salary)
avgSalary

## [1] 103030.7

# Assume avgSalary to be the salary of Students who did not disclose their salary (notDisclosedSalary.df) 
notDisclosedSalary.df$salary = avgSalary


# allPlaced.df:  A dataframe containing all students who were placed. 
# It includes those students who disclosed their salary (placed.df) and those who did not disclose their salary (notDisclosed.df), where the salary of students who did not disclose their salary (notDisclosed.df) is assumed to be the sample average of the students who disclosed their salary (placed.df)
allPlaced.df <- rbind(placed.df, notDisclosedSalary.df)

Summary Statistics of allPlaced.df

library(psych)
describe(allPlaced.df)[,c(1:5)]

##           vars   n      mean       sd   median
## age          1 138     26.96     3.05     26.0
## sex*         2 138      1.74     0.44      2.0
## gmat_tot     3 138    619.28    53.47    620.0
## gmat_qpc     4 138     81.10    13.59     83.5
## gmat_vpc     5 138     77.99    17.10     81.5
## gmat_tpc     6 138     84.48    13.08     87.0
## s_avg        7 138      3.03     0.38      3.0
## f_avg        8 138      3.06     0.46      3.0
## quarter      9 138      2.43     1.15      2.0
## work_yrs    10 138      3.67     2.75      3.0
## frstlang*   11 138      1.12     0.32      1.0
## salary      12 138 103030.74 15418.25 103030.7
## satis       13 138      5.53     1.11      6.0

View(allPlaced.df)

Review the Distribution of Salary

library(lattice)
histogram(~salary, data = placed.df,
 main = "Distribution of Starting Salary", xlab="Starting Salary", col='grey' )

histogram(~salary, data = allPlaced.df,
 main = "Distribution of Starting Salary", xlab="Starting Salary", col='grey' )

Comparison of Salary with sex and the other given variables

aggregate(cbind(salary, work_yrs, age) ~ sex, 
                   data = placed.df, mean)

##      sex    salary work_yrs      age
## 1 Female  98524.39 3.258065 26.06452
## 2   Male 104970.97 3.861111 27.08333

aggregate(cbind(salary, work_yrs, age) ~ sex, 
                   data = allPlaced.df, mean)

##      sex    salary work_yrs      age
## 1 Female  99150.27 3.277778 26.13889
## 2   Male 104400.32 3.803922 27.24510

Comparison of Salary with Work Experience

# Scatter plot of Salary with Work Experience
library(car)
scatterplot(salary ~ work_yrs ,data=placed.df, main="Scatterplot of Salary with Work Experience", xlab="Work Experience", ylab="MBA's Starting Salaries", horizontal=TRUE)

# Distribution of Salary with Work Experience
boxplot(salary ~ work_yrs ,data=placed.df, main="Distribution of Salary with Work Experience", ylab="Work Experience", xlab="MBA's Starting Salaries", horizontal=TRUE)

library(lattice)
histogram(~salary, data = placed.df,
 main = "Frequency of Starting Salary", xlab="Starting Salary", col='grey' )

# Average Salary by Work Experience
salaryWorkEx = aggregate(salary ~ work_yrs, data = placed.df, mean)
salaryWorkEx

##    work_yrs    salary
## 1         0  95000.00
## 2         1 103532.00
## 3         2  97673.68
## 4         3 101652.86
## 5         4 105454.55
## 6         5 103142.86
## 7         6 105928.57
## 8         7  98000.00
## 9         8 105025.00
## 10       10 118000.00
## 11       15 183000.00
## 12       16 108500.00

Comparison of Salary with GMAT total score

scatterplot(salary ~ gmat_tot , data=placed.df, 
    xlab="GMAT Total", ylab="Salary", 
   main="Comparison of Salary with Total GMAT score", 
   labels=row.names(placed.df))

scatterplot(salary ~ gmat_tot | sex, data=placed.df, 
    xlab="GMAT Total", ylab="Salary", 
   main="Comparison of Salary with Total GMAT score", 
   labels=row.names(placed.df))

boxplot(salary ~ gmat_tot , data=placed.df, 
    ylab="GMAT Total", xlab="Salary", 
   main="Comparison of Salary with Total GMAT score", 
   horizontal=TRUE,
   labels=row.names(placed.df))

library(car)
scatterplot.matrix(~salary+gmat_tot+s_avg+f_avg, data=placed.df,
    main="Salary versus other variables")

scatterplot.matrix(~salary+gmat_tot+s_avg+f_avg |sex, data=placed.df,
    main="Salary versus other variables")

VISUALIZATION

Number of male and females in dataframe age-wise

ageTable <- table(placed.df$sex, placed.df$age)
ageTable

##         
##          22 23 24 25 26 27 28 29 30 31 32 33 34 39 40
##   Female  1  2  5 10  5  1  3  1  2  0  0  0  0  0  1
##   Male    0  3 11 13  9 13  5  5  4  4  1  1  1  1  1

Effect of Sex on the Salary

aggregate(cbind(salary, work_yrs, age) ~ sex, 
                   data = MBA.df, mean)

##      sex   salary work_yrs      age
## 1 Female 45121.07 3.808824 27.17647
## 2   Male 37013.62 3.893204 27.41748

Effect of Age on the Salary

aggregate(cbind(salary, work_yrs) ~ age, data = MBA.df, mean)

##    age    salary  work_yrs
## 1   22  42500.00  1.000000
## 2   23  57282.00  1.750000
## 3   24  49342.24  1.727273
## 4   25  43395.55  2.264151
## 5   26  35982.07  2.875000
## 6   27  31499.37  3.130435
## 7   28  39809.00  4.666667
## 8   29  28067.95  4.500000
## 9   30  55291.25  5.583333
## 10  31  40599.40  5.800000
## 11  32  13662.25  5.625000
## 12  33 118000.00 10.000000
## 13  34  26250.00 11.500000
## 14  35      0.00  9.333333
## 15  36      0.00 12.500000
## 16  37      0.00  9.000000
## 17  39  56000.00 10.500000
## 18  40 183000.00 15.000000
## 19  42      0.00 13.000000
## 20  43      0.00 19.000000
## 21  48      0.00 22.000000

Effect of Satisfaction level on the Salary

aggregate(cbind(salary, work_yrs) ~ satis , data = MBA.df, mean)

##   satis    salary work_yrs
## 1     1   999.000 3.000000
## 2     2   999.000 2.000000
## 3     3 19799.200 4.200000
## 4     4  6293.412 2.941176
## 5     5 40476.311 4.243243
## 6     6 54383.536 4.185567
## 7     7 65718.152 3.727273
## 8   998   998.000 3.086957

Effect of MBA’s Starting salary based on Work Experience

boxplot(salary ~ work_yrs ,data=MBA.df, main="Effect of Work Experience on Salary", ylab="Work Experience", xlab="MBA's Starting Salaries", horizontal=FALSE)

Effect of MBA’s Starting salary based on Gender

boxplot(salary ~ sex ,data=MBA.df, main="Effect of Gender on Salary", ylab="Work Experience", xlab="MBA's Starting Salaries", horizontal=FALSE)

Distribution of MBA’s Starting Salary

library(lattice)
histogram(~salary, data = MBA.df,
 main = "Distribution of MBA's Starting Salary", xlab="MBA's Starting Salary", col='grey' )

Merge placed.df ; notDisclosed.df ; notPlaced = knownMBA.df

knownMBA.df <- rbind(placed.df, notDisclosedSalary.df, notPlaced.df)
View(knownMBA.df)

Create a dummy variable called “GotPlaced” = 1 (got a job) or 0 (did not get a job)

knownMBA.df$GotPlaced = (knownMBA.df$salary >1000)
View(knownMBA.df)

knownMBA.df$GotPlaced <- factor(knownMBA.df$GotPlaced)
str(knownMBA.df)

## 'data.frame':    228 obs. of  14 variables:
##  $ age      : int  22 27 25 25 27 28 24 25 25 25 ...
##  $ sex      : Factor w/ 2 levels "Female","Male": 1 1 1 1 2 1 2 1 1 2 ...
##  $ gmat_tot : int  660 700 680 650 710 620 670 560 530 650 ...
##  $ gmat_qpc : int  90 94 87 82 96 52 84 52 50 79 ...
##  $ gmat_vpc : int  92 98 96 91 96 98 96 81 62 93 ...
##  $ gmat_tpc : int  94 98 96 93 98 87 95 72 61 93 ...
##  $ s_avg    : num  3.5 3.3 3.5 3.4 3.3 3.4 3.3 3.3 3.6 3.3 ...
##  $ f_avg    : num  3.75 3.25 2.67 3.25 3.5 3.75 3.25 3.5 3.67 3.5 ...
##  $ quarter  : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ work_yrs : int  1 2 2 3 2 5 0 1 3 1 ...
##  $ frstlang : Factor w/ 2 levels "English","Other": 1 1 1 1 1 1 1 1 1 1 ...
##  $ salary   : num  85000 85000 86000 88000 92000 93000 95000 95000 95000 96000 ...
##  $ satis    : int  5 6 5 7 6 5 4 5 3 7 ...
##  $ GotPlaced: Factor w/ 2 levels "FALSE","TRUE": 2 2 2 2 2 2 2 2 2 2 ...

#GotPlaced = factor(year)
#dummies = model.matrix(~year.f)

Create contingency tables, counting allPlaced / notPlaced versus Sex: Male / Female

Number of Placed and Not Placed candiadtes

allplaced <- table(knownMBA.df$GotPlaced == 'TRUE')
allplaced

## 
## FALSE  TRUE 
##    90   138

allPlaced / notPlaced versus Sex: Male / Female

placedbySex <- xtabs(~ knownMBA.df$GotPlaced + knownMBA.df$sex , data=knownMBA.df)
placedbySex

##                      knownMBA.df$sex
## knownMBA.df$GotPlaced Female Male
##                 FALSE     23   67
##                 TRUE      36  102

addmargins(placedbySex)

##                      knownMBA.df$sex
## knownMBA.df$GotPlaced Female Male Sum
##                 FALSE     23   67  90
##                 TRUE      36  102 138
##                 Sum       59  169 228

Percentage of Male / Female candidates who got Placed

prop.table(placedbySex, 2)

##                      knownMBA.df$sex
## knownMBA.df$GotPlaced    Female      Male
##                 FALSE 0.3898305 0.3964497
##                 TRUE  0.6101695 0.6035503

allPlaced / notPlaced versus First Language: English / Other

placedbyLanguage <- xtabs(~ knownMBA.df$GotPlaced + knownMBA.df$frstlang, data=knownMBA.df)
placedbyLanguage

##                      knownMBA.df$frstlang
## knownMBA.df$GotPlaced English Other
##                 FALSE      82     8
##                 TRUE      122    16

addmargins(placedbyLanguage)

##                      knownMBA.df$frstlang
## knownMBA.df$GotPlaced English Other Sum
##                 FALSE      82     8  90
##                 TRUE      122    16 138
##                 Sum       204    24 228

Percentage of First Language candidates who got Placed

prop.table(placedbyLanguage, 2)

##                      knownMBA.df$frstlang
## knownMBA.df$GotPlaced   English     Other
##                 FALSE 0.4019608 0.3333333
##                 TRUE  0.5980392 0.6666667

H1: The percentage of Females placed is more than Males

Chi Square Test : percentage of female who got placed is higher than percentage of male who got placed

chisq.test(placedbySex)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  placedbySex
## X-squared = 3.5816e-30, df = 1, p-value = 1

Since the p value is greater than 0.05, we reject the null hypothesis that the percentage of females who got placed and the percentage of males who got placed is equal. So,from Chi Square Test, it can be concluded that the hypothesis H1 is true. ## H2: The percentage of people placed whose first language is English is higher than the percentage of people placed whose first language is not English

Chi Square Test

chisq.test(placedbyLanguage)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  placedbyLanguage
## X-squared = 0.18479, df = 1, p-value = 0.6673

Since the p value is greater than 0.05, we reject the null hypothesis that the percentage of people who got placed whose first language is English and the percentage of people who got placed whose first language is not English is equal. So,from Chi Square Test, it can be concluded that the hypothesis H2 is true. # MODEL SELECTION

library(corrplot)

colnames(placed.df)

##  [1] "age"      "sex"      "gmat_tot" "gmat_qpc" "gmat_vpc" "gmat_tpc"
##  [7] "s_avg"    "f_avg"    "quarter"  "work_yrs" "frstlang" "salary"  
## [13] "satis"

dataColumns <- placed.df[, c("age","work_yrs", "gmat_tot", "gmat_qpc", "gmat_vpc", "gmat_tpc", "s_avg", "f_avg", "quarter", "satis")]

N <- cor(dataColumns)
corrplot(N, method="circle")

res <- cor(dataColumns)
round(res, 2)

##            age work_yrs gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg
## age       1.00     0.88    -0.08    -0.17     0.02    -0.10  0.16 -0.22
## work_yrs  0.88     1.00    -0.12    -0.18    -0.03    -0.13  0.16 -0.22
## gmat_tot -0.08    -0.12     1.00     0.67     0.78     0.97  0.17  0.12
## gmat_qpc -0.17    -0.18     0.67     1.00     0.09     0.66  0.02  0.10
## gmat_vpc  0.02    -0.03     0.78     0.09     1.00     0.78  0.16  0.02
## gmat_tpc -0.10    -0.13     0.97     0.66     0.78     1.00  0.14  0.07
## s_avg     0.16     0.16     0.17     0.02     0.16     0.14  1.00  0.45
## f_avg    -0.22    -0.22     0.12     0.10     0.02     0.07  0.45  1.00
## quarter  -0.13    -0.13    -0.11     0.01    -0.13    -0.10 -0.84 -0.43
## satis     0.11     0.06     0.06     0.00     0.15     0.12 -0.14 -0.12
##          quarter satis
## age        -0.13  0.11
## work_yrs   -0.13  0.06
## gmat_tot   -0.11  0.06
## gmat_qpc    0.01  0.00
## gmat_vpc   -0.13  0.15
## gmat_tpc   -0.10  0.12
## s_avg      -0.84 -0.14
## f_avg      -0.43 -0.12
## quarter     1.00  0.23
## satis       0.23  1.00

# MBA PERFORMANCE
# The variables tracking performance during the MBA are heavily correlated
mbaPerformance <- placed.df[, c("s_avg", "f_avg", "quarter")]

N <- cor(mbaPerformance)
corrplot(N, method="circle")

res <- cor(mbaPerformance)
round(res, 2)

##         s_avg f_avg quarter
## s_avg    1.00  0.45   -0.84
## f_avg    0.45  1.00   -0.43
## quarter -0.84 -0.43    1.00

# The overall performance quartile (quarter) is highly correlated with the Spring (s_avg) and Fall (f_avg) GPA
# We will include 's_avg' and 'f_avg' in our regression, but exclude 'quarter' from our regression.

# Identifying DEPENDENT and INDEPENDENT Variables

# The DEPENDENT variable is "salary"
# Identifying INDEPENDENT Variables:
#1a.  Variables related to GMAT are highly correlated:   "gmat_tot" "gmat_qpc" "gmat_vpc" "gmat_tpc"
#1b.  Variables relatd to MBA performance are highly correlated:    "s_avg"    "f_avg"    "quarter"
#1c.  Variables related to age and work experience are highly correlated: "age", "work_yrs"
#1d.  Other variables: "sex"   , "frstlang" ;   "satis"   

# 1e. GMAT 
# The GMAT related columns are heavily strongly correlated with each other
gmat <- placed.df[, c("gmat_tot", "gmat_qpc", "gmat_vpc", "gmat_tpc")]
res <- cor(gmat)
round(res, 2)

##          gmat_tot gmat_qpc gmat_vpc gmat_tpc
## gmat_tot     1.00     0.67     0.78     0.97
## gmat_qpc     0.67     1.00     0.09     0.66
## gmat_vpc     0.78     0.09     1.00     0.78
## gmat_tpc     0.97     0.66     0.78     1.00

library(corrplot)
M <- cor(gmat)
corrplot(M, method="circle")

# However, GMAT verbal and quantitative scores are very weakly correlated
cor(gmat_qpc,gmat_vpc)

## [1] 0.1521801

# Therefore, in our regression we will include gmat_qpc and gmat_vpc ,  but exclude "gmat_tot" and "gmat_tpc"

# 1f. MBA PERFORMANCE
# The variables tracking performance during the MBA are heavily correlated
mbaPerformance <- placed.df[, c("s_avg", "f_avg", "quarter")]

N <- cor(mbaPerformance)
corrplot(N, method="circle")

res <- cor(mbaPerformance)
round(res, 2)

##         s_avg f_avg quarter
## s_avg    1.00  0.45   -0.84
## f_avg    0.45  1.00   -0.43
## quarter -0.84 -0.43    1.00

# The overall performance quartile (quarter) is highly correlated with the Spring (s_avg) and Fall (f_avg) GPA
# We will include 's_avg' and 'f_avg' in our regression, but exclude 'quarter' from our regression.

# 1g. WORK EXPERIENCE AND AGE
# The variables 'age' and 'work_years' are highly correlated. The older the person, the greater the work experience.
cor(age,work_yrs)

## [1] 0.8582981

# Therefore we will include 'work_years' in our regression, but exclude 'age' from our regression

# SUMMARY OF MODEL SELECTION
# Given the above discussion, the indepedent variables we will include in the regression are {work_yrs,s_avg,f_avg,gmat_qpc,gmat_vpc,sex,frstlang,satis}

VARIANCE - COVARIANCE MATRIX

columns = c("salary", "work_yrs", "gmat_qpc", "gmat_vpc", "s_avg", "f_avg", "satis")
placedVariables <- placed.df[, columns]
res <- cor(placedVariables)
round(res, 2)

##          salary work_yrs gmat_qpc gmat_vpc s_avg f_avg satis
## salary     1.00     0.45     0.01    -0.14  0.10 -0.11 -0.04
## work_yrs   0.45     1.00    -0.18    -0.03  0.16 -0.22  0.06
## gmat_qpc   0.01    -0.18     1.00     0.09  0.02  0.10  0.00
## gmat_vpc  -0.14    -0.03     0.09     1.00  0.16  0.02  0.15
## s_avg      0.10     0.16     0.02     0.16  1.00  0.45 -0.14
## f_avg     -0.11    -0.22     0.10     0.02  0.45  1.00 -0.12
## satis     -0.04     0.06     0.00     0.15 -0.14 -0.12  1.00

library(corrplot)
M <- cor(placed.df[, columns])
corrplot(M, method="circle")

SCATTER PLOTS

library(car)
scatterplotMatrix(~salary + s_avg + f_avg + satis, data=placed.df,
    main="Salary versus MBA Performance and MBA Satisfaction")

library(car)
scatterplotMatrix(~salary + work_yrs + gmat_qpc + gmat_vpc, data=placed.df,
    main="Salary versus Work Experience; GMAT Performance")

REGRESSION

Formulating multivariate linear regression model to fit salary with respect to the model selection

Independent Variables: {work_yrs,s_avg,f_avg,gmat_qpc,gmat_vpc,sex,frstlang,satis}

Dependent Variable: Salary

Model1 <- salary ~ 
             work_yrs + s_avg + f_avg + gmat_qpc + gmat_vpc + sex + frstlang + satis 
fit1 <- lm(Model1, data = placed.df)
summary(fit1)

## 
## Call:
## lm(formula = Model1, data = placed.df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -29800  -7822  -1742   4869  82341 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   90136.22   21739.22   4.146  7.4e-05 ***
## work_yrs       2331.12     585.99   3.978 0.000137 ***
## s_avg          4659.05    5015.66   0.929 0.355320    
## f_avg         -1698.83    3834.70  -0.443 0.658773    
## gmat_qpc         98.72     121.85   0.810 0.419884    
## gmat_vpc        -95.80     102.99  -0.930 0.354699    
## sexMale        5289.24    3545.91   1.492 0.139140    
## frstlangOther 13994.76    6641.66   2.107 0.037770 *  
## satis         -1671.20    2070.62  -0.807 0.421643    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15740 on 94 degrees of freedom
## Multiple R-squared:  0.285,  Adjusted R-squared:  0.2241 
## F-statistic: 4.683 on 8 and 94 DF,  p-value: 7.574e-05

library(leaps)

## Warning: package 'leaps' was built under R version 3.4.3

leap <- regsubsets(Model1, data = placed.df, nbest=1)
summary(leap)

## Subset selection object
## Call: regsubsets.formula(Model1, data = placed.df, nbest = 1)
## 8 Variables  (and intercept)
##               Forced in Forced out
## work_yrs          FALSE      FALSE
## s_avg             FALSE      FALSE
## f_avg             FALSE      FALSE
## gmat_qpc          FALSE      FALSE
## gmat_vpc          FALSE      FALSE
## sexMale           FALSE      FALSE
## frstlangOther     FALSE      FALSE
## satis             FALSE      FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
##          work_yrs s_avg f_avg gmat_qpc gmat_vpc sexMale frstlangOther
## 1  ( 1 ) "*"      " "   " "   " "      " "      " "     " "          
## 2  ( 1 ) "*"      " "   " "   " "      " "      " "     "*"          
## 3  ( 1 ) "*"      " "   " "   " "      " "      "*"     "*"          
## 4  ( 1 ) "*"      " "   " "   " "      " "      "*"     "*"          
## 5  ( 1 ) "*"      "*"   " "   " "      " "      "*"     "*"          
## 6  ( 1 ) "*"      "*"   " "   " "      "*"      "*"     "*"          
## 7  ( 1 ) "*"      "*"   " "   "*"      "*"      "*"     "*"          
## 8  ( 1 ) "*"      "*"   "*"   "*"      "*"      "*"     "*"          
##          satis
## 1  ( 1 ) " "  
## 2  ( 1 ) " "  
## 3  ( 1 ) " "  
## 4  ( 1 ) "*"  
## 5  ( 1 ) "*"  
## 6  ( 1 ) "*"  
## 7  ( 1 ) "*"  
## 8  ( 1 ) "*"

plot(leap, scale="adjr2")

Model2 <- salary ~ 
             work_yrs + 
             # age +
             # s_avg +
             # f_avg +
             # quarter +
             # gmat_qpc +
             # gmat_vpc +
             # gmat_tot +
             # gmat_tpc +
             sex +
             frstlang +
             satis 
fit2 <- lm(Model2, data = placed.df)
summary(fit2)

## 
## Call:
## lm(formula = Model2, data = placed.df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -30492  -8055  -1744   5362  80436 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   102214.0    11827.8   8.642 1.06e-13 ***
## work_yrs        2409.4      526.1   4.579 1.37e-05 ***
## sexMale         5949.5     3392.2   1.754   0.0826 .  
## frstlangOther  14675.7     6274.0   2.339   0.0214 *  
## satis          -2244.4     1988.4  -1.129   0.2618    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15580 on 98 degrees of freedom
## Multiple R-squared:  0.2695, Adjusted R-squared:  0.2397 
## F-statistic: 9.038 on 4 and 98 DF,  p-value: 2.953e-06

library(leaps)
leap <- regsubsets(Model2, data = placed.df, nbest=1)
summary(leap)

## Subset selection object
## Call: regsubsets.formula(Model2, data = placed.df, nbest = 1)
## 4 Variables  (and intercept)
##               Forced in Forced out
## work_yrs          FALSE      FALSE
## sexMale           FALSE      FALSE
## frstlangOther     FALSE      FALSE
## satis             FALSE      FALSE
## 1 subsets of each size up to 4
## Selection Algorithm: exhaustive
##          work_yrs sexMale frstlangOther satis
## 1  ( 1 ) "*"      " "     " "           " "  
## 2  ( 1 ) "*"      " "     "*"           " "  
## 3  ( 1 ) "*"      "*"     "*"           " "  
## 4  ( 1 ) "*"      "*"     "*"           "*"

plot(leap, scale="adjr2")

Analysis of MBA Starting Salaries

Tamoghno Das