R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

data.df <- read.csv(paste("MBA Starting Salaries Data.csv", sep=""))
attach(data.df)
data.df$sex[data.df$sex == 1]

##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [71] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [106] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [141] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [176] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1

data.df$sex[data.df$sex == 2]

##  [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [36] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2

data.df$sex <- factor(data.df$sex)
data.df$frstlang[data.df$frstlang == 1]

##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [71] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [106] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [141] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [176] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [211] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1

data.df$frstlang[data.df$frstlang == 2]

##  [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2

data.df$frstlang <- factor(data.df$frstlang)
library(car)

#As per special conditions for "Salary".
 #999: Answered Survey But Did Not Disclose Salary
 #998: Did not Answer Survey
 #0:  Not Yet Placed
# 999: Answered Survey, Got Placed, Disclosed Salary


# Placed and Disclosed Salary
placed.df <- data.df[which (data.df$salary > 1000)  , ]
View(placed.df)

# Not placed
notPlaced.df <- data.df[which(data.df$salary==0), ]
View(notPlaced.df)

# Placed but not Disclosed salary
notDisclosedSalary.df  <- data.df[which (data.df$salary == 999)  , ]
View(notDisclosedSalary.df)

# Skipped the survey
notAnsweredSurvey.df  <- data.df[which (data.df$salary == 998)  , ]
View(notAnsweredSurvey.df)

dim(data.df)[1]

## [1] 274

c1 = dim(placed.df)[1]
c2 = dim(notPlaced.df)[1]
c3 = dim(notDisclosedSalary.df)[1]
c4 = dim(notAnsweredSurvey.df)[1]
c1+c2+c3+c4

## [1] 274

library(corrplot)

colnames(placed.df)

##  [1] "age"      "sex"      "gmat_tot" "gmat_qpc" "gmat_vpc" "gmat_tpc"
##  [7] "s_avg"    "f_avg"    "quarter"  "work_yrs" "frstlang" "salary"  
## [13] "satis"

dataColumns <- placed.df[, c("age","work_yrs", "gmat_tot", "gmat_qpc", "gmat_vpc", "gmat_tpc", "s_avg", "f_avg", "quarter", "satis")]

N <- cor(dataColumns)
corrplot(N, method="circle")

res <- cor(dataColumns)
round(res, 2)

##            age work_yrs gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg
## age       1.00     0.88    -0.08    -0.17     0.02    -0.10  0.16 -0.22
## work_yrs  0.88     1.00    -0.12    -0.18    -0.03    -0.13  0.16 -0.22
## gmat_tot -0.08    -0.12     1.00     0.67     0.78     0.97  0.17  0.12
## gmat_qpc -0.17    -0.18     0.67     1.00     0.09     0.66  0.02  0.10
## gmat_vpc  0.02    -0.03     0.78     0.09     1.00     0.78  0.16  0.02
## gmat_tpc -0.10    -0.13     0.97     0.66     0.78     1.00  0.14  0.07
## s_avg     0.16     0.16     0.17     0.02     0.16     0.14  1.00  0.45
## f_avg    -0.22    -0.22     0.12     0.10     0.02     0.07  0.45  1.00
## quarter  -0.13    -0.13    -0.11     0.01    -0.13    -0.10 -0.84 -0.43
## satis     0.11     0.06     0.06     0.00     0.15     0.12 -0.14 -0.12
##          quarter satis
## age        -0.13  0.11
## work_yrs   -0.13  0.06
## gmat_tot   -0.11  0.06
## gmat_qpc    0.01  0.00
## gmat_vpc   -0.13  0.15
## gmat_tpc   -0.10  0.12
## s_avg      -0.84 -0.14
## f_avg      -0.43 -0.12
## quarter     1.00  0.23
## satis       0.23  1.00

Performance <- placed.df[, c("s_avg", "f_avg", "quarter")]

N <- cor(Performance)
corrplot(N, method="circle")

res <- cor(Performance)
round(res, 2)

##         s_avg f_avg quarter
## s_avg    1.00  0.45   -0.84
## f_avg    0.45  1.00   -0.43
## quarter -0.84 -0.43    1.00

# High correlation between overall performance quartile & the "s_avg" & "f_avg" GPA

Males & Females (age-wise)

ageTable <- table(placed.df$sex, placed.df$age)
ageTable

##    
##     22 23 24 25 26 27 28 29 30 31 32 33 34 39 40
##   1  0  3 11 13  9 13  5  5  4  4  1  1  1  1  1
##   2  1  2  5 10  5  1  3  1  2  0  0  0  0  0  1

Satisfaction level

aggregate(cbind(salary, work_yrs) ~ satis , data = data.df, mean)

##   satis    salary work_yrs
## 1     1   999.000 3.000000
## 2     2   999.000 2.000000
## 3     3 19799.200 4.200000
## 4     4  6293.412 2.941176
## 5     5 40476.311 4.243243
## 6     6 54383.536 4.185567
## 7     7 65718.152 3.727273
## 8   998   998.000 3.086957

Effect of Work Experience on Salary

boxplot(salary ~ work_yrs ,data=data.df, main="Effect of Work Experience on Salary", ylab="Work Experience", xlab="MBA's Starting Salaries", horizontal=FALSE)

Effect of Age on Salary

aggregate(cbind(salary, work_yrs) ~ age, data = data.df, mean)

##    age    salary  work_yrs
## 1   22  42500.00  1.000000
## 2   23  57282.00  1.750000
## 3   24  49342.24  1.727273
## 4   25  43395.55  2.264151
## 5   26  35982.07  2.875000
## 6   27  31499.37  3.130435
## 7   28  39809.00  4.666667
## 8   29  28067.95  4.500000
## 9   30  55291.25  5.583333
## 10  31  40599.40  5.800000
## 11  32  13662.25  5.625000
## 12  33 118000.00 10.000000
## 13  34  26250.00 11.500000
## 14  35      0.00  9.333333
## 15  36      0.00 12.500000
## 16  37      0.00  9.000000
## 17  39  56000.00 10.500000
## 18  40 183000.00 15.000000
## 19  42      0.00 13.000000
## 20  43      0.00 19.000000
## 21  48      0.00 22.000000

MBA’s Starting salary based on Gender

boxplot(salary ~ sex ,data=data.df, main="MBA's Starting salary based on Gender", ylab="Work Experience", xlab="MBA's Starting Salaries", horizontal=FALSE)

Effect of Sex on Salary

aggregate(cbind(salary, work_yrs, age) ~ sex, 
                   data = data.df, mean)

##   sex   salary work_yrs      age
## 1   1 37013.62 3.893204 27.41748
## 2   2 45121.07 3.808824 27.17647

Distribution of MBA’s Starting Salary

library(lattice)
histogram(~salary, data = data.df,
 main = "Distribution of Starting Salary", xlab="MBA's Starting Salary", col='red' )

Distribution of MBA’s Starting Salary

library(lattice)
histogram(~salary, data = data.df,
 main = "Distribution of MBA's Starting Salary", xlab="MBA's Starting Salary", col='red' )

Merge placed.df ; notDisclosed.df ; notPlaced = knownMBA.df

MBA.df <- rbind(placed.df, notDisclosedSalary.df, notPlaced.df)
View(MBA.df)

Creating dummy variable: “GotPlaced” = 1 (got a job) or 0 (did not get a job)

MBA.df$GotPlaced = (MBA.df$salary >1000)
View(MBA.df)

MBA.df$GotPlaced <- factor(MBA.df$GotPlaced)
str(MBA.df)

## 'data.frame':    228 obs. of  14 variables:
##  $ age      : int  22 27 25 25 27 28 24 25 25 25 ...
##  $ sex      : Factor w/ 2 levels "1","2": 2 2 2 2 1 2 1 2 2 1 ...
##  $ gmat_tot : int  660 700 680 650 710 620 670 560 530 650 ...
##  $ gmat_qpc : int  90 94 87 82 96 52 84 52 50 79 ...
##  $ gmat_vpc : int  92 98 96 91 96 98 96 81 62 93 ...
##  $ gmat_tpc : int  94 98 96 93 98 87 95 72 61 93 ...
##  $ s_avg    : num  3.5 3.3 3.5 3.4 3.3 3.4 3.3 3.3 3.6 3.3 ...
##  $ f_avg    : num  3.75 3.25 2.67 3.25 3.5 3.75 3.25 3.5 3.67 3.5 ...
##  $ quarter  : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ work_yrs : int  1 2 2 3 2 5 0 1 3 1 ...
##  $ frstlang : Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ salary   : int  85000 85000 86000 88000 92000 93000 95000 95000 95000 96000 ...
##  $ satis    : int  5 6 5 7 6 5 4 5 3 7 ...
##  $ GotPlaced: Factor w/ 2 levels "FALSE","TRUE": 2 2 2 2 2 2 2 2 2 2 ...

#GotPlaced = factor(year)
#dummies = model.matrix(~year.f)

# Identifying DEPENDENT and INDEPENDENT Variables

# The DEPENDENT variable is "salary"
# Identifying INDEPENDENT Variables:
#1a.  Variables related to GMAT are highly correlated:   "gmat_tot" "gmat_qpc" "gmat_vpc" "gmat_tpc"
#1b.  Variables relatd to MBA performance are highly correlated:    "s_avg"    "f_avg"    "quarter"
#1c.  Variables related to age and work experience are highly correlated: "age", "work_yrs"
#1d.  Other variables: "sex"   , "frstlang" ;   "satis"   

# 1e. GMAT 
# The GMAT related columns are strongly correlated
gmat <- placed.df[, c("gmat_tot", "gmat_qpc", "gmat_vpc", "gmat_tpc")]
res <- cor(gmat)
round(res, 2)

##          gmat_tot gmat_qpc gmat_vpc gmat_tpc
## gmat_tot     1.00     0.67     0.78     0.97
## gmat_qpc     0.67     1.00     0.09     0.66
## gmat_vpc     0.78     0.09     1.00     0.78
## gmat_tpc     0.97     0.66     0.78     1.00

library(corrplot)
M <- cor(gmat)
corrplot(M, method="circle")

#GMAT verbal and quantitative scores are very weakly correlated
cor(gmat_qpc,gmat_vpc)

## [1] 0.1521801

# We include gmat_qpc and gmat_vpc , but exclude "gmat_tot" and "gmat_tpc" in our Regression

# 1f. Performance

mbaPerformance <- placed.df[, c("s_avg", "f_avg", "quarter")]

N <- cor(mbaPerformance)
corrplot(N, method="circle")

res <- cor(mbaPerformance)
round(res, 2)

##         s_avg f_avg quarter
## s_avg    1.00  0.45   -0.84
## f_avg    0.45  1.00   -0.43
## quarter -0.84 -0.43    1.00

# Highly Correlated: The overall performance quartile , the Spring (s_avg) & Fall (f_avg) GPA
# We include 's_avg' and 'f_avg' , but exclude 'quarter' in our regression.

# 1g. WORK EXPERIENCE AND AGE

cor(age,work_yrs)

## [1] 0.8582981

# Highly correlated, hence we include 'work_years', but exclude 'age' from our regression

# Concludingly, the indepedent variables we will include in the regression are {work_yrs,s_avg,f_avg,gmat_qpc,gmat_vpc,sex,frstlang,satis}

VARIANCE - COVARIANCE MATRIX

columns = c("salary", "work_yrs", "gmat_qpc", "gmat_vpc", "s_avg", "f_avg", "satis")
placedVariables <- placed.df[, columns]
res <- cor(placedVariables)
round(res, 2)

##          salary work_yrs gmat_qpc gmat_vpc s_avg f_avg satis
## salary     1.00     0.45     0.01    -0.14  0.10 -0.11 -0.04
## work_yrs   0.45     1.00    -0.18    -0.03  0.16 -0.22  0.06
## gmat_qpc   0.01    -0.18     1.00     0.09  0.02  0.10  0.00
## gmat_vpc  -0.14    -0.03     0.09     1.00  0.16  0.02  0.15
## s_avg      0.10     0.16     0.02     0.16  1.00  0.45 -0.14
## f_avg     -0.11    -0.22     0.10     0.02  0.45  1.00 -0.12
## satis     -0.04     0.06     0.00     0.15 -0.14 -0.12  1.00

library(corrplot)
M <- cor(placed.df[, columns])
corrplot(M, method="circle")

Comparison of Salary with Work Experience

scatterplot(salary ~ work_yrs ,data=placed.df, main="Scatterplot of Salary with Work Experience", xlab="Work Experience", ylab="MBA's Starting Salaries", horizontal=TRUE)

boxplot(salary ~ work_yrs ,data=placed.df, main="Distribution of Salary with Work Experience", ylab="Work Experience", xlab="MBA's Starting Salaries", horizontal=TRUE)

library(lattice)
histogram(~salary, data = placed.df,
 main = "Frequency of Starting Salary", xlab="Starting Salary", col='red' )

salaryWorkEx = aggregate(salary ~ work_yrs, data = placed.df, mean)
salaryWorkEx

##    work_yrs    salary
## 1         0  95000.00
## 2         1 103532.00
## 3         2  97673.68
## 4         3 101652.86
## 5         4 105454.55
## 6         5 103142.86
## 7         6 105928.57
## 8         7  98000.00
## 9         8 105025.00
## 10       10 118000.00
## 11       15 183000.00
## 12       16 108500.00

Comparison of Salary with GMAT total score

scatterplot(salary ~ gmat_tot , data=placed.df, 
    xlab="GMAT Total", ylab="Salary", 
   main="Comparison of Salary with Total GMAT score", 
   labels=row.names(placed.df))

scatterplot(salary ~ gmat_tot | sex, data=placed.df, 
    xlab="GMAT Total", ylab="Salary", 
   main="Comparison of Salary with Total GMAT score", 
   labels=row.names(placed.df))

boxplot(salary ~ gmat_tot , data=placed.df, 
    ylab="GMAT Total", xlab="Salary", 
   main="Comparison of Salary with Total GMAT score", 
   horizontal=TRUE,
   labels=row.names(placed.df))

colnames(placed.df)

##  [1] "age"      "sex"      "gmat_tot" "gmat_qpc" "gmat_vpc" "gmat_tpc"
##  [7] "s_avg"    "f_avg"    "quarter"  "work_yrs" "frstlang" "salary"  
## [13] "satis"

library(car)
scatterplot.matrix(~salary+gmat_tot+s_avg+f_avg, data=placed.df,
    main="Salary versus other variables")

scatterplot.matrix(~salary+gmat_tot+s_avg+f_avg |sex, data=placed.df,
    main="Salary versus other variables")

ageTable<-table(placed.df$sex, placed.df$age)
ageTable

##    
##     22 23 24 25 26 27 28 29 30 31 32 33 34 39 40
##   1  0  3 11 13  9 13  5  5  4  4  1  1  1  1  1
##   2  1  2  5 10  5  1  3  1  2  0  0  0  0  0  1

aggregate(cbind(salary, work_yrs, age)~sex, data=data.df, mean)

##   sex   salary work_yrs      age
## 1   1 37013.62 3.893204 27.41748
## 2   2 45121.07 3.808824 27.17647

aggregate(cbind(salary, work_yrs)~age, data=data.df, mean)

##    age    salary  work_yrs
## 1   22  42500.00  1.000000
## 2   23  57282.00  1.750000
## 3   24  49342.24  1.727273
## 4   25  43395.55  2.264151
## 5   26  35982.07  2.875000
## 6   27  31499.37  3.130435
## 7   28  39809.00  4.666667
## 8   29  28067.95  4.500000
## 9   30  55291.25  5.583333
## 10  31  40599.40  5.800000
## 11  32  13662.25  5.625000
## 12  33 118000.00 10.000000
## 13  34  26250.00 11.500000
## 14  35      0.00  9.333333
## 15  36      0.00 12.500000
## 16  37      0.00  9.000000
## 17  39  56000.00 10.500000
## 18  40 183000.00 15.000000
## 19  42      0.00 13.000000
## 20  43      0.00 19.000000
## 21  48      0.00 22.000000

prop.table(ageTable, 2)

##    
##             22         23         24         25         26         27
##   1 0.00000000 0.60000000 0.68750000 0.56521739 0.64285714 0.92857143
##   2 1.00000000 0.40000000 0.31250000 0.43478261 0.35714286 0.07142857
##    
##             28         29         30         31         32         33
##   1 0.62500000 0.83333333 0.66666667 1.00000000 1.00000000 1.00000000
##   2 0.37500000 0.16666667 0.33333333 0.00000000 0.00000000 0.00000000
##    
##             34         39         40
##   1 1.00000000 1.00000000 0.50000000
##   2 0.00000000 0.00000000 0.50000000

demo<-lm(salary~work_yrs+ sex+frstlang+satis, data=data.df)
summary(demo)

## 
## Call:
## lm(formula = salary ~ work_yrs + sex + frstlang + satis, data = data.df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -53048 -46140  -1073  47952 182479 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 48210.336   5292.917   9.108  < 2e-16 ***
## work_yrs     -445.138    907.603  -0.490    0.624    
## sex2         5955.031   6747.840   0.883    0.378    
## frstlang2   -9695.438   9090.930  -1.066    0.287    
## satis         -45.340      7.928  -5.719 2.85e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 48170 on 269 degrees of freedom
## Multiple R-squared:  0.1194, Adjusted R-squared:  0.1063 
## F-statistic: 9.117 on 4 and 269 DF,  p-value: 6.396e-07

MBA Salaries Project

Tushar Upasani

24 January 2018

R Markdown

Males & Females (age-wise)

Satisfaction level

Effect of Work Experience on Salary

Effect of Age on Salary

MBA’s Starting salary based on Gender

Effect of Sex on Salary

Distribution of MBA’s Starting Salary

Distribution of MBA’s Starting Salary

Merge placed.df ; notDisclosed.df ; notPlaced = knownMBA.df

Creating dummy variable: “GotPlaced” = 1 (got a job) or 0 (did not get a job)

VARIANCE - COVARIANCE MATRIX

Comparison of Salary with Work Experience

Comparison of Salary with GMAT total score