R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

data.df <- read.csv(paste("MBA Starting Salaries Data.csv", sep=""))
attach(data.df)
data.df$sex[data.df$sex == 1] 
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [71] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [106] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [141] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [176] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
data.df$sex[data.df$sex == 2] 
##  [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [36] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
data.df$sex <- factor(data.df$sex)
data.df$frstlang[data.df$frstlang == 1]
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [71] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [106] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [141] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [176] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [211] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
data.df$frstlang[data.df$frstlang == 2]
##  [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
data.df$frstlang <- factor(data.df$frstlang)
library(car)
#As per special conditions for "Salary".
 #999: Answered Survey But Did Not Disclose Salary
 #998: Did not Answer Survey
 #0:  Not Yet Placed
# 999: Answered Survey, Got Placed, Disclosed Salary


# Placed and Disclosed Salary
placed.df <- data.df[which (data.df$salary > 1000)  , ]
View(placed.df)

# Not placed
notPlaced.df <- data.df[which(data.df$salary==0), ]
View(notPlaced.df)

# Placed but not Disclosed salary
notDisclosedSalary.df  <- data.df[which (data.df$salary == 999)  , ]
View(notDisclosedSalary.df)

# Skipped the survey
notAnsweredSurvey.df  <- data.df[which (data.df$salary == 998)  , ]
View(notAnsweredSurvey.df)

dim(data.df)[1]
## [1] 274
c1 = dim(placed.df)[1]
c2 = dim(notPlaced.df)[1]
c3 = dim(notDisclosedSalary.df)[1]
c4 = dim(notAnsweredSurvey.df)[1]
c1+c2+c3+c4
## [1] 274
library(corrplot)

colnames(placed.df)
##  [1] "age"      "sex"      "gmat_tot" "gmat_qpc" "gmat_vpc" "gmat_tpc"
##  [7] "s_avg"    "f_avg"    "quarter"  "work_yrs" "frstlang" "salary"  
## [13] "satis"
dataColumns <- placed.df[, c("age","work_yrs", "gmat_tot", "gmat_qpc", "gmat_vpc", "gmat_tpc", "s_avg", "f_avg", "quarter", "satis")]

N <- cor(dataColumns)
corrplot(N, method="circle")

res <- cor(dataColumns)
round(res, 2)
##            age work_yrs gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg
## age       1.00     0.88    -0.08    -0.17     0.02    -0.10  0.16 -0.22
## work_yrs  0.88     1.00    -0.12    -0.18    -0.03    -0.13  0.16 -0.22
## gmat_tot -0.08    -0.12     1.00     0.67     0.78     0.97  0.17  0.12
## gmat_qpc -0.17    -0.18     0.67     1.00     0.09     0.66  0.02  0.10
## gmat_vpc  0.02    -0.03     0.78     0.09     1.00     0.78  0.16  0.02
## gmat_tpc -0.10    -0.13     0.97     0.66     0.78     1.00  0.14  0.07
## s_avg     0.16     0.16     0.17     0.02     0.16     0.14  1.00  0.45
## f_avg    -0.22    -0.22     0.12     0.10     0.02     0.07  0.45  1.00
## quarter  -0.13    -0.13    -0.11     0.01    -0.13    -0.10 -0.84 -0.43
## satis     0.11     0.06     0.06     0.00     0.15     0.12 -0.14 -0.12
##          quarter satis
## age        -0.13  0.11
## work_yrs   -0.13  0.06
## gmat_tot   -0.11  0.06
## gmat_qpc    0.01  0.00
## gmat_vpc   -0.13  0.15
## gmat_tpc   -0.10  0.12
## s_avg      -0.84 -0.14
## f_avg      -0.43 -0.12
## quarter     1.00  0.23
## satis       0.23  1.00
Performance <- placed.df[, c("s_avg", "f_avg", "quarter")]

N <- cor(Performance)
corrplot(N, method="circle")

res <- cor(Performance)
round(res, 2)
##         s_avg f_avg quarter
## s_avg    1.00  0.45   -0.84
## f_avg    0.45  1.00   -0.43
## quarter -0.84 -0.43    1.00
# High correlation between overall performance quartile & the "s_avg" & "f_avg" GPA

Males & Females (age-wise)

ageTable <- table(placed.df$sex, placed.df$age)
ageTable
##    
##     22 23 24 25 26 27 28 29 30 31 32 33 34 39 40
##   1  0  3 11 13  9 13  5  5  4  4  1  1  1  1  1
##   2  1  2  5 10  5  1  3  1  2  0  0  0  0  0  1

Satisfaction level

aggregate(cbind(salary, work_yrs) ~ satis , data = data.df, mean) 
##   satis    salary work_yrs
## 1     1   999.000 3.000000
## 2     2   999.000 2.000000
## 3     3 19799.200 4.200000
## 4     4  6293.412 2.941176
## 5     5 40476.311 4.243243
## 6     6 54383.536 4.185567
## 7     7 65718.152 3.727273
## 8   998   998.000 3.086957

Effect of Work Experience on Salary

boxplot(salary ~ work_yrs ,data=data.df, main="Effect of Work Experience on Salary", ylab="Work Experience", xlab="MBA's Starting Salaries", horizontal=FALSE)

Effect of Age on Salary

aggregate(cbind(salary, work_yrs) ~ age, data = data.df, mean)
##    age    salary  work_yrs
## 1   22  42500.00  1.000000
## 2   23  57282.00  1.750000
## 3   24  49342.24  1.727273
## 4   25  43395.55  2.264151
## 5   26  35982.07  2.875000
## 6   27  31499.37  3.130435
## 7   28  39809.00  4.666667
## 8   29  28067.95  4.500000
## 9   30  55291.25  5.583333
## 10  31  40599.40  5.800000
## 11  32  13662.25  5.625000
## 12  33 118000.00 10.000000
## 13  34  26250.00 11.500000
## 14  35      0.00  9.333333
## 15  36      0.00 12.500000
## 16  37      0.00  9.000000
## 17  39  56000.00 10.500000
## 18  40 183000.00 15.000000
## 19  42      0.00 13.000000
## 20  43      0.00 19.000000
## 21  48      0.00 22.000000

MBA’s Starting salary based on Gender

boxplot(salary ~ sex ,data=data.df, main="MBA's Starting salary based on Gender", ylab="Work Experience", xlab="MBA's Starting Salaries", horizontal=FALSE)

Effect of Sex on Salary

aggregate(cbind(salary, work_yrs, age) ~ sex, 
                   data = data.df, mean)
##   sex   salary work_yrs      age
## 1   1 37013.62 3.893204 27.41748
## 2   2 45121.07 3.808824 27.17647

Distribution of MBA’s Starting Salary

library(lattice)
histogram(~salary, data = data.df,
 main = "Distribution of Starting Salary", xlab="MBA's Starting Salary", col='red' ) 

Distribution of MBA’s Starting Salary

library(lattice)
histogram(~salary, data = data.df,
 main = "Distribution of MBA's Starting Salary", xlab="MBA's Starting Salary", col='red' ) 

Merge placed.df ; notDisclosed.df ; notPlaced = knownMBA.df

MBA.df <- rbind(placed.df, notDisclosedSalary.df, notPlaced.df)
View(MBA.df)

Creating dummy variable: “GotPlaced” = 1 (got a job) or 0 (did not get a job)

MBA.df$GotPlaced = (MBA.df$salary >1000)
View(MBA.df)

MBA.df$GotPlaced <- factor(MBA.df$GotPlaced)
str(MBA.df)
## 'data.frame':    228 obs. of  14 variables:
##  $ age      : int  22 27 25 25 27 28 24 25 25 25 ...
##  $ sex      : Factor w/ 2 levels "1","2": 2 2 2 2 1 2 1 2 2 1 ...
##  $ gmat_tot : int  660 700 680 650 710 620 670 560 530 650 ...
##  $ gmat_qpc : int  90 94 87 82 96 52 84 52 50 79 ...
##  $ gmat_vpc : int  92 98 96 91 96 98 96 81 62 93 ...
##  $ gmat_tpc : int  94 98 96 93 98 87 95 72 61 93 ...
##  $ s_avg    : num  3.5 3.3 3.5 3.4 3.3 3.4 3.3 3.3 3.6 3.3 ...
##  $ f_avg    : num  3.75 3.25 2.67 3.25 3.5 3.75 3.25 3.5 3.67 3.5 ...
##  $ quarter  : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ work_yrs : int  1 2 2 3 2 5 0 1 3 1 ...
##  $ frstlang : Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ salary   : int  85000 85000 86000 88000 92000 93000 95000 95000 95000 96000 ...
##  $ satis    : int  5 6 5 7 6 5 4 5 3 7 ...
##  $ GotPlaced: Factor w/ 2 levels "FALSE","TRUE": 2 2 2 2 2 2 2 2 2 2 ...
#GotPlaced = factor(year)
#dummies = model.matrix(~year.f)
# Identifying DEPENDENT and INDEPENDENT Variables

# The DEPENDENT variable is "salary"
# Identifying INDEPENDENT Variables:
#1a.  Variables related to GMAT are highly correlated:   "gmat_tot" "gmat_qpc" "gmat_vpc" "gmat_tpc"
#1b.  Variables relatd to MBA performance are highly correlated:    "s_avg"    "f_avg"    "quarter"
#1c.  Variables related to age and work experience are highly correlated: "age", "work_yrs"
#1d.  Other variables: "sex"   , "frstlang" ;   "satis"   

# 1e. GMAT 
# The GMAT related columns are strongly correlated
gmat <- placed.df[, c("gmat_tot", "gmat_qpc", "gmat_vpc", "gmat_tpc")]
res <- cor(gmat)
round(res, 2)
##          gmat_tot gmat_qpc gmat_vpc gmat_tpc
## gmat_tot     1.00     0.67     0.78     0.97
## gmat_qpc     0.67     1.00     0.09     0.66
## gmat_vpc     0.78     0.09     1.00     0.78
## gmat_tpc     0.97     0.66     0.78     1.00
library(corrplot)
M <- cor(gmat)
corrplot(M, method="circle")

#GMAT verbal and quantitative scores are very weakly correlated
cor(gmat_qpc,gmat_vpc)
## [1] 0.1521801
# We include gmat_qpc and gmat_vpc , but exclude "gmat_tot" and "gmat_tpc" in our Regression
# 1f. Performance

mbaPerformance <- placed.df[, c("s_avg", "f_avg", "quarter")]

N <- cor(mbaPerformance)
corrplot(N, method="circle")

res <- cor(mbaPerformance)
round(res, 2)
##         s_avg f_avg quarter
## s_avg    1.00  0.45   -0.84
## f_avg    0.45  1.00   -0.43
## quarter -0.84 -0.43    1.00
# Highly Correlated: The overall performance quartile , the Spring (s_avg) & Fall (f_avg) GPA
# We include 's_avg' and 'f_avg' , but exclude 'quarter' in our regression.
# 1g. WORK EXPERIENCE AND AGE

cor(age,work_yrs)
## [1] 0.8582981
# Highly correlated, hence we include 'work_years', but exclude 'age' from our regression
# Concludingly, the indepedent variables we will include in the regression are {work_yrs,s_avg,f_avg,gmat_qpc,gmat_vpc,sex,frstlang,satis}

VARIANCE - COVARIANCE MATRIX

columns = c("salary", "work_yrs", "gmat_qpc", "gmat_vpc", "s_avg", "f_avg", "satis")
placedVariables <- placed.df[, columns]
res <- cor(placedVariables)
round(res, 2)
##          salary work_yrs gmat_qpc gmat_vpc s_avg f_avg satis
## salary     1.00     0.45     0.01    -0.14  0.10 -0.11 -0.04
## work_yrs   0.45     1.00    -0.18    -0.03  0.16 -0.22  0.06
## gmat_qpc   0.01    -0.18     1.00     0.09  0.02  0.10  0.00
## gmat_vpc  -0.14    -0.03     0.09     1.00  0.16  0.02  0.15
## s_avg      0.10     0.16     0.02     0.16  1.00  0.45 -0.14
## f_avg     -0.11    -0.22     0.10     0.02  0.45  1.00 -0.12
## satis     -0.04     0.06     0.00     0.15 -0.14 -0.12  1.00
library(corrplot)
M <- cor(placed.df[, columns])
corrplot(M, method="circle")

Comparison of Salary with Work Experience

scatterplot(salary ~ work_yrs ,data=placed.df, main="Scatterplot of Salary with Work Experience", xlab="Work Experience", ylab="MBA's Starting Salaries", horizontal=TRUE)

boxplot(salary ~ work_yrs ,data=placed.df, main="Distribution of Salary with Work Experience", ylab="Work Experience", xlab="MBA's Starting Salaries", horizontal=TRUE)

library(lattice)
histogram(~salary, data = placed.df,
 main = "Frequency of Starting Salary", xlab="Starting Salary", col='red' ) 

salaryWorkEx = aggregate(salary ~ work_yrs, data = placed.df, mean)
salaryWorkEx
##    work_yrs    salary
## 1         0  95000.00
## 2         1 103532.00
## 3         2  97673.68
## 4         3 101652.86
## 5         4 105454.55
## 6         5 103142.86
## 7         6 105928.57
## 8         7  98000.00
## 9         8 105025.00
## 10       10 118000.00
## 11       15 183000.00
## 12       16 108500.00

Comparison of Salary with GMAT total score

scatterplot(salary ~ gmat_tot , data=placed.df, 
    xlab="GMAT Total", ylab="Salary", 
   main="Comparison of Salary with Total GMAT score", 
   labels=row.names(placed.df))

scatterplot(salary ~ gmat_tot | sex, data=placed.df, 
    xlab="GMAT Total", ylab="Salary", 
   main="Comparison of Salary with Total GMAT score", 
   labels=row.names(placed.df))

boxplot(salary ~ gmat_tot , data=placed.df, 
    ylab="GMAT Total", xlab="Salary", 
   main="Comparison of Salary with Total GMAT score", 
   horizontal=TRUE,
   labels=row.names(placed.df))

colnames(placed.df)
##  [1] "age"      "sex"      "gmat_tot" "gmat_qpc" "gmat_vpc" "gmat_tpc"
##  [7] "s_avg"    "f_avg"    "quarter"  "work_yrs" "frstlang" "salary"  
## [13] "satis"
library(car)
scatterplot.matrix(~salary+gmat_tot+s_avg+f_avg, data=placed.df,
    main="Salary versus other variables")

scatterplot.matrix(~salary+gmat_tot+s_avg+f_avg |sex, data=placed.df,
    main="Salary versus other variables")

ageTable<-table(placed.df$sex, placed.df$age)
ageTable
##    
##     22 23 24 25 26 27 28 29 30 31 32 33 34 39 40
##   1  0  3 11 13  9 13  5  5  4  4  1  1  1  1  1
##   2  1  2  5 10  5  1  3  1  2  0  0  0  0  0  1
aggregate(cbind(salary, work_yrs, age)~sex, data=data.df, mean)
##   sex   salary work_yrs      age
## 1   1 37013.62 3.893204 27.41748
## 2   2 45121.07 3.808824 27.17647
aggregate(cbind(salary, work_yrs)~age, data=data.df, mean)
##    age    salary  work_yrs
## 1   22  42500.00  1.000000
## 2   23  57282.00  1.750000
## 3   24  49342.24  1.727273
## 4   25  43395.55  2.264151
## 5   26  35982.07  2.875000
## 6   27  31499.37  3.130435
## 7   28  39809.00  4.666667
## 8   29  28067.95  4.500000
## 9   30  55291.25  5.583333
## 10  31  40599.40  5.800000
## 11  32  13662.25  5.625000
## 12  33 118000.00 10.000000
## 13  34  26250.00 11.500000
## 14  35      0.00  9.333333
## 15  36      0.00 12.500000
## 16  37      0.00  9.000000
## 17  39  56000.00 10.500000
## 18  40 183000.00 15.000000
## 19  42      0.00 13.000000
## 20  43      0.00 19.000000
## 21  48      0.00 22.000000
prop.table(ageTable, 2)
##    
##             22         23         24         25         26         27
##   1 0.00000000 0.60000000 0.68750000 0.56521739 0.64285714 0.92857143
##   2 1.00000000 0.40000000 0.31250000 0.43478261 0.35714286 0.07142857
##    
##             28         29         30         31         32         33
##   1 0.62500000 0.83333333 0.66666667 1.00000000 1.00000000 1.00000000
##   2 0.37500000 0.16666667 0.33333333 0.00000000 0.00000000 0.00000000
##    
##             34         39         40
##   1 1.00000000 1.00000000 0.50000000
##   2 0.00000000 0.00000000 0.50000000
demo<-lm(salary~work_yrs+ sex+frstlang+satis, data=data.df)
summary(demo)
## 
## Call:
## lm(formula = salary ~ work_yrs + sex + frstlang + satis, data = data.df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -53048 -46140  -1073  47952 182479 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 48210.336   5292.917   9.108  < 2e-16 ***
## work_yrs     -445.138    907.603  -0.490    0.624    
## sex2         5955.031   6747.840   0.883    0.378    
## frstlang2   -9695.438   9090.930  -1.066    0.287    
## satis         -45.340      7.928  -5.719 2.85e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 48170 on 269 degrees of freedom
## Multiple R-squared:  0.1194, Adjusted R-squared:  0.1063 
## F-statistic: 9.117 on 4 and 269 DF,  p-value: 6.396e-07