This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
data.df <- read.csv(paste("MBA Starting Salaries Data.csv", sep=""))
attach(data.df)
data.df$sex[data.df$sex == 1]
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [71] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [106] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [141] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [176] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
data.df$sex[data.df$sex == 2]
## [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [36] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
data.df$sex <- factor(data.df$sex)
data.df$frstlang[data.df$frstlang == 1]
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [71] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [106] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [141] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [176] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [211] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
data.df$frstlang[data.df$frstlang == 2]
## [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
data.df$frstlang <- factor(data.df$frstlang)
library(car)
#As per special conditions for "Salary".
#999: Answered Survey But Did Not Disclose Salary
#998: Did not Answer Survey
#0: Not Yet Placed
# 999: Answered Survey, Got Placed, Disclosed Salary
# Placed and Disclosed Salary
placed.df <- data.df[which (data.df$salary > 1000) , ]
View(placed.df)
# Not placed
notPlaced.df <- data.df[which(data.df$salary==0), ]
View(notPlaced.df)
# Placed but not Disclosed salary
notDisclosedSalary.df <- data.df[which (data.df$salary == 999) , ]
View(notDisclosedSalary.df)
# Skipped the survey
notAnsweredSurvey.df <- data.df[which (data.df$salary == 998) , ]
View(notAnsweredSurvey.df)
dim(data.df)[1]
## [1] 274
c1 = dim(placed.df)[1]
c2 = dim(notPlaced.df)[1]
c3 = dim(notDisclosedSalary.df)[1]
c4 = dim(notAnsweredSurvey.df)[1]
c1+c2+c3+c4
## [1] 274
library(corrplot)
colnames(placed.df)
## [1] "age" "sex" "gmat_tot" "gmat_qpc" "gmat_vpc" "gmat_tpc"
## [7] "s_avg" "f_avg" "quarter" "work_yrs" "frstlang" "salary"
## [13] "satis"
dataColumns <- placed.df[, c("age","work_yrs", "gmat_tot", "gmat_qpc", "gmat_vpc", "gmat_tpc", "s_avg", "f_avg", "quarter", "satis")]
N <- cor(dataColumns)
corrplot(N, method="circle")
res <- cor(dataColumns)
round(res, 2)
## age work_yrs gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg
## age 1.00 0.88 -0.08 -0.17 0.02 -0.10 0.16 -0.22
## work_yrs 0.88 1.00 -0.12 -0.18 -0.03 -0.13 0.16 -0.22
## gmat_tot -0.08 -0.12 1.00 0.67 0.78 0.97 0.17 0.12
## gmat_qpc -0.17 -0.18 0.67 1.00 0.09 0.66 0.02 0.10
## gmat_vpc 0.02 -0.03 0.78 0.09 1.00 0.78 0.16 0.02
## gmat_tpc -0.10 -0.13 0.97 0.66 0.78 1.00 0.14 0.07
## s_avg 0.16 0.16 0.17 0.02 0.16 0.14 1.00 0.45
## f_avg -0.22 -0.22 0.12 0.10 0.02 0.07 0.45 1.00
## quarter -0.13 -0.13 -0.11 0.01 -0.13 -0.10 -0.84 -0.43
## satis 0.11 0.06 0.06 0.00 0.15 0.12 -0.14 -0.12
## quarter satis
## age -0.13 0.11
## work_yrs -0.13 0.06
## gmat_tot -0.11 0.06
## gmat_qpc 0.01 0.00
## gmat_vpc -0.13 0.15
## gmat_tpc -0.10 0.12
## s_avg -0.84 -0.14
## f_avg -0.43 -0.12
## quarter 1.00 0.23
## satis 0.23 1.00
Performance <- placed.df[, c("s_avg", "f_avg", "quarter")]
N <- cor(Performance)
corrplot(N, method="circle")
res <- cor(Performance)
round(res, 2)
## s_avg f_avg quarter
## s_avg 1.00 0.45 -0.84
## f_avg 0.45 1.00 -0.43
## quarter -0.84 -0.43 1.00
# High correlation between overall performance quartile & the "s_avg" & "f_avg" GPA
ageTable <- table(placed.df$sex, placed.df$age)
ageTable
##
## 22 23 24 25 26 27 28 29 30 31 32 33 34 39 40
## 1 0 3 11 13 9 13 5 5 4 4 1 1 1 1 1
## 2 1 2 5 10 5 1 3 1 2 0 0 0 0 0 1
aggregate(cbind(salary, work_yrs) ~ satis , data = data.df, mean)
## satis salary work_yrs
## 1 1 999.000 3.000000
## 2 2 999.000 2.000000
## 3 3 19799.200 4.200000
## 4 4 6293.412 2.941176
## 5 5 40476.311 4.243243
## 6 6 54383.536 4.185567
## 7 7 65718.152 3.727273
## 8 998 998.000 3.086957
boxplot(salary ~ work_yrs ,data=data.df, main="Effect of Work Experience on Salary", ylab="Work Experience", xlab="MBA's Starting Salaries", horizontal=FALSE)
aggregate(cbind(salary, work_yrs) ~ age, data = data.df, mean)
## age salary work_yrs
## 1 22 42500.00 1.000000
## 2 23 57282.00 1.750000
## 3 24 49342.24 1.727273
## 4 25 43395.55 2.264151
## 5 26 35982.07 2.875000
## 6 27 31499.37 3.130435
## 7 28 39809.00 4.666667
## 8 29 28067.95 4.500000
## 9 30 55291.25 5.583333
## 10 31 40599.40 5.800000
## 11 32 13662.25 5.625000
## 12 33 118000.00 10.000000
## 13 34 26250.00 11.500000
## 14 35 0.00 9.333333
## 15 36 0.00 12.500000
## 16 37 0.00 9.000000
## 17 39 56000.00 10.500000
## 18 40 183000.00 15.000000
## 19 42 0.00 13.000000
## 20 43 0.00 19.000000
## 21 48 0.00 22.000000
boxplot(salary ~ sex ,data=data.df, main="MBA's Starting salary based on Gender", ylab="Work Experience", xlab="MBA's Starting Salaries", horizontal=FALSE)
aggregate(cbind(salary, work_yrs, age) ~ sex,
data = data.df, mean)
## sex salary work_yrs age
## 1 1 37013.62 3.893204 27.41748
## 2 2 45121.07 3.808824 27.17647
library(lattice)
histogram(~salary, data = data.df,
main = "Distribution of Starting Salary", xlab="MBA's Starting Salary", col='red' )
library(lattice)
histogram(~salary, data = data.df,
main = "Distribution of MBA's Starting Salary", xlab="MBA's Starting Salary", col='red' )
MBA.df <- rbind(placed.df, notDisclosedSalary.df, notPlaced.df)
View(MBA.df)
MBA.df$GotPlaced = (MBA.df$salary >1000)
View(MBA.df)
MBA.df$GotPlaced <- factor(MBA.df$GotPlaced)
str(MBA.df)
## 'data.frame': 228 obs. of 14 variables:
## $ age : int 22 27 25 25 27 28 24 25 25 25 ...
## $ sex : Factor w/ 2 levels "1","2": 2 2 2 2 1 2 1 2 2 1 ...
## $ gmat_tot : int 660 700 680 650 710 620 670 560 530 650 ...
## $ gmat_qpc : int 90 94 87 82 96 52 84 52 50 79 ...
## $ gmat_vpc : int 92 98 96 91 96 98 96 81 62 93 ...
## $ gmat_tpc : int 94 98 96 93 98 87 95 72 61 93 ...
## $ s_avg : num 3.5 3.3 3.5 3.4 3.3 3.4 3.3 3.3 3.6 3.3 ...
## $ f_avg : num 3.75 3.25 2.67 3.25 3.5 3.75 3.25 3.5 3.67 3.5 ...
## $ quarter : int 1 1 1 1 1 1 1 1 1 1 ...
## $ work_yrs : int 1 2 2 3 2 5 0 1 3 1 ...
## $ frstlang : Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ salary : int 85000 85000 86000 88000 92000 93000 95000 95000 95000 96000 ...
## $ satis : int 5 6 5 7 6 5 4 5 3 7 ...
## $ GotPlaced: Factor w/ 2 levels "FALSE","TRUE": 2 2 2 2 2 2 2 2 2 2 ...
#GotPlaced = factor(year)
#dummies = model.matrix(~year.f)
# Identifying DEPENDENT and INDEPENDENT Variables
# The DEPENDENT variable is "salary"
# Identifying INDEPENDENT Variables:
#1a. Variables related to GMAT are highly correlated: "gmat_tot" "gmat_qpc" "gmat_vpc" "gmat_tpc"
#1b. Variables relatd to MBA performance are highly correlated: "s_avg" "f_avg" "quarter"
#1c. Variables related to age and work experience are highly correlated: "age", "work_yrs"
#1d. Other variables: "sex" , "frstlang" ; "satis"
# 1e. GMAT
# The GMAT related columns are strongly correlated
gmat <- placed.df[, c("gmat_tot", "gmat_qpc", "gmat_vpc", "gmat_tpc")]
res <- cor(gmat)
round(res, 2)
## gmat_tot gmat_qpc gmat_vpc gmat_tpc
## gmat_tot 1.00 0.67 0.78 0.97
## gmat_qpc 0.67 1.00 0.09 0.66
## gmat_vpc 0.78 0.09 1.00 0.78
## gmat_tpc 0.97 0.66 0.78 1.00
library(corrplot)
M <- cor(gmat)
corrplot(M, method="circle")
#GMAT verbal and quantitative scores are very weakly correlated
cor(gmat_qpc,gmat_vpc)
## [1] 0.1521801
# We include gmat_qpc and gmat_vpc , but exclude "gmat_tot" and "gmat_tpc" in our Regression
# 1f. Performance
mbaPerformance <- placed.df[, c("s_avg", "f_avg", "quarter")]
N <- cor(mbaPerformance)
corrplot(N, method="circle")
res <- cor(mbaPerformance)
round(res, 2)
## s_avg f_avg quarter
## s_avg 1.00 0.45 -0.84
## f_avg 0.45 1.00 -0.43
## quarter -0.84 -0.43 1.00
# Highly Correlated: The overall performance quartile , the Spring (s_avg) & Fall (f_avg) GPA
# We include 's_avg' and 'f_avg' , but exclude 'quarter' in our regression.
# 1g. WORK EXPERIENCE AND AGE
cor(age,work_yrs)
## [1] 0.8582981
# Highly correlated, hence we include 'work_years', but exclude 'age' from our regression
# Concludingly, the indepedent variables we will include in the regression are {work_yrs,s_avg,f_avg,gmat_qpc,gmat_vpc,sex,frstlang,satis}
columns = c("salary", "work_yrs", "gmat_qpc", "gmat_vpc", "s_avg", "f_avg", "satis")
placedVariables <- placed.df[, columns]
res <- cor(placedVariables)
round(res, 2)
## salary work_yrs gmat_qpc gmat_vpc s_avg f_avg satis
## salary 1.00 0.45 0.01 -0.14 0.10 -0.11 -0.04
## work_yrs 0.45 1.00 -0.18 -0.03 0.16 -0.22 0.06
## gmat_qpc 0.01 -0.18 1.00 0.09 0.02 0.10 0.00
## gmat_vpc -0.14 -0.03 0.09 1.00 0.16 0.02 0.15
## s_avg 0.10 0.16 0.02 0.16 1.00 0.45 -0.14
## f_avg -0.11 -0.22 0.10 0.02 0.45 1.00 -0.12
## satis -0.04 0.06 0.00 0.15 -0.14 -0.12 1.00
library(corrplot)
M <- cor(placed.df[, columns])
corrplot(M, method="circle")
scatterplot(salary ~ work_yrs ,data=placed.df, main="Scatterplot of Salary with Work Experience", xlab="Work Experience", ylab="MBA's Starting Salaries", horizontal=TRUE)
boxplot(salary ~ work_yrs ,data=placed.df, main="Distribution of Salary with Work Experience", ylab="Work Experience", xlab="MBA's Starting Salaries", horizontal=TRUE)
library(lattice)
histogram(~salary, data = placed.df,
main = "Frequency of Starting Salary", xlab="Starting Salary", col='red' )
salaryWorkEx = aggregate(salary ~ work_yrs, data = placed.df, mean)
salaryWorkEx
## work_yrs salary
## 1 0 95000.00
## 2 1 103532.00
## 3 2 97673.68
## 4 3 101652.86
## 5 4 105454.55
## 6 5 103142.86
## 7 6 105928.57
## 8 7 98000.00
## 9 8 105025.00
## 10 10 118000.00
## 11 15 183000.00
## 12 16 108500.00
scatterplot(salary ~ gmat_tot , data=placed.df,
xlab="GMAT Total", ylab="Salary",
main="Comparison of Salary with Total GMAT score",
labels=row.names(placed.df))
scatterplot(salary ~ gmat_tot | sex, data=placed.df,
xlab="GMAT Total", ylab="Salary",
main="Comparison of Salary with Total GMAT score",
labels=row.names(placed.df))
boxplot(salary ~ gmat_tot , data=placed.df,
ylab="GMAT Total", xlab="Salary",
main="Comparison of Salary with Total GMAT score",
horizontal=TRUE,
labels=row.names(placed.df))
colnames(placed.df)
## [1] "age" "sex" "gmat_tot" "gmat_qpc" "gmat_vpc" "gmat_tpc"
## [7] "s_avg" "f_avg" "quarter" "work_yrs" "frstlang" "salary"
## [13] "satis"
library(car)
scatterplot.matrix(~salary+gmat_tot+s_avg+f_avg, data=placed.df,
main="Salary versus other variables")
scatterplot.matrix(~salary+gmat_tot+s_avg+f_avg |sex, data=placed.df,
main="Salary versus other variables")
ageTable<-table(placed.df$sex, placed.df$age)
ageTable
##
## 22 23 24 25 26 27 28 29 30 31 32 33 34 39 40
## 1 0 3 11 13 9 13 5 5 4 4 1 1 1 1 1
## 2 1 2 5 10 5 1 3 1 2 0 0 0 0 0 1
aggregate(cbind(salary, work_yrs, age)~sex, data=data.df, mean)
## sex salary work_yrs age
## 1 1 37013.62 3.893204 27.41748
## 2 2 45121.07 3.808824 27.17647
aggregate(cbind(salary, work_yrs)~age, data=data.df, mean)
## age salary work_yrs
## 1 22 42500.00 1.000000
## 2 23 57282.00 1.750000
## 3 24 49342.24 1.727273
## 4 25 43395.55 2.264151
## 5 26 35982.07 2.875000
## 6 27 31499.37 3.130435
## 7 28 39809.00 4.666667
## 8 29 28067.95 4.500000
## 9 30 55291.25 5.583333
## 10 31 40599.40 5.800000
## 11 32 13662.25 5.625000
## 12 33 118000.00 10.000000
## 13 34 26250.00 11.500000
## 14 35 0.00 9.333333
## 15 36 0.00 12.500000
## 16 37 0.00 9.000000
## 17 39 56000.00 10.500000
## 18 40 183000.00 15.000000
## 19 42 0.00 13.000000
## 20 43 0.00 19.000000
## 21 48 0.00 22.000000
prop.table(ageTable, 2)
##
## 22 23 24 25 26 27
## 1 0.00000000 0.60000000 0.68750000 0.56521739 0.64285714 0.92857143
## 2 1.00000000 0.40000000 0.31250000 0.43478261 0.35714286 0.07142857
##
## 28 29 30 31 32 33
## 1 0.62500000 0.83333333 0.66666667 1.00000000 1.00000000 1.00000000
## 2 0.37500000 0.16666667 0.33333333 0.00000000 0.00000000 0.00000000
##
## 34 39 40
## 1 1.00000000 1.00000000 0.50000000
## 2 0.00000000 0.00000000 0.50000000
demo<-lm(salary~work_yrs+ sex+frstlang+satis, data=data.df)
summary(demo)
##
## Call:
## lm(formula = salary ~ work_yrs + sex + frstlang + satis, data = data.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -53048 -46140 -1073 47952 182479
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 48210.336 5292.917 9.108 < 2e-16 ***
## work_yrs -445.138 907.603 -0.490 0.624
## sex2 5955.031 6747.840 0.883 0.378
## frstlang2 -9695.438 9090.930 -1.066 0.287
## satis -45.340 7.928 -5.719 2.85e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 48170 on 269 degrees of freedom
## Multiple R-squared: 0.1194, Adjusted R-squared: 0.1063
## F-statistic: 9.117 on 4 and 269 DF, p-value: 6.396e-07