This is an assignment given on Week 4, Day 1 and Day 2 of the Data Analytics Internship under Prof. Sameer Mathur, IIML.
Read the data set into R
MBA.df <- read.csv(paste("MBA Starting Salaries Data.csv", sep=""))
attach(MBA.df)
dim(MBA.df)
library(psych)
describe(MBA.df)[,c(1:5)]
str(MBA.df)
MBA.df$sex[MBA.df$sex == 1] <- 'Male'
MBA.df$sex[MBA.df$sex == 2] <- 'Female'
MBA.df$sex <- factor(MBA.df$sex)
MBA.df$frstlang[MBA.df$frstlang == 1] <- 'English'
MBA.df$frstlang[MBA.df$frstlang == 2] <- 'Other'
MBA.df$frstlang <- factor(MBA.df$frstlang)
str(MBA.df)
MBAs who got placed and who disclosed their salaries
placed.df <- MBA.df[which (MBA.df$salary > 1000) , ]
View(placed.df)MBAs who were not placed
notPlaced.df <- MBA.df[which(MBA.df$salary==0), ]
View(notPlaced.df)MBAs who were placed but did not disclose their salary
notDisclosedSalary.df <- MBA.df[which (MBA.df$salary == 999) , ]
View(notDisclosedSalary.df)
4.MBAs who did not answer the survey
notAnsweredSurvey.df <- MBA.df[which (MBA.df$salary == 998) , ]
View(notAnsweredSurvey.df)Let avgSalary = Average Salary of students who were placed and who disclosed their salary
avgSalary = mean(placed.df$salary)
avgSalary
Assume avgSalary to be the salary of Students who did not disclose their salary
(notDisclosedSalary.df)
notDisclosedSalary.df$salary = avgSalary
allPlaced.df: A dataframe containing all students who were placed.
allPlaced.df <- rbind(placed.df, notDisclosedSalary.df)
library(psych)
describe(allPlaced.df)[,c(1:5)]
View(allPlaced.df)
library(lattice)
histogram(~salary, data = placed.df,
main = "Distribution of Starting Salary", xlab="Starting Salary", col='grey' )
histogram(~salary, data = allPlaced.df,
main = "Distribution of Starting Salary", xlab="Starting Salary", col='grey' )
aggregate(cbind(salary, work_yrs, age) ~ sex,
data = placed.df, mean)
aggregate(cbind(salary, work_yrs, age) ~ sex,
data = allPlaced.df, mean)
Scatter plot of Salary with Work Experience
scatterplot(salary ~ work_yrs ,data=placed.df, main="Scatterplot of Salary with Work Experience", xlab="Work Experience", ylab="MBA's Starting Salaries", horizontal=TRUE)
Distribution of Salary with Work Experience
boxplot(salary ~ work_yrs ,data=placed.df, main="Distribution of Salary with Work Experience", ylab="Work Experience", xlab="MBA's Starting Salaries", horizontal=TRUE)
library(lattice)
histogram(~salary, data = placed.df,
main = "Frequency of Starting Salary", xlab="Starting Salary", col='grey' )
Average Salary by Work Experience
salaryWorkEx = aggregate(salary ~ work_yrs, data = placed.df, mean)
salaryWorkEx
scatterplot(salary ~ gmat_tot , data=placed.df,
xlab="GMAT Total", ylab="Salary",
main="Comparison of Salary with Total GMAT score",
labels=row.names(placed.df))
scatterplot(salary ~ gmat_tot | sex, data=placed.df,
xlab="GMAT Total", ylab="Salary",
main="Comparison of Salary with Total GMAT score",
labels=row.names(placed.df))
boxplot(salary ~ gmat_tot , data=placed.df,
ylab="GMAT Total", xlab="Salary",
main="Comparison of Salary with Total GMAT score",
horizontal=TRUE,
labels=row.names(placed.df))
colnames(placed.df)
library(car)
scatterplot.matrix(~salary+gmat_tot+s_avg+f_avg, data=placed.df,
main="Salary versus other variables")
scatterplot.matrix(~salary+gmat_tot+s_avg+f_avg |sex, data=placed.df,
main="Salary versus other variables")
Number of male and females in dataframe age-wise
ageTable <- table(placed.df$sex, placed.df$age)
ageTableEffect of Sex on the on Salary
aggregate(cbind(salary, work_yrs, age) ~ sex,
data = MBA.df, mean)Effect of Age on the on Salary
aggregate(cbind(salary, work_yrs) ~ age, data = MBA.df, mean)Effect of Satisfaction level on the on Salary
aggregate(cbind(salary, work_yrs) ~ satis , data = MBA.df, mean) Effect of MBA’s Starting salary based on Work Experience
boxplot(salary ~ work_yrs ,data=MBA.df, main="Effect of Work Experience on Salary", ylab="Work Experience", xlab="MBA's Starting Salaries", horizontal=FALSE)Effect of MBA’s Starting salary based on Gender
boxplot(salary ~ sex ,data=MBA.df, main="Effect of Gender on Salary", ylab="Work Experience", xlab="MBA's Starting Salaries", horizontal=FALSE)Distribution of MBA’s Starting Salary
library(lattice)
histogram(~salary, data = MBA.df,
main = "Distribution of MBA's Starting Salary", xlab="MBA's Starting Slariy", col='grey' ) knownMBA.df <- rbind(placed.df, notDisclosedSalary.df, notPlaced.df)
View(knownMBA.df)
Create a dummay variable called “GotPlaced” = 1 (got a job) or 0 (did not get a job)
knownMBA.df$GotPlaced = (knownMBA.df$salary >1000)
View(knownMBA.df)
knownMBA.df$GotPlaced <- factor(knownMBA.df$GotPlaced)
str(knownMBA.df)
Number of Placed and Not Placed candiadtes
allplaced <- table(knownMBA.df$GotPlaced == 'TRUE')
allplacedallPlaced / notPlaced versus Sex
placedbySex <- xtabs(~ knownMBA.df$GotPlaced + knownMBA.df$sex , data=knownMBA.df)
placedbySex
addmargins(placedbySex)
Percentage of Male / Female candidates who got Placed
prop.table(placedbySex, 2) allPlaced / notPlaced versus First Language
placedbyLanguage <- xtabs(~ knownMBA.df$GotPlaced + knownMBA.df$frstlang, data=knownMBA.df)
placedbyLanguage
addmargins(placedbyLanguage)
Percentage of First Language candidates who got Placed
prop.table(placedbyLanguage, 2)
H1: The percentage of Females placed is more than Males
Chi Square Test : percentage of female who got placed is higher than percentage of male who got placed
chisq.test(placedbySex)
H2: The percentage of people placed whose first language is English is higher than the percentage of people placed whose first language is not English
chisq.test(placedbyLanguage)
library(corrplot)
colnames(placed.df)
dataColumns <- placed.df[, c("age","work_yrs", "gmat_tot", "gmat_qpc", "gmat_vpc", "gmat_tpc", "s_avg", "f_avg", "quarter", "satis")]
N <- cor(dataColumns)
corrplot(N, method="circle")
res <- cor(dataColumns)
round(res, 2)
The variables tracking performance during the MBA are heavily correlated
mbaPerformance <- placed.df[, c("s_avg", "f_avg", "quarter")]
N <- cor(mbaPerformance)
corrplot(N, method="circle")
res <- cor(mbaPerformance)
round(res, 2)
The overall performance quartile (quarter) is highly correlated with the Spring (s_avg) and Fall (f_avg) GPA. Thus we include ‘s_avg’ and ‘f_avg’ in our regression, but exclude ‘quarter’ from our regression.
The DEPENDENT variable is “salary” Identifying INDEPENDENT Variables: 1.Variables related to GMAT are highly correlated: “gmat_tot” “gmat_qpc” “gmat_vpc” “gmat_tpc” 2. Variables relatd to MBA performance are highly correlated: “s_avg” “f_avg” “quarter” 3. Variables related to age and work experience are highly correlated: “age”, “work_yrs” 4. Other variables: “sex” , “frstlang” ; “satis”
GMAT The GMAT related columns are heavily strongly correlated with each other gmat <- placed.df[, c("gmat_tot", "gmat_qpc", "gmat_vpc", "gmat_tpc")]
res <- cor(gmat)
round(res, 2)
library(corrplot)
M <- cor(gmat)
corrplot(M, method="circle")
However, GMAT verbal and quantitative scores are very weakly correlated cor(gmat_qpc,gmat_vpc) Therefore, in our regression we will include gmat_qpc and gmat_vpc , but exclude “gmat_tot” and “gmat_tpc”
MBA PERFORMANCE The variables tracking performance during the MBA are heavily correlated
mbaPerformance <- placed.df[, c("s_avg", "f_avg", "quarter")]
N <- cor(mbaPerformance)
corrplot(N, method="circle")
res <- cor(mbaPerformance)
round(res, 2)
The overall performance quartile (quarter) is highly correlated with the Spring (s_avg) and Fall (f_avg) GPA Thus We will include ‘s_avg’ and ‘f_avg’ in our regression, but exclude ‘quarter’ from our regression.
WORK EXPERIENCE AND AGE The variables ‘age’ and ‘work_years’ are highly correlated. The older the person, the greater the work experience.
cor(age,work_yrs)
Therefore we will include ‘work_years’ in our regression, but exclude ‘age’ from our regression
The indepedent variables we will include in the regression are {work_yrs,s_avg,f_avg,gmat_qpc,gmat_vpc,sex,frstlang,satis}
columns = c("salary", "work_yrs", "gmat_qpc", "gmat_vpc", "s_avg", "f_avg", "satis")
placedVariables <- placed.df[, columns]
res <- cor(placedVariables)
round(res, 2)
library(corrplot)
M <- cor(placed.df[, columns])
corrplot(M, method="circle")
library(car)
scatterplotMatrix(~salary + s_avg + f_avg + satis, data=placed.df,
main="Salary versus MBA Performance and MBA Satisfaction")
scatterplotMatrix(~salary + work_yrs + gmat_qpc + gmat_vpc, data=placed.df,
main="Salary versus Work Experience; GMAT Performance")
Formulating multivariate linear regression model to fit salary with respect to the model selection Independent Variables: {work_yrs,s_avg,f_avg,gmat_qpc,gmat_vpc,sex,frstlang,satis} Dependent Variable: Salary
Model1 <- salary ~
work_yrs + s_avg + f_avg + gmat_qpc + gmat_vpc + sex + frstlang + satis
fit1 <- lm(Model1, data = placed.df)
summary(fit1)
library(leaps)
leap <- regsubsets(Model1, data = placed.df, nbest=1)
summary(leap)
plot(leap, scale="adjr2")
Model2 <- salary ~
work_yrs +
# age +
# s_avg +
# f_avg +
# quarter +
# gmat_qpc +
# gmat_vpc +
# gmat_tot +
# gmat_tpc +
sex +
frstlang +
satis
fit2 <- lm(Model2, data = placed.df)
summary(fit2)
library(leaps)
leap <- regsubsets(Model2, data = placed.df, nbest=1)
summary(leap)
plot(leap, scale="adjr2")