This is an assignment given on Week 4, Day 1 and Day 2 of the Data Analytics Internship under Prof. Sameer Mathur, IIML.

TASK 2a - Reading and reviewing the dataset

Read the data set into R

MBA.df <- read.csv(paste("MBA Starting Salaries Data.csv", sep=""))
attach(MBA.df)
dim(MBA.df)
library(psych)
describe(MBA.df)[,c(1:5)]

Task 2b Who got how much salary

Verify the data types, sex and frstlang should be Factor variables

str(MBA.df)
MBA.df$sex[MBA.df$sex == 1] <- 'Male'
MBA.df$sex[MBA.df$sex == 2] <- 'Female'
MBA.df$sex <- factor(MBA.df$sex)
MBA.df$frstlang[MBA.df$frstlang == 1] <- 'English'
MBA.df$frstlang[MBA.df$frstlang == 2] <- 'Other'
MBA.df$frstlang <- factor(MBA.df$frstlang)
str(MBA.df) 
  1. MBAs who got placed and who disclosed their salaries

    placed.df <- MBA.df[which (MBA.df$salary > 1000)  , ]
    View(placed.df)
  2. MBAs who were not placed

    notPlaced.df <- MBA.df[which(MBA.df$salary==0), ]
    View(notPlaced.df)
  3. MBAs who were placed but did not disclose their salary

    notDisclosedSalary.df  <- MBA.df[which (MBA.df$salary == 999)  , ]
    View(notDisclosedSalary.df)

    4.MBAs who did not answer the survey

    notAnsweredSurvey.df  <- MBA.df[which (MBA.df$salary == 998)  , ]
    View(notAnsweredSurvey.df)

Let avgSalary = Average Salary of students who were placed and who disclosed their salary

avgSalary = mean(placed.df$salary)
avgSalary

Assume avgSalary to be the salary of Students who did not disclose their salary

(notDisclosedSalary.df) 
notDisclosedSalary.df$salary = avgSalary

allPlaced.df: A dataframe containing all students who were placed.

allPlaced.df <- rbind(placed.df, notDisclosedSalary.df)
library(psych)
describe(allPlaced.df)[,c(1:5)]
View(allPlaced.df)

Review the Distribution of Salary

library(lattice)
histogram(~salary, data = placed.df,
 main = "Distribution of Starting Salary", xlab="Starting Salary", col='grey' ) 
histogram(~salary, data = allPlaced.df,
 main = "Distribution of Starting Salary", xlab="Starting Salary", col='grey' ) 

Comparison of salary with the given variables

aggregate(cbind(salary, work_yrs, age) ~ sex, 
                   data = placed.df, mean)
aggregate(cbind(salary, work_yrs, age) ~ sex, 
                   data = allPlaced.df, mean)

1. Comparison of Salary with Work Experience

Scatter plot of Salary with Work Experience

scatterplot(salary ~ work_yrs ,data=placed.df, main="Scatterplot of Salary with Work Experience", xlab="Work Experience", ylab="MBA's Starting Salaries", horizontal=TRUE)

Distribution of Salary with Work Experience

boxplot(salary ~ work_yrs ,data=placed.df, main="Distribution of Salary with Work Experience", ylab="Work Experience", xlab="MBA's Starting Salaries", horizontal=TRUE)
library(lattice)
histogram(~salary, data = placed.df,
 main = "Frequency of Starting Salary", xlab="Starting Salary", col='grey' ) 

Average Salary by Work Experience

salaryWorkEx = aggregate(salary ~ work_yrs, data = placed.df, mean)
salaryWorkEx

2.Comparison of Salary with GMAT total score

scatterplot(salary ~ gmat_tot , data=placed.df, 
    xlab="GMAT Total", ylab="Salary", 
   main="Comparison of Salary with Total GMAT score", 
   labels=row.names(placed.df))

scatterplot(salary ~ gmat_tot | sex, data=placed.df, 
    xlab="GMAT Total", ylab="Salary", 
   main="Comparison of Salary with Total GMAT score", 
   labels=row.names(placed.df))
boxplot(salary ~ gmat_tot , data=placed.df, 
    ylab="GMAT Total", xlab="Salary", 
   main="Comparison of Salary with Total GMAT score", 
   horizontal=TRUE,
   labels=row.names(placed.df))
colnames(placed.df)
library(car)
scatterplot.matrix(~salary+gmat_tot+s_avg+f_avg, data=placed.df,
    main="Salary versus other variables")
scatterplot.matrix(~salary+gmat_tot+s_avg+f_avg |sex, data=placed.df,
    main="Salary versus other variables")

Various Factors affecting starting salary

  1. Number of male and females in dataframe age-wise

    ageTable <- table(placed.df$sex, placed.df$age)
    ageTable
  2. Effect of Sex on the on Salary

    aggregate(cbind(salary, work_yrs, age) ~ sex, 
                   data = MBA.df, mean)
  3. Effect of Age on the on Salary

    aggregate(cbind(salary, work_yrs) ~ age, data = MBA.df, mean)
  4. Effect of Satisfaction level on the on Salary

    aggregate(cbind(salary, work_yrs) ~ satis , data = MBA.df, mean) 
  5. Effect of MBA’s Starting salary based on Work Experience

    boxplot(salary ~ work_yrs ,data=MBA.df, main="Effect of Work Experience on Salary", ylab="Work Experience", xlab="MBA's Starting Salaries", horizontal=FALSE)
  6. Effect of MBA’s Starting salary based on Gender

    boxplot(salary ~ sex ,data=MBA.df, main="Effect of Gender on Salary", ylab="Work Experience", xlab="MBA's Starting Salaries", horizontal=FALSE)
  7. Distribution of MBA’s Starting Salary

    library(lattice)
    histogram(~salary, data = MBA.df,
     main = "Distribution of MBA's Starting Salary", xlab="MBA's Starting Slariy", col='grey' ) 

Merging 3 dataframes placed.df ; notDisclosed.df ; notPlaced = knownMBA.df

knownMBA.df <- rbind(placed.df, notDisclosedSalary.df, notPlaced.df)
View(knownMBA.df)

Create a dummay variable called “GotPlaced” = 1 (got a job) or 0 (did not get a job)

knownMBA.df$GotPlaced = (knownMBA.df$salary >1000)
View(knownMBA.df)

knownMBA.df$GotPlaced <- factor(knownMBA.df$GotPlaced)
str(knownMBA.df)

Contingency tables

  1. Number of Placed and Not Placed candiadtes

    allplaced <- table(knownMBA.df$GotPlaced == 'TRUE')
    allplaced
  2. allPlaced / notPlaced versus Sex

    placedbySex <- xtabs(~ knownMBA.df$GotPlaced + knownMBA.df$sex , data=knownMBA.df)
    placedbySex
    addmargins(placedbySex)

    Percentage of Male / Female candidates who got Placed

    prop.table(placedbySex, 2) 
  3. allPlaced / notPlaced versus First Language

    placedbyLanguage <- xtabs(~ knownMBA.df$GotPlaced + knownMBA.df$frstlang, data=knownMBA.df)
    placedbyLanguage
    addmargins(placedbyLanguage)

    Percentage of First Language candidates who got Placed

    prop.table(placedbyLanguage, 2) 

    Chi Square Test

    H1: The percentage of Females placed is more than Males

Chi Square Test : percentage of female who got placed is higher than percentage of male who got placed

chisq.test(placedbySex)

H2: The percentage of people placed whose first language is English is higher than the percentage of people placed whose first language is not English

chisq.test(placedbyLanguage)

MODEL SELECTION

library(corrplot)

colnames(placed.df)
dataColumns <- placed.df[, c("age","work_yrs", "gmat_tot", "gmat_qpc", "gmat_vpc", "gmat_tpc", "s_avg", "f_avg", "quarter", "satis")]

N <- cor(dataColumns)
corrplot(N, method="circle")
res <- cor(dataColumns)
round(res, 2)

MBA PERFORMANCE

The variables tracking performance during the MBA are heavily correlated

mbaPerformance <- placed.df[, c("s_avg", "f_avg", "quarter")]

N <- cor(mbaPerformance)
corrplot(N, method="circle")
res <- cor(mbaPerformance)
round(res, 2)

The overall performance quartile (quarter) is highly correlated with the Spring (s_avg) and Fall (f_avg) GPA. Thus we include ‘s_avg’ and ‘f_avg’ in our regression, but exclude ‘quarter’ from our regression.

Identifying DEPENDENT and INDEPENDENT Variables

The DEPENDENT variable is “salary” Identifying INDEPENDENT Variables: 1.Variables related to GMAT are highly correlated: “gmat_tot” “gmat_qpc” “gmat_vpc” “gmat_tpc” 2. Variables relatd to MBA performance are highly correlated: “s_avg” “f_avg” “quarter” 3. Variables related to age and work experience are highly correlated: “age”, “work_yrs” 4. Other variables: “sex” , “frstlang” ; “satis”

  1. GMAT The GMAT related columns are heavily strongly correlated with each other gmat <- placed.df[, c("gmat_tot", "gmat_qpc", "gmat_vpc", "gmat_tpc")]

    res <- cor(gmat)
    round(res, 2)
    library(corrplot)
    M <- cor(gmat)
    corrplot(M, method="circle")

    However, GMAT verbal and quantitative scores are very weakly correlated cor(gmat_qpc,gmat_vpc) Therefore, in our regression we will include gmat_qpc and gmat_vpc , but exclude “gmat_tot” and “gmat_tpc”

  2. MBA PERFORMANCE The variables tracking performance during the MBA are heavily correlated

    mbaPerformance <- placed.df[, c("s_avg", "f_avg", "quarter")]
    N <- cor(mbaPerformance)
    corrplot(N, method="circle")
    res <- cor(mbaPerformance)
    round(res, 2)

    The overall performance quartile (quarter) is highly correlated with the Spring (s_avg) and Fall (f_avg) GPA Thus We will include ‘s_avg’ and ‘f_avg’ in our regression, but exclude ‘quarter’ from our regression.

  3. WORK EXPERIENCE AND AGE The variables ‘age’ and ‘work_years’ are highly correlated. The older the person, the greater the work experience.

    cor(age,work_yrs)

    Therefore we will include ‘work_years’ in our regression, but exclude ‘age’ from our regression

SUMMARY OF MODEL SELECTION

The indepedent variables we will include in the regression are {work_yrs,s_avg,f_avg,gmat_qpc,gmat_vpc,sex,frstlang,satis}

VARIANCE - COVARIANCE MATRIX

columns = c("salary", "work_yrs", "gmat_qpc", "gmat_vpc", "s_avg", "f_avg", "satis")
placedVariables <- placed.df[, columns]
res <- cor(placedVariables)
round(res, 2)

library(corrplot)
M <- cor(placed.df[, columns])
corrplot(M, method="circle")

SCATTER PLOTS

library(car)
scatterplotMatrix(~salary + s_avg + f_avg + satis, data=placed.df,
    main="Salary versus MBA Performance and MBA Satisfaction")
scatterplotMatrix(~salary + work_yrs + gmat_qpc + gmat_vpc, data=placed.df,
    main="Salary versus Work Experience; GMAT Performance")

REGRESSION

Formulating multivariate linear regression model to fit salary with respect to the model selection Independent Variables: {work_yrs,s_avg,f_avg,gmat_qpc,gmat_vpc,sex,frstlang,satis} Dependent Variable: Salary

Model1 <- salary ~ 
             work_yrs + s_avg + f_avg + gmat_qpc + gmat_vpc + sex + frstlang + satis 
fit1 <- lm(Model1, data = placed.df)
summary(fit1)
library(leaps)
leap <- regsubsets(Model1, data = placed.df, nbest=1)
summary(leap)
plot(leap, scale="adjr2")
Model2 <- salary ~ 
             work_yrs + 
             # age +
             # s_avg +
             # f_avg +
             # quarter +
             # gmat_qpc +
             # gmat_vpc +
             # gmat_tot +
             # gmat_tpc +
             sex +
             frstlang +
             satis 
fit2 <- lm(Model2, data = placed.df)
summary(fit2)
library(leaps)
leap <- regsubsets(Model2, data = placed.df, nbest=1)
summary(leap)
plot(leap, scale="adjr2")