This R Markdown File is created in submission of the analyis of the MBA starting salaries
setwd("C:/Users/srinivas.s.n/Desktop/IIM internship/Internshipdata")
model <- read.csv(paste("MBA Starting Salaries Data.csv",sep=""))
View(model)
Creating Summary Statistics
summary(model)
## age sex gmat_tot gmat_qpc
## Min. :22.00 Min. :1.000 Min. :450.0 Min. :28.00
## 1st Qu.:25.00 1st Qu.:1.000 1st Qu.:580.0 1st Qu.:72.00
## Median :27.00 Median :1.000 Median :620.0 Median :83.00
## Mean :27.36 Mean :1.248 Mean :619.5 Mean :80.64
## 3rd Qu.:29.00 3rd Qu.:1.000 3rd Qu.:660.0 3rd Qu.:93.00
## Max. :48.00 Max. :2.000 Max. :790.0 Max. :99.00
## gmat_vpc gmat_tpc s_avg f_avg
## Min. :16.00 Min. : 0.0 Min. :2.000 Min. :0.000
## 1st Qu.:71.00 1st Qu.:78.0 1st Qu.:2.708 1st Qu.:2.750
## Median :81.00 Median :87.0 Median :3.000 Median :3.000
## Mean :78.32 Mean :84.2 Mean :3.025 Mean :3.062
## 3rd Qu.:91.00 3rd Qu.:94.0 3rd Qu.:3.300 3rd Qu.:3.250
## Max. :99.00 Max. :99.0 Max. :4.000 Max. :4.000
## quarter work_yrs frstlang salary
## Min. :1.000 Min. : 0.000 Min. :1.000 Min. : 0
## 1st Qu.:1.250 1st Qu.: 2.000 1st Qu.:1.000 1st Qu.: 0
## Median :2.000 Median : 3.000 Median :1.000 Median : 999
## Mean :2.478 Mean : 3.872 Mean :1.117 Mean : 39026
## 3rd Qu.:3.000 3rd Qu.: 4.000 3rd Qu.:1.000 3rd Qu.: 97000
## Max. :4.000 Max. :22.000 Max. :2.000 Max. :220000
## satis
## Min. : 1.0
## 1st Qu.: 5.0
## Median : 6.0
## Mean :172.2
## 3rd Qu.: 7.0
## Max. :998.0
library(psych)
## Warning: package 'psych' was built under R version 3.4.3
describe(model)
## vars n mean sd median trimmed mad min max
## age 1 274 27.36 3.71 27 26.76 2.97 22 48
## sex 2 274 1.25 0.43 1 1.19 0.00 1 2
## gmat_tot 3 274 619.45 57.54 620 618.86 59.30 450 790
## gmat_qpc 4 274 80.64 14.87 83 82.31 14.83 28 99
## gmat_vpc 5 274 78.32 16.86 81 80.33 14.83 16 99
## gmat_tpc 6 274 84.20 14.02 87 86.12 11.86 0 99
## s_avg 7 274 3.03 0.38 3 3.03 0.44 2 4
## f_avg 8 274 3.06 0.53 3 3.09 0.37 0 4
## quarter 9 274 2.48 1.11 2 2.47 1.48 1 4
## work_yrs 10 274 3.87 3.23 3 3.29 1.48 0 22
## frstlang 11 274 1.12 0.32 1 1.02 0.00 1 2
## salary 12 274 39025.69 50951.56 999 33607.86 1481.12 0 220000
## satis 13 274 172.18 371.61 6 91.50 1.48 1 998
## range skew kurtosis se
## age 26 2.16 6.45 0.22
## sex 1 1.16 -0.66 0.03
## gmat_tot 340 -0.01 0.06 3.48
## gmat_qpc 71 -0.92 0.30 0.90
## gmat_vpc 83 -1.04 0.74 1.02
## gmat_tpc 99 -2.28 9.02 0.85
## s_avg 2 -0.06 -0.38 0.02
## f_avg 4 -2.08 10.85 0.03
## quarter 3 0.02 -1.35 0.07
## work_yrs 22 2.78 9.80 0.20
## frstlang 1 2.37 3.65 0.02
## salary 220000 0.70 -1.05 3078.10
## satis 997 1.77 1.13 22.45
Visual Analysis of Variables
new <- model[which(model$satis<='7'),]
hist(new$satis,xlab="degree of satisfaction with MBA program",main="degree of satisfaction",col="RED",breaks=5)
new<-model[which(model$salary!='998' & model$salary!='999'& model$salary!='0'),]
hist(new$salary,xlab="starting salary",main="first salary frequency distribution",col="RED")
hist(model$frstlang,xlab="first language",main="first language frequency distribution",col="RED")
hist(model$work_yrs,xlab="work experience in years",main="work expreience frequency distribution", col="RED",breaks=20)
hist(model$f_avg,xlab="fall mba average", main="fall mba average frequency distribution", col="RED",breaks=20)
Bi Variate Analysis
library(car)
## Warning: package 'car' was built under R version 3.4.3
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
scatterplot(salary~age, data=model,
main="Scatter plot of salary vs age",
xlab="age",
ylab="salary")
scatterplot(salary~sex,data=model,
spread=FALSE,
main="scatterplot of salary and sex",ylab="salary",xlab="sex")
scatterplot(salary~gmat_tot, data=model,
main="scatterplot of gmat total score and salary",xlab="gmat total score",ylab="salary")
Breaking down dataset into placed and not placed
placed.df <- model[which (model$salary > 0) , ]
notPlaced.df <- model[which(model$salary==0), ]
Scatterplot Matrix , corrgram and regression analysis for placed students
library(car)
scatterplotMatrix(~salary+gmat_tot+s_avg+f_avg+work_yrs|sex, data=placed.df, main="ScatterPlotMatrix")
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.4.3
## corrplot 0.84 loaded
cor <- cor(placed.df[sapply(placed.df, function(x) !is.factor(x))])
corrplot(cor, method = "pie")
Model <- salary ~ work_yrs + s_avg + f_avg + gmat_tot + sex + frstlang
fit <- lm(Model, data = placed.df)
summary(fit)
##
## Call:
## lm(formula = Model, data = placed.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -90833 -45803 13770 42822 137571
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 88013.69 55273.87 1.592 0.1131
## work_yrs 3000.98 1554.94 1.930 0.0552 .
## s_avg 28236.10 11828.21 2.387 0.0180 *
## f_avg -4851.08 8820.00 -0.550 0.5830
## gmat_tot -151.24 69.71 -2.170 0.0314 *
## sex 11542.89 8786.26 1.314 0.1906
## frstlang -27657.44 11265.98 -2.455 0.0151 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 49850 on 177 degrees of freedom
## Multiple R-squared: 0.1284, Adjusted R-squared: 0.09882
## F-statistic: 4.344 on 6 and 177 DF, p-value: 0.000407
placed.df <- model[which (model$salary > 0) , ]
notPlaced.df <- model[which(model$salary==0), ]