setwd("C:/Users/CJ With HP/Desktop/IIM Lucknow/Datasets")
salary.df <- read.csv(paste("MBA Starting Salaries Data.csv",sep = ""))
summary(salary.df)
## age sex gmat_tot gmat_qpc
## Min. :22.00 Min. :1.000 Min. :450.0 Min. :28.00
## 1st Qu.:25.00 1st Qu.:1.000 1st Qu.:580.0 1st Qu.:72.00
## Median :27.00 Median :1.000 Median :620.0 Median :83.00
## Mean :27.36 Mean :1.248 Mean :619.5 Mean :80.64
## 3rd Qu.:29.00 3rd Qu.:1.000 3rd Qu.:660.0 3rd Qu.:93.00
## Max. :48.00 Max. :2.000 Max. :790.0 Max. :99.00
## gmat_vpc gmat_tpc s_avg f_avg
## Min. :16.00 Min. : 0.0 Min. :2.000 Min. :0.000
## 1st Qu.:71.00 1st Qu.:78.0 1st Qu.:2.708 1st Qu.:2.750
## Median :81.00 Median :87.0 Median :3.000 Median :3.000
## Mean :78.32 Mean :84.2 Mean :3.025 Mean :3.062
## 3rd Qu.:91.00 3rd Qu.:94.0 3rd Qu.:3.300 3rd Qu.:3.250
## Max. :99.00 Max. :99.0 Max. :4.000 Max. :4.000
## quarter work_yrs frstlang salary
## Min. :1.000 Min. : 0.000 Min. :1.000 Min. : 0
## 1st Qu.:1.250 1st Qu.: 2.000 1st Qu.:1.000 1st Qu.: 0
## Median :2.000 Median : 3.000 Median :1.000 Median : 999
## Mean :2.478 Mean : 3.872 Mean :1.117 Mean : 39026
## 3rd Qu.:3.000 3rd Qu.: 4.000 3rd Qu.:1.000 3rd Qu.: 97000
## Max. :4.000 Max. :22.000 Max. :2.000 Max. :220000
## satis
## Min. : 1.0
## 1st Qu.: 5.0
## Median : 6.0
## Mean :172.2
## 3rd Qu.: 7.0
## Max. :998.0
attach(salary.df)
placed.df <- salary.df[which(salary!=0 & salary!=999 & salary!= 998),]
attach(placed.df)
## The following objects are masked from salary.df:
##
## age, f_avg, frstlang, gmat_qpc, gmat_tot, gmat_tpc, gmat_vpc,
## quarter, s_avg, salary, satis, sex, work_yrs
mytable<-with(placed.df,table(salary))
mytable
## salary
## 64000 77000 78256 82000 85000 86000 88000 88500 90000 92000
## 1 1 1 1 4 2 1 1 3 3
## 93000 95000 96000 96500 97000 98000 99000 100000 100400 101000
## 3 7 4 1 2 10 1 9 1 2
## 101100 101600 102500 103000 104000 105000 106000 107000 107300 107500
## 1 1 1 1 2 11 3 1 1 1
## 108000 110000 112000 115000 118000 120000 126710 130000 145800 146000
## 2 1 3 5 1 4 1 1 1 1
## 162000 220000
## 1 1
mytable <- with(placed.df,table(sex))
mytable
## sex
## 1 2
## 72 31
mytable<-xtabs(~frstlang+sex)
mytable
## sex
## frstlang 1 2
## 1 68 28
## 2 4 3
chisq.test(mytable)
## Warning in chisq.test(mytable): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: mytable
## X-squared = 0.11264, df = 1, p-value = 0.7372
library(lattice)
histogram(salary,type="count")
boxplot(salary~sex,xlab="sex",ylab="Salary")
boxplot(salary~quarter,xlab="quartile",ylab="Salary")
aggregate(salary,by=list(sex),mean)
## Group.1 x
## 1 1 104970.97
## 2 2 98524.39
t.test(salary,sex)
##
## Welch Two Sample t-test
##
## data: salary and sex
## t = 58.517, df = 102, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 99537.17 106521.71
## sample estimates:
## mean of x mean of y
## 1.030307e+05 1.300971e+00
library(car)
scatterplotMatrix(formula = ~age+salary+work_yrs,data=placed.df)
library(corrgram)
corrgram(placed.df,lower.panel = panel.shade,upper.panel = panel.pie,text.panel = panel.txt)
cor(placed.df[,c(1,3:10,12,13)])
## age gmat_tot gmat_qpc gmat_vpc gmat_tpc
## age 1.00000000 -0.07871678 -0.165039057 0.01799420 -0.09609156
## gmat_tot -0.07871678 1.00000000 0.666382266 0.78038546 0.96680810
## gmat_qpc -0.16503906 0.66638227 1.000000000 0.09466541 0.65865003
## gmat_vpc 0.01799420 0.78038546 0.094665411 1.00000000 0.78443167
## gmat_tpc -0.09609156 0.96680810 0.658650025 0.78443167 1.00000000
## s_avg 0.15654954 0.17198874 0.015471662 0.15865101 0.13938500
## f_avg -0.21699191 0.12246257 0.098418869 0.02290167 0.07051391
## quarter -0.12568145 -0.10578964 0.012648346 -0.12862079 -0.09955033
## work_yrs 0.88052470 -0.12280018 -0.182701263 -0.02812182 -0.13246963
## salary 0.49964284 -0.09067141 0.014141299 -0.13743230 -0.13201783
## satis 0.10832308 0.06474206 -0.003984632 0.14863481 0.11630842
## s_avg f_avg quarter work_yrs salary
## age 0.15654954 -0.21699191 -0.12568145 0.88052470 0.49964284
## gmat_tot 0.17198874 0.12246257 -0.10578964 -0.12280018 -0.09067141
## gmat_qpc 0.01547166 0.09841887 0.01264835 -0.18270126 0.01414130
## gmat_vpc 0.15865101 0.02290167 -0.12862079 -0.02812182 -0.13743230
## gmat_tpc 0.13938500 0.07051391 -0.09955033 -0.13246963 -0.13201783
## s_avg 1.00000000 0.44590413 -0.84038355 0.16328236 0.10173175
## f_avg 0.44590413 1.00000000 -0.43144819 -0.21633018 -0.10603897
## quarter -0.84038355 -0.43144819 1.00000000 -0.12896722 -0.12848526
## work_yrs 0.16328236 -0.21633018 -0.12896722 1.00000000 0.45466634
## salary 0.10173175 -0.10603897 -0.12848526 0.45466634 1.00000000
## satis -0.14356557 -0.11773304 0.22511985 0.06299926 -0.04005060
## satis
## age 0.108323083
## gmat_tot 0.064742057
## gmat_qpc -0.003984632
## gmat_vpc 0.148634805
## gmat_tpc 0.116308417
## s_avg -0.143565573
## f_avg -0.117733043
## quarter 0.225119851
## work_yrs 0.062999256
## salary -0.040050600
## satis 1.000000000
boxplot(salary~work_yrs,xlab="work-ex",ylab="Salary")
boxplot(salary~age,xlab="age",ylab="Salary")
plot(s_avg,salary)
plot(f_avg,salary)
Some conclusions: 1.The mean salary of males is slightly higher than females. 2.Better Quartile ranking corresponds to better salaries, however there are some outliers as well. 3.Work-experience and age have a high positive correlation with Salary. In general, salaries grow slightly with increasing age and work experience.
fit<-lm(salary~gmat_tot+gmat_tpc+gmat_vpc+gmat_qpc,data=placed.df)
fit<-lm(salary~work_yrs+s_avg+f_avg+gmat_vpc+gmat_qpc+gmat_tot+gmat_tpc+age+frstlang+satis,data=placed.df)
summary(fit)
##
## Call:
## lm(formula = salary ~ work_yrs + s_avg + f_avg + gmat_vpc + gmat_qpc +
## gmat_tot + gmat_tpc + age + frstlang + satis, data = placed.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -30038 -8477 269 5338 67342
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 61749.15 51183.62 1.206 0.2307
## work_yrs 605.26 1126.95 0.537 0.5925
## s_avg 4406.72 5035.37 0.875 0.3838
## f_avg -1966.84 3817.01 -0.515 0.6076
## gmat_vpc 642.25 493.51 1.301 0.1964
## gmat_qpc 920.31 488.23 1.885 0.0626 .
## gmat_tot -43.66 171.20 -0.255 0.7993
## gmat_tpc -1394.91 712.26 -1.958 0.0532 .
## age 2014.72 1105.48 1.822 0.0716 .
## frstlang 6571.93 7192.65 0.914 0.3633
## satis -1443.04 2075.08 -0.695 0.4886
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15420 on 92 degrees of freedom
## Multiple R-squared: 0.3279, Adjusted R-squared: 0.2548
## F-statistic: 4.488 on 10 and 92 DF, p-value: 3.682e-05
This model explains 25.48%(Adjusted R-value) variability in the output.
attach(salary.df)
## The following objects are masked from placed.df:
##
## age, f_avg, frstlang, gmat_qpc, gmat_tot, gmat_tpc, gmat_vpc,
## quarter, s_avg, salary, satis, sex, work_yrs
## The following objects are masked from salary.df (pos = 7):
##
## age, f_avg, frstlang, gmat_qpc, gmat_tot, gmat_tpc, gmat_vpc,
## quarter, s_avg, salary, satis, sex, work_yrs
unplaced.df <- salary.df[which(salary==0),]
boxplot(placed.df$age,unplaced.df$age,salary.df$age)
boxplot(placed.df$quarter,unplaced.df$quarter)
boxplot(placed.df$s_avg,unplaced.df$s_avg)
boxplot(placed.df$f_avg,unplaced.df$f_avg)
Thus Age,quartile ranking, spring_average,fall_average are some of the parameters which explaines the difference between the students who got placed and the ones who didn’t.