1. Reading the dataset and Creating summary statistics

setwd("C:/Users/CJ With HP/Desktop/IIM Lucknow/Datasets")
salary.df <- read.csv(paste("MBA Starting Salaries Data.csv",sep = ""))
summary(salary.df)
##       age             sex           gmat_tot        gmat_qpc    
##  Min.   :22.00   Min.   :1.000   Min.   :450.0   Min.   :28.00  
##  1st Qu.:25.00   1st Qu.:1.000   1st Qu.:580.0   1st Qu.:72.00  
##  Median :27.00   Median :1.000   Median :620.0   Median :83.00  
##  Mean   :27.36   Mean   :1.248   Mean   :619.5   Mean   :80.64  
##  3rd Qu.:29.00   3rd Qu.:1.000   3rd Qu.:660.0   3rd Qu.:93.00  
##  Max.   :48.00   Max.   :2.000   Max.   :790.0   Max.   :99.00  
##     gmat_vpc        gmat_tpc        s_avg           f_avg      
##  Min.   :16.00   Min.   : 0.0   Min.   :2.000   Min.   :0.000  
##  1st Qu.:71.00   1st Qu.:78.0   1st Qu.:2.708   1st Qu.:2.750  
##  Median :81.00   Median :87.0   Median :3.000   Median :3.000  
##  Mean   :78.32   Mean   :84.2   Mean   :3.025   Mean   :3.062  
##  3rd Qu.:91.00   3rd Qu.:94.0   3rd Qu.:3.300   3rd Qu.:3.250  
##  Max.   :99.00   Max.   :99.0   Max.   :4.000   Max.   :4.000  
##     quarter         work_yrs         frstlang         salary      
##  Min.   :1.000   Min.   : 0.000   Min.   :1.000   Min.   :     0  
##  1st Qu.:1.250   1st Qu.: 2.000   1st Qu.:1.000   1st Qu.:     0  
##  Median :2.000   Median : 3.000   Median :1.000   Median :   999  
##  Mean   :2.478   Mean   : 3.872   Mean   :1.117   Mean   : 39026  
##  3rd Qu.:3.000   3rd Qu.: 4.000   3rd Qu.:1.000   3rd Qu.: 97000  
##  Max.   :4.000   Max.   :22.000   Max.   :2.000   Max.   :220000  
##      satis      
##  Min.   :  1.0  
##  1st Qu.:  5.0  
##  Median :  6.0  
##  Mean   :172.2  
##  3rd Qu.:  7.0  
##  Max.   :998.0
attach(salary.df)

2. Creating a subset of placed students

placed.df <- salary.df[which(salary!=0 & salary!=999 & salary!= 998),]
attach(placed.df)
## The following objects are masked from salary.df:
## 
##     age, f_avg, frstlang, gmat_qpc, gmat_tot, gmat_tpc, gmat_vpc,
##     quarter, s_avg, salary, satis, sex, work_yrs

3. Plotting variables

mytable<-with(placed.df,table(salary))
mytable
## salary
##  64000  77000  78256  82000  85000  86000  88000  88500  90000  92000 
##      1      1      1      1      4      2      1      1      3      3 
##  93000  95000  96000  96500  97000  98000  99000 100000 100400 101000 
##      3      7      4      1      2     10      1      9      1      2 
## 101100 101600 102500 103000 104000 105000 106000 107000 107300 107500 
##      1      1      1      1      2     11      3      1      1      1 
## 108000 110000 112000 115000 118000 120000 126710 130000 145800 146000 
##      2      1      3      5      1      4      1      1      1      1 
## 162000 220000 
##      1      1
mytable <- with(placed.df,table(sex))
mytable
## sex
##  1  2 
## 72 31
mytable<-xtabs(~frstlang+sex)
mytable
##         sex
## frstlang  1  2
##        1 68 28
##        2  4  3
chisq.test(mytable)
## Warning in chisq.test(mytable): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  mytable
## X-squared = 0.11264, df = 1, p-value = 0.7372
library(lattice)
histogram(salary,type="count")

boxplot(salary~sex,xlab="sex",ylab="Salary")

boxplot(salary~quarter,xlab="quartile",ylab="Salary")

aggregate(salary,by=list(sex),mean)
##   Group.1         x
## 1       1 104970.97
## 2       2  98524.39
t.test(salary,sex)
## 
##  Welch Two Sample t-test
## 
## data:  salary and sex
## t = 58.517, df = 102, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##   99537.17 106521.71
## sample estimates:
##    mean of x    mean of y 
## 1.030307e+05 1.300971e+00
library(car)
scatterplotMatrix(formula = ~age+salary+work_yrs,data=placed.df)

library(corrgram)
corrgram(placed.df,lower.panel = panel.shade,upper.panel = panel.pie,text.panel = panel.txt)

cor(placed.df[,c(1,3:10,12,13)])
##                  age    gmat_tot     gmat_qpc    gmat_vpc    gmat_tpc
## age       1.00000000 -0.07871678 -0.165039057  0.01799420 -0.09609156
## gmat_tot -0.07871678  1.00000000  0.666382266  0.78038546  0.96680810
## gmat_qpc -0.16503906  0.66638227  1.000000000  0.09466541  0.65865003
## gmat_vpc  0.01799420  0.78038546  0.094665411  1.00000000  0.78443167
## gmat_tpc -0.09609156  0.96680810  0.658650025  0.78443167  1.00000000
## s_avg     0.15654954  0.17198874  0.015471662  0.15865101  0.13938500
## f_avg    -0.21699191  0.12246257  0.098418869  0.02290167  0.07051391
## quarter  -0.12568145 -0.10578964  0.012648346 -0.12862079 -0.09955033
## work_yrs  0.88052470 -0.12280018 -0.182701263 -0.02812182 -0.13246963
## salary    0.49964284 -0.09067141  0.014141299 -0.13743230 -0.13201783
## satis     0.10832308  0.06474206 -0.003984632  0.14863481  0.11630842
##                s_avg       f_avg     quarter    work_yrs      salary
## age       0.15654954 -0.21699191 -0.12568145  0.88052470  0.49964284
## gmat_tot  0.17198874  0.12246257 -0.10578964 -0.12280018 -0.09067141
## gmat_qpc  0.01547166  0.09841887  0.01264835 -0.18270126  0.01414130
## gmat_vpc  0.15865101  0.02290167 -0.12862079 -0.02812182 -0.13743230
## gmat_tpc  0.13938500  0.07051391 -0.09955033 -0.13246963 -0.13201783
## s_avg     1.00000000  0.44590413 -0.84038355  0.16328236  0.10173175
## f_avg     0.44590413  1.00000000 -0.43144819 -0.21633018 -0.10603897
## quarter  -0.84038355 -0.43144819  1.00000000 -0.12896722 -0.12848526
## work_yrs  0.16328236 -0.21633018 -0.12896722  1.00000000  0.45466634
## salary    0.10173175 -0.10603897 -0.12848526  0.45466634  1.00000000
## satis    -0.14356557 -0.11773304  0.22511985  0.06299926 -0.04005060
##                 satis
## age       0.108323083
## gmat_tot  0.064742057
## gmat_qpc -0.003984632
## gmat_vpc  0.148634805
## gmat_tpc  0.116308417
## s_avg    -0.143565573
## f_avg    -0.117733043
## quarter   0.225119851
## work_yrs  0.062999256
## salary   -0.040050600
## satis     1.000000000
boxplot(salary~work_yrs,xlab="work-ex",ylab="Salary")

boxplot(salary~age,xlab="age",ylab="Salary")

plot(s_avg,salary)

plot(f_avg,salary)

Some conclusions: 1.The mean salary of males is slightly higher than females. 2.Better Quartile ranking corresponds to better salaries, however there are some outliers as well. 3.Work-experience and age have a high positive correlation with Salary. In general, salaries grow slightly with increasing age and work experience.

4. Constructing Regression models

fit<-lm(salary~gmat_tot+gmat_tpc+gmat_vpc+gmat_qpc,data=placed.df)
fit<-lm(salary~work_yrs+s_avg+f_avg+gmat_vpc+gmat_qpc+gmat_tot+gmat_tpc+age+frstlang+satis,data=placed.df)
summary(fit)
## 
## Call:
## lm(formula = salary ~ work_yrs + s_avg + f_avg + gmat_vpc + gmat_qpc + 
##     gmat_tot + gmat_tpc + age + frstlang + satis, data = placed.df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -30038  -8477    269   5338  67342 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 61749.15   51183.62   1.206   0.2307  
## work_yrs      605.26    1126.95   0.537   0.5925  
## s_avg        4406.72    5035.37   0.875   0.3838  
## f_avg       -1966.84    3817.01  -0.515   0.6076  
## gmat_vpc      642.25     493.51   1.301   0.1964  
## gmat_qpc      920.31     488.23   1.885   0.0626 .
## gmat_tot      -43.66     171.20  -0.255   0.7993  
## gmat_tpc    -1394.91     712.26  -1.958   0.0532 .
## age          2014.72    1105.48   1.822   0.0716 .
## frstlang     6571.93    7192.65   0.914   0.3633  
## satis       -1443.04    2075.08  -0.695   0.4886  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15420 on 92 degrees of freedom
## Multiple R-squared:  0.3279, Adjusted R-squared:  0.2548 
## F-statistic: 4.488 on 10 and 92 DF,  p-value: 3.682e-05

This model explains 25.48%(Adjusted R-value) variability in the output.

5. Creating a subset of unplaced students

attach(salary.df)
## The following objects are masked from placed.df:
## 
##     age, f_avg, frstlang, gmat_qpc, gmat_tot, gmat_tpc, gmat_vpc,
##     quarter, s_avg, salary, satis, sex, work_yrs
## The following objects are masked from salary.df (pos = 7):
## 
##     age, f_avg, frstlang, gmat_qpc, gmat_tot, gmat_tpc, gmat_vpc,
##     quarter, s_avg, salary, satis, sex, work_yrs
unplaced.df <- salary.df[which(salary==0),]

6. Comparing the two groups

boxplot(placed.df$age,unplaced.df$age,salary.df$age)

boxplot(placed.df$quarter,unplaced.df$quarter)

boxplot(placed.df$s_avg,unplaced.df$s_avg)

boxplot(placed.df$f_avg,unplaced.df$f_avg)

Thus Age,quartile ranking, spring_average,fall_average are some of the parameters which explaines the difference between the students who got placed and the ones who didn’t.