R Markdown

This R Markdown File is created in submission of the analyis of the MBA starting salaries

setwd("C:/Users/srinivas.s.n/Desktop/IIM internship/Internshipdata")
model <- read.csv(paste("MBA Starting Salaries Data.csv",sep=""))
View(model)

Creating Summary Statistics

summary(model)
##       age             sex           gmat_tot        gmat_qpc    
##  Min.   :22.00   Min.   :1.000   Min.   :450.0   Min.   :28.00  
##  1st Qu.:25.00   1st Qu.:1.000   1st Qu.:580.0   1st Qu.:72.00  
##  Median :27.00   Median :1.000   Median :620.0   Median :83.00  
##  Mean   :27.36   Mean   :1.248   Mean   :619.5   Mean   :80.64  
##  3rd Qu.:29.00   3rd Qu.:1.000   3rd Qu.:660.0   3rd Qu.:93.00  
##  Max.   :48.00   Max.   :2.000   Max.   :790.0   Max.   :99.00  
##     gmat_vpc        gmat_tpc        s_avg           f_avg      
##  Min.   :16.00   Min.   : 0.0   Min.   :2.000   Min.   :0.000  
##  1st Qu.:71.00   1st Qu.:78.0   1st Qu.:2.708   1st Qu.:2.750  
##  Median :81.00   Median :87.0   Median :3.000   Median :3.000  
##  Mean   :78.32   Mean   :84.2   Mean   :3.025   Mean   :3.062  
##  3rd Qu.:91.00   3rd Qu.:94.0   3rd Qu.:3.300   3rd Qu.:3.250  
##  Max.   :99.00   Max.   :99.0   Max.   :4.000   Max.   :4.000  
##     quarter         work_yrs         frstlang         salary      
##  Min.   :1.000   Min.   : 0.000   Min.   :1.000   Min.   :     0  
##  1st Qu.:1.250   1st Qu.: 2.000   1st Qu.:1.000   1st Qu.:     0  
##  Median :2.000   Median : 3.000   Median :1.000   Median :   999  
##  Mean   :2.478   Mean   : 3.872   Mean   :1.117   Mean   : 39026  
##  3rd Qu.:3.000   3rd Qu.: 4.000   3rd Qu.:1.000   3rd Qu.: 97000  
##  Max.   :4.000   Max.   :22.000   Max.   :2.000   Max.   :220000  
##      satis      
##  Min.   :  1.0  
##  1st Qu.:  5.0  
##  Median :  6.0  
##  Mean   :172.2  
##  3rd Qu.:  7.0  
##  Max.   :998.0
library(psych)
## Warning: package 'psych' was built under R version 3.4.3
describe(model)
##          vars   n     mean       sd median  trimmed     mad min    max
## age         1 274    27.36     3.71     27    26.76    2.97  22     48
## sex         2 274     1.25     0.43      1     1.19    0.00   1      2
## gmat_tot    3 274   619.45    57.54    620   618.86   59.30 450    790
## gmat_qpc    4 274    80.64    14.87     83    82.31   14.83  28     99
## gmat_vpc    5 274    78.32    16.86     81    80.33   14.83  16     99
## gmat_tpc    6 274    84.20    14.02     87    86.12   11.86   0     99
## s_avg       7 274     3.03     0.38      3     3.03    0.44   2      4
## f_avg       8 274     3.06     0.53      3     3.09    0.37   0      4
## quarter     9 274     2.48     1.11      2     2.47    1.48   1      4
## work_yrs   10 274     3.87     3.23      3     3.29    1.48   0     22
## frstlang   11 274     1.12     0.32      1     1.02    0.00   1      2
## salary     12 274 39025.69 50951.56    999 33607.86 1481.12   0 220000
## satis      13 274   172.18   371.61      6    91.50    1.48   1    998
##           range  skew kurtosis      se
## age          26  2.16     6.45    0.22
## sex           1  1.16    -0.66    0.03
## gmat_tot    340 -0.01     0.06    3.48
## gmat_qpc     71 -0.92     0.30    0.90
## gmat_vpc     83 -1.04     0.74    1.02
## gmat_tpc     99 -2.28     9.02    0.85
## s_avg         2 -0.06    -0.38    0.02
## f_avg         4 -2.08    10.85    0.03
## quarter       3  0.02    -1.35    0.07
## work_yrs     22  2.78     9.80    0.20
## frstlang      1  2.37     3.65    0.02
## salary   220000  0.70    -1.05 3078.10
## satis       997  1.77     1.13   22.45

Visual Analysis of Variables

new <- model[which(model$satis<='7'),]
hist(new$satis,xlab="degree of satisfaction with MBA program",main="degree of satisfaction",col="RED",breaks=5)

new<-model[which(model$salary!='998' & model$salary!='999'& model$salary!='0'),]
hist(new$salary,xlab="starting salary",main="first salary frequency distribution",col="RED")

hist(model$frstlang,xlab="first language",main="first language frequency distribution",col="RED")

hist(model$work_yrs,xlab="work experience in years",main="work expreience frequency distribution", col="RED",breaks=20)

hist(model$f_avg,xlab="fall mba average", main="fall mba average frequency distribution", col="RED",breaks=20)

Bi Variate Analysis

library(car)
## Warning: package 'car' was built under R version 3.4.3
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
scatterplot(salary~age,     data=model,
            main="Scatter plot of salary vs age",
            xlab="age",
            ylab="salary")

scatterplot(salary~sex,data=model,
            spread=FALSE,
            main="scatterplot of salary and           sex",ylab="salary",xlab="sex")

scatterplot(salary~gmat_tot, data=model,
            main="scatterplot of gmat total score and salary",xlab="gmat total score",ylab="salary")

Breaking down dataset into placed and not placed

placed.df <- model[which (model$salary > 0)  , ]
notPlaced.df <- model[which(model$salary==0), ]

Scatterplot Matrix , corrgram and regression analysis for placed students

library(car)
scatterplotMatrix(~salary+gmat_tot+s_avg+f_avg+work_yrs|sex, data=placed.df, main="ScatterPlotMatrix")

library(corrplot)
## Warning: package 'corrplot' was built under R version 3.4.3
## corrplot 0.84 loaded
cor <- cor(placed.df[sapply(placed.df, function(x) !is.factor(x))])
corrplot(cor, method = "pie")

Model <- salary ~ work_yrs + s_avg + f_avg + gmat_tot + sex + frstlang
fit <- lm(Model, data = placed.df)
summary(fit)
## 
## Call:
## lm(formula = Model, data = placed.df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -90833 -45803  13770  42822 137571 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  88013.69   55273.87   1.592   0.1131  
## work_yrs      3000.98    1554.94   1.930   0.0552 .
## s_avg        28236.10   11828.21   2.387   0.0180 *
## f_avg        -4851.08    8820.00  -0.550   0.5830  
## gmat_tot      -151.24      69.71  -2.170   0.0314 *
## sex          11542.89    8786.26   1.314   0.1906  
## frstlang    -27657.44   11265.98  -2.455   0.0151 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 49850 on 177 degrees of freedom
## Multiple R-squared:  0.1284, Adjusted R-squared:  0.09882 
## F-statistic: 4.344 on 6 and 177 DF,  p-value: 0.000407
placed.df <- model[which (model$salary > 0)  , ]
notPlaced.df <- model[which(model$salary==0), ]