1. Reading the data and generating the summary
MBA_start <- read.csv(paste("MBA Starting Salaries Data.csv", sep=""))
summary(MBA_start)
##       age             sex           gmat_tot        gmat_qpc    
##  Min.   :22.00   Min.   :1.000   Min.   :450.0   Min.   :28.00  
##  1st Qu.:25.00   1st Qu.:1.000   1st Qu.:580.0   1st Qu.:72.00  
##  Median :27.00   Median :1.000   Median :620.0   Median :83.00  
##  Mean   :27.36   Mean   :1.248   Mean   :619.5   Mean   :80.64  
##  3rd Qu.:29.00   3rd Qu.:1.000   3rd Qu.:660.0   3rd Qu.:93.00  
##  Max.   :48.00   Max.   :2.000   Max.   :790.0   Max.   :99.00  
##     gmat_vpc        gmat_tpc        s_avg           f_avg      
##  Min.   :16.00   Min.   : 0.0   Min.   :2.000   Min.   :0.000  
##  1st Qu.:71.00   1st Qu.:78.0   1st Qu.:2.708   1st Qu.:2.750  
##  Median :81.00   Median :87.0   Median :3.000   Median :3.000  
##  Mean   :78.32   Mean   :84.2   Mean   :3.025   Mean   :3.062  
##  3rd Qu.:91.00   3rd Qu.:94.0   3rd Qu.:3.300   3rd Qu.:3.250  
##  Max.   :99.00   Max.   :99.0   Max.   :4.000   Max.   :4.000  
##     quarter         work_yrs         frstlang         salary      
##  Min.   :1.000   Min.   : 0.000   Min.   :1.000   Min.   :     0  
##  1st Qu.:1.250   1st Qu.: 2.000   1st Qu.:1.000   1st Qu.:     0  
##  Median :2.000   Median : 3.000   Median :1.000   Median :   999  
##  Mean   :2.478   Mean   : 3.872   Mean   :1.117   Mean   : 39026  
##  3rd Qu.:3.000   3rd Qu.: 4.000   3rd Qu.:1.000   3rd Qu.: 97000  
##  Max.   :4.000   Max.   :22.000   Max.   :2.000   Max.   :220000  
##      satis      
##  Min.   :  1.0  
##  1st Qu.:  5.0  
##  Median :  6.0  
##  Mean   :172.2  
##  3rd Qu.:  7.0  
##  Max.   :998.0
library(psych)
describe(MBA_start)
##          vars   n     mean       sd median  trimmed     mad min    max
## age         1 274    27.36     3.71     27    26.76    2.97  22     48
## sex         2 274     1.25     0.43      1     1.19    0.00   1      2
## gmat_tot    3 274   619.45    57.54    620   618.86   59.30 450    790
## gmat_qpc    4 274    80.64    14.87     83    82.31   14.83  28     99
## gmat_vpc    5 274    78.32    16.86     81    80.33   14.83  16     99
## gmat_tpc    6 274    84.20    14.02     87    86.12   11.86   0     99
## s_avg       7 274     3.03     0.38      3     3.03    0.44   2      4
## f_avg       8 274     3.06     0.53      3     3.09    0.37   0      4
## quarter     9 274     2.48     1.11      2     2.47    1.48   1      4
## work_yrs   10 274     3.87     3.23      3     3.29    1.48   0     22
## frstlang   11 274     1.12     0.32      1     1.02    0.00   1      2
## salary     12 274 39025.69 50951.56    999 33607.86 1481.12   0 220000
## satis      13 274   172.18   371.61      6    91.50    1.48   1    998
##           range  skew kurtosis      se
## age          26  2.16     6.45    0.22
## sex           1  1.16    -0.66    0.03
## gmat_tot    340 -0.01     0.06    3.48
## gmat_qpc     71 -0.92     0.30    0.90
## gmat_vpc     83 -1.04     0.74    1.02
## gmat_tpc     99 -2.28     9.02    0.85
## s_avg         2 -0.06    -0.38    0.02
## f_avg         4 -2.08    10.85    0.03
## quarter       3  0.02    -1.35    0.07
## work_yrs     22  2.78     9.80    0.20
## frstlang      1  2.37     3.65    0.02
## salary   220000  0.70    -1.05 3078.10
## satis       997  1.77     1.13   22.45
  1. Dividing different data frames
# Dataframe for students who were placed
Placed <- MBA_start[which (MBA_start$salary > 999), ]
View(Placed)

# Dataframe for students who were not placed
Not_Placed <- MBA_start[which (MBA_start$salary == 0), ]
View(Not_Placed)

# Dataframe for students who didnot disclose their starting salary
Salary_not_disclosed <- MBA_start[which (MBA_start$salary == 999), ]
View(Salary_not_disclosed)

# Dataframe for students who didnot answer the survey
Not_answered <- MBA_start[which (MBA_start$salary == 998), ]
View(Not_answered)

# Dataframe for students whose starting salary is known 
Salary_known <- MBA_start[which((MBA_start$salary == 0) | (MBA_start$salary > 999)), ]
View(Salary_known)

# Dataframe for students whose satisfaction is known
Satis_known <- MBA_start[which(MBA_start$satis < 8), ]
View(Satis_known)
  1. Creating Box Plot for Age
boxplot(MBA_start$age,
        horizontal = TRUE,
        main = "Box Plot for Age",
        xlab =  "Age",
        col  =  "blue"
       )

  1. Creating Box Plot for GMAT Total
boxplot(MBA_start$gmat_tot,
        horizontal = TRUE,
        main = "Box Plot for GMAT Total",
        xlab =  "GMAT Scores",
        col  =  "green"
       )

  1. Creating Box Plot for GMAT percentile for Quants Section
boxplot(MBA_start$gmat_qpc,
        horizontal = TRUE,
        main = "Box Plot for GMAT Percentile for Quants Section",
        xlab =  "Percentile",
        col  =  "Yellow"
       )

  1. Creating Box Plot for GMAT percentile for Verbal Section
boxplot(MBA_start$gmat_vpc,
        horizontal = TRUE,
        main = "Box Plot for GMAT Percentile for Verbal Section ",
        xlab =  "Percentile",
        col  =  "darkgreen"
       )

  1. Creating Box Plot for GMAT percentile for overall GMAT Exam
boxplot(MBA_start$gmat_tpc,
        horizontal = TRUE,
        main = "Box Plot for GMAT Percentile (OVERALL) ",
        xlab =  "Percentile",
        col  =  "black"
       )

  1. Creating Box Plot for Spring MBA Average
boxplot(MBA_start$s_avg,
        horizontal = TRUE,
        main = "Box Plot for Spring MBA Average ",
        xlab =  "Average",
        col  =  "purple"
       )

  1. Creating Box Plot for Fall MBA Average
boxplot(MBA_start$f_avg,
        horizontal = TRUE,
        main = "Box Plot for Fall MBA Average ",
        xlab =  "Average",
        col  =  "tan"
       )

  1. Creating Box Plot for Work Experience years
boxplot(MBA_start$work_yrs,
        horizontal = TRUE,
        main = "Box Plot for Years of Work Experience ",
        xlab =  "Years",
        col  =  "magenta"
       )

  1. Creating Box Plot for Starting Salary
boxplot(Salary_known$salary,
        horizontal = TRUE,
        main = "Box Plot for Starting Salary ",
        xlab =  "Salary ",
        col  =  "lightgreen"
       )

  1. Creating Bar Plot for Gender Distribution
cnt_gender <-table(MBA_start$sex)
barplot(cnt_gender, width=1, space=1, main="Gender Distribution",xlab="Gender",col=c("blue","red"),
        names.arg=c("Male","Female"),ylim=c(0,105),xlim=c(0,10))

  1. Creating Bar Plot for First Language
first_lang <-table(MBA_start$frstlang)
barplot(first_lang, width=1, space=1, main="First Language of Students ",xlab="Language", 
        col=c("yellow","green"),names.arg=c("English","Other"),ylim=c(0,105),xlim=c(0,10))

  1. Creating Bar Plot for Satisfaction degree of MBA students
hist(Satis_known$satis, breaks=5,col="brown",xlab="Satisfaction level,1=low 7=high", main="Satisfaction  distribution")

  1. Creating Bar Plot Between Placed and Non-Placed Students
Salary_known$placed <- ifelse(Salary_known$salary == 0, 0, 1)
cnt_placed <-table(Salary_known$placed)
barplot(cnt_placed, width=1, space=1, main="Placed vs Not-placed",xlab="Status", 
        col=c("green","red"),names.arg=c("Not Placed","Placed"),ylim=c(0,105),xlim=c(0,10))

  1. Creating Scatter Plot for showing the variation of salary with different other variables
library(car)
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
scatterplotMatrix(~salary+gmat_tot+s_avg+age+f_avg+work_yrs |sex, data=MBA_start, main="Variation of Salary with different variables")

  1. Corrgram Generation
library(corrgram)
corrgram(Salary_known, order=TRUE, lower.panel=panel.shade,upper.panel=panel.pie, text.panel=panel.txt,main="Corrgram for MBA starting salary analysis")

  1. Applying Pearson Chi square Test for Salary and Sex
c1 <- xtabs (~ salary + sex, data=Salary_known)
chisq.test(c1)
## Warning in chisq.test(c1): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  c1
## X-squared = 55.494, df = 42, p-value = 0.07929
  1. Applying Pearson Chi square Test for Quartile Rank and Salary
c2 <- xtabs (~ salary + quarter, data=Salary_known)
chisq.test(c2)
## Warning in chisq.test(c2): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  c2
## X-squared = 132.64, df = 126, p-value = 0.3252
  1. Applying Pearson Chi square Test for Salary and First Language
c3 <- xtabs (~ salary + frstlang, data=Salary_known)
chisq.test(c3)
## Warning in chisq.test(c3): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  c3
## X-squared = 62.016, df = 42, p-value = 0.02384
  1. Applying Pearson Chi square Test for Placement Status and Sex
c4 <- xtabs(~ placed + sex, data=Salary_known)
chisq.test(c4)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  c4
## X-squared = 0.29208, df = 1, p-value = 0.5889
  1. Applying Pearson Chi square Test for Placement Status and Age
c5 <- xtabs(~ placed + age, data=Salary_known)
chisq.test(c5)
## Warning in chisq.test(c5): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  c5
## X-squared = 27.943, df = 20, p-value = 0.1108
  1. Applying Pearson Chi square Test for Placement Status and Work Experience
c6 <- xtabs(~ placed + work_yrs, data=Salary_known)
chisq.test(c6)
## Warning in chisq.test(c6): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  c6
## X-squared = 24.663, df = 17, p-value = 0.1025
  1. Applying Pearson Chi square Test for Placement Status and First Language
c7 <- xtabs(~ placed + frstlang, data=Salary_known)
chisq.test(c7)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  c7
## X-squared = 0.074127, df = 1, p-value = 0.7854
  1. Correlation Matrix
(cor(Salary_known))
##                   age          sex      gmat_tot     gmat_qpc     gmat_vpc
## age       1.000000000 -0.031876273 -1.256220e-01 -0.220590341 -0.006721674
## sex      -0.031876273  1.000000000 -4.351109e-02 -0.167904888  0.099184398
## gmat_tot -0.125622047 -0.043511095  1.000000e+00  0.743099719  0.752906719
## gmat_qpc -0.220590341 -0.167904888  7.430997e-01  1.000000000  0.175497777
## gmat_vpc -0.006721674  0.099184398  7.529067e-01  0.175497777  1.000000000
## gmat_tpc -0.131681932 -0.012849186  8.791496e-01  0.690581939  0.688039929
## s_avg     0.164342257  0.073368077  1.435675e-01  0.019038162  0.190665307
## f_avg    -0.034290725  0.042895288  1.010821e-01  0.130285115  0.033106093
## quarter  -0.076614994 -0.086616877 -8.407099e-02  0.008601267 -0.139400223
## work_yrs  0.871679595 -0.023832548 -1.736909e-01 -0.241384675 -0.041357878
## frstlang  0.097619028 -0.008488358 -9.557089e-02  0.094537575 -0.295162826
## salary   -0.130198680  0.018516965 -5.685962e-05  0.028391635  0.003389965
## satis    -0.073500580 -0.061738773  7.981946e-02 -0.020006117  0.195134711
## placed   -0.205697192  0.050470540  1.491495e-02  0.026982025  0.028880982
##             gmat_tpc       s_avg        f_avg      quarter     work_yrs
## age      -0.13168193  0.16434226 -0.034290725 -0.076614994  0.871679595
## sex      -0.01284919  0.07336808  0.042895288 -0.086616877 -0.023832548
## gmat_tot  0.87914961  0.14356746  0.101082103 -0.084070990 -0.173690863
## gmat_qpc  0.69058194  0.01903816  0.130285115  0.008601267 -0.241384675
## gmat_vpc  0.68803993  0.19066531  0.033106093 -0.139400223 -0.041357878
## gmat_tpc  1.00000000  0.18894788  0.109811857 -0.128533421 -0.166139876
## s_avg     0.18894788  1.00000000  0.520554250 -0.735421726  0.159136628
## f_avg     0.10981186  0.52055425  1.000000000 -0.382421186 -0.047951357
## quarter  -0.12853342 -0.73542173 -0.382421186  1.000000000 -0.126454286
## work_yrs -0.16613988  0.15913663 -0.047951357 -0.126454286  1.000000000
## frstlang -0.10789784 -0.12631935 -0.055830525  0.089504320 -0.002916547
## salary    0.06094464  0.09632412  0.008846655 -0.147257809 -0.053266846
## satis     0.13288434 -0.04639953 -0.114704819  0.067729421 -0.007722658
## placed    0.08264631  0.08063913  0.027460510 -0.127882161 -0.123303946
##              frstlang        salary        satis      placed
## age       0.097619028 -1.301987e-01 -0.073500580 -0.20569719
## sex      -0.008488358  1.851696e-02 -0.061738773  0.05047054
## gmat_tot -0.095570885 -5.685962e-05  0.079819458  0.01491495
## gmat_qpc  0.094537575  2.839164e-02 -0.020006117  0.02698202
## gmat_vpc -0.295162826  3.389965e-03  0.195134711  0.02888098
## gmat_tpc -0.107897839  6.094464e-02  0.132884339  0.08264631
## s_avg    -0.126319350  9.632412e-02 -0.046399534  0.08063913
## f_avg    -0.055830525  8.846655e-03 -0.114704819  0.02746051
## quarter   0.089504320 -1.472578e-01  0.067729421 -0.12788216
## work_yrs -0.002916547 -5.326685e-02 -0.007722658 -0.12330395
## frstlang  1.000000000  7.125825e-03 -0.135986251 -0.03899476
## salary    0.007125825  1.000000e+00  0.156439455  0.96951510
## satis    -0.135986251  1.564395e-01  1.000000000  0.16882557
## placed   -0.038994758  9.695151e-01  0.168825569  1.00000000
  1. Regression Analysis
reg <- lm(salary ~ age + sex + gmat_qpc + gmat_vpc +gmat_tpc + s_avg + f_avg + work_yrs + frstlang
 + satis, data = Salary_known)
summary(reg)
## 
## Call:
## lm(formula = salary ~ age + sex + gmat_qpc + gmat_vpc + gmat_tpc + 
##     s_avg + f_avg + work_yrs + frstlang + satis, data = Salary_known)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -93924 -48912  20019  44376 179796 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  40696.2    70338.2   0.579   0.5636  
## age          -4376.4     1916.4  -2.284   0.0235 *
## sex           1749.0     8733.1   0.200   0.8415  
## gmat_qpc      -246.8      442.3  -0.558   0.5775  
## gmat_vpc      -381.8      406.9  -0.938   0.3494  
## gmat_tpc       571.8      652.3   0.876   0.3819  
## s_avg        22054.8    12502.0   1.764   0.0794 .
## f_avg        -5868.9     8794.0  -0.667   0.5054  
## work_yrs      3252.9     2171.5   1.498   0.1359  
## frstlang     13151.7    15529.4   0.847   0.3982  
## satis         9967.1     5137.4   1.940   0.0539 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 52430 on 182 degrees of freedom
## Multiple R-squared:  0.07776,    Adjusted R-squared:  0.02709 
## F-statistic: 1.535 on 10 and 182 DF,  p-value: 0.1301
  1. Summary

According to the performed Analysis on the MBA Starting Salary File, the following insights can be drawn:

  1. The salary of MBA students is most prominently impacted by the parameters of satisfaction level,first language of english, spring mba average(s_avg), and the age of students.

2.The salary offered to students is higher if they are from Spring MBA average.

  1. The age estimate comes out to be negative implying that age is inversely proportional to the starting salary, experienced students got higher salary.

  2. With a unit rise in satisfaction level of students, the salary rises by almost 10000 units.

  3. The students having first language as English are contributing significantly for starting salary of MBA Students.

  4. With a unit rise in work experience years, the salary rises by almost 3300 units.