TASK 1a

Reading the dataset

MBASalaryData.df <- read.csv(paste("MBA Starting Salaries Data.csv", sep=""))
str(MBASalaryData.df)
## 'data.frame':    274 obs. of  13 variables:
##  $ age     : int  23 24 24 24 24 24 25 25 25 25 ...
##  $ sex     : int  2 1 1 1 2 1 1 2 1 1 ...
##  $ gmat_tot: int  620 610 670 570 710 640 610 650 630 680 ...
##  $ gmat_qpc: int  77 90 99 56 93 82 89 88 79 99 ...
##  $ gmat_vpc: int  87 71 78 81 98 89 74 89 91 81 ...
##  $ gmat_tpc: int  87 87 95 75 98 91 87 92 89 96 ...
##  $ s_avg   : num  3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
##  $ f_avg   : num  3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
##  $ quarter : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ work_yrs: int  2 2 2 1 2 2 2 2 2 2 ...
##  $ frstlang: int  1 1 1 1 1 1 1 1 2 1 ...
##  $ salary  : int  0 0 0 0 999 0 0 0 999 998 ...
##  $ satis   : int  7 6 6 7 5 6 5 6 4 998 ...
## Converting sex, quarter and first language columns to factors from integers

MBASalaryData.df$sex <- as.factor(MBASalaryData.df$sex)
MBASalaryData.df$quarter <- as.factor(MBASalaryData.df$quarter)
MBASalaryData.df$frstlang <- as.factor(MBASalaryData.df$frstlang)
str(MBASalaryData.df)
## 'data.frame':    274 obs. of  13 variables:
##  $ age     : int  23 24 24 24 24 24 25 25 25 25 ...
##  $ sex     : Factor w/ 2 levels "1","2": 2 1 1 1 2 1 1 2 1 1 ...
##  $ gmat_tot: int  620 610 670 570 710 640 610 650 630 680 ...
##  $ gmat_qpc: int  77 90 99 56 93 82 89 88 79 99 ...
##  $ gmat_vpc: int  87 71 78 81 98 89 74 89 91 81 ...
##  $ gmat_tpc: int  87 87 95 75 98 91 87 92 89 96 ...
##  $ s_avg   : num  3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
##  $ f_avg   : num  3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
##  $ quarter : Factor w/ 4 levels "1","2","3","4": 1 1 1 1 1 1 1 1 1 1 ...
##  $ work_yrs: int  2 2 2 1 2 2 2 2 2 2 ...
##  $ frstlang: Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 2 1 ...
##  $ salary  : int  0 0 0 0 999 0 0 0 999 998 ...
##  $ satis   : int  7 6 6 7 5 6 5 6 4 998 ...

Summary statistics

summary(MBASalaryData.df)
##       age        sex        gmat_tot        gmat_qpc        gmat_vpc    
##  Min.   :22.00   1:206   Min.   :450.0   Min.   :28.00   Min.   :16.00  
##  1st Qu.:25.00   2: 68   1st Qu.:580.0   1st Qu.:72.00   1st Qu.:71.00  
##  Median :27.00           Median :620.0   Median :83.00   Median :81.00  
##  Mean   :27.36           Mean   :619.5   Mean   :80.64   Mean   :78.32  
##  3rd Qu.:29.00           3rd Qu.:660.0   3rd Qu.:93.00   3rd Qu.:91.00  
##  Max.   :48.00           Max.   :790.0   Max.   :99.00   Max.   :99.00  
##     gmat_tpc        s_avg           f_avg       quarter    work_yrs     
##  Min.   : 0.0   Min.   :2.000   Min.   :0.000   1:69    Min.   : 0.000  
##  1st Qu.:78.0   1st Qu.:2.708   1st Qu.:2.750   2:70    1st Qu.: 2.000  
##  Median :87.0   Median :3.000   Median :3.000   3:70    Median : 3.000  
##  Mean   :84.2   Mean   :3.025   Mean   :3.062   4:65    Mean   : 3.872  
##  3rd Qu.:94.0   3rd Qu.:3.300   3rd Qu.:3.250           3rd Qu.: 4.000  
##  Max.   :99.0   Max.   :4.000   Max.   :4.000           Max.   :22.000  
##  frstlang     salary           satis      
##  1:242    Min.   :     0   Min.   :  1.0  
##  2: 32    1st Qu.:     0   1st Qu.:  5.0  
##           Median :   999   Median :  6.0  
##           Mean   : 39026   Mean   :172.2  
##           3rd Qu.: 97000   3rd Qu.:  7.0  
##           Max.   :220000   Max.   :998.0
library(psych)
describe(MBASalaryData.df)
##           vars   n     mean       sd median  trimmed     mad min    max
## age          1 274    27.36     3.71     27    26.76    2.97  22     48
## sex*         2 274     1.25     0.43      1     1.19    0.00   1      2
## gmat_tot     3 274   619.45    57.54    620   618.86   59.30 450    790
## gmat_qpc     4 274    80.64    14.87     83    82.31   14.83  28     99
## gmat_vpc     5 274    78.32    16.86     81    80.33   14.83  16     99
## gmat_tpc     6 274    84.20    14.02     87    86.12   11.86   0     99
## s_avg        7 274     3.03     0.38      3     3.03    0.44   2      4
## f_avg        8 274     3.06     0.53      3     3.09    0.37   0      4
## quarter*     9 274     2.48     1.11      2     2.47    1.48   1      4
## work_yrs    10 274     3.87     3.23      3     3.29    1.48   0     22
## frstlang*   11 274     1.12     0.32      1     1.02    0.00   1      2
## salary      12 274 39025.69 50951.56    999 33607.86 1481.12   0 220000
## satis       13 274   172.18   371.61      6    91.50    1.48   1    998
##            range  skew kurtosis      se
## age           26  2.16     6.45    0.22
## sex*           1  1.16    -0.66    0.03
## gmat_tot     340 -0.01     0.06    3.48
## gmat_qpc      71 -0.92     0.30    0.90
## gmat_vpc      83 -1.04     0.74    1.02
## gmat_tpc      99 -2.28     9.02    0.85
## s_avg          2 -0.06    -0.38    0.02
## f_avg          4 -2.08    10.85    0.03
## quarter*       3  0.02    -1.35    0.07
## work_yrs      22  2.78     9.80    0.20
## frstlang*      1  2.37     3.65    0.02
## salary    220000  0.70    -1.05 3078.10
## satis        997  1.77     1.13   22.45

Forming subsets of the main dataset based on who got placed and who did not as per survey

# Those who got placed irrespective of whether they disclosed their salaries in survey or not
Job.df <- MBASalaryData.df[which (MBASalaryData.df$salary >= 999)  , ]
View(Job.df)

# Those who were not placed
noJob.df <- MBASalaryData.df[which(MBASalaryData.df$salary==0), ]
View(noJob.df)

Average and median Salary of students who were placed

avg_job = mean(Job.df$salary)
avg_job
## [1] 77153.12
median_job = median(Job.df$salary)
median_job
## [1] 97000

Effect of Years of Work Experience on Starting Salary of those who got placed

plot(salary ~ work_yrs ,data=Job.df, 
     main="Visualization of Salary and Work Experience", 
     xlab="Years of Work Experience", ylab="Starting Salary", 
     horizontal=TRUE)

boxplot(salary ~ work_yrs ,data=Job.df, 
        main="Plot of Salary with Work Experience", ylab="Years of Work Experience", 
        xlab="Starting Salary", horizontal=TRUE,
        col=c("red","blue","peachpuff","yellow", "green", "pink"))

Effect of GMAT score of students on starting Salary

plot(salary ~ gmat_tpc , data=Job.df, 
    xlab="GMAT Overall Score", ylab="Starting Salary", 
    main="Visualization of Salary with Overall GMAT score")

Construct a Corrgram based on all variables in the dataset.

library(corrgram)
corrgram(Job.df[, names(Job.df)], order=FALSE,
         main="Corrgram of dataset variables of places students",
         lower.panel=panel.shade, upper.panel=panel.pie,
         text.panel=panel.txt)

Effect of Gender on Starting Salaries

library(lattice)
histogram(~salary | sex, data=Job.df)

## 1=Male; 2=Female

Effect of native language on Starting Salaries

library(lattice)
histogram(~salary | frstlang, data=Job.df)

## first language (1=English; 2=other)

Effect of age of students on starting Salary

plot(salary ~ age , data=Job.df, 
    xlab="Age", ylab="Starting Salary", 
    main="Visualization of Salary with Age")

Effect of whether or not students were satisfied with the MBA Program on starting Salary

plot(salary ~ satis , data=Job.df, 
    xlab="Satisfies/not satisfied", ylab="Starting Salary", 
    main="Visualization of Salary and satisfaction")

Effect of Spring average performance on starting Salary

plot(salary ~ s_avg , data=Job.df, 
    xlab="Spring average performance", ylab="Starting Salary", 
    main="Visualization of Salary and spring average performance")

Effect of fall average performance on starting Salary

plot(salary ~ f_avg , data=Job.df, 
    xlab="Fall average performance", ylab="Starting Salary", 
    main="Visualization of Salary and fall average performance")

TASK 1b

Mean salary wrt Gender, work experience and age

aggregate(salary ~ sex, data = Job.df, mean)
##   sex   salary
## 1   1 74390.98
## 2   2 84979.19
## 1=Male; 2=Female
aggregate(salary ~ work_yrs, data = Job.df, mean)
##    work_yrs    salary
## 1         0  47999.50
## 2         1  83025.40
## 3         2  84490.77
## 4         3  73886.28
## 5         4  53226.77
## 6         5  80444.22
## 7         6  82610.89
## 8         7  33332.67
## 9         8 105025.00
## 10        9    999.00
## 11       10 118000.00
## 12       15 183000.00
## 13       16 108500.00
aggregate(salary ~ age, data = Job.df, mean)
##    age    salary
## 1   22  85000.00
## 2   23  91651.20
## 3   24  90349.89
## 4   25  78792.90
## 5   26  75173.95
## 6   27  68475.86
## 7   28  75636.09
## 8   29  56135.91
## 9   30  73610.78
## 10  31  67333.00
## 11  32  36432.67
## 12  33 118000.00
## 13  34 105000.00
## 14  39 112000.00
## 15  40 183000.00

Mean age and years of work experience of both Male and Female students

aggregate(cbind(work_yrs, age) ~ sex, data = Job.df, mean)
##   sex work_yrs      age
## 1   1 3.803922 27.24510
## 2   2 3.277778 26.13889
## 1=Male; 2=Female

Chi-sq test for effect of Gender on Starting salary

sal_sex <- xtabs(~salary+sex, data=Job.df)
chisq.test(sal_sex)
## 
##  Pearson's Chi-squared test
## 
## data:  sal_sex
## X-squared = 60.869, df = 42, p-value = 0.02987

Chi-sq test for effect of native language on Starting salary

sal_lang <- xtabs(~salary+frstlang, data=Job.df)
chisq.test(sal_lang)
## 
##  Pearson's Chi-squared test
## 
## data:  sal_lang
## X-squared = 52.285, df = 42, p-value = 0.1328
## first language (1=English; 2=other)

First Regression Model

In this model we are considering maximum number of important variables - 1) Independent Variables - work_yrs, sex, gmat_qpc, gmat_vpc, gmat_tpc, s_avg, f_avg, frstlang, satis 2) Dependent Variables - Salary

fit_a <- lm(salary ~ work_yrs+sex+gmat_qpc+gmat_vpc+gmat_tpc+s_avg+f_avg+frstlang+satis, data = Job.df)
summary(fit_a)
## 
## Call:
## lm(formula = salary ~ work_yrs + sex + gmat_qpc + gmat_vpc + 
##     gmat_tpc + s_avg + f_avg + frstlang + satis, data = Job.df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -102066  -21162    7023   24234  129791 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -94218.4    42592.5  -2.212  0.02873 *  
## work_yrs      1486.8     1341.7   1.108  0.26986    
## sex2          6582.5     7832.9   0.840  0.40227    
## gmat_qpc      -404.5      364.8  -1.109  0.26955    
## gmat_vpc      -713.1      326.3  -2.185  0.03067 *  
## gmat_tpc       594.5      486.8   1.221  0.22422    
## s_avg        32113.7    10752.2   2.987  0.00338 ** 
## f_avg        -1726.0     9014.2  -0.191  0.84845    
## frstlang2   -15675.1    12231.1  -1.282  0.20231    
## satis        20286.7     3133.3   6.475 1.85e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 38810 on 128 degrees of freedom
## Multiple R-squared:  0.3669, Adjusted R-squared:  0.3223 
## F-statistic: 8.241 on 9 and 128 DF,  p-value: 1.321e-09

Second Regression Model

In this model we are considering only those variables whose effect Daer esp. wants to see given in the last paragraph of the case study. 1) Independent Variables - age, sex, gmat_tpc, frstlang, satis 2) Dependent Variables - Salary

fit_b <- lm(salary ~ age+sex+gmat_tpc+frstlang+satis, data = Job.df)
summary(fit_b)
## 
## Call:
## lm(formula = salary ~ age + sex + gmat_tpc + frstlang + satis, 
##     data = Job.df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -112089  -21032    9688   24751  110605 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -64941.4    45251.6  -1.435    0.154    
## age           1618.5     1208.4   1.339    0.183    
## sex2         11308.8     7995.9   1.414    0.160    
## gmat_tpc      -213.2      270.3  -0.789    0.432    
## frstlang2   -16054.1    11596.5  -1.384    0.169    
## satis        20868.8     3219.8   6.481 1.66e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 40490 on 132 degrees of freedom
## Multiple R-squared:  0.2893, Adjusted R-squared:  0.2623 
## F-statistic: 10.74 on 5 and 132 DF,  p-value: 1.113e-08

TASK 1c

Regression Analysis

The first model is a better one since it has more statistically significant variables

The explanatory variable(s) whose beta-coefficients are statistically significant (p < 0.05) -

  1. gmat_vpc
  2. s_avg
  3. satis

Insights

  1. p- value of the whole model is 1.321e-09 which is much less than 0.05 and therefore, the model as a whole is a good model for the prediction of profit.

  2. The model has passed the F-Test most likely.

  3. According to the Adjusted R-Squared, all the predictor variables taken explain a 32.23% of variance approximately. Since it is around 33%, we can say that the number of variables taken to calculate the effect on salary is less and it would be good if we could incorporate more factors.

  4. There is a very positive relationship between student satisfaction and starting salary.

  5. It can be said that the students who were not very satisfied with the MBA Program did not get placed and students who were satisfied with the MBA Program got placements with reported starting salaries. Therefore, satisfaction with the MBA Program is a big factor which drives placement after MBA.

  6. Also, performance in GMAT verbal section is a significant variable affecting the placement process. Therefore, one whose score in GMAT verbal section is above a certain threshold is assumed to have good communication skills also and this thus becomes an impotant factor for getting placement after MBA.

  7. It is seen from the first graph that students with some prior work experience of around 2 - 4 years approx. are earning higher startig salary as compared to others. Therefore, prior work experience might play a role in getting a placement after MBA.

TASK 1d - Logistic Regression

## check for missing values and look how many unique values there are for each variable using the sapply() function which applies the function passed as argument to each column of the dataframe.

sapply(MBASalaryData.df, function(x) sum(is.na(x)))
##      age      sex gmat_tot gmat_qpc gmat_vpc gmat_tpc    s_avg    f_avg 
##        0        0        0        0        0        0        0        0 
##  quarter work_yrs frstlang   salary    satis 
##        0        0        0        0        0
sapply(MBASalaryData.df, function(x) length(unique(x)))
##      age      sex gmat_tot gmat_qpc gmat_vpc gmat_tpc    s_avg    f_avg 
##       21        2       31       48       34       42       36       21 
##  quarter work_yrs frstlang   salary    satis 
##        4       18        2       45        8
## A visual take on the missing values might be helpful: the Amelia package has a special plotting function missmap() that will plot your dataset and highlight missing values.

library(Amelia)
missmap(MBASalaryData.df, main = "Missing values vs observed")

## The graph clearly shows that there are no missing values in the dataset.

## We split the data into two chunks: training and testing set. The training set will be used to fit our model which we will be testing over the testing set.

train <- MBASalaryData.df[1:194, ]
test <- MBASalaryData.df[195:274,]

## Now, we will fit the model. We specify the parameter family=binomial in the glm() function first. Also, since the dependent variable has to be a categorical variable, therefore, I have used first language of the student as the dependent variable.

model <- glm(frstlang ~.,family=binomial(link='logit'),data=MBASalaryData.df)

## By using function summary() we obtain the results of our model:

summary(model)
## 
## Call:
## glm(formula = frstlang ~ ., family = binomial(link = "logit"), 
##     data = MBASalaryData.df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9514  -0.3900  -0.2215  -0.1128   3.1951  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -7.508e+00  7.077e+00  -1.061 0.288690    
## age          4.865e-01  1.385e-01   3.514 0.000442 ***
## sex2         7.613e-01  5.815e-01   1.309 0.190482    
## gmat_tot     5.945e-03  1.999e-02   0.297 0.766157    
## gmat_qpc     1.712e-02  7.747e-02   0.221 0.825118    
## gmat_vpc    -1.231e-01  5.732e-02  -2.148 0.031725 *  
## gmat_tpc     7.036e-02  8.761e-02   0.803 0.421932    
## s_avg       -3.573e+00  1.926e+00  -1.855 0.063586 .  
## f_avg        9.554e-01  9.388e-01   1.018 0.308807    
## quarter2    -6.544e-01  9.648e-01  -0.678 0.497614    
## quarter3    -8.922e-01  1.292e+00  -0.690 0.489971    
## quarter4    -2.653e+00  1.716e+00  -1.546 0.121997    
## work_yrs    -4.621e-01  1.596e-01  -2.896 0.003780 ** 
## salary       4.061e-07  5.305e-06   0.077 0.938978    
## satis        1.131e-03  6.317e-04   1.791 0.073287 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 197.54  on 273  degrees of freedom
## Residual deviance: 126.81  on 259  degrees of freedom
## AIC: 156.81
## 
## Number of Fisher Scoring iterations: 7

Analysis of Logistic regression

  1. sex, gmat_tot, gmat_qpc, gmat_tpc, s_avg, f_avg, quarter, salary, satis are not statistically significant.
  2. age, work_yrs and gmat_vpc are statistically significant variables.
  3. age of a student is a highly significant variable.