Task 1A

setwd("C:/Users/Parul Verma/Desktop/Data Analytics Internship")
Data.df <-read.csv(paste ("MBA Starting Salaries Data.csv", sep=""))

Describing the data using Summary command -

summary(Data.df)
##       age             sex           gmat_tot        gmat_qpc    
##  Min.   :22.00   Min.   :1.000   Min.   :450.0   Min.   :28.00  
##  1st Qu.:25.00   1st Qu.:1.000   1st Qu.:580.0   1st Qu.:72.00  
##  Median :27.00   Median :1.000   Median :620.0   Median :83.00  
##  Mean   :27.36   Mean   :1.248   Mean   :619.5   Mean   :80.64  
##  3rd Qu.:29.00   3rd Qu.:1.000   3rd Qu.:660.0   3rd Qu.:93.00  
##  Max.   :48.00   Max.   :2.000   Max.   :790.0   Max.   :99.00  
##     gmat_vpc        gmat_tpc        s_avg           f_avg      
##  Min.   :16.00   Min.   : 0.0   Min.   :2.000   Min.   :0.000  
##  1st Qu.:71.00   1st Qu.:78.0   1st Qu.:2.708   1st Qu.:2.750  
##  Median :81.00   Median :87.0   Median :3.000   Median :3.000  
##  Mean   :78.32   Mean   :84.2   Mean   :3.025   Mean   :3.062  
##  3rd Qu.:91.00   3rd Qu.:94.0   3rd Qu.:3.300   3rd Qu.:3.250  
##  Max.   :99.00   Max.   :99.0   Max.   :4.000   Max.   :4.000  
##     quarter         work_yrs         frstlang         salary      
##  Min.   :1.000   Min.   : 0.000   Min.   :1.000   Min.   :     0  
##  1st Qu.:1.250   1st Qu.: 2.000   1st Qu.:1.000   1st Qu.:     0  
##  Median :2.000   Median : 3.000   Median :1.000   Median :   999  
##  Mean   :2.478   Mean   : 3.872   Mean   :1.117   Mean   : 39026  
##  3rd Qu.:3.000   3rd Qu.: 4.000   3rd Qu.:1.000   3rd Qu.: 97000  
##  Max.   :4.000   Max.   :22.000   Max.   :2.000   Max.   :220000  
##      satis      
##  Min.   :  1.0  
##  1st Qu.:  5.0  
##  Median :  6.0  
##  Mean   :172.2  
##  3rd Qu.:  7.0  
##  Max.   :998.0

Bar Plots to visualise the distribution of each variable independently -

library(psych)

AGE DISTRIBUTION

hist(Data.df$age, col="green",xlab="Age in years", main="Age  Distribution")

GENDER DISTRIBUTION

Data.df$sex=factor(Data.df$sex, levels=c(1,2), labels=c("Male","Female"))
plot(Data.df$sex,col = "green",main = "Gender distribution")

GMAT SCORE DISTRIBUTION

hist(Data.df$gmat_tot,col="green",xlab="GMAT SCORES (total = 800)", main="GMAT Score Distribution")

WORK EXPERIENCE DISTRIBUTION

hist(Data.df$work_yrs, col="green",xlab="Work Experience (in years)", main="Work Experience Distribution")

FIRST LANGUAGE DISTRIBUTION

Data.df$frstlang=factor(Data.df$frstlang, levels=c(1,2), labels=c("English","Others"))
plot(Data.df$frstlang,col = "green",main = "First Language Distribution")

SALARY DISTRIBUTION

hist(Data.df$salary, col="green",xlab="Salary", main="Salary Distribution")

SATISFACTION DISTRIBUTION

hist(Data.df$satis, col="green",xlab="Satisfaction", main="Satisfaction Distribution")

SCATTER PLOT MATRIX

library(car)
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
scatterplot(salary ~ age,data= Data.df,main="Scatter plot of Salary vs Age", xlab="Age", ylab="Salary")

scatterplot(salary ~ gmat_tot,data= Data.df,main="Scatter plot of Salary vs GMAT Total", xlab="GMAT Total", ylab="Salary")

scatterplot(salary ~ work_yrs,data= Data.df,main="Scatter plot of Salary vs Work Experience", xlab="Work Experience in  years", ylab="Salary")

scatterplot(salary ~ satis,data= Data.df,main="Scatter plot of Salary vs Satisfaction", xlab="Satisfaction", ylab="Salary")

CORRGRAM OF DATA

library(corrgram)
corrgram(Data.df, order=TRUE, lower.panel=panel.shade, upper.panel=panel.pie, text.panel=panel.txt, main="Corrgram")

VARIANCE-COVARIANCE MATRIX

var(Data.df)
## Warning in var(Data.df): NAs introduced by coercion
##                    age sex      gmat_tot      gmat_qpc     gmat_vpc
## age       1.376904e+01  NA -3.115879e+01 -1.192655e+01    -2.763643
## sex                 NA  NA            NA            NA           NA
## gmat_tot -3.115879e+01  NA  3.310688e+03  6.200233e+02   726.000642
## gmat_qpc -1.192655e+01  NA  6.200233e+02  2.210731e+02    38.148258
## gmat_vpc -2.763643e+00  NA  7.260006e+02  3.814826e+01   284.248122
## gmat_tpc -8.839978e+00  NA  6.839911e+02  1.357997e+02   157.493249
## s_avg     2.116874e-01  NA  2.480257e+00 -1.691233e-01     1.313570
## f_avg    -3.399348e-02  NA  3.154688e+00  5.753854e-01     0.672070
## quarter  -2.045935e-01  NA -5.891153e+00  6.001979e-01    -3.267667
## work_yrs  1.029494e+01  NA -3.391634e+01 -1.137186e+01    -3.618165
## frstlang            NA  NA            NA            NA           NA
## salary   -1.183042e+04  NA -1.611600e+05 -3.335823e+04 -5273.852384
## satis    -1.763499e+02  NA  1.765263e+03  3.348371e+02   392.356274
##              gmat_tpc        s_avg        f_avg       quarter     work_yrs
## age        -8.8399775    0.2116874  -0.03399348 -2.045935e-01   10.2949386
## sex                NA           NA           NA            NA           NA
## gmat_tot  683.9910698    2.4802572   3.15468838 -5.891153e+00  -33.9163391
## gmat_qpc  135.7996845   -0.1691233   0.57538542  6.001979e-01  -11.3718617
## gmat_vpc  157.4932488    1.3135702   0.67207000 -3.267667e+00   -3.6181653
## gmat_tpc  196.6057057    0.6271001   0.58698618 -1.292372e+00   -7.8575172
## s_avg       0.6271001    0.1452176   0.11016898 -3.223721e-01    0.1592639
## f_avg       0.5869862    0.1101690   0.27567237 -2.608088e-01   -0.0662870
## quarter    -1.2923719   -0.3223721  -0.26080880  1.232119e+00   -0.3086682
## work_yrs   -7.8575172    0.1592639  -0.06628700 -3.086682e-01   10.4488249
## frstlang           NA           NA           NA            NA           NA
## salary   3522.7500067 2831.6009858 787.65597177 -9.296214e+03 1486.1470415
## satis     484.2466779   -4.6288450   2.12532927 -5.227133e-03 -131.2408091
##          frstlang        salary         satis
## age            NA -1.183042e+04 -1.763499e+02
## sex            NA            NA            NA
## gmat_tot       NA -1.611600e+05  1.765263e+03
## gmat_qpc       NA -3.335823e+04  3.348371e+02
## gmat_vpc       NA -5.273852e+03  3.923563e+02
## gmat_tpc       NA  3.522750e+03  4.842467e+02
## s_avg          NA  2.831601e+03 -4.628845e+00
## f_avg          NA  7.876560e+02  2.125329e+00
## quarter        NA -9.296214e+03 -5.227133e-03
## work_yrs       NA  1.486147e+03 -1.312408e+02
## frstlang       NA            NA            NA
## salary         NA  2.596062e+09 -6.347115e+06
## satis          NA -6.347115e+06  1.380974e+05
 A <- Data.df[,c("age", "gmat_tot", "gmat_qpc", "gmat_vpc","gmat_tpc","s_avg","f_avg","work_yrs","salary")]
 B <- Data.df[,c("age", "gmat_tot", "gmat_qpc", "gmat_vpc","gmat_tpc","s_avg","f_avg","work_yrs","salary")]
   cov(A,B)
##                    age      gmat_tot      gmat_qpc     gmat_vpc
## age       1.376904e+01 -3.115879e+01 -1.192655e+01    -2.763643
## gmat_tot -3.115879e+01  3.310688e+03  6.200233e+02   726.000642
## gmat_qpc -1.192655e+01  6.200233e+02  2.210731e+02    38.148258
## gmat_vpc -2.763643e+00  7.260006e+02  3.814826e+01   284.248122
## gmat_tpc -8.839978e+00  6.839911e+02  1.357997e+02   157.493249
## s_avg     2.116874e-01  2.480257e+00 -1.691233e-01     1.313570
## f_avg    -3.399348e-02  3.154688e+00  5.753854e-01     0.672070
## work_yrs  1.029494e+01 -3.391634e+01 -1.137186e+01    -3.618165
## salary   -1.183042e+04 -1.611600e+05 -3.335823e+04 -5273.852384
##              gmat_tpc        s_avg        f_avg     work_yrs        salary
## age        -8.8399775    0.2116874  -0.03399348   10.2949386 -1.183042e+04
## gmat_tot  683.9910698    2.4802572   3.15468838  -33.9163391 -1.611600e+05
## gmat_qpc  135.7996845   -0.1691233   0.57538542  -11.3718617 -3.335823e+04
## gmat_vpc  157.4932488    1.3135702   0.67207000   -3.6181653 -5.273852e+03
## gmat_tpc  196.6057057    0.6271001   0.58698618   -7.8575172  3.522750e+03
## s_avg       0.6271001    0.1452176   0.11016898    0.1592639  2.831601e+03
## f_avg       0.5869862    0.1101690   0.27567237   -0.0662870  7.876560e+02
## work_yrs   -7.8575172    0.1592639  -0.06628700   10.4488249  1.486147e+03
## salary   3522.7500067 2831.6009858 787.65597177 1486.1470415  2.596062e+09

Task 1B : WHO GOT HOW MUCH SALARY?

Taking a subset of the dataset consisting of only those people who actually got a job -

Job.df <- subset(Data.df, salary>0 & salary!= 998 & salary!=999)

Using this subset of data: Think about the problem as y = f(x), where y = Starting Salary and x = various factors that it could depend upon, Examples: impact of {gender; first language; prior work experience; GMAT performance; MBA performance} etc in determining the Starting Salary.

Contingency Tables -

table1 <- with(Job.df, table(age))
table1
## age
## 22 23 24 25 26 27 28 29 30 31 32 33 34 39 40 
##  1  5 16 23 14 14  8  6  6  4  1  1  1  1  2
table2 <- with(Job.df, table(sex))
table2
## sex
##   Male Female 
##     72     31
table3 <- aggregate(salary ~ gmat_tot, data=Job.df, mean)
table3
##    gmat_tot   salary
## 1       500 158250.0
## 2       520  78256.0
## 3       530  99500.0
## 4       540 104000.0
## 5       550 112236.7
## 6       560  94000.0
## 7       570 103857.1
## 8       580  99875.0
## 9       590  97000.0
## 10      600 107666.7
## 11      610  96200.0
## 12      620 104108.3
## 13      630 105812.5
## 14      640 110000.0
## 15      650 101285.7
## 16      660  92480.0
## 17      670 100642.9
## 18      680 102166.7
## 19      700 122333.3
## 20      710 101250.0
## 21      720  85000.0
table4<-xtabs(~salary+frstlang,data=Job.df)
table4
##         frstlang
## salary   English Others
##   64000        1      0
##   77000        1      0
##   78256        1      0
##   82000        1      0
##   85000        4      0
##   86000        2      0
##   88000        1      0
##   88500        1      0
##   90000        3      0
##   92000        3      0
##   93000        3      0
##   95000        7      0
##   96000        4      0
##   96500        1      0
##   97000        2      0
##   98000        8      2
##   99000        0      1
##   100000       9      0
##   100400       1      0
##   101000       2      0
##   101100       1      0
##   101600       1      0
##   102500       1      0
##   103000       1      0
##   104000       1      1
##   105000      11      0
##   106000       3      0
##   107000       1      0
##   107300       0      1
##   107500       1      0
##   108000       2      0
##   110000       1      0
##   112000       3      0
##   115000       5      0
##   118000       0      1
##   120000       4      0
##   126710       1      0
##   130000       1      0
##   145800       1      0
##   146000       1      0
##   162000       1      0
##   220000       0      1
table5 <- aggregate(salary ~ s_avg + f_avg, data=Job.df, mean)
table5
##    s_avg f_avg    salary
## 1   4.00  0.00 146000.00
## 2   2.20  2.00 105000.00
## 3   2.40  2.00  85000.00
## 4   2.40  2.25  90000.00
## 5   2.30  2.50  98000.00
## 6   2.50  2.50  96000.00
## 7   2.60  2.50 107700.00
## 8   2.70  2.50  90000.00
## 9   3.50  2.67  86000.00
## 10  2.40  2.75  99500.00
## 11  2.50  2.75 220000.00
## 12  2.60  2.75 114155.00
## 13  2.70  2.75  90750.00
## 14  2.80  2.75 104833.33
## 15  2.90  2.75  91085.33
## 16  3.00  2.75  97250.00
## 17  2.91  2.83 105000.00
## 18  2.50  3.00  77000.00
## 19  2.60  3.00 100000.00
## 20  2.70  3.00  98000.00
## 21  2.80  3.00  99700.00
## 22  2.90  3.00 101400.00
## 23  3.00  3.00 105000.00
## 24  3.09  3.00 100000.00
## 25  3.10  3.00 112450.00
## 26  3.20  3.00 109000.00
## 27  3.30  3.00 105000.00
## 28  3.40  3.00 100000.00
## 29  3.50  3.00 113000.00
## 30  2.80  3.25  98000.00
## 31  2.90  3.25  93000.00
## 32  3.00  3.25 107500.00
## 33  3.20  3.25 105166.67
## 34  3.27  3.25  95000.00
## 35  3.30  3.25 101416.67
## 36  3.40  3.25  90000.00
## 37  3.50  3.25  97333.33
## 38  3.10  3.33  82000.00
## 39  2.90  3.50 107300.00
## 40  3.09  3.50 107000.00
## 41  3.10  3.50  96500.00
## 42  3.20  3.50  95000.00
## 43  3.30  3.50  95750.00
## 44  3.45  3.50 105000.00
## 45  3.50  3.50 111500.00
## 46  3.60  3.50 110500.00
## 47  3.80  3.50 105000.00
## 48  3.50  3.60  85000.00
## 49  3.70  3.60 106000.00
## 50  3.40  3.67 100000.00
## 51  3.60  3.67  95000.00
## 52  3.40  3.75  93000.00
## 53  3.50  3.75  85000.00
## 54  3.60  3.75 162000.00
## 55  3.70  4.00 115000.00
## 56  3.80  4.00 120000.00
table6 <- aggregate(salary ~ work_yrs, data=Job.df, mean)
table6 
##    work_yrs    salary
## 1         0  95000.00
## 2         1 103532.00
## 3         2  97673.68
## 4         3 101652.86
## 5         4 105454.55
## 6         5 103142.86
## 7         6 105928.57
## 8         7  98000.00
## 9         8 105025.00
## 10       10 118000.00
## 11       15 183000.00
## 12       16 108500.00

T-TESTS - To examine the effect of gender and First language on the salary from the output given :

t.test(salary ~ sex, data=Job.df)
## 
##  Welch Two Sample t-test
## 
## data:  salary by sex
## t = 1.3628, df = 38.115, p-value = 0.1809
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -3128.55 16021.72
## sample estimates:
##   mean in group Male mean in group Female 
##            104970.97             98524.39
t.test(salary ~ frstlang, data=Job.df)
## 
##  Welch Two Sample t-test
## 
## data:  salary by frstlang
## t = -1.1202, df = 6.0863, p-value = 0.3049
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -59933.62  22202.25
## sample estimates:
## mean in group English  mean in group Others 
##              101748.6              120614.3

CHI-SQAURE TESTS - TO examine the effect of first language and work experience on the salary from the p-vlaue obtained -

chisq.test(table4)
## Warning in chisq.test(table4): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  table4
## X-squared = 69.847, df = 41, p-value = 0.003296
chisq.test(table6)
## 
##  Pearson's Chi-squared test
## 
## data:  table6
## X-squared = 33.445, df = 11, p-value = 0.0004455

Since p-value < 0.1 in both cases, we can say that there is a relationship between the variables we’ve taken.

REGRESSION MODELS -

MODEL 1

model1 <- lm(salary ~ age + sex + gmat_tot + work_yrs + s_avg + f_avg + frstlang, data = Job.df)
summary(model1)
## 
## Call:
## lm(formula = salary ~ age + sex + gmat_tot + work_yrs + s_avg + 
##     f_avg + frstlang, data = Job.df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -31445  -8977  -2293   6114  80584 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)  
## (Intercept)    58980.07   32358.38   1.823   0.0715 .
## age             1551.84    1123.08   1.382   0.1703  
## sexFemale      -5000.72    3505.60  -1.426   0.1570  
## gmat_tot         -15.97      31.75  -0.503   0.6161  
## work_yrs         832.83    1146.63   0.726   0.4694  
## s_avg           3804.74    5020.77   0.758   0.4504  
## f_avg           -558.44    3828.62  -0.146   0.8843  
## frstlangOthers 10881.96    7130.15   1.526   0.1303  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15690 on 95 degrees of freedom
## Multiple R-squared:  0.2823, Adjusted R-squared:  0.2294 
## F-statistic: 5.337 on 7 and 95 DF,  p-value: 3.562e-05

MODEL 2

model2 <- lm(salary ~  gmat_tot + gmat_qpc + gmat_vpc + gmat_tpc + s_avg + f_avg + frstlang, data = Job.df)
summary(model2)
## 
## Call:
## lm(formula = salary ~ gmat_tot + gmat_qpc + gmat_vpc + gmat_tpc + 
##     s_avg + f_avg + frstlang, data = Job.df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -42604  -7362   -369   6038  89453 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)  
## (Intercept)    108645.92   45824.46   2.371   0.0198 *
## gmat_tot          -13.65     182.44  -0.075   0.9405  
## gmat_qpc          897.73     528.78   1.698   0.0928 .
## gmat_vpc          727.96     530.24   1.373   0.1730  
## gmat_tpc        -1679.17     767.96  -2.187   0.0312 *
## s_avg           12713.89    5096.70   2.495   0.0143 *
## f_avg           -7948.11    3898.87  -2.039   0.0443 *
## frstlangOthers  17814.16    6863.28   2.596   0.0109 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16800 on 95 degrees of freedom
## Multiple R-squared:  0.1769, Adjusted R-squared:  0.1162 
## F-statistic: 2.917 on 7 and 95 DF,  p-value: 0.008286

MODEL 3

model3 <- lm(salary ~  age + sex + gmat_tot + gmat_qpc + gmat_vpc + gmat_tpc + s_avg + f_avg + quarter + work_yrs + frstlang + satis, data = Job.df)
summary(model3)
## 
## Call:
## lm(formula = salary ~ age + sex + gmat_tot + gmat_qpc + gmat_vpc + 
##     gmat_tpc + s_avg + f_avg + quarter + work_yrs + frstlang + 
##     satis, data = Job.df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -26489  -7983   -373   5923  70602 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)  
## (Intercept)    82141.01   54281.71   1.513   0.1337  
## age             1750.65    1130.92   1.548   0.1251  
## sexFemale      -3584.07    3595.85  -0.997   0.3216  
## gmat_tot          16.19     178.85   0.090   0.9281  
## gmat_qpc         796.55     496.78   1.603   0.1123  
## gmat_vpc         546.31     501.97   1.088   0.2794  
## gmat_tpc       -1457.09     714.94  -2.038   0.0445 *
## s_avg           -931.53    8240.31  -0.113   0.9102  
## f_avg          -2222.82    3894.57  -0.571   0.5696  
## quarter        -2336.56    2721.89  -0.858   0.3929  
## work_yrs         749.66    1135.90   0.660   0.5110  
## frstlangOthers  7719.42    7373.27   1.047   0.2979  
## satis          -1086.54    2157.76  -0.504   0.6158  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15430 on 90 degrees of freedom
## Multiple R-squared:  0.3422, Adjusted R-squared:  0.2545 
## F-statistic: 3.902 on 12 and 90 DF,  p-value: 8.086e-05

MODEL 4

model4 <- lm(salary ~  age + sex + gmat_tot + work_yrs + frstlang + satis, data = Job.df)
summary(model4)
## 
## Call:
## lm(formula = salary ~ age + sex + gmat_tot + work_yrs + frstlang + 
##     satis, data = Job.df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -26421  -9113  -1720   5518  77942 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)  
## (Intercept)    73754.350  31743.018   2.323   0.0223 *
## age             1753.296   1100.928   1.593   0.1145  
## sexFemale      -4993.576   3436.460  -1.453   0.1495  
## gmat_tot          -8.973     31.057  -0.289   0.7733  
## work_yrs         803.922   1134.132   0.709   0.4801  
## frstlangOthers 10165.631   6883.562   1.477   0.1430  
## satis          -2428.807   1993.480  -1.218   0.2261  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15540 on 96 degrees of freedom
## Multiple R-squared:  0.2884, Adjusted R-squared:  0.2439 
## F-statistic: 6.484 on 6 and 96 DF,  p-value: 9.105e-06

MODEL 3 seems to be the best model if we look at multiple R-squared value. However, the adjusted R-squared value is quite less compared to other models. Adjusted R-squared value decreases when a predictor improves the model by less than expected by chance. R-squared will either stay the same or increase with addition of more variables, even if they do not have any relationship with the output variables. In multi-variate regression, it is often better to look at the adjusted R-squared value. Adjusted R-square decreases if we add variables that do not improve our existing model. It takes into account only those models that are significant. Hence, looking at adjusted R-squared value will give us a better goo-fit model.

Here, even in the adjusted R-squared value comparison, MODEL 3 is the best.

We can conclude that MBA recruiters look at a variety of factors before deciding upon the starting salary of their recruits.

TASK 1c: COMPARE THOSE WHO GOT A JOB WITH THOSE WHO DID NOT GET A JOB? IDENTIFY WHY?

Compare the remaining subset of those people who did not get a job and compare them with those people who got a job. Here, we are not analyzing what drives a higher salary. Instead, we are analysing the two groups who got a job / did not get a job.

noJob.df <- subset(Data.df, salary==0)

Contingency Tables

table1.1 <- with(noJob.df, table(gmat_tot))
table1.1
## gmat_tot
## 450 480 510 530 540 550 560 570 580 590 600 610 620 630 640 650 660 670 
##   1   1   2   3   3   4   8   7   4   3   3   9   4   5   6   5   3   4 
## 680 700 710 720 730 740 750 760 
##   3   2   4   2   1   1   1   1
table1.2 <- with(noJob.df, table(frstlang))
table1.2
## frstlang
## English  Others 
##      82       8
table1.3 <- with(noJob.df, table(quarter))
table1.3
## quarter
##  1  2  3  4 
## 18 27 23 22

CHI-SQUARE TESTS : To examine the effect of GMAT score, First language and quartile ranking.

chisq.test(table1.1)
## Warning in chisq.test(table1.1): Chi-squared approximation may be incorrect
## 
##  Chi-squared test for given probabilities
## 
## data:  table1.1
## X-squared = 34.8, df = 25, p-value = 0.09188
chisq.test(table1.2)
## 
##  Chi-squared test for given probabilities
## 
## data:  table1.2
## X-squared = 60.844, df = 1, p-value = 6.177e-15
chisq.test(table1.3)
## 
##  Chi-squared test for given probabilities
## 
## data:  table1.3
## X-squared = 1.8222, df = 3, p-value = 0.6101

LOGISTIC REGRESSION

  1. Data Cleaning Process
training.data.raw <- read.csv('MBA Starting Salaries Data.csv',header=T,na.strings=c(""))

Checking for missing values :

sapply(training.data.raw,function(x) sum(is.na(x)))
##      age      sex gmat_tot gmat_qpc gmat_vpc gmat_tpc    s_avg    f_avg 
##        0        0        0        0        0        0        0        0 
##  quarter work_yrs frstlang   salary    satis 
##        0        0        0        0        0

Since there are no missing values, we can use all the parameters further without having to account for them.

Checking for unique values :

sapply(training.data.raw, function(x) length(unique(x)))
##      age      sex gmat_tot gmat_qpc gmat_vpc gmat_tpc    s_avg    f_avg 
##       21        2       31       48       34       42       36       21 
##  quarter work_yrs frstlang   salary    satis 
##        4       18        2       45        8

Training and Testing the Data

train <- Data.df[1:260,]
test <- Data.df[261:274,]

Modelling the Data : Relating it to the Titanic case, we need a categorical variable that is binomial. In the MBA Starting Salaries dataset, First language is the only such variable. Running a binomial logistic regression for it,

model <- glm(frstlang ~.,family=binomial(link='logit'),data=Data.df)
summary(model)
## 
## Call:
## glm(formula = frstlang ~ ., family = binomial(link = "logit"), 
##     data = Data.df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6145  -0.4053  -0.2344  -0.1219   3.0978  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -4.574e+00  6.905e+00  -0.662 0.507749    
## age          4.565e-01  1.379e-01   3.311 0.000929 ***
## sexFemale    7.118e-01  5.773e-01   1.233 0.217548    
## gmat_tot     4.741e-03  1.926e-02   0.246 0.805596    
## gmat_qpc     5.715e-03  7.441e-02   0.077 0.938772    
## gmat_vpc    -1.266e-01  5.546e-02  -2.283 0.022414 *  
## gmat_tpc     8.864e-02  8.407e-02   1.054 0.291715    
## s_avg       -3.829e+00  1.844e+00  -2.076 0.037851 *  
## f_avg        1.020e+00  9.147e-01   1.115 0.264847    
## quarter     -8.802e-01  5.240e-01  -1.680 0.093002 .  
## work_yrs    -4.408e-01  1.616e-01  -2.727 0.006383 ** 
## salary       3.349e-07  5.291e-06   0.063 0.949530    
## satis        1.161e-03  6.194e-04   1.874 0.060915 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 197.54  on 273  degrees of freedom
## Residual deviance: 129.01  on 261  degrees of freedom
## AIC: 155.01
## 
## Number of Fisher Scoring iterations: 7