Task 1A
setwd("C:/Users/Parul Verma/Desktop/Data Analytics Internship")
Data.df <-read.csv(paste ("MBA Starting Salaries Data.csv", sep=""))
Describing the data using Summary command -
summary(Data.df)
## age sex gmat_tot gmat_qpc
## Min. :22.00 Min. :1.000 Min. :450.0 Min. :28.00
## 1st Qu.:25.00 1st Qu.:1.000 1st Qu.:580.0 1st Qu.:72.00
## Median :27.00 Median :1.000 Median :620.0 Median :83.00
## Mean :27.36 Mean :1.248 Mean :619.5 Mean :80.64
## 3rd Qu.:29.00 3rd Qu.:1.000 3rd Qu.:660.0 3rd Qu.:93.00
## Max. :48.00 Max. :2.000 Max. :790.0 Max. :99.00
## gmat_vpc gmat_tpc s_avg f_avg
## Min. :16.00 Min. : 0.0 Min. :2.000 Min. :0.000
## 1st Qu.:71.00 1st Qu.:78.0 1st Qu.:2.708 1st Qu.:2.750
## Median :81.00 Median :87.0 Median :3.000 Median :3.000
## Mean :78.32 Mean :84.2 Mean :3.025 Mean :3.062
## 3rd Qu.:91.00 3rd Qu.:94.0 3rd Qu.:3.300 3rd Qu.:3.250
## Max. :99.00 Max. :99.0 Max. :4.000 Max. :4.000
## quarter work_yrs frstlang salary
## Min. :1.000 Min. : 0.000 Min. :1.000 Min. : 0
## 1st Qu.:1.250 1st Qu.: 2.000 1st Qu.:1.000 1st Qu.: 0
## Median :2.000 Median : 3.000 Median :1.000 Median : 999
## Mean :2.478 Mean : 3.872 Mean :1.117 Mean : 39026
## 3rd Qu.:3.000 3rd Qu.: 4.000 3rd Qu.:1.000 3rd Qu.: 97000
## Max. :4.000 Max. :22.000 Max. :2.000 Max. :220000
## satis
## Min. : 1.0
## 1st Qu.: 5.0
## Median : 6.0
## Mean :172.2
## 3rd Qu.: 7.0
## Max. :998.0
Bar Plots to visualise the distribution of each variable independently -
library(psych)
AGE DISTRIBUTION
hist(Data.df$age, col="green",xlab="Age in years", main="Age Distribution")
GENDER DISTRIBUTION
Data.df$sex=factor(Data.df$sex, levels=c(1,2), labels=c("Male","Female"))
plot(Data.df$sex,col = "green",main = "Gender distribution")
GMAT SCORE DISTRIBUTION
hist(Data.df$gmat_tot,col="green",xlab="GMAT SCORES (total = 800)", main="GMAT Score Distribution")
WORK EXPERIENCE DISTRIBUTION
hist(Data.df$work_yrs, col="green",xlab="Work Experience (in years)", main="Work Experience Distribution")
FIRST LANGUAGE DISTRIBUTION
Data.df$frstlang=factor(Data.df$frstlang, levels=c(1,2), labels=c("English","Others"))
plot(Data.df$frstlang,col = "green",main = "First Language Distribution")
SALARY DISTRIBUTION
hist(Data.df$salary, col="green",xlab="Salary", main="Salary Distribution")
SATISFACTION DISTRIBUTION
hist(Data.df$satis, col="green",xlab="Satisfaction", main="Satisfaction Distribution")
SCATTER PLOT MATRIX
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
scatterplot(salary ~ age,data= Data.df,main="Scatter plot of Salary vs Age", xlab="Age", ylab="Salary")
scatterplot(salary ~ gmat_tot,data= Data.df,main="Scatter plot of Salary vs GMAT Total", xlab="GMAT Total", ylab="Salary")
scatterplot(salary ~ work_yrs,data= Data.df,main="Scatter plot of Salary vs Work Experience", xlab="Work Experience in years", ylab="Salary")
scatterplot(salary ~ satis,data= Data.df,main="Scatter plot of Salary vs Satisfaction", xlab="Satisfaction", ylab="Salary")
CORRGRAM OF DATA
library(corrgram)
corrgram(Data.df, order=TRUE, lower.panel=panel.shade, upper.panel=panel.pie, text.panel=panel.txt, main="Corrgram")
VARIANCE-COVARIANCE MATRIX
var(Data.df)
## Warning in var(Data.df): NAs introduced by coercion
## age sex gmat_tot gmat_qpc gmat_vpc
## age 1.376904e+01 NA -3.115879e+01 -1.192655e+01 -2.763643
## sex NA NA NA NA NA
## gmat_tot -3.115879e+01 NA 3.310688e+03 6.200233e+02 726.000642
## gmat_qpc -1.192655e+01 NA 6.200233e+02 2.210731e+02 38.148258
## gmat_vpc -2.763643e+00 NA 7.260006e+02 3.814826e+01 284.248122
## gmat_tpc -8.839978e+00 NA 6.839911e+02 1.357997e+02 157.493249
## s_avg 2.116874e-01 NA 2.480257e+00 -1.691233e-01 1.313570
## f_avg -3.399348e-02 NA 3.154688e+00 5.753854e-01 0.672070
## quarter -2.045935e-01 NA -5.891153e+00 6.001979e-01 -3.267667
## work_yrs 1.029494e+01 NA -3.391634e+01 -1.137186e+01 -3.618165
## frstlang NA NA NA NA NA
## salary -1.183042e+04 NA -1.611600e+05 -3.335823e+04 -5273.852384
## satis -1.763499e+02 NA 1.765263e+03 3.348371e+02 392.356274
## gmat_tpc s_avg f_avg quarter work_yrs
## age -8.8399775 0.2116874 -0.03399348 -2.045935e-01 10.2949386
## sex NA NA NA NA NA
## gmat_tot 683.9910698 2.4802572 3.15468838 -5.891153e+00 -33.9163391
## gmat_qpc 135.7996845 -0.1691233 0.57538542 6.001979e-01 -11.3718617
## gmat_vpc 157.4932488 1.3135702 0.67207000 -3.267667e+00 -3.6181653
## gmat_tpc 196.6057057 0.6271001 0.58698618 -1.292372e+00 -7.8575172
## s_avg 0.6271001 0.1452176 0.11016898 -3.223721e-01 0.1592639
## f_avg 0.5869862 0.1101690 0.27567237 -2.608088e-01 -0.0662870
## quarter -1.2923719 -0.3223721 -0.26080880 1.232119e+00 -0.3086682
## work_yrs -7.8575172 0.1592639 -0.06628700 -3.086682e-01 10.4488249
## frstlang NA NA NA NA NA
## salary 3522.7500067 2831.6009858 787.65597177 -9.296214e+03 1486.1470415
## satis 484.2466779 -4.6288450 2.12532927 -5.227133e-03 -131.2408091
## frstlang salary satis
## age NA -1.183042e+04 -1.763499e+02
## sex NA NA NA
## gmat_tot NA -1.611600e+05 1.765263e+03
## gmat_qpc NA -3.335823e+04 3.348371e+02
## gmat_vpc NA -5.273852e+03 3.923563e+02
## gmat_tpc NA 3.522750e+03 4.842467e+02
## s_avg NA 2.831601e+03 -4.628845e+00
## f_avg NA 7.876560e+02 2.125329e+00
## quarter NA -9.296214e+03 -5.227133e-03
## work_yrs NA 1.486147e+03 -1.312408e+02
## frstlang NA NA NA
## salary NA 2.596062e+09 -6.347115e+06
## satis NA -6.347115e+06 1.380974e+05
A <- Data.df[,c("age", "gmat_tot", "gmat_qpc", "gmat_vpc","gmat_tpc","s_avg","f_avg","work_yrs","salary")]
B <- Data.df[,c("age", "gmat_tot", "gmat_qpc", "gmat_vpc","gmat_tpc","s_avg","f_avg","work_yrs","salary")]
cov(A,B)
## age gmat_tot gmat_qpc gmat_vpc
## age 1.376904e+01 -3.115879e+01 -1.192655e+01 -2.763643
## gmat_tot -3.115879e+01 3.310688e+03 6.200233e+02 726.000642
## gmat_qpc -1.192655e+01 6.200233e+02 2.210731e+02 38.148258
## gmat_vpc -2.763643e+00 7.260006e+02 3.814826e+01 284.248122
## gmat_tpc -8.839978e+00 6.839911e+02 1.357997e+02 157.493249
## s_avg 2.116874e-01 2.480257e+00 -1.691233e-01 1.313570
## f_avg -3.399348e-02 3.154688e+00 5.753854e-01 0.672070
## work_yrs 1.029494e+01 -3.391634e+01 -1.137186e+01 -3.618165
## salary -1.183042e+04 -1.611600e+05 -3.335823e+04 -5273.852384
## gmat_tpc s_avg f_avg work_yrs salary
## age -8.8399775 0.2116874 -0.03399348 10.2949386 -1.183042e+04
## gmat_tot 683.9910698 2.4802572 3.15468838 -33.9163391 -1.611600e+05
## gmat_qpc 135.7996845 -0.1691233 0.57538542 -11.3718617 -3.335823e+04
## gmat_vpc 157.4932488 1.3135702 0.67207000 -3.6181653 -5.273852e+03
## gmat_tpc 196.6057057 0.6271001 0.58698618 -7.8575172 3.522750e+03
## s_avg 0.6271001 0.1452176 0.11016898 0.1592639 2.831601e+03
## f_avg 0.5869862 0.1101690 0.27567237 -0.0662870 7.876560e+02
## work_yrs -7.8575172 0.1592639 -0.06628700 10.4488249 1.486147e+03
## salary 3522.7500067 2831.6009858 787.65597177 1486.1470415 2.596062e+09
Task 1B : WHO GOT HOW MUCH SALARY?
Taking a subset of the dataset consisting of only those people who actually got a job -
Job.df <- subset(Data.df, salary>0 & salary!= 998 & salary!=999)
Using this subset of data: Think about the problem as y = f(x), where y = Starting Salary and x = various factors that it could depend upon, Examples: impact of {gender; first language; prior work experience; GMAT performance; MBA performance} etc in determining the Starting Salary.
Contingency Tables -
table1 <- with(Job.df, table(age))
table1
## age
## 22 23 24 25 26 27 28 29 30 31 32 33 34 39 40
## 1 5 16 23 14 14 8 6 6 4 1 1 1 1 2
table2 <- with(Job.df, table(sex))
table2
## sex
## Male Female
## 72 31
table3 <- aggregate(salary ~ gmat_tot, data=Job.df, mean)
table3
## gmat_tot salary
## 1 500 158250.0
## 2 520 78256.0
## 3 530 99500.0
## 4 540 104000.0
## 5 550 112236.7
## 6 560 94000.0
## 7 570 103857.1
## 8 580 99875.0
## 9 590 97000.0
## 10 600 107666.7
## 11 610 96200.0
## 12 620 104108.3
## 13 630 105812.5
## 14 640 110000.0
## 15 650 101285.7
## 16 660 92480.0
## 17 670 100642.9
## 18 680 102166.7
## 19 700 122333.3
## 20 710 101250.0
## 21 720 85000.0
table4<-xtabs(~salary+frstlang,data=Job.df)
table4
## frstlang
## salary English Others
## 64000 1 0
## 77000 1 0
## 78256 1 0
## 82000 1 0
## 85000 4 0
## 86000 2 0
## 88000 1 0
## 88500 1 0
## 90000 3 0
## 92000 3 0
## 93000 3 0
## 95000 7 0
## 96000 4 0
## 96500 1 0
## 97000 2 0
## 98000 8 2
## 99000 0 1
## 100000 9 0
## 100400 1 0
## 101000 2 0
## 101100 1 0
## 101600 1 0
## 102500 1 0
## 103000 1 0
## 104000 1 1
## 105000 11 0
## 106000 3 0
## 107000 1 0
## 107300 0 1
## 107500 1 0
## 108000 2 0
## 110000 1 0
## 112000 3 0
## 115000 5 0
## 118000 0 1
## 120000 4 0
## 126710 1 0
## 130000 1 0
## 145800 1 0
## 146000 1 0
## 162000 1 0
## 220000 0 1
table5 <- aggregate(salary ~ s_avg + f_avg, data=Job.df, mean)
table5
## s_avg f_avg salary
## 1 4.00 0.00 146000.00
## 2 2.20 2.00 105000.00
## 3 2.40 2.00 85000.00
## 4 2.40 2.25 90000.00
## 5 2.30 2.50 98000.00
## 6 2.50 2.50 96000.00
## 7 2.60 2.50 107700.00
## 8 2.70 2.50 90000.00
## 9 3.50 2.67 86000.00
## 10 2.40 2.75 99500.00
## 11 2.50 2.75 220000.00
## 12 2.60 2.75 114155.00
## 13 2.70 2.75 90750.00
## 14 2.80 2.75 104833.33
## 15 2.90 2.75 91085.33
## 16 3.00 2.75 97250.00
## 17 2.91 2.83 105000.00
## 18 2.50 3.00 77000.00
## 19 2.60 3.00 100000.00
## 20 2.70 3.00 98000.00
## 21 2.80 3.00 99700.00
## 22 2.90 3.00 101400.00
## 23 3.00 3.00 105000.00
## 24 3.09 3.00 100000.00
## 25 3.10 3.00 112450.00
## 26 3.20 3.00 109000.00
## 27 3.30 3.00 105000.00
## 28 3.40 3.00 100000.00
## 29 3.50 3.00 113000.00
## 30 2.80 3.25 98000.00
## 31 2.90 3.25 93000.00
## 32 3.00 3.25 107500.00
## 33 3.20 3.25 105166.67
## 34 3.27 3.25 95000.00
## 35 3.30 3.25 101416.67
## 36 3.40 3.25 90000.00
## 37 3.50 3.25 97333.33
## 38 3.10 3.33 82000.00
## 39 2.90 3.50 107300.00
## 40 3.09 3.50 107000.00
## 41 3.10 3.50 96500.00
## 42 3.20 3.50 95000.00
## 43 3.30 3.50 95750.00
## 44 3.45 3.50 105000.00
## 45 3.50 3.50 111500.00
## 46 3.60 3.50 110500.00
## 47 3.80 3.50 105000.00
## 48 3.50 3.60 85000.00
## 49 3.70 3.60 106000.00
## 50 3.40 3.67 100000.00
## 51 3.60 3.67 95000.00
## 52 3.40 3.75 93000.00
## 53 3.50 3.75 85000.00
## 54 3.60 3.75 162000.00
## 55 3.70 4.00 115000.00
## 56 3.80 4.00 120000.00
table6 <- aggregate(salary ~ work_yrs, data=Job.df, mean)
table6
## work_yrs salary
## 1 0 95000.00
## 2 1 103532.00
## 3 2 97673.68
## 4 3 101652.86
## 5 4 105454.55
## 6 5 103142.86
## 7 6 105928.57
## 8 7 98000.00
## 9 8 105025.00
## 10 10 118000.00
## 11 15 183000.00
## 12 16 108500.00
T-TESTS - To examine the effect of gender and First language on the salary from the output given :
t.test(salary ~ sex, data=Job.df)
##
## Welch Two Sample t-test
##
## data: salary by sex
## t = 1.3628, df = 38.115, p-value = 0.1809
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -3128.55 16021.72
## sample estimates:
## mean in group Male mean in group Female
## 104970.97 98524.39
t.test(salary ~ frstlang, data=Job.df)
##
## Welch Two Sample t-test
##
## data: salary by frstlang
## t = -1.1202, df = 6.0863, p-value = 0.3049
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -59933.62 22202.25
## sample estimates:
## mean in group English mean in group Others
## 101748.6 120614.3
CHI-SQAURE TESTS - TO examine the effect of first language and work experience on the salary from the p-vlaue obtained -
chisq.test(table4)
## Warning in chisq.test(table4): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: table4
## X-squared = 69.847, df = 41, p-value = 0.003296
chisq.test(table6)
##
## Pearson's Chi-squared test
##
## data: table6
## X-squared = 33.445, df = 11, p-value = 0.0004455
Since p-value < 0.1 in both cases, we can say that there is a relationship between the variables we’ve taken.
REGRESSION MODELS -
MODEL 1
model1 <- lm(salary ~ age + sex + gmat_tot + work_yrs + s_avg + f_avg + frstlang, data = Job.df)
summary(model1)
##
## Call:
## lm(formula = salary ~ age + sex + gmat_tot + work_yrs + s_avg +
## f_avg + frstlang, data = Job.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31445 -8977 -2293 6114 80584
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 58980.07 32358.38 1.823 0.0715 .
## age 1551.84 1123.08 1.382 0.1703
## sexFemale -5000.72 3505.60 -1.426 0.1570
## gmat_tot -15.97 31.75 -0.503 0.6161
## work_yrs 832.83 1146.63 0.726 0.4694
## s_avg 3804.74 5020.77 0.758 0.4504
## f_avg -558.44 3828.62 -0.146 0.8843
## frstlangOthers 10881.96 7130.15 1.526 0.1303
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15690 on 95 degrees of freedom
## Multiple R-squared: 0.2823, Adjusted R-squared: 0.2294
## F-statistic: 5.337 on 7 and 95 DF, p-value: 3.562e-05
MODEL 2
model2 <- lm(salary ~ gmat_tot + gmat_qpc + gmat_vpc + gmat_tpc + s_avg + f_avg + frstlang, data = Job.df)
summary(model2)
##
## Call:
## lm(formula = salary ~ gmat_tot + gmat_qpc + gmat_vpc + gmat_tpc +
## s_avg + f_avg + frstlang, data = Job.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -42604 -7362 -369 6038 89453
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 108645.92 45824.46 2.371 0.0198 *
## gmat_tot -13.65 182.44 -0.075 0.9405
## gmat_qpc 897.73 528.78 1.698 0.0928 .
## gmat_vpc 727.96 530.24 1.373 0.1730
## gmat_tpc -1679.17 767.96 -2.187 0.0312 *
## s_avg 12713.89 5096.70 2.495 0.0143 *
## f_avg -7948.11 3898.87 -2.039 0.0443 *
## frstlangOthers 17814.16 6863.28 2.596 0.0109 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16800 on 95 degrees of freedom
## Multiple R-squared: 0.1769, Adjusted R-squared: 0.1162
## F-statistic: 2.917 on 7 and 95 DF, p-value: 0.008286
MODEL 3
model3 <- lm(salary ~ age + sex + gmat_tot + gmat_qpc + gmat_vpc + gmat_tpc + s_avg + f_avg + quarter + work_yrs + frstlang + satis, data = Job.df)
summary(model3)
##
## Call:
## lm(formula = salary ~ age + sex + gmat_tot + gmat_qpc + gmat_vpc +
## gmat_tpc + s_avg + f_avg + quarter + work_yrs + frstlang +
## satis, data = Job.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26489 -7983 -373 5923 70602
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 82141.01 54281.71 1.513 0.1337
## age 1750.65 1130.92 1.548 0.1251
## sexFemale -3584.07 3595.85 -0.997 0.3216
## gmat_tot 16.19 178.85 0.090 0.9281
## gmat_qpc 796.55 496.78 1.603 0.1123
## gmat_vpc 546.31 501.97 1.088 0.2794
## gmat_tpc -1457.09 714.94 -2.038 0.0445 *
## s_avg -931.53 8240.31 -0.113 0.9102
## f_avg -2222.82 3894.57 -0.571 0.5696
## quarter -2336.56 2721.89 -0.858 0.3929
## work_yrs 749.66 1135.90 0.660 0.5110
## frstlangOthers 7719.42 7373.27 1.047 0.2979
## satis -1086.54 2157.76 -0.504 0.6158
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15430 on 90 degrees of freedom
## Multiple R-squared: 0.3422, Adjusted R-squared: 0.2545
## F-statistic: 3.902 on 12 and 90 DF, p-value: 8.086e-05
MODEL 4
model4 <- lm(salary ~ age + sex + gmat_tot + work_yrs + frstlang + satis, data = Job.df)
summary(model4)
##
## Call:
## lm(formula = salary ~ age + sex + gmat_tot + work_yrs + frstlang +
## satis, data = Job.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26421 -9113 -1720 5518 77942
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 73754.350 31743.018 2.323 0.0223 *
## age 1753.296 1100.928 1.593 0.1145
## sexFemale -4993.576 3436.460 -1.453 0.1495
## gmat_tot -8.973 31.057 -0.289 0.7733
## work_yrs 803.922 1134.132 0.709 0.4801
## frstlangOthers 10165.631 6883.562 1.477 0.1430
## satis -2428.807 1993.480 -1.218 0.2261
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15540 on 96 degrees of freedom
## Multiple R-squared: 0.2884, Adjusted R-squared: 0.2439
## F-statistic: 6.484 on 6 and 96 DF, p-value: 9.105e-06
MODEL 3 seems to be the best model if we look at multiple R-squared value. However, the adjusted R-squared value is quite less compared to other models. Adjusted R-squared value decreases when a predictor improves the model by less than expected by chance. R-squared will either stay the same or increase with addition of more variables, even if they do not have any relationship with the output variables. In multi-variate regression, it is often better to look at the adjusted R-squared value. Adjusted R-square decreases if we add variables that do not improve our existing model. It takes into account only those models that are significant. Hence, looking at adjusted R-squared value will give us a better goo-fit model.
Here, even in the adjusted R-squared value comparison, MODEL 3 is the best.
We can conclude that MBA recruiters look at a variety of factors before deciding upon the starting salary of their recruits.
TASK 1c: COMPARE THOSE WHO GOT A JOB WITH THOSE WHO DID NOT GET A JOB? IDENTIFY WHY?
Compare the remaining subset of those people who did not get a job and compare them with those people who got a job. Here, we are not analyzing what drives a higher salary. Instead, we are analysing the two groups who got a job / did not get a job.
noJob.df <- subset(Data.df, salary==0)
Contingency Tables
table1.1 <- with(noJob.df, table(gmat_tot))
table1.1
## gmat_tot
## 450 480 510 530 540 550 560 570 580 590 600 610 620 630 640 650 660 670
## 1 1 2 3 3 4 8 7 4 3 3 9 4 5 6 5 3 4
## 680 700 710 720 730 740 750 760
## 3 2 4 2 1 1 1 1
table1.2 <- with(noJob.df, table(frstlang))
table1.2
## frstlang
## English Others
## 82 8
table1.3 <- with(noJob.df, table(quarter))
table1.3
## quarter
## 1 2 3 4
## 18 27 23 22
CHI-SQUARE TESTS : To examine the effect of GMAT score, First language and quartile ranking.
chisq.test(table1.1)
## Warning in chisq.test(table1.1): Chi-squared approximation may be incorrect
##
## Chi-squared test for given probabilities
##
## data: table1.1
## X-squared = 34.8, df = 25, p-value = 0.09188
chisq.test(table1.2)
##
## Chi-squared test for given probabilities
##
## data: table1.2
## X-squared = 60.844, df = 1, p-value = 6.177e-15
chisq.test(table1.3)
##
## Chi-squared test for given probabilities
##
## data: table1.3
## X-squared = 1.8222, df = 3, p-value = 0.6101
LOGISTIC REGRESSION
training.data.raw <- read.csv('MBA Starting Salaries Data.csv',header=T,na.strings=c(""))
Checking for missing values :
sapply(training.data.raw,function(x) sum(is.na(x)))
## age sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg
## 0 0 0 0 0 0 0 0
## quarter work_yrs frstlang salary satis
## 0 0 0 0 0
Since there are no missing values, we can use all the parameters further without having to account for them.
Checking for unique values :
sapply(training.data.raw, function(x) length(unique(x)))
## age sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg
## 21 2 31 48 34 42 36 21
## quarter work_yrs frstlang salary satis
## 4 18 2 45 8
Training and Testing the Data
train <- Data.df[1:260,]
test <- Data.df[261:274,]
Modelling the Data : Relating it to the Titanic case, we need a categorical variable that is binomial. In the MBA Starting Salaries dataset, First language is the only such variable. Running a binomial logistic regression for it,
model <- glm(frstlang ~.,family=binomial(link='logit'),data=Data.df)
summary(model)
##
## Call:
## glm(formula = frstlang ~ ., family = binomial(link = "logit"),
## data = Data.df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6145 -0.4053 -0.2344 -0.1219 3.0978
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.574e+00 6.905e+00 -0.662 0.507749
## age 4.565e-01 1.379e-01 3.311 0.000929 ***
## sexFemale 7.118e-01 5.773e-01 1.233 0.217548
## gmat_tot 4.741e-03 1.926e-02 0.246 0.805596
## gmat_qpc 5.715e-03 7.441e-02 0.077 0.938772
## gmat_vpc -1.266e-01 5.546e-02 -2.283 0.022414 *
## gmat_tpc 8.864e-02 8.407e-02 1.054 0.291715
## s_avg -3.829e+00 1.844e+00 -2.076 0.037851 *
## f_avg 1.020e+00 9.147e-01 1.115 0.264847
## quarter -8.802e-01 5.240e-01 -1.680 0.093002 .
## work_yrs -4.408e-01 1.616e-01 -2.727 0.006383 **
## salary 3.349e-07 5.291e-06 0.063 0.949530
## satis 1.161e-03 6.194e-04 1.874 0.060915 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 197.54 on 273 degrees of freedom
## Residual deviance: 129.01 on 261 degrees of freedom
## AIC: 155.01
##
## Number of Fisher Scoring iterations: 7