R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

2a

data <- read.csv(paste("MBAData.csv", sep=""))
summary(data)

##       age             sex           gmat_tot        gmat_qpc    
##  Min.   :22.00   Min.   :1.000   Min.   :450.0   Min.   :28.00  
##  1st Qu.:25.00   1st Qu.:1.000   1st Qu.:580.0   1st Qu.:72.00  
##  Median :27.00   Median :1.000   Median :620.0   Median :83.00  
##  Mean   :27.36   Mean   :1.248   Mean   :619.5   Mean   :80.64  
##  3rd Qu.:29.00   3rd Qu.:1.000   3rd Qu.:660.0   3rd Qu.:93.00  
##  Max.   :48.00   Max.   :2.000   Max.   :790.0   Max.   :99.00  
##     gmat_vpc        gmat_tpc        s_avg           f_avg      
##  Min.   :16.00   Min.   : 0.0   Min.   :2.000   Min.   :0.000  
##  1st Qu.:71.00   1st Qu.:78.0   1st Qu.:2.708   1st Qu.:2.750  
##  Median :81.00   Median :87.0   Median :3.000   Median :3.000  
##  Mean   :78.32   Mean   :84.2   Mean   :3.025   Mean   :3.062  
##  3rd Qu.:91.00   3rd Qu.:94.0   3rd Qu.:3.300   3rd Qu.:3.250  
##  Max.   :99.00   Max.   :99.0   Max.   :4.000   Max.   :4.000  
##     quarter         work_yrs         frstlang         salary      
##  Min.   :1.000   Min.   : 0.000   Min.   :1.000   Min.   :     0  
##  1st Qu.:1.250   1st Qu.: 2.000   1st Qu.:1.000   1st Qu.:     0  
##  Median :2.000   Median : 3.000   Median :1.000   Median :   999  
##  Mean   :2.478   Mean   : 3.872   Mean   :1.117   Mean   : 39026  
##  3rd Qu.:3.000   3rd Qu.: 4.000   3rd Qu.:1.000   3rd Qu.: 97000  
##  Max.   :4.000   Max.   :22.000   Max.   :2.000   Max.   :220000  
##      satis      
##  Min.   :  1.0  
##  1st Qu.:  5.0  
##  Median :  6.0  
##  Mean   :172.2  
##  3rd Qu.:  7.0  
##  Max.   :998.0

library(psych)
describe(data)

##          vars   n     mean       sd median  trimmed     mad min    max
## age         1 274    27.36     3.71     27    26.76    2.97  22     48
## sex         2 274     1.25     0.43      1     1.19    0.00   1      2
## gmat_tot    3 274   619.45    57.54    620   618.86   59.30 450    790
## gmat_qpc    4 274    80.64    14.87     83    82.31   14.83  28     99
## gmat_vpc    5 274    78.32    16.86     81    80.33   14.83  16     99
## gmat_tpc    6 274    84.20    14.02     87    86.12   11.86   0     99
## s_avg       7 274     3.03     0.38      3     3.03    0.44   2      4
## f_avg       8 274     3.06     0.53      3     3.09    0.37   0      4
## quarter     9 274     2.48     1.11      2     2.47    1.48   1      4
## work_yrs   10 274     3.87     3.23      3     3.29    1.48   0     22
## frstlang   11 274     1.12     0.32      1     1.02    0.00   1      2
## salary     12 274 39025.69 50951.56    999 33607.86 1481.12   0 220000
## satis      13 274   172.18   371.61      6    91.50    1.48   1    998
##           range  skew kurtosis      se
## age          26  2.16     6.45    0.22
## sex           1  1.16    -0.66    0.03
## gmat_tot    340 -0.01     0.06    3.48
## gmat_qpc     71 -0.92     0.30    0.90
## gmat_vpc     83 -1.04     0.74    1.02
## gmat_tpc     99 -2.28     9.02    0.85
## s_avg         2 -0.06    -0.38    0.02
## f_avg         4 -2.08    10.85    0.03
## quarter       3  0.02    -1.35    0.07
## work_yrs     22  2.78     9.80    0.20
## frstlang      1  2.37     3.65    0.02
## salary   220000  0.70    -1.05 3078.10
## satis       997  1.77     1.13   22.45

hist(data$age, breaks=5,xlab="Age in years", main="Age  distribution",col="Green Yellow")

hist(data$work_yrs, breaks=5,col="Green Yellow",xlab="Work Experience in years", main="Work experience distribution")

hist(data$gmat_tot, breaks=5,col="Green Yellow",xlab="Score out of 800", main="Gmat Score distribution")

newdata1 <- data[ which(data$salary !="998" & data$salary !="999"), ]
hist(newdata1$salary, breaks=10,col="Green Yellow",xlab="starting salary", main="Salary  distribution")

newdata <- data[ which(data$satis<='7'), ]
hist(newdata$satis, breaks=5,col="Green Yellow",xlab="Degree of Satisfaction,1=low 7=high", main="Satisfaction  distribution")

Scatterplots

library(car)

## 
## Attaching package: 'car'

## The following object is masked from 'package:psych':
## 
##     logit

scatterplot(salary ~age,     data=newdata1,
            spread=FALSE, smoother.args=list(lty=2),
            main="Scatter plot of Salary vs Age",
            xlab="age",
            ylab="salary")

scatterplot(salary ~sex,     data=newdata1,
            spread=FALSE, smoother.args=list(lty=2),
            main="Scatter plot of Salary vs Sex",
            xlab="sex",
            ylab="salary")

scatterplot(salary ~gmat_tot,     data=newdata1,
            main="Scatter plot of salary vs Gmat total",
            xlab="Gmat score",
            ylab="salary")

scatterplot(salary ~work_yrs,     data=newdata1,
            main="Scatter plot of salary vs Work exp.",
            xlab="Work experience in years",
            ylab="salary")

scatterplot(salary ~satis,     data=newdata1,
            main="Scatter plot of salary vs satisfaction",
            xlab="Degree of satisfaction",
            ylab="salary")

Corrgram

library(corrgram)
library(Hmisc)

## Loading required package: lattice

## Loading required package: survival

## Loading required package: Formula

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following objects are masked from 'package:psych':
## 
##     %+%, alpha

## 
## Attaching package: 'Hmisc'

## The following object is masked from 'package:psych':
## 
##     describe

## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units

corrgram(data, order=FALSE, lower.panel=panel.shade,
         upper.panel=panel.pie, text.panel=panel.txt,
         main="Corrgram of variables ")

#Correlation matrix
colsalary <- c("salary","work_yrs","age","sex","gmat_tot","s_avg","f_avg")
corMatrix <- rcorr(as.matrix(data[,colsalary]))
corMatrix

##          salary work_yrs   age   sex gmat_tot s_avg f_avg
## salary     1.00     0.01 -0.06  0.07    -0.05  0.15  0.03
## work_yrs   0.01     1.00  0.86 -0.01    -0.18  0.13 -0.04
## age       -0.06     0.86  1.00 -0.03    -0.15  0.15 -0.02
## sex        0.07    -0.01 -0.03  1.00    -0.05  0.13  0.09
## gmat_tot  -0.05    -0.18 -0.15 -0.05     1.00  0.11  0.10
## s_avg      0.15     0.13  0.15  0.13     0.11  1.00  0.55
## f_avg      0.03    -0.04 -0.02  0.09     0.10  0.55  1.00
## 
## n= 274 
## 
## 
## P
##          salary work_yrs age    sex    gmat_tot s_avg  f_avg 
## salary          0.8818   0.3020 0.2560 0.3647   0.0157 0.6275
## work_yrs 0.8818          0.0000 0.8523 0.0024   0.0324 0.5197
## age      0.3020 0.0000          0.6432 0.0156   0.0131 0.7737
## sex      0.2560 0.8523   0.6432        0.3789   0.0355 0.1301
## gmat_tot 0.3647 0.0024   0.0156 0.3789          0.0615 0.0845
## s_avg    0.0157 0.0324   0.0131 0.0355 0.0615          0.0000
## f_avg    0.6275 0.5197   0.7737 0.1301 0.0845   0.0000

2b

attach(data)
job<-subset(data, salary != 998  & salary != 999 & salary != 0)


#Chi-square tests


chisq.test(job$salary, job$frstlang)

## Warning in chisq.test(job$salary, job$frstlang): Chi-squared approximation
## may be incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  job$salary and job$frstlang
## X-squared = 69.847, df = 41, p-value = 0.003296

#The p value suggests that salary and frstland are related.

chisq.test(xtabs(~salary+sex, data = job))

## Warning in chisq.test(xtabs(~salary + sex, data = job)): Chi-squared
## approximation may be incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  xtabs(~salary + sex, data = job)
## X-squared = 52.681, df = 41, p-value = 0.1045

#The p value suggests that salary and sex are related.

t-tests

t.test(job$salary,job$age)

## 
##  Welch Two Sample t-test
## 
## data:  job$salary and job$age
## t = 58.503, df = 102, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##   99511.69 106496.23
## sample estimates:
##   mean of x   mean of y 
## 103030.7379     26.7767

#Null hypothesis:There is no relation between salary and age.
#Result:Null hypothesis rejected

t.test(salary, work_yrs)

## 
##  Welch Two Sample t-test
## 
## data:  salary and work_yrs
## t = 12.677, df = 273, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  32961.99 45081.64
## sample estimates:
##    mean of x    mean of y 
## 39025.689781     3.872263

#Null hypothesis:There is no relation between salary and work_yrs
#Result:Null hypothesis rejected
t.test(salary, gmat_tot)

## 
##  Welch Two Sample t-test
## 
## data:  salary and gmat_tot
## t = 12.477, df = 273, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  32346.41 44466.06
## sample estimates:
##  mean of x  mean of y 
## 39025.6898   619.4526

#Null hypothesis:There is no relation between salary and gmat score.
#Result:Null hypothesis rejected
t.test(salary, frstlang)

## 
##  Welch Two Sample t-test
## 
## data:  salary and frstlang
## t = 12.678, df = 273, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  32964.75 45084.40
## sample estimates:
##    mean of x    mean of y 
## 39025.689781     1.116788

#Null hypothesis:There is no relation between salary and frstlang
#Result:Null hypothesis rejected
t.test(salary,s_avg)

## 
##  Welch Two Sample t-test
## 
## data:  salary and s_avg
## t = 12.678, df = 273, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  32962.84 45082.49
## sample estimates:
##    mean of x    mean of y 
## 39025.689781     3.025401

#Null hypothesis:There is no relation between salary and s_avg
#Result:Null hypothesis rejected
t.test(salary, f_avg)

## 
##  Welch Two Sample t-test
## 
## data:  salary and f_avg
## t = 12.678, df = 273, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  32962.81 45082.45
## sample estimates:
##    mean of x    mean of y 
## 39025.689781     3.061533

#Null hypothesis:There is no relation between salary and f_avg
#Result:Null hypothesis rejected

Model1

fit1<-lm(salary ~ age+quarter+gmat_tpc+gmat_qpc+gmat_vpc+frstlang, data=job)
summary(fit1)

## 
## Call:
## lm(formula = salary ~ age + quarter + gmat_tpc + gmat_qpc + gmat_vpc + 
##     frstlang, data = job)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -26852  -9178   -615   5382  68168 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  44716.7    19552.1   2.287   0.0244 *  
## age           2583.0      508.3   5.082 1.84e-06 ***
## quarter      -1744.3     1375.4  -1.268   0.2078    
## gmat_tpc     -1424.6      683.3  -2.085   0.0397 *  
## gmat_qpc       834.1      350.0   2.383   0.0191 *  
## gmat_vpc       535.2      351.3   1.523   0.1309    
## frstlang      4649.7     6617.1   0.703   0.4840    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15120 on 96 degrees of freedom
## Multiple R-squared:  0.3262, Adjusted R-squared:  0.284 
## F-statistic: 7.744 on 6 and 96 DF,  p-value: 8.366e-07

Model2

fit2<-lm(salary ~ work_yrs + s_avg + f_avg + frstlang,data=job)
summary(fit2)

## 
## Call:
## lm(formula = salary ~ work_yrs + s_avg + f_avg + frstlang, data = job)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -35587  -8376   -696   4354  79427 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  72285.5    16125.7   4.483    2e-05 ***
## work_yrs      2314.5      575.8   4.019 0.000115 ***
## s_avg         4172.4     4946.2   0.844 0.400969    
## f_avg        -1867.2     3818.6  -0.489 0.625950    
## frstlang     14137.1     6451.7   2.191 0.030807 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15840 on 98 degrees of freedom
## Multiple R-squared:  0.2451, Adjusted R-squared:  0.2143 
## F-statistic: 7.955 on 4 and 98 DF,  p-value: 1.35e-05

Comparison of models

summary(fit1)$adj.r.squared

## [1] 0.2840354

AIC(fit1)

## [1] 2283.544

summary(fit2)$adj.r.squared

## [1] 0.2143036

AIC(fit2)

## [1] 2291.241

#We can clearly see that model 1 is better than model 2 because:
#Adjusted R-square of model1>model 2
#AIC(model1)<AIC(model2)

2c

Students who did not get a job

job <- job[1:90,]
nojob<-subset(data, salary != 998  & salary != 999 & salary == 0)

chisq.test(job$age,nojob$age)

## Warning in chisq.test(job$age, nojob$age): Chi-squared approximation may be
## incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  job$age and nojob$age
## X-squared = 229.27, df = 252, p-value = 0.8449

#* Age is not significant factor to get a job


chisq.test(job$sex,nojob$sex)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  job$sex and nojob$sex
## X-squared = 0.11711, df = 1, p-value = 0.7322

#* Sex is not significant factor to get a job


chisq.test(job$gmat_tpc,nojob$gmat_tpc)

## Warning in chisq.test(job$gmat_tpc, nojob$gmat_tpc): Chi-squared
## approximation may be incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  job$gmat_tpc and nojob$gmat_tpc
## X-squared = 776.54, df = 784, p-value = 0.5683

#* Gmat total score is not significant factor to get a job



chisq.test(job$frstlang,nojob$frstlang)

## Warning in chisq.test(job$frstlang, nojob$frstlang): Chi-squared
## approximation may be incorrect

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  job$frstlang and nojob$frstlang
## X-squared = 0.0080703, df = 1, p-value = 0.9284

#* First language is not significant factor to get a job


chisq.test(job$work_yrs,nojob$work_yrs)

## Warning in chisq.test(job$work_yrs, nojob$work_yrs): Chi-squared
## approximation may be incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  job$work_yrs and nojob$work_yrs
## X-squared = 117.66, df = 176, p-value = 0.9998

#* Work experience is not significant factor to get a job

2d

Logistic Regression

library(rcompanion)
library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

library(ROCR)

## Loading required package: gplots

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:Hmisc':
## 
##     combine, src, summarize

## The following object is masked from 'package:car':
## 
##     recode

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

nagelkerke(fit1)

## $Models
##                                                                                     
## Model: "lm, salary ~ age + quarter + gmat_tpc + gmat_qpc + gmat_vpc + frstlang, job"
## Null:  "lm, salary ~ 1, job"                                                        
## 
## $Pseudo.R.squared.for.model.vs.null
##                              Pseudo.R.squared
## McFadden                            -0.148494
## Cox and Snell (ML)                 -16.225900
## Nagelkerke (Cragg and Uhler)       -16.225900
## 
## $Likelihood.ratio.test
##  Df.diff LogLik.diff   Chisq p.value
##       -6      146.59 -293.18       1
## 
## $Number.of.observations
##           
## Model: 103
## Null:   90
## 
## $Messages
## [1] "Note: For models fit with REML, these statistics are based on refitting with ML"
## 
## $Warnings
## [1] "WARNING: Fitted and null models have different numbers of observations"

Data set before cleaning process

library(Amelia)

## Loading required package: Rcpp

## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.4, built: 2015-12-05)
## ## Copyright (C) 2005-2017 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##

MBAsal<-subset(data, salary != 998  & salary != 999)
missmap(MBAsal, main = "Missing value before Cleaning",horizontal=FALSE)

Creating job column

MBAsal$job <- ifelse(MBAsal$salary > 0,1, 0)

Removing salary column

MBAsal$salary <- NULL

Model fitting

We split the data into two chunks: training and testing set. The training set will be used to fit model which we will be testing over the testing set.

trainingSet  <- MBAsal[1:153,]
testSet  <- MBAsal[154:193,]

Anova Test

newmodel <- glm(formula = job ~ age+gmat_tpc+gmat_qpc+gmat_vpc+frstlang+quarter, family = binomial(link = "logit"), data = trainingSet)
anova(newmodel,test="Chisq")

## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: job
## 
## Terms added sequentially (first to last)
## 
## 
##          Df Deviance Resid. Df Resid. Dev Pr(>Chi)  
## NULL                       152     210.63           
## age       1   4.9352       151     205.69  0.02632 *
## gmat_tpc  1   0.2843       150     205.41  0.59387  
## gmat_qpc  1   1.0497       149     204.36  0.30558  
## gmat_vpc  1   2.5692       148     201.79  0.10896  
## frstlang  1   0.3416       147     201.45  0.55891  
## quarter   1   4.0764       146     197.37  0.04349 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

newmodel <- glm(formula = job ~ age+quarter, family = binomial(link = "logit"), data = trainingSet)

Age and quartile ranking are significant predictor to model.

Statistical Tests for Individual Predictors

Variable Importance

To assess the relative importance of individual predictors in the model we use this to portray significance of variable to predict getting job.
Age is the most significant predictor.

library(caret)
varImp(newmodel)

##          Overall
## age     2.378017
## quarter 2.064746

Area under ROC Curve

AUC value of 0.809 shows a model with good predictive ability.

library(ROCR)
p <- predict(newmodel,testSet,type='response')
pr <- prediction(p, testSet$job)
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
auc

## [1] 0.8095238

ROC Curve

The ROC is a curve generated by plotting the true positive rate (TPR) against the false positive rate (FPR) at various threshold settings.

plot(prf)

KS Statistic

Value lies between 40 and 70 implies it is able to separate events and non-events.

logit_ks <- max(prf@y.values[[1]]-prf@x.values[[1]])*100
logit_ks

## [1] 59.89975

Lift Chart

The predicted model(Blue line) plotted in lift chart gives great predictive lift over base model(Red line).
The level where a naive effort could produce a 20% rate of positive prediction, the model charted in blue produce about a 5 times multiple over baseline.

lift.obj <- performance(pr, measure="lift", x.measure="rpp")
plot(lift.obj,main="Lift Chart",xlab="% Populations",ylab="Lift", col="blue")
abline(1,0,col="red")

Gain Chart

The gap between the green line (response after predictive model) and the red line (response before predictive model) indicates the gains that the company can have using the predictive modelt that predict ALF.

lift.obj <- performance(pr, "tpr", x.measure="rpp")
plot(lift.obj,main="Gain Chart",xlab="Rate of positive prediction",ylab="True positive rate", col="green")
abline(0,1,col="red")

Note:

Students who didn’t attend the survey and didnt respond are removed from the dataset i.e., Students who had salary marked as 998 and 999

MBASalaries

Akshay Jha

December 26, 2017