R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

2a

data <- read.csv(paste("MBAData.csv", sep=""))
summary(data)
##       age             sex           gmat_tot        gmat_qpc    
##  Min.   :22.00   Min.   :1.000   Min.   :450.0   Min.   :28.00  
##  1st Qu.:25.00   1st Qu.:1.000   1st Qu.:580.0   1st Qu.:72.00  
##  Median :27.00   Median :1.000   Median :620.0   Median :83.00  
##  Mean   :27.36   Mean   :1.248   Mean   :619.5   Mean   :80.64  
##  3rd Qu.:29.00   3rd Qu.:1.000   3rd Qu.:660.0   3rd Qu.:93.00  
##  Max.   :48.00   Max.   :2.000   Max.   :790.0   Max.   :99.00  
##     gmat_vpc        gmat_tpc        s_avg           f_avg      
##  Min.   :16.00   Min.   : 0.0   Min.   :2.000   Min.   :0.000  
##  1st Qu.:71.00   1st Qu.:78.0   1st Qu.:2.708   1st Qu.:2.750  
##  Median :81.00   Median :87.0   Median :3.000   Median :3.000  
##  Mean   :78.32   Mean   :84.2   Mean   :3.025   Mean   :3.062  
##  3rd Qu.:91.00   3rd Qu.:94.0   3rd Qu.:3.300   3rd Qu.:3.250  
##  Max.   :99.00   Max.   :99.0   Max.   :4.000   Max.   :4.000  
##     quarter         work_yrs         frstlang         salary      
##  Min.   :1.000   Min.   : 0.000   Min.   :1.000   Min.   :     0  
##  1st Qu.:1.250   1st Qu.: 2.000   1st Qu.:1.000   1st Qu.:     0  
##  Median :2.000   Median : 3.000   Median :1.000   Median :   999  
##  Mean   :2.478   Mean   : 3.872   Mean   :1.117   Mean   : 39026  
##  3rd Qu.:3.000   3rd Qu.: 4.000   3rd Qu.:1.000   3rd Qu.: 97000  
##  Max.   :4.000   Max.   :22.000   Max.   :2.000   Max.   :220000  
##      satis      
##  Min.   :  1.0  
##  1st Qu.:  5.0  
##  Median :  6.0  
##  Mean   :172.2  
##  3rd Qu.:  7.0  
##  Max.   :998.0
library(psych)
describe(data)
##          vars   n     mean       sd median  trimmed     mad min    max
## age         1 274    27.36     3.71     27    26.76    2.97  22     48
## sex         2 274     1.25     0.43      1     1.19    0.00   1      2
## gmat_tot    3 274   619.45    57.54    620   618.86   59.30 450    790
## gmat_qpc    4 274    80.64    14.87     83    82.31   14.83  28     99
## gmat_vpc    5 274    78.32    16.86     81    80.33   14.83  16     99
## gmat_tpc    6 274    84.20    14.02     87    86.12   11.86   0     99
## s_avg       7 274     3.03     0.38      3     3.03    0.44   2      4
## f_avg       8 274     3.06     0.53      3     3.09    0.37   0      4
## quarter     9 274     2.48     1.11      2     2.47    1.48   1      4
## work_yrs   10 274     3.87     3.23      3     3.29    1.48   0     22
## frstlang   11 274     1.12     0.32      1     1.02    0.00   1      2
## salary     12 274 39025.69 50951.56    999 33607.86 1481.12   0 220000
## satis      13 274   172.18   371.61      6    91.50    1.48   1    998
##           range  skew kurtosis      se
## age          26  2.16     6.45    0.22
## sex           1  1.16    -0.66    0.03
## gmat_tot    340 -0.01     0.06    3.48
## gmat_qpc     71 -0.92     0.30    0.90
## gmat_vpc     83 -1.04     0.74    1.02
## gmat_tpc     99 -2.28     9.02    0.85
## s_avg         2 -0.06    -0.38    0.02
## f_avg         4 -2.08    10.85    0.03
## quarter       3  0.02    -1.35    0.07
## work_yrs     22  2.78     9.80    0.20
## frstlang      1  2.37     3.65    0.02
## salary   220000  0.70    -1.05 3078.10
## satis       997  1.77     1.13   22.45
hist(data$age, breaks=5,xlab="Age in years", main="Age  distribution",col="Green Yellow")

hist(data$work_yrs, breaks=5,col="Green Yellow",xlab="Work Experience in years", main="Work experience distribution")

hist(data$gmat_tot, breaks=5,col="Green Yellow",xlab="Score out of 800", main="Gmat Score distribution")

newdata1 <- data[ which(data$salary !="998" & data$salary !="999"), ]
hist(newdata1$salary, breaks=10,col="Green Yellow",xlab="starting salary", main="Salary  distribution")

newdata <- data[ which(data$satis<='7'), ]
hist(newdata$satis, breaks=5,col="Green Yellow",xlab="Degree of Satisfaction,1=low 7=high", main="Satisfaction  distribution")

Scatterplots

library(car)
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
scatterplot(salary ~age,     data=newdata1,
            spread=FALSE, smoother.args=list(lty=2),
            main="Scatter plot of Salary vs Age",
            xlab="age",
            ylab="salary")

scatterplot(salary ~sex,     data=newdata1,
            spread=FALSE, smoother.args=list(lty=2),
            main="Scatter plot of Salary vs Sex",
            xlab="sex",
            ylab="salary")

scatterplot(salary ~gmat_tot,     data=newdata1,
            main="Scatter plot of salary vs Gmat total",
            xlab="Gmat score",
            ylab="salary")

scatterplot(salary ~work_yrs,     data=newdata1,
            main="Scatter plot of salary vs Work exp.",
            xlab="Work experience in years",
            ylab="salary")

scatterplot(salary ~satis,     data=newdata1,
            main="Scatter plot of salary vs satisfaction",
            xlab="Degree of satisfaction",
            ylab="salary")

Corrgram

library(corrgram)
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
## 
## Attaching package: 'Hmisc'
## The following object is masked from 'package:psych':
## 
##     describe
## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units
corrgram(data, order=FALSE, lower.panel=panel.shade,
         upper.panel=panel.pie, text.panel=panel.txt,
         main="Corrgram of variables ")

#Correlation matrix
colsalary <- c("salary","work_yrs","age","sex","gmat_tot","s_avg","f_avg")
corMatrix <- rcorr(as.matrix(data[,colsalary]))
corMatrix
##          salary work_yrs   age   sex gmat_tot s_avg f_avg
## salary     1.00     0.01 -0.06  0.07    -0.05  0.15  0.03
## work_yrs   0.01     1.00  0.86 -0.01    -0.18  0.13 -0.04
## age       -0.06     0.86  1.00 -0.03    -0.15  0.15 -0.02
## sex        0.07    -0.01 -0.03  1.00    -0.05  0.13  0.09
## gmat_tot  -0.05    -0.18 -0.15 -0.05     1.00  0.11  0.10
## s_avg      0.15     0.13  0.15  0.13     0.11  1.00  0.55
## f_avg      0.03    -0.04 -0.02  0.09     0.10  0.55  1.00
## 
## n= 274 
## 
## 
## P
##          salary work_yrs age    sex    gmat_tot s_avg  f_avg 
## salary          0.8818   0.3020 0.2560 0.3647   0.0157 0.6275
## work_yrs 0.8818          0.0000 0.8523 0.0024   0.0324 0.5197
## age      0.3020 0.0000          0.6432 0.0156   0.0131 0.7737
## sex      0.2560 0.8523   0.6432        0.3789   0.0355 0.1301
## gmat_tot 0.3647 0.0024   0.0156 0.3789          0.0615 0.0845
## s_avg    0.0157 0.0324   0.0131 0.0355 0.0615          0.0000
## f_avg    0.6275 0.5197   0.7737 0.1301 0.0845   0.0000

2b

attach(data)
job<-subset(data, salary != 998  & salary != 999 & salary != 0)


#Chi-square tests


chisq.test(job$salary, job$frstlang)
## Warning in chisq.test(job$salary, job$frstlang): Chi-squared approximation
## may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  job$salary and job$frstlang
## X-squared = 69.847, df = 41, p-value = 0.003296
#The p value suggests that salary and frstland are related.

chisq.test(xtabs(~salary+sex, data = job))
## Warning in chisq.test(xtabs(~salary + sex, data = job)): Chi-squared
## approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  xtabs(~salary + sex, data = job)
## X-squared = 52.681, df = 41, p-value = 0.1045
#The p value suggests that salary and sex are related.

t-tests

t.test(job$salary,job$age)
## 
##  Welch Two Sample t-test
## 
## data:  job$salary and job$age
## t = 58.503, df = 102, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##   99511.69 106496.23
## sample estimates:
##   mean of x   mean of y 
## 103030.7379     26.7767
#Null hypothesis:There is no relation between salary and age.
#Result:Null hypothesis rejected

t.test(salary, work_yrs)
## 
##  Welch Two Sample t-test
## 
## data:  salary and work_yrs
## t = 12.677, df = 273, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  32961.99 45081.64
## sample estimates:
##    mean of x    mean of y 
## 39025.689781     3.872263
#Null hypothesis:There is no relation between salary and work_yrs
#Result:Null hypothesis rejected
t.test(salary, gmat_tot)
## 
##  Welch Two Sample t-test
## 
## data:  salary and gmat_tot
## t = 12.477, df = 273, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  32346.41 44466.06
## sample estimates:
##  mean of x  mean of y 
## 39025.6898   619.4526
#Null hypothesis:There is no relation between salary and gmat score.
#Result:Null hypothesis rejected
t.test(salary, frstlang)
## 
##  Welch Two Sample t-test
## 
## data:  salary and frstlang
## t = 12.678, df = 273, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  32964.75 45084.40
## sample estimates:
##    mean of x    mean of y 
## 39025.689781     1.116788
#Null hypothesis:There is no relation between salary and frstlang
#Result:Null hypothesis rejected
t.test(salary,s_avg)
## 
##  Welch Two Sample t-test
## 
## data:  salary and s_avg
## t = 12.678, df = 273, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  32962.84 45082.49
## sample estimates:
##    mean of x    mean of y 
## 39025.689781     3.025401
#Null hypothesis:There is no relation between salary and s_avg
#Result:Null hypothesis rejected
t.test(salary, f_avg)
## 
##  Welch Two Sample t-test
## 
## data:  salary and f_avg
## t = 12.678, df = 273, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  32962.81 45082.45
## sample estimates:
##    mean of x    mean of y 
## 39025.689781     3.061533
#Null hypothesis:There is no relation between salary and f_avg
#Result:Null hypothesis rejected

Model1

fit1<-lm(salary ~ age+quarter+gmat_tpc+gmat_qpc+gmat_vpc+frstlang, data=job)
summary(fit1)
## 
## Call:
## lm(formula = salary ~ age + quarter + gmat_tpc + gmat_qpc + gmat_vpc + 
##     frstlang, data = job)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -26852  -9178   -615   5382  68168 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  44716.7    19552.1   2.287   0.0244 *  
## age           2583.0      508.3   5.082 1.84e-06 ***
## quarter      -1744.3     1375.4  -1.268   0.2078    
## gmat_tpc     -1424.6      683.3  -2.085   0.0397 *  
## gmat_qpc       834.1      350.0   2.383   0.0191 *  
## gmat_vpc       535.2      351.3   1.523   0.1309    
## frstlang      4649.7     6617.1   0.703   0.4840    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15120 on 96 degrees of freedom
## Multiple R-squared:  0.3262, Adjusted R-squared:  0.284 
## F-statistic: 7.744 on 6 and 96 DF,  p-value: 8.366e-07

Model2

fit2<-lm(salary ~ work_yrs + s_avg + f_avg + frstlang,data=job)
summary(fit2)
## 
## Call:
## lm(formula = salary ~ work_yrs + s_avg + f_avg + frstlang, data = job)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -35587  -8376   -696   4354  79427 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  72285.5    16125.7   4.483    2e-05 ***
## work_yrs      2314.5      575.8   4.019 0.000115 ***
## s_avg         4172.4     4946.2   0.844 0.400969    
## f_avg        -1867.2     3818.6  -0.489 0.625950    
## frstlang     14137.1     6451.7   2.191 0.030807 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15840 on 98 degrees of freedom
## Multiple R-squared:  0.2451, Adjusted R-squared:  0.2143 
## F-statistic: 7.955 on 4 and 98 DF,  p-value: 1.35e-05

Comparison of models

summary(fit1)$adj.r.squared
## [1] 0.2840354
AIC(fit1)
## [1] 2283.544
summary(fit2)$adj.r.squared
## [1] 0.2143036
AIC(fit2)
## [1] 2291.241
#We can clearly see that model 1 is better than model 2 because:
#Adjusted R-square of model1>model 2
#AIC(model1)<AIC(model2)

2c

Students who did not get a job

job <- job[1:90,]
nojob<-subset(data, salary != 998  & salary != 999 & salary == 0)

chisq.test(job$age,nojob$age)
## Warning in chisq.test(job$age, nojob$age): Chi-squared approximation may be
## incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  job$age and nojob$age
## X-squared = 229.27, df = 252, p-value = 0.8449
#* Age is not significant factor to get a job


chisq.test(job$sex,nojob$sex)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  job$sex and nojob$sex
## X-squared = 0.11711, df = 1, p-value = 0.7322
#* Sex is not significant factor to get a job


chisq.test(job$gmat_tpc,nojob$gmat_tpc)
## Warning in chisq.test(job$gmat_tpc, nojob$gmat_tpc): Chi-squared
## approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  job$gmat_tpc and nojob$gmat_tpc
## X-squared = 776.54, df = 784, p-value = 0.5683
#* Gmat total score is not significant factor to get a job



chisq.test(job$frstlang,nojob$frstlang)
## Warning in chisq.test(job$frstlang, nojob$frstlang): Chi-squared
## approximation may be incorrect
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  job$frstlang and nojob$frstlang
## X-squared = 0.0080703, df = 1, p-value = 0.9284
#* First language is not significant factor to get a job


chisq.test(job$work_yrs,nojob$work_yrs)
## Warning in chisq.test(job$work_yrs, nojob$work_yrs): Chi-squared
## approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  job$work_yrs and nojob$work_yrs
## X-squared = 117.66, df = 176, p-value = 0.9998
#* Work experience is not significant factor to get a job

2d

Logistic Regression

library(rcompanion)
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(ROCR)
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:Hmisc':
## 
##     combine, src, summarize
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
nagelkerke(fit1)
## $Models
##                                                                                     
## Model: "lm, salary ~ age + quarter + gmat_tpc + gmat_qpc + gmat_vpc + frstlang, job"
## Null:  "lm, salary ~ 1, job"                                                        
## 
## $Pseudo.R.squared.for.model.vs.null
##                              Pseudo.R.squared
## McFadden                            -0.148494
## Cox and Snell (ML)                 -16.225900
## Nagelkerke (Cragg and Uhler)       -16.225900
## 
## $Likelihood.ratio.test
##  Df.diff LogLik.diff   Chisq p.value
##       -6      146.59 -293.18       1
## 
## $Number.of.observations
##           
## Model: 103
## Null:   90
## 
## $Messages
## [1] "Note: For models fit with REML, these statistics are based on refitting with ML"
## 
## $Warnings
## [1] "WARNING: Fitted and null models have different numbers of observations"

Data set before cleaning process

library(Amelia)
## Loading required package: Rcpp
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.4, built: 2015-12-05)
## ## Copyright (C) 2005-2017 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
MBAsal<-subset(data, salary != 998  & salary != 999)
missmap(MBAsal, main = "Missing value before Cleaning",horizontal=FALSE)

Creating job column

MBAsal$job <- ifelse(MBAsal$salary > 0,1, 0)

Removing salary column

MBAsal$salary <- NULL

Model fitting

trainingSet  <- MBAsal[1:153,]
testSet  <- MBAsal[154:193,]

Anova Test

newmodel <- glm(formula = job ~ age+gmat_tpc+gmat_qpc+gmat_vpc+frstlang+quarter, family = binomial(link = "logit"), data = trainingSet)
anova(newmodel,test="Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: job
## 
## Terms added sequentially (first to last)
## 
## 
##          Df Deviance Resid. Df Resid. Dev Pr(>Chi)  
## NULL                       152     210.63           
## age       1   4.9352       151     205.69  0.02632 *
## gmat_tpc  1   0.2843       150     205.41  0.59387  
## gmat_qpc  1   1.0497       149     204.36  0.30558  
## gmat_vpc  1   2.5692       148     201.79  0.10896  
## frstlang  1   0.3416       147     201.45  0.55891  
## quarter   1   4.0764       146     197.37  0.04349 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
newmodel <- glm(formula = job ~ age+quarter, family = binomial(link = "logit"), data = trainingSet)

Statistical Tests for Individual Predictors

Variable Importance

library(caret)
varImp(newmodel)
##          Overall
## age     2.378017
## quarter 2.064746

Area under ROC Curve

AUC value of 0.809 shows a model with good predictive ability.

library(ROCR)
p <- predict(newmodel,testSet,type='response')
pr <- prediction(p, testSet$job)
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
auc
## [1] 0.8095238

ROC Curve

plot(prf)

KS Statistic

logit_ks <- max(prf@y.values[[1]]-prf@x.values[[1]])*100
logit_ks
## [1] 59.89975

Lift Chart

lift.obj <- performance(pr, measure="lift", x.measure="rpp")
plot(lift.obj,main="Lift Chart",xlab="% Populations",ylab="Lift", col="blue")
abline(1,0,col="red")

Gain Chart

lift.obj <- performance(pr, "tpr", x.measure="rpp")
plot(lift.obj,main="Gain Chart",xlab="Rate of positive prediction",ylab="True positive rate", col="green")
abline(0,1,col="red")

Note:

Students who didn’t attend the survey and didnt respond are removed from the dataset i.e., Students who had salary marked as 998 and 999