This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
data <- read.csv(paste("MBAData.csv", sep=""))
summary(data)
## age sex gmat_tot gmat_qpc
## Min. :22.00 Min. :1.000 Min. :450.0 Min. :28.00
## 1st Qu.:25.00 1st Qu.:1.000 1st Qu.:580.0 1st Qu.:72.00
## Median :27.00 Median :1.000 Median :620.0 Median :83.00
## Mean :27.36 Mean :1.248 Mean :619.5 Mean :80.64
## 3rd Qu.:29.00 3rd Qu.:1.000 3rd Qu.:660.0 3rd Qu.:93.00
## Max. :48.00 Max. :2.000 Max. :790.0 Max. :99.00
## gmat_vpc gmat_tpc s_avg f_avg
## Min. :16.00 Min. : 0.0 Min. :2.000 Min. :0.000
## 1st Qu.:71.00 1st Qu.:78.0 1st Qu.:2.708 1st Qu.:2.750
## Median :81.00 Median :87.0 Median :3.000 Median :3.000
## Mean :78.32 Mean :84.2 Mean :3.025 Mean :3.062
## 3rd Qu.:91.00 3rd Qu.:94.0 3rd Qu.:3.300 3rd Qu.:3.250
## Max. :99.00 Max. :99.0 Max. :4.000 Max. :4.000
## quarter work_yrs frstlang salary
## Min. :1.000 Min. : 0.000 Min. :1.000 Min. : 0
## 1st Qu.:1.250 1st Qu.: 2.000 1st Qu.:1.000 1st Qu.: 0
## Median :2.000 Median : 3.000 Median :1.000 Median : 999
## Mean :2.478 Mean : 3.872 Mean :1.117 Mean : 39026
## 3rd Qu.:3.000 3rd Qu.: 4.000 3rd Qu.:1.000 3rd Qu.: 97000
## Max. :4.000 Max. :22.000 Max. :2.000 Max. :220000
## satis
## Min. : 1.0
## 1st Qu.: 5.0
## Median : 6.0
## Mean :172.2
## 3rd Qu.: 7.0
## Max. :998.0
library(psych)
describe(data)
## vars n mean sd median trimmed mad min max
## age 1 274 27.36 3.71 27 26.76 2.97 22 48
## sex 2 274 1.25 0.43 1 1.19 0.00 1 2
## gmat_tot 3 274 619.45 57.54 620 618.86 59.30 450 790
## gmat_qpc 4 274 80.64 14.87 83 82.31 14.83 28 99
## gmat_vpc 5 274 78.32 16.86 81 80.33 14.83 16 99
## gmat_tpc 6 274 84.20 14.02 87 86.12 11.86 0 99
## s_avg 7 274 3.03 0.38 3 3.03 0.44 2 4
## f_avg 8 274 3.06 0.53 3 3.09 0.37 0 4
## quarter 9 274 2.48 1.11 2 2.47 1.48 1 4
## work_yrs 10 274 3.87 3.23 3 3.29 1.48 0 22
## frstlang 11 274 1.12 0.32 1 1.02 0.00 1 2
## salary 12 274 39025.69 50951.56 999 33607.86 1481.12 0 220000
## satis 13 274 172.18 371.61 6 91.50 1.48 1 998
## range skew kurtosis se
## age 26 2.16 6.45 0.22
## sex 1 1.16 -0.66 0.03
## gmat_tot 340 -0.01 0.06 3.48
## gmat_qpc 71 -0.92 0.30 0.90
## gmat_vpc 83 -1.04 0.74 1.02
## gmat_tpc 99 -2.28 9.02 0.85
## s_avg 2 -0.06 -0.38 0.02
## f_avg 4 -2.08 10.85 0.03
## quarter 3 0.02 -1.35 0.07
## work_yrs 22 2.78 9.80 0.20
## frstlang 1 2.37 3.65 0.02
## salary 220000 0.70 -1.05 3078.10
## satis 997 1.77 1.13 22.45
hist(data$age, breaks=5,xlab="Age in years", main="Age distribution",col="Green Yellow")
hist(data$work_yrs, breaks=5,col="Green Yellow",xlab="Work Experience in years", main="Work experience distribution")
hist(data$gmat_tot, breaks=5,col="Green Yellow",xlab="Score out of 800", main="Gmat Score distribution")
newdata1 <- data[ which(data$salary !="998" & data$salary !="999"), ]
hist(newdata1$salary, breaks=10,col="Green Yellow",xlab="starting salary", main="Salary distribution")
newdata <- data[ which(data$satis<='7'), ]
hist(newdata$satis, breaks=5,col="Green Yellow",xlab="Degree of Satisfaction,1=low 7=high", main="Satisfaction distribution")
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
scatterplot(salary ~age, data=newdata1,
spread=FALSE, smoother.args=list(lty=2),
main="Scatter plot of Salary vs Age",
xlab="age",
ylab="salary")
scatterplot(salary ~sex, data=newdata1,
spread=FALSE, smoother.args=list(lty=2),
main="Scatter plot of Salary vs Sex",
xlab="sex",
ylab="salary")
scatterplot(salary ~gmat_tot, data=newdata1,
main="Scatter plot of salary vs Gmat total",
xlab="Gmat score",
ylab="salary")
scatterplot(salary ~work_yrs, data=newdata1,
main="Scatter plot of salary vs Work exp.",
xlab="Work experience in years",
ylab="salary")
scatterplot(salary ~satis, data=newdata1,
main="Scatter plot of salary vs satisfaction",
xlab="Degree of satisfaction",
ylab="salary")
library(corrgram)
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
##
## Attaching package: 'Hmisc'
## The following object is masked from 'package:psych':
##
## describe
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
corrgram(data, order=FALSE, lower.panel=panel.shade,
upper.panel=panel.pie, text.panel=panel.txt,
main="Corrgram of variables ")
#Correlation matrix
colsalary <- c("salary","work_yrs","age","sex","gmat_tot","s_avg","f_avg")
corMatrix <- rcorr(as.matrix(data[,colsalary]))
corMatrix
## salary work_yrs age sex gmat_tot s_avg f_avg
## salary 1.00 0.01 -0.06 0.07 -0.05 0.15 0.03
## work_yrs 0.01 1.00 0.86 -0.01 -0.18 0.13 -0.04
## age -0.06 0.86 1.00 -0.03 -0.15 0.15 -0.02
## sex 0.07 -0.01 -0.03 1.00 -0.05 0.13 0.09
## gmat_tot -0.05 -0.18 -0.15 -0.05 1.00 0.11 0.10
## s_avg 0.15 0.13 0.15 0.13 0.11 1.00 0.55
## f_avg 0.03 -0.04 -0.02 0.09 0.10 0.55 1.00
##
## n= 274
##
##
## P
## salary work_yrs age sex gmat_tot s_avg f_avg
## salary 0.8818 0.3020 0.2560 0.3647 0.0157 0.6275
## work_yrs 0.8818 0.0000 0.8523 0.0024 0.0324 0.5197
## age 0.3020 0.0000 0.6432 0.0156 0.0131 0.7737
## sex 0.2560 0.8523 0.6432 0.3789 0.0355 0.1301
## gmat_tot 0.3647 0.0024 0.0156 0.3789 0.0615 0.0845
## s_avg 0.0157 0.0324 0.0131 0.0355 0.0615 0.0000
## f_avg 0.6275 0.5197 0.7737 0.1301 0.0845 0.0000
attach(data)
job<-subset(data, salary != 998 & salary != 999 & salary != 0)
#Chi-square tests
chisq.test(job$salary, job$frstlang)
## Warning in chisq.test(job$salary, job$frstlang): Chi-squared approximation
## may be incorrect
##
## Pearson's Chi-squared test
##
## data: job$salary and job$frstlang
## X-squared = 69.847, df = 41, p-value = 0.003296
#The p value suggests that salary and frstland are related.
chisq.test(xtabs(~salary+sex, data = job))
## Warning in chisq.test(xtabs(~salary + sex, data = job)): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: xtabs(~salary + sex, data = job)
## X-squared = 52.681, df = 41, p-value = 0.1045
#The p value suggests that salary and sex are related.
t.test(job$salary,job$age)
##
## Welch Two Sample t-test
##
## data: job$salary and job$age
## t = 58.503, df = 102, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 99511.69 106496.23
## sample estimates:
## mean of x mean of y
## 103030.7379 26.7767
#Null hypothesis:There is no relation between salary and age.
#Result:Null hypothesis rejected
t.test(salary, work_yrs)
##
## Welch Two Sample t-test
##
## data: salary and work_yrs
## t = 12.677, df = 273, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 32961.99 45081.64
## sample estimates:
## mean of x mean of y
## 39025.689781 3.872263
#Null hypothesis:There is no relation between salary and work_yrs
#Result:Null hypothesis rejected
t.test(salary, gmat_tot)
##
## Welch Two Sample t-test
##
## data: salary and gmat_tot
## t = 12.477, df = 273, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 32346.41 44466.06
## sample estimates:
## mean of x mean of y
## 39025.6898 619.4526
#Null hypothesis:There is no relation between salary and gmat score.
#Result:Null hypothesis rejected
t.test(salary, frstlang)
##
## Welch Two Sample t-test
##
## data: salary and frstlang
## t = 12.678, df = 273, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 32964.75 45084.40
## sample estimates:
## mean of x mean of y
## 39025.689781 1.116788
#Null hypothesis:There is no relation between salary and frstlang
#Result:Null hypothesis rejected
t.test(salary,s_avg)
##
## Welch Two Sample t-test
##
## data: salary and s_avg
## t = 12.678, df = 273, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 32962.84 45082.49
## sample estimates:
## mean of x mean of y
## 39025.689781 3.025401
#Null hypothesis:There is no relation between salary and s_avg
#Result:Null hypothesis rejected
t.test(salary, f_avg)
##
## Welch Two Sample t-test
##
## data: salary and f_avg
## t = 12.678, df = 273, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 32962.81 45082.45
## sample estimates:
## mean of x mean of y
## 39025.689781 3.061533
#Null hypothesis:There is no relation between salary and f_avg
#Result:Null hypothesis rejected
fit1<-lm(salary ~ age+quarter+gmat_tpc+gmat_qpc+gmat_vpc+frstlang, data=job)
summary(fit1)
##
## Call:
## lm(formula = salary ~ age + quarter + gmat_tpc + gmat_qpc + gmat_vpc +
## frstlang, data = job)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26852 -9178 -615 5382 68168
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 44716.7 19552.1 2.287 0.0244 *
## age 2583.0 508.3 5.082 1.84e-06 ***
## quarter -1744.3 1375.4 -1.268 0.2078
## gmat_tpc -1424.6 683.3 -2.085 0.0397 *
## gmat_qpc 834.1 350.0 2.383 0.0191 *
## gmat_vpc 535.2 351.3 1.523 0.1309
## frstlang 4649.7 6617.1 0.703 0.4840
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15120 on 96 degrees of freedom
## Multiple R-squared: 0.3262, Adjusted R-squared: 0.284
## F-statistic: 7.744 on 6 and 96 DF, p-value: 8.366e-07
fit2<-lm(salary ~ work_yrs + s_avg + f_avg + frstlang,data=job)
summary(fit2)
##
## Call:
## lm(formula = salary ~ work_yrs + s_avg + f_avg + frstlang, data = job)
##
## Residuals:
## Min 1Q Median 3Q Max
## -35587 -8376 -696 4354 79427
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 72285.5 16125.7 4.483 2e-05 ***
## work_yrs 2314.5 575.8 4.019 0.000115 ***
## s_avg 4172.4 4946.2 0.844 0.400969
## f_avg -1867.2 3818.6 -0.489 0.625950
## frstlang 14137.1 6451.7 2.191 0.030807 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15840 on 98 degrees of freedom
## Multiple R-squared: 0.2451, Adjusted R-squared: 0.2143
## F-statistic: 7.955 on 4 and 98 DF, p-value: 1.35e-05
summary(fit1)$adj.r.squared
## [1] 0.2840354
AIC(fit1)
## [1] 2283.544
summary(fit2)$adj.r.squared
## [1] 0.2143036
AIC(fit2)
## [1] 2291.241
#We can clearly see that model 1 is better than model 2 because:
#Adjusted R-square of model1>model 2
#AIC(model1)<AIC(model2)
job <- job[1:90,]
nojob<-subset(data, salary != 998 & salary != 999 & salary == 0)
chisq.test(job$age,nojob$age)
## Warning in chisq.test(job$age, nojob$age): Chi-squared approximation may be
## incorrect
##
## Pearson's Chi-squared test
##
## data: job$age and nojob$age
## X-squared = 229.27, df = 252, p-value = 0.8449
#* Age is not significant factor to get a job
chisq.test(job$sex,nojob$sex)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: job$sex and nojob$sex
## X-squared = 0.11711, df = 1, p-value = 0.7322
#* Sex is not significant factor to get a job
chisq.test(job$gmat_tpc,nojob$gmat_tpc)
## Warning in chisq.test(job$gmat_tpc, nojob$gmat_tpc): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: job$gmat_tpc and nojob$gmat_tpc
## X-squared = 776.54, df = 784, p-value = 0.5683
#* Gmat total score is not significant factor to get a job
chisq.test(job$frstlang,nojob$frstlang)
## Warning in chisq.test(job$frstlang, nojob$frstlang): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: job$frstlang and nojob$frstlang
## X-squared = 0.0080703, df = 1, p-value = 0.9284
#* First language is not significant factor to get a job
chisq.test(job$work_yrs,nojob$work_yrs)
## Warning in chisq.test(job$work_yrs, nojob$work_yrs): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: job$work_yrs and nojob$work_yrs
## X-squared = 117.66, df = 176, p-value = 0.9998
#* Work experience is not significant factor to get a job
library(rcompanion)
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:Hmisc':
##
## combine, src, summarize
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
nagelkerke(fit1)
## $Models
##
## Model: "lm, salary ~ age + quarter + gmat_tpc + gmat_qpc + gmat_vpc + frstlang, job"
## Null: "lm, salary ~ 1, job"
##
## $Pseudo.R.squared.for.model.vs.null
## Pseudo.R.squared
## McFadden -0.148494
## Cox and Snell (ML) -16.225900
## Nagelkerke (Cragg and Uhler) -16.225900
##
## $Likelihood.ratio.test
## Df.diff LogLik.diff Chisq p.value
## -6 146.59 -293.18 1
##
## $Number.of.observations
##
## Model: 103
## Null: 90
##
## $Messages
## [1] "Note: For models fit with REML, these statistics are based on refitting with ML"
##
## $Warnings
## [1] "WARNING: Fitted and null models have different numbers of observations"
library(Amelia)
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.4, built: 2015-12-05)
## ## Copyright (C) 2005-2017 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
MBAsal<-subset(data, salary != 998 & salary != 999)
missmap(MBAsal, main = "Missing value before Cleaning",horizontal=FALSE)
MBAsal$job <- ifelse(MBAsal$salary > 0,1, 0)
MBAsal$salary <- NULL
trainingSet <- MBAsal[1:153,]
testSet <- MBAsal[154:193,]
newmodel <- glm(formula = job ~ age+gmat_tpc+gmat_qpc+gmat_vpc+frstlang+quarter, family = binomial(link = "logit"), data = trainingSet)
anova(newmodel,test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: job
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 152 210.63
## age 1 4.9352 151 205.69 0.02632 *
## gmat_tpc 1 0.2843 150 205.41 0.59387
## gmat_qpc 1 1.0497 149 204.36 0.30558
## gmat_vpc 1 2.5692 148 201.79 0.10896
## frstlang 1 0.3416 147 201.45 0.55891
## quarter 1 4.0764 146 197.37 0.04349 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
newmodel <- glm(formula = job ~ age+quarter, family = binomial(link = "logit"), data = trainingSet)
library(caret)
varImp(newmodel)
## Overall
## age 2.378017
## quarter 2.064746
library(ROCR)
p <- predict(newmodel,testSet,type='response')
pr <- prediction(p, testSet$job)
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
auc
## [1] 0.8095238
plot(prf)
logit_ks <- max(prf@y.values[[1]]-prf@x.values[[1]])*100
logit_ks
## [1] 59.89975
lift.obj <- performance(pr, measure="lift", x.measure="rpp")
plot(lift.obj,main="Lift Chart",xlab="% Populations",ylab="Lift", col="blue")
abline(1,0,col="red")
lift.obj <- performance(pr, "tpr", x.measure="rpp")
plot(lift.obj,main="Gain Chart",xlab="Rate of positive prediction",ylab="True positive rate", col="green")
abline(0,1,col="red")