MBA Starting Salaries Analysis
Field Description age age - in years sex 1=Male; 2=Female gmat_tot total GMAT score gmat_qpc quantitative GMAT percentile gmat_vpc verbal GMAT percentile qmat_tpc overall GMAT percentile s_avg spring MBA average f_avg fall MBA average quarter quartile ranking (1st is top, 4th is bottom) work_yrs years of work experience frstlang first language (1=English; 2=other) salary starting salary satis degree of satisfaction with MBA program (1= low, 7 = high satisfaction)
Missing salary and data are coded as follows: 998 = did not answer the survey 999 = answered the survey but did not disclose salary data Size of data set: 274 records
Assumption In Outcome Variable of data, 1 is taken as the students who got the job while 0 means who did not get the job.
#Setting working directory
setwd("C:/Users/SarveshKumar/Desktop/R/R/SM/Day 20")
#Read the data using read.csv
dataset.df <- read.csv(paste("MBA Starting Salaries Data.csv", sep=""))
library(car)
library(psych)
##
## Attaching package: 'psych'
## The following object is masked from 'package:car':
##
## logit
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
library(Amelia)
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.4, built: 2015-12-05)
## ## Copyright (C) 2005-2017 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(lattice)
library(vcd)
## Loading required package: grid
library(Hmisc)
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## combine, src, summarize
## The following object is masked from 'package:psych':
##
## describe
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
library(corrgram)
library(caret)
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
#library(gridExtra)
#View the data frame in R
head(dataset.df)
## age sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg quarter work_yrs
## 1 23 2 620 77 87 87 3.4 3.00 1 2
## 2 24 1 610 90 71 87 3.5 4.00 1 2
## 3 24 1 670 99 78 95 3.3 3.25 1 2
## 4 24 1 570 56 81 75 3.3 2.67 1 1
## 5 24 2 710 93 98 98 3.6 3.75 1 2
## 6 24 1 640 82 89 91 3.9 3.75 1 2
## frstlang salary satis
## 1 1 0 7
## 2 1 0 6
## 3 1 0 6
## 4 1 0 7
## 5 1 999 5
## 6 1 0 6
tail(dataset.df)
## age sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg quarter
## 269 26 1 630 96 71 91 2.6 2.75 4
## 270 31 1 530 75 45 62 2.4 2.75 4
## 271 23 1 580 64 81 78 2.2 2.00 4
## 272 25 1 540 79 45 65 2.6 2.50 4
## 273 26 1 550 72 58 69 2.6 2.75 4
## 274 40 2 500 60 45 51 2.5 2.75 4
## work_yrs frstlang salary satis
## 269 3 1 101600 6
## 270 4 2 104000 6
## 271 2 1 105000 6
## 272 3 1 115000 5
## 273 3 1 126710 6
## 274 15 2 220000 6
some(dataset.df)
## age sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg quarter
## 25 29 1 580 56 87 78 3.64 3.33 1
## 37 25 2 680 87 96 96 3.50 2.67 1
## 53 30 1 600 60 91 83 3.30 3.25 1
## 66 30 1 600 77 81 84 3.50 3.25 1
## 120 24 1 560 52 81 72 3.20 3.25 2
## 148 25 1 600 89 62 83 2.70 3.25 3
## 151 25 1 710 99 91 98 2.90 3.25 3
## 218 25 1 700 99 87 98 2.00 2.00 4
## 232 27 1 670 89 91 95 3.60 3.25 4
## 257 23 1 660 81 98 95 2.50 3.00 4
## work_yrs frstlang salary satis
## 25 3 1 0 5
## 37 2 1 86000 5
## 53 5 1 105000 6
## 66 5 1 120000 6
## 120 2 1 96000 7
## 148 4 1 998 998
## 151 1 1 0 6
## 218 1 1 0 7
## 232 5 1 0 6
## 257 2 1 77000 6
glimpse(dataset.df)
## Observations: 274
## Variables: 13
## $ age <int> 23, 24, 24, 24, 24, 24, 25, 25, 25, 25, 26, 26, 26, 2...
## $ sex <int> 2, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 1, 2, 2, 2, 1,...
## $ gmat_tot <int> 620, 610, 670, 570, 710, 640, 610, 650, 630, 680, 740...
## $ gmat_qpc <int> 77, 90, 99, 56, 93, 82, 89, 88, 79, 99, 99, 75, 95, 9...
## $ gmat_vpc <int> 87, 71, 78, 81, 98, 89, 74, 89, 91, 81, 98, 87, 95, 9...
## $ gmat_tpc <int> 87, 87, 95, 75, 98, 91, 87, 92, 89, 96, 99, 86, 98, 9...
## $ s_avg <dbl> 3.40, 3.50, 3.30, 3.30, 3.60, 3.90, 3.40, 3.30, 3.30,...
## $ f_avg <dbl> 3.00, 4.00, 3.25, 2.67, 3.75, 3.75, 3.50, 3.75, 3.25,...
## $ quarter <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ work_yrs <int> 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 4, 2, 4, 3,...
## $ frstlang <int> 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2,...
## $ salary <int> 0, 0, 0, 0, 999, 0, 0, 0, 999, 998, 998, 998, 998, 99...
## $ satis <int> 7, 6, 6, 7, 5, 6, 5, 6, 4, 998, 998, 998, 998, 998, 9...
274 observations and 13 variables with Numeric data.
##Functions
detect_outliers <- function(inp, na.rm=TRUE)
{
i.qnt <- quantile(inp, probs=c(.25, .75), na.rm=na.rm)
i.max <- 2.5 * IQR(inp, na.rm=na.rm)
otp <- inp
otp[inp < (i.qnt[1] - i.max)] <- NA
otp[inp > (i.qnt[2] + i.max)] <- NA
inp[is.na(otp)]
}
detect_na <- function(inp)
{
sum(is.na(inp))
}
Graph_Boxplot <- function (input, na.rm = TRUE)
{
Plot <- ggplot(dataset.df2, aes(x="", y=input)) +
geom_boxplot(aes(fill=input), color="blue") +
labs(title="Outliers")
Plot
}
#Create summary statistics (e.g. mean, standard deviation, median, mode) for the important variables in the dataset.
lapply(dataset.df, FUN=describe)
## $age
## X[[i]]
## n missing distinct Info Mean Gmd .05 .10
## 274 0 21 0.982 27.36 3.614 24.00 24.00
## .25 .50 .75 .90 .95
## 25.00 27.00 29.00 31.00 34.35
##
## lowest : 22 23 24 25 26, highest: 39 40 42 43 48
##
## $sex
## X[[i]]
## n missing distinct Info Mean Gmd
## 274 0 2 0.56 1.248 0.3745
##
## Value 1 2
## Frequency 206 68
## Proportion 0.752 0.248
##
## $gmat_tot
## X[[i]]
## n missing distinct Info Mean Gmd .05 .10
## 274 0 31 0.997 619.5 64.79 530 553
## .25 .50 .75 .90 .95
## 580 620 660 697 710
##
## lowest : 450 460 480 500 510, highest: 730 740 750 760 790
##
## $gmat_qpc
## X[[i]]
## n missing distinct Info Mean Gmd .05 .10
## 274 0 48 0.998 80.64 16.38 52.0 57.6
## .25 .50 .75 .90 .95
## 72.0 83.0 93.0 97.0 99.0
##
## lowest : 28 35 39 43 46, highest: 95 96 97 98 99
##
## $gmat_vpc
## X[[i]]
## n missing distinct Info Mean Gmd .05 .10
## 274 0 34 0.997 78.32 18.37 45 54
## .25 .50 .75 .90 .95
## 71 81 91 97 98
##
## lowest : 16 22 30 33 37, highest: 95 96 97 98 99
##
## $gmat_tpc
## X[[i]]
## n missing distinct Info Mean Gmd .05 .10
## 274 0 42 0.998 84.2 13.97 62 69
## .25 .50 .75 .90 .95
## 78 87 94 98 99
##
## lowest : 0 34 37 44 45, highest: 95 96 97 98 99
##
## $s_avg
## X[[i]]
## n missing distinct Info Mean Gmd .05 .10
## 274 0 36 0.995 3.025 0.433 2.400 2.500
## .25 .50 .75 .90 .95
## 2.708 3.000 3.300 3.500 3.600
##
## lowest : 2.00 2.10 2.20 2.30 2.40, highest: 3.64 3.70 3.80 3.90 4.00
##
## $f_avg
## X[[i]]
## n missing distinct Info Mean Gmd .05 .10
## 274 0 21 0.971 3.062 0.5195 2.441 2.500
## .25 .50 .75 .90 .95
## 2.750 3.000 3.250 3.649 3.750
##
## lowest : 0.00 2.00 2.25 2.33 2.50, highest: 3.60 3.67 3.75 3.83 4.00
##
## $quarter
## X[[i]]
## n missing distinct Info Mean Gmd
## 274 0 4 0.937 2.478 1.243
##
## Value 1 2 3 4
## Frequency 69 70 70 65
## Proportion 0.252 0.255 0.255 0.237
##
## $work_yrs
## X[[i]]
## n missing distinct Info Mean Gmd .05 .10
## 274 0 18 0.96 3.872 2.888 1 2
## .25 .50 .75 .90 .95
## 2 3 4 7 10
##
## Value 0 1 2 3 4 5 6 7 8 9
## Frequency 3 24 82 56 43 21 12 9 7 2
## Proportion 0.011 0.088 0.299 0.204 0.157 0.077 0.044 0.033 0.026 0.007
##
## Value 10 11 12 13 15 16 18 22
## Frequency 2 2 2 1 2 3 1 2
## Proportion 0.007 0.007 0.007 0.004 0.007 0.011 0.004 0.007
##
## $frstlang
## X[[i]]
## n missing distinct Info Mean Gmd
## 274 0 2 0.309 1.117 0.2071
##
## Value 1 2
## Frequency 242 32
## Proportion 0.883 0.117
##
## $salary
## X[[i]]
## n missing distinct Info Mean Gmd .05 .10
## 274 0 45 0.958 39026 50725 0 0
## .25 .50 .75 .90 .95
## 0 999 97000 105700 115000
##
## lowest : 0 998 999 64000 77000, highest: 130000 145800 146000 162000 220000
##
## $satis
## X[[i]]
## n missing distinct Info Mean Gmd
## 274 0 8 0.929 172.2 279
##
## Value 0 2 4 6 8 998
## Frequency 1 1 96 97 33 46
## Proportion 0.004 0.004 0.350 0.354 0.120 0.168
glimpse function shows that foll. variables have some value that needs cleaning - $salary - 998 and 999 values $satis - 998 and 999 values
sapply(dataset.df, function(x) length(unique(x)))
## age sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg
## 21 2 31 48 34 42 36 21
## quarter work_yrs frstlang salary satis
## 4 18 2 45 8
lapply(dataset.df, FUN=table)
## $age
##
## 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 39 40 42 43 48
## 2 8 33 53 40 46 21 22 12 10 8 1 4 3 2 1 2 2 1 2 1
##
## $sex
##
## 1 2
## 206 68
##
## $gmat_tot
##
## 450 460 480 500 510 520 530 540 550 560 570 580 590 600 610 620 630 640
## 2 1 1 3 2 1 5 5 8 21 18 15 9 20 18 20 22 12
## 650 660 670 680 690 700 710 720 730 740 750 760 790
## 16 14 17 12 4 5 10 4 2 4 1 1 1
##
## $gmat_qpc
##
## 28 35 39 43 46 48 49 50 52 53 55 56 57 59 60 61 64 65 66 67 68 69 71 72 73
## 1 1 1 3 1 3 2 1 6 1 1 5 2 1 7 2 5 1 3 2 6 1 3 14 1
## 74 75 77 78 79 81 82 83 84 85 87 88 89 90 91 92 93 94 95 96 97 98 99
## 3 10 5 3 18 4 14 9 8 3 14 4 20 3 8 2 10 8 11 8 12 3 20
##
## $gmat_vpc
##
## 16 22 30 33 37 41 45 46 50 54 58 62 63 67 70 71 74 75 78 81 82 84 85 87 89
## 1 2 1 1 1 4 7 2 6 5 13 9 4 9 1 21 12 1 13 26 1 16 3 22 15
## 90 91 92 93 95 96 97 98 99
## 3 10 5 9 14 8 4 20 5
##
## $gmat_tpc
##
## 0 34 37 44 45 51 52 54 55 58 61 62 65 68 69 71 72 73 75 76 77 78 79 80 81
## 2 1 1 1 1 1 2 1 1 1 1 4 5 1 6 3 15 4 14 1 1 11 2 2 9
## 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99
## 13 5 1 16 17 2 18 4 11 6 15 8 15 15 7 12 18
##
## $s_avg
##
## 2 2.1 2.2 2.3 2.4 2.45 2.5 2.6 2.67 2.7 2.73 2.8 2.82 2.9 2.91
## 1 2 3 4 9 1 10 11 1 27 1 19 1 29 1
## 3 3.08 3.09 3.1 3.17 3.18 3.2 3.25 3.27 3.3 3.38 3.4 3.45 3.5 3.56
## 24 1 4 20 1 1 16 1 3 25 1 16 3 16 1
## 3.6 3.64 3.7 3.8 3.9 4
## 11 1 2 4 1 2
##
## $f_avg
##
## 0 2 2.25 2.33 2.5 2.67 2.75 2.8 2.83 3 3.17 3.2 3.25 3.33 3.4
## 3 5 5 1 21 3 38 2 1 68 2 2 58 3 2
## 3.5 3.6 3.67 3.75 3.83 4
## 29 3 4 12 1 11
##
## $quarter
##
## 1 2 3 4
## 69 70 70 65
##
## $work_yrs
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 15 16 18 22
## 3 24 82 56 43 21 12 9 7 2 2 2 2 1 2 3 1 2
##
## $frstlang
##
## 1 2
## 242 32
##
## $salary
##
## 0 998 999 64000 77000 78256 82000 85000 86000 88000
## 90 46 35 1 1 1 1 4 2 1
## 88500 90000 92000 93000 95000 96000 96500 97000 98000 99000
## 1 3 3 3 7 4 1 2 10 1
## 100000 100400 101000 101100 101600 102500 103000 104000 105000 106000
## 9 1 2 1 1 1 1 2 11 3
## 107000 107300 107500 108000 110000 112000 115000 118000 120000 126710
## 1 1 1 2 1 3 5 1 4 1
## 130000 145800 146000 162000 220000
## 1 1 1 1 1
##
## $satis
##
## 1 2 3 4 5 6 7 998
## 1 1 5 17 74 97 33 46
dataset.df1 <- subset(dataset.df, !(dataset.df$salary %in% c(998, 999)))
dataset.df2 <- subset(dataset.df1, !(dataset.df1$satis %in% c(998)))
Values 998 and 999 removed from variable salary and satis
#sapply(dataset.df, function(x) length(unique(x)))
table(dataset.df2$satis)
##
## 3 4 5 6 7
## 1 5 65 90 32
Uniques values in variable satis reduced to 5 (1 to 7 on scale)
#library(Amelia)
missmap(dataset.df2, main = "Missing values vs observed")
lapply(dataset.df2, FUN=detect_na)
## $age
## [1] 0
##
## $sex
## [1] 0
##
## $gmat_tot
## [1] 0
##
## $gmat_qpc
## [1] 0
##
## $gmat_vpc
## [1] 0
##
## $gmat_tpc
## [1] 0
##
## $s_avg
## [1] 0
##
## $f_avg
## [1] 0
##
## $quarter
## [1] 0
##
## $work_yrs
## [1] 0
##
## $frstlang
## [1] 0
##
## $salary
## [1] 0
##
## $satis
## [1] 0
No missing values in the dataset.
lapply(dataset.df2, FUN=detect_outliers)
## $age
## [1] 42 48 40 43 43 40
##
## $sex
## integer(0)
##
## $gmat_tot
## integer(0)
##
## $gmat_qpc
## integer(0)
##
## $gmat_vpc
## integer(0)
##
## $gmat_tpc
## [1] 0
##
## $s_avg
## numeric(0)
##
## $f_avg
## [1] 0 0
##
## $quarter
## integer(0)
##
## $work_yrs
## [1] 13 22 16 15 18 16 16 22 15
##
## $frstlang
## [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##
## $salary
## integer(0)
##
## $satis
## integer(0)
Outliers present in dataset but since the n size=274(reduced to 193) is less, we go ahead with outliers in dataset.
#Draw Box Plots / Bar Plots to visualize the distribution of each variable independently
lapply(dataset.df2, FUN=Graph_Boxplot)
## $age
##
## $sex
##
## $gmat_tot
##
## $gmat_qpc
##
## $gmat_vpc
##
## $gmat_tpc
##
## $s_avg
##
## $f_avg
##
## $quarter
##
## $work_yrs
##
## $frstlang
##
## $salary
##
## $satis
#library(lattice)
histogram(dataset.df2$sex)
histogram(dataset.df2$quarter)
histogram(dataset.df2$frstlang)
histogram(dataset.df2$satis)
table(dataset.df2$salary>0)
##
## FALSE TRUE
## 90 103
salaried_students <- data.frame(dataset.df2$salary[dataset.df2$salary>0])
summary(salaried_students)
## dataset.df2.salary.dataset.df2.salary...0.
## Min. : 64000
## 1st Qu.: 95000
## Median :100000
## Mean :103031
## 3rd Qu.:106000
## Max. :220000
boxplot(salaried_students,
main="Salaries received by placed students",
col=c("yellow"),
horizontal=TRUE,
xlab="salaries of placed students",
ylab="salary")
dataset.df2$salaried <- ifelse(dataset.df2$salary == 0, 0, 1)
#dataset.df2$salary <- NULL #remove salary column
some(dataset.df2)
## age sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg quarter
## 47 24 1 560 81 50 71 3.40 3.67 1
## 58 25 1 600 82 74 83 3.50 3.25 1
## 76 25 1 660 94 84 94 3.27 3.75 2
## 115 26 2 670 87 95 95 3.10 3.33 2
## 129 27 1 620 97 63 88 3.20 3.00 2
## 189 25 1 630 75 93 89 2.70 2.50 3
## 199 29 1 710 93 98 99 2.90 3.25 3
## 232 27 1 670 89 91 95 3.60 3.25 4
## 233 27 1 580 74 70 78 3.40 3.25 4
## 236 28 1 710 94 98 99 3.40 3.75 4
## work_yrs frstlang salary satis salaried
## 47 2 1 100000 6 1
## 58 3 1 108000 6 1
## 76 2 1 0 5 0
## 115 1 1 82000 7 1
## 129 3 1 103000 6 1
## 189 2 1 90000 5 1
## 199 7 1 98000 5 1
## 232 5 1 0 6 0
## 233 3 1 0 6 0
## 236 6 1 0 6 0
table(dataset.df2$salaried)
##
## 0 1
## 90 103
103 got job (with salary) param found.
mytable <- xtabs(~ salaried+sex+quarter, data=dataset.df2)
ftable(mytable)
## quarter 1 2 3 4
## salaried sex
## 0 1 11 21 16 19
## 2 7 6 7 3
## 1 1 23 19 17 13
## 2 12 6 7 6
#library(vcd)
mosaic(mytable, shade=TRUE, legend=TRUE, main=" Dist. of salaried, sex and quarter")
attach(dataset.df2)
#7. Pair Plots
# ==========
## Salary and Work Exp
bwplot(sex[dataset.df2$salary>0] ~ work_yrs[dataset.df2$salary>0], data=dataset.df2,
horizontal=TRUE, xlab="Sex, Work_Exp distribution")
plot(~salary[dataset.df2$salary>0] + work_yrs[dataset.df2$salary>0], main="Salary and Work Exp of salaried students")
abline(0,1)
#6e. Scatter plots
# ==========
pairs(dataset.df2)
scatterplotMatrix(formula = ~ sex+age+gmat_tot+gmat_qpc+ gmat_tpc+
gmat_vpc+s_avg+f_avg+ quarter+ work_yrs+frstlang+satis+salary,
cex=0.6, data=dataset.df2, diagonal="histogram")
corrgram(dataset.df2, order=TRUE,
main="MBA Salaries vs Other Params",
lower.panel=panel.pts, upper.panel=panel.pie,
diag.panel=panel.minmax, text.panel=panel.txt)
##Correlation
vctCorr = numeric(0)
for (i in names(dataset.df2)){
cor.result <- cor(as.numeric(dataset.df2$salaried), as.numeric(dataset.df2[,i]))
vctCorr <- c(vctCorr, cor.result)
}
dfrCorr <- vctCorr
names(dfrCorr) <- names(dataset.df2)
dfrCorr
## age sex gmat_tot gmat_qpc gmat_vpc gmat_tpc
## -0.20569719 0.05047054 0.01491495 0.02698202 0.02888098 0.08264631
## s_avg f_avg quarter work_yrs frstlang salary
## 0.08063913 0.02746051 -0.12788216 -0.12330395 -0.03899476 0.96951510
## satis salaried
## 0.16882557 1.00000000
dfrGraph <- gather(dataset.df2, variable, value, -c(salary, salaried))
head(dfrGraph)
## salary salaried variable value
## 1 0 0 age 23
## 2 0 0 age 24
## 3 0 0 age 24
## 4 0 0 age 24
## 5 0 0 age 24
## 6 0 0 age 25
ggplot(dfrGraph) +
geom_jitter(aes(value,salaried, colour=variable)) +
geom_smooth(aes(value,salaried, colour=variable), method=lm, se=FALSE) +
facet_wrap(~variable, scales="free_x") +
labs(title="Relation Of salaried students With Other params")
##Observation salaried variable is correlated to other variables.
Find Best Multi Linear Model
Choose the best linear model by using step(). Choose a model by AIC in a Stepwise Algorithm In statistics, stepwise regression is a method of fitting regression models in which the choice of predictive variables is carried out by an automatic procedure. In each step, a variable is considered for addition to or subtraction from the set of explanatory variables based on some prespecified criterion. The Akaike information criterion (AIC) is a measure of the relative quality of statistical models for a given set of data. Given a collection of models for the data, AIC estimates the quality of each model, relative to each of the other models. Hence, AIC provides a means for model selection.
#?step()
stpModel=step(lm(data=dataset.df2, salaried~. -salary), trace=0, steps=100)
stpSummary <- summary(stpModel)
stpSummary
##
## Call:
## lm(formula = salaried ~ age + gmat_tot + gmat_tpc + quarter +
## satis, data = dataset.df2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.8780 -0.4781 0.2619 0.4163 0.8791
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.342013 0.600063 2.236 0.02650 *
## age -0.024381 0.008346 -2.921 0.00391 **
## gmat_tot -0.001943 0.001294 -1.501 0.13498
## gmat_tpc 0.007751 0.005478 1.415 0.15874
## quarter -0.065926 0.031995 -2.060 0.04074 *
## satis 0.099025 0.045597 2.172 0.03113 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4806 on 187 degrees of freedom
## Multiple R-squared: 0.1008, Adjusted R-squared: 0.07678
## F-statistic: 4.193 on 5 and 187 DF, p-value: 0.001234
#?step()
stpModel=step(glm(data=dataset.df2, salaried~. -salary, family=binomial), trace=0, steps=100)
stpSummary <- summary(stpModel)
stpSummary
##
## Call:
## glm(formula = salaried ~ age + gmat_tot + gmat_tpc + quarter +
## satis, family = binomial, data = dataset.df2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9491 -1.1229 0.7702 1.0287 2.1052
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.154571 3.053403 1.688 0.09138 .
## age -0.112199 0.040810 -2.749 0.00597 **
## gmat_tot -0.015176 0.009298 -1.632 0.10266
## gmat_tpc 0.067617 0.044113 1.533 0.12532
## quarter -0.296904 0.143264 -2.072 0.03823 *
## satis 0.426628 0.203977 2.092 0.03648 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 266.68 on 192 degrees of freedom
## Residual deviance: 244.99 on 187 degrees of freedom
## AIC: 256.99
##
## Number of Fisher Scoring iterations: 4
Best results given by salaried ~ age + quarter + satis and also gmat_tot + gmat_tpc
Make Final Multi Linear Model
x1 <- dataset.df2$age
x2 <- dataset.df2$quarter
x3 <- dataset.df2$satis
x4 <- dataset.df2$gmat_tpc
x5 <- dataset.df2$gmat_tot
y <- dataset.df2$salaried
mgmModel1 <- glm(y~x1+x2+x3+x4+x5)
No errors. Model successfully created.
Show Model
# print summary
summary(mgmModel1)
##
## Call:
## glm(formula = y ~ x1 + x2 + x3 + x4 + x5)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.8780 -0.4781 0.2619 0.4163 0.8791
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.342013 0.600063 2.236 0.02650 *
## x1 -0.024381 0.008346 -2.921 0.00391 **
## x2 -0.065926 0.031995 -2.060 0.04074 *
## x3 0.099025 0.045597 2.172 0.03113 *
## x4 0.007751 0.005478 1.415 0.15874
## x5 -0.001943 0.001294 -1.501 0.13498
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.2309553)
##
## Null deviance: 48.031 on 192 degrees of freedom
## Residual deviance: 43.189 on 187 degrees of freedom
## AIC: 272.77
##
## Number of Fisher Scoring iterations: 2
Make Final Multi Linear Model
x1 <- dataset.df2$age
x2 <- dataset.df2$quarter
x3 <- dataset.df2$satis
x4 <- dataset.df2$gmat_tpc
x5 <- dataset.df2$gmat_tot
y <- dataset.df2$salaried
mgmModel2 <- glm(y~x1+x2+x3+x4+x5, family=binomial(link="logit"))
No errors. Model successfully created.
Show Model
# print summary
summary(mgmModel2)
##
## Call:
## glm(formula = y ~ x1 + x2 + x3 + x4 + x5, family = binomial(link = "logit"))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9491 -1.1229 0.7702 1.0287 2.1052
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.154571 3.053403 1.688 0.09138 .
## x1 -0.112199 0.040810 -2.749 0.00597 **
## x2 -0.296904 0.143264 -2.072 0.03823 *
## x3 0.426628 0.203977 2.092 0.03648 *
## x4 0.067617 0.044113 1.533 0.12532
## x5 -0.015176 0.009298 -1.632 0.10266
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 266.68 on 192 degrees of freedom
## Residual deviance: 244.99 on 187 degrees of freedom
## AIC: 256.99
##
## Number of Fisher Scoring iterations: 4
Confusion Matrix Primary source of accuracy is the confusion matrix Its important to find out the foll: Accuracy, Precision, Recall
#library(caret)
prdVal <- predict(mgmModel2, type='response')
prdBln <- ifelse(prdVal > 0.50, 1, 0)
cnfmtrx <- table(prd=prdBln, act=dataset.df2$salaried)
confusionMatrix(cnfmtrx)
## Confusion Matrix and Statistics
##
## act
## prd 0 1
## 0 49 21
## 1 41 82
##
## Accuracy : 0.6788
## 95% CI : (0.6079, 0.744)
## No Information Rate : 0.5337
## P-Value [Acc > NIR] : 2.916e-05
##
## Kappa : 0.3454
## Mcnemar's Test P-Value : 0.01582
##
## Sensitivity : 0.5444
## Specificity : 0.7961
## Pos Pred Value : 0.7000
## Neg Pred Value : 0.6667
## Prevalence : 0.4663
## Detection Rate : 0.2539
## Detection Prevalence : 0.3627
## Balanced Accuracy : 0.6703
##
## 'Positive' Class : 0
##
At 95% Confidence Interval, p-value < 0.05. Accuracy of 67% is achieved.
Test Data
# find mpg of a person with weight 3.0
dfrTest <- data.frame(x1=c(20),x2=c(1),x3=c(6), x4=c(71), x5=c(570))
dfrTest
## x1 x2 x3 x4 x5
## 1 20 1 6 71 570
#names(dfrTest) <- c("x1","x2","x3")
#dfrTest
Test Data successfully created.
result <- predict(mgmModel2, dfrTest)
print(result)
## 1
## 1.324165
Predict
resVal <- predict(mgmModel2, dfrTest, type="response")
prdSur <- ifelse(resVal > 0.5, 1, 0)
prdSur <- as.factor(prdSur)
levels(prdSur) <- c("Didn't Get A Job", "Got A Job")
dfrTest <- mutate(dfrTest, Result=resVal, Prd_Outcome=prdSur)
some(dfrTest)
## x1 x2 x3 x4 x5 Result Prd_Outcome
## 1 20 1 6 71 570 0.7898739 Didn't Get A Job
Model created and data prediction is done on test data.