Problem Statement

MBA Starting Salaries Analysis

Attributes

Field Description age age - in years sex 1=Male; 2=Female gmat_tot total GMAT score gmat_qpc quantitative GMAT percentile gmat_vpc verbal GMAT percentile qmat_tpc overall GMAT percentile s_avg spring MBA average f_avg fall MBA average quarter quartile ranking (1st is top, 4th is bottom) work_yrs years of work experience frstlang first language (1=English; 2=other) salary starting salary satis degree of satisfaction with MBA program (1= low, 7 = high satisfaction)

Missing salary and data are coded as follows: 998 = did not answer the survey 999 = answered the survey but did not disclose salary data Size of data set: 274 records

Assumption In Outcome Variable of data, 1 is taken as the students who got the job while 0 means who did not get the job.

Approach - 1

Approach - 2

Approach - 3

#Setting working directory
setwd("C:/Users/SarveshKumar/Desktop/R/R/SM/Day 20")
#Read the data using read.csv
dataset.df <- read.csv(paste("MBA Starting Salaries Data.csv", sep=""))
library(car)
library(psych)
## 
## Attaching package: 'psych'
## The following object is masked from 'package:car':
## 
##     logit
library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
library(Amelia)
## Loading required package: Rcpp
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.4, built: 2015-12-05)
## ## Copyright (C) 2005-2017 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(lattice)
library(vcd)
## Loading required package: grid
library(Hmisc)
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     combine, src, summarize
## The following object is masked from 'package:psych':
## 
##     describe
## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units
library(corrgram)
library(caret)
## 
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
## 
##     cluster
#library(gridExtra) 
#View the data frame in R
head(dataset.df)
##   age sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg quarter work_yrs
## 1  23   2      620       77       87       87   3.4  3.00       1        2
## 2  24   1      610       90       71       87   3.5  4.00       1        2
## 3  24   1      670       99       78       95   3.3  3.25       1        2
## 4  24   1      570       56       81       75   3.3  2.67       1        1
## 5  24   2      710       93       98       98   3.6  3.75       1        2
## 6  24   1      640       82       89       91   3.9  3.75       1        2
##   frstlang salary satis
## 1        1      0     7
## 2        1      0     6
## 3        1      0     6
## 4        1      0     7
## 5        1    999     5
## 6        1      0     6
tail(dataset.df)
##     age sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg quarter
## 269  26   1      630       96       71       91   2.6  2.75       4
## 270  31   1      530       75       45       62   2.4  2.75       4
## 271  23   1      580       64       81       78   2.2  2.00       4
## 272  25   1      540       79       45       65   2.6  2.50       4
## 273  26   1      550       72       58       69   2.6  2.75       4
## 274  40   2      500       60       45       51   2.5  2.75       4
##     work_yrs frstlang salary satis
## 269        3        1 101600     6
## 270        4        2 104000     6
## 271        2        1 105000     6
## 272        3        1 115000     5
## 273        3        1 126710     6
## 274       15        2 220000     6
some(dataset.df)
##     age sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg quarter
## 25   29   1      580       56       87       78  3.64  3.33       1
## 37   25   2      680       87       96       96  3.50  2.67       1
## 53   30   1      600       60       91       83  3.30  3.25       1
## 66   30   1      600       77       81       84  3.50  3.25       1
## 120  24   1      560       52       81       72  3.20  3.25       2
## 148  25   1      600       89       62       83  2.70  3.25       3
## 151  25   1      710       99       91       98  2.90  3.25       3
## 218  25   1      700       99       87       98  2.00  2.00       4
## 232  27   1      670       89       91       95  3.60  3.25       4
## 257  23   1      660       81       98       95  2.50  3.00       4
##     work_yrs frstlang salary satis
## 25         3        1      0     5
## 37         2        1  86000     5
## 53         5        1 105000     6
## 66         5        1 120000     6
## 120        2        1  96000     7
## 148        4        1    998   998
## 151        1        1      0     6
## 218        1        1      0     7
## 232        5        1      0     6
## 257        2        1  77000     6
glimpse(dataset.df)
## Observations: 274
## Variables: 13
## $ age      <int> 23, 24, 24, 24, 24, 24, 25, 25, 25, 25, 26, 26, 26, 2...
## $ sex      <int> 2, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 1, 2, 2, 2, 1,...
## $ gmat_tot <int> 620, 610, 670, 570, 710, 640, 610, 650, 630, 680, 740...
## $ gmat_qpc <int> 77, 90, 99, 56, 93, 82, 89, 88, 79, 99, 99, 75, 95, 9...
## $ gmat_vpc <int> 87, 71, 78, 81, 98, 89, 74, 89, 91, 81, 98, 87, 95, 9...
## $ gmat_tpc <int> 87, 87, 95, 75, 98, 91, 87, 92, 89, 96, 99, 86, 98, 9...
## $ s_avg    <dbl> 3.40, 3.50, 3.30, 3.30, 3.60, 3.90, 3.40, 3.30, 3.30,...
## $ f_avg    <dbl> 3.00, 4.00, 3.25, 2.67, 3.75, 3.75, 3.50, 3.75, 3.25,...
## $ quarter  <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ work_yrs <int> 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 4, 2, 4, 3,...
## $ frstlang <int> 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2,...
## $ salary   <int> 0, 0, 0, 0, 999, 0, 0, 0, 999, 998, 998, 998, 998, 99...
## $ satis    <int> 7, 6, 6, 7, 5, 6, 5, 6, 4, 998, 998, 998, 998, 998, 9...

Observation

274 observations and 13 variables with Numeric data.

##Functions

detect_outliers <- function(inp, na.rm=TRUE) 
{
  i.qnt <- quantile(inp, probs=c(.25, .75), na.rm=na.rm)
  i.max <- 2.5 * IQR(inp, na.rm=na.rm)
  otp <- inp
  otp[inp < (i.qnt[1] - i.max)] <- NA
  otp[inp > (i.qnt[2] + i.max)] <- NA
  inp[is.na(otp)]
}

detect_na <- function(inp) 
{
  sum(is.na(inp))
}

Graph_Boxplot <- function (input, na.rm = TRUE)
{
  Plot <- ggplot(dataset.df2, aes(x="", y=input)) +
    geom_boxplot(aes(fill=input), color="blue") +
    labs(title="Outliers")
  Plot
}
#Create summary statistics (e.g. mean, standard deviation, median, mode) for the       important variables in the dataset.
lapply(dataset.df, FUN=describe)
## $age
## X[[i]] 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      274        0       21    0.982    27.36    3.614    24.00    24.00 
##      .25      .50      .75      .90      .95 
##    25.00    27.00    29.00    31.00    34.35 
## 
## lowest : 22 23 24 25 26, highest: 39 40 42 43 48
## 
## $sex
## X[[i]] 
##        n  missing distinct     Info     Mean      Gmd 
##      274        0        2     0.56    1.248   0.3745 
##                       
## Value          1     2
## Frequency    206    68
## Proportion 0.752 0.248
## 
## $gmat_tot
## X[[i]] 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      274        0       31    0.997    619.5    64.79      530      553 
##      .25      .50      .75      .90      .95 
##      580      620      660      697      710 
## 
## lowest : 450 460 480 500 510, highest: 730 740 750 760 790
## 
## $gmat_qpc
## X[[i]] 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      274        0       48    0.998    80.64    16.38     52.0     57.6 
##      .25      .50      .75      .90      .95 
##     72.0     83.0     93.0     97.0     99.0 
## 
## lowest : 28 35 39 43 46, highest: 95 96 97 98 99
## 
## $gmat_vpc
## X[[i]] 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      274        0       34    0.997    78.32    18.37       45       54 
##      .25      .50      .75      .90      .95 
##       71       81       91       97       98 
## 
## lowest : 16 22 30 33 37, highest: 95 96 97 98 99
## 
## $gmat_tpc
## X[[i]] 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      274        0       42    0.998     84.2    13.97       62       69 
##      .25      .50      .75      .90      .95 
##       78       87       94       98       99 
## 
## lowest :  0 34 37 44 45, highest: 95 96 97 98 99
## 
## $s_avg
## X[[i]] 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      274        0       36    0.995    3.025    0.433    2.400    2.500 
##      .25      .50      .75      .90      .95 
##    2.708    3.000    3.300    3.500    3.600 
## 
## lowest : 2.00 2.10 2.20 2.30 2.40, highest: 3.64 3.70 3.80 3.90 4.00
## 
## $f_avg
## X[[i]] 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      274        0       21    0.971    3.062   0.5195    2.441    2.500 
##      .25      .50      .75      .90      .95 
##    2.750    3.000    3.250    3.649    3.750 
## 
## lowest : 0.00 2.00 2.25 2.33 2.50, highest: 3.60 3.67 3.75 3.83 4.00
## 
## $quarter
## X[[i]] 
##        n  missing distinct     Info     Mean      Gmd 
##      274        0        4    0.937    2.478    1.243 
##                                   
## Value          1     2     3     4
## Frequency     69    70    70    65
## Proportion 0.252 0.255 0.255 0.237
## 
## $work_yrs
## X[[i]] 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      274        0       18     0.96    3.872    2.888        1        2 
##      .25      .50      .75      .90      .95 
##        2        3        4        7       10 
##                                                                       
## Value          0     1     2     3     4     5     6     7     8     9
## Frequency      3    24    82    56    43    21    12     9     7     2
## Proportion 0.011 0.088 0.299 0.204 0.157 0.077 0.044 0.033 0.026 0.007
##                                                           
## Value         10    11    12    13    15    16    18    22
## Frequency      2     2     2     1     2     3     1     2
## Proportion 0.007 0.007 0.007 0.004 0.007 0.011 0.004 0.007
## 
## $frstlang
## X[[i]] 
##        n  missing distinct     Info     Mean      Gmd 
##      274        0        2    0.309    1.117   0.2071 
##                       
## Value          1     2
## Frequency    242    32
## Proportion 0.883 0.117
## 
## $salary
## X[[i]] 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      274        0       45    0.958    39026    50725        0        0 
##      .25      .50      .75      .90      .95 
##        0      999    97000   105700   115000 
## 
## lowest :      0    998    999  64000  77000, highest: 130000 145800 146000 162000 220000
## 
## $satis
## X[[i]] 
##        n  missing distinct     Info     Mean      Gmd 
##      274        0        8    0.929    172.2      279 
##                                               
## Value          0     2     4     6     8   998
## Frequency      1     1    96    97    33    46
## Proportion 0.004 0.004 0.350 0.354 0.120 0.168

Observation

glimpse function shows that foll. variables have some value that needs cleaning - $salary - 998 and 999 values $satis - 998 and 999 values

sapply(dataset.df, function(x) length(unique(x)))
##      age      sex gmat_tot gmat_qpc gmat_vpc gmat_tpc    s_avg    f_avg 
##       21        2       31       48       34       42       36       21 
##  quarter work_yrs frstlang   salary    satis 
##        4       18        2       45        8
lapply(dataset.df, FUN=table)
## $age
## 
## 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 39 40 42 43 48 
##  2  8 33 53 40 46 21 22 12 10  8  1  4  3  2  1  2  2  1  2  1 
## 
## $sex
## 
##   1   2 
## 206  68 
## 
## $gmat_tot
## 
## 450 460 480 500 510 520 530 540 550 560 570 580 590 600 610 620 630 640 
##   2   1   1   3   2   1   5   5   8  21  18  15   9  20  18  20  22  12 
## 650 660 670 680 690 700 710 720 730 740 750 760 790 
##  16  14  17  12   4   5  10   4   2   4   1   1   1 
## 
## $gmat_qpc
## 
## 28 35 39 43 46 48 49 50 52 53 55 56 57 59 60 61 64 65 66 67 68 69 71 72 73 
##  1  1  1  3  1  3  2  1  6  1  1  5  2  1  7  2  5  1  3  2  6  1  3 14  1 
## 74 75 77 78 79 81 82 83 84 85 87 88 89 90 91 92 93 94 95 96 97 98 99 
##  3 10  5  3 18  4 14  9  8  3 14  4 20  3  8  2 10  8 11  8 12  3 20 
## 
## $gmat_vpc
## 
## 16 22 30 33 37 41 45 46 50 54 58 62 63 67 70 71 74 75 78 81 82 84 85 87 89 
##  1  2  1  1  1  4  7  2  6  5 13  9  4  9  1 21 12  1 13 26  1 16  3 22 15 
## 90 91 92 93 95 96 97 98 99 
##  3 10  5  9 14  8  4 20  5 
## 
## $gmat_tpc
## 
##  0 34 37 44 45 51 52 54 55 58 61 62 65 68 69 71 72 73 75 76 77 78 79 80 81 
##  2  1  1  1  1  1  2  1  1  1  1  4  5  1  6  3 15  4 14  1  1 11  2  2  9 
## 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 
## 13  5  1 16 17  2 18  4 11  6 15  8 15 15  7 12 18 
## 
## $s_avg
## 
##    2  2.1  2.2  2.3  2.4 2.45  2.5  2.6 2.67  2.7 2.73  2.8 2.82  2.9 2.91 
##    1    2    3    4    9    1   10   11    1   27    1   19    1   29    1 
##    3 3.08 3.09  3.1 3.17 3.18  3.2 3.25 3.27  3.3 3.38  3.4 3.45  3.5 3.56 
##   24    1    4   20    1    1   16    1    3   25    1   16    3   16    1 
##  3.6 3.64  3.7  3.8  3.9    4 
##   11    1    2    4    1    2 
## 
## $f_avg
## 
##    0    2 2.25 2.33  2.5 2.67 2.75  2.8 2.83    3 3.17  3.2 3.25 3.33  3.4 
##    3    5    5    1   21    3   38    2    1   68    2    2   58    3    2 
##  3.5  3.6 3.67 3.75 3.83    4 
##   29    3    4   12    1   11 
## 
## $quarter
## 
##  1  2  3  4 
## 69 70 70 65 
## 
## $work_yrs
## 
##  0  1  2  3  4  5  6  7  8  9 10 11 12 13 15 16 18 22 
##  3 24 82 56 43 21 12  9  7  2  2  2  2  1  2  3  1  2 
## 
## $frstlang
## 
##   1   2 
## 242  32 
## 
## $salary
## 
##      0    998    999  64000  77000  78256  82000  85000  86000  88000 
##     90     46     35      1      1      1      1      4      2      1 
##  88500  90000  92000  93000  95000  96000  96500  97000  98000  99000 
##      1      3      3      3      7      4      1      2     10      1 
## 100000 100400 101000 101100 101600 102500 103000 104000 105000 106000 
##      9      1      2      1      1      1      1      2     11      3 
## 107000 107300 107500 108000 110000 112000 115000 118000 120000 126710 
##      1      1      1      2      1      3      5      1      4      1 
## 130000 145800 146000 162000 220000 
##      1      1      1      1      1 
## 
## $satis
## 
##   1   2   3   4   5   6   7 998 
##   1   1   5  17  74  97  33  46
dataset.df1 <- subset(dataset.df, !(dataset.df$salary %in% c(998, 999)))
dataset.df2 <- subset(dataset.df1, !(dataset.df1$satis %in% c(998)))

Observation

Values 998 and 999 removed from variable salary and satis

#sapply(dataset.df, function(x) length(unique(x)))
table(dataset.df2$satis)
## 
##  3  4  5  6  7 
##  1  5 65 90 32

Observation

Uniques values in variable satis reduced to 5 (1 to 7 on scale)

#library(Amelia)
missmap(dataset.df2, main = "Missing values vs observed")

lapply(dataset.df2, FUN=detect_na)
## $age
## [1] 0
## 
## $sex
## [1] 0
## 
## $gmat_tot
## [1] 0
## 
## $gmat_qpc
## [1] 0
## 
## $gmat_vpc
## [1] 0
## 
## $gmat_tpc
## [1] 0
## 
## $s_avg
## [1] 0
## 
## $f_avg
## [1] 0
## 
## $quarter
## [1] 0
## 
## $work_yrs
## [1] 0
## 
## $frstlang
## [1] 0
## 
## $salary
## [1] 0
## 
## $satis
## [1] 0

Observation

No missing values in the dataset.

lapply(dataset.df2, FUN=detect_outliers)
## $age
## [1] 42 48 40 43 43 40
## 
## $sex
## integer(0)
## 
## $gmat_tot
## integer(0)
## 
## $gmat_qpc
## integer(0)
## 
## $gmat_vpc
## integer(0)
## 
## $gmat_tpc
## [1] 0
## 
## $s_avg
## numeric(0)
## 
## $f_avg
## [1] 0 0
## 
## $quarter
## integer(0)
## 
## $work_yrs
## [1] 13 22 16 15 18 16 16 22 15
## 
## $frstlang
##  [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 
## $salary
## integer(0)
## 
## $satis
## integer(0)

Observation

Outliers present in dataset but since the n size=274(reduced to 193) is less, we go ahead with outliers in dataset.

#Draw Box Plots / Bar Plots to visualize the distribution of each variable independently
lapply(dataset.df2, FUN=Graph_Boxplot)
## $age

## 
## $sex

## 
## $gmat_tot

## 
## $gmat_qpc

## 
## $gmat_vpc

## 
## $gmat_tpc

## 
## $s_avg

## 
## $f_avg

## 
## $quarter

## 
## $work_yrs

## 
## $frstlang

## 
## $salary

## 
## $satis

#library(lattice)
histogram(dataset.df2$sex)

histogram(dataset.df2$quarter)

histogram(dataset.df2$frstlang)

histogram(dataset.df2$satis)

table(dataset.df2$salary>0)
## 
## FALSE  TRUE 
##    90   103
salaried_students <- data.frame(dataset.df2$salary[dataset.df2$salary>0])
summary(salaried_students)
##  dataset.df2.salary.dataset.df2.salary...0.
##  Min.   : 64000                            
##  1st Qu.: 95000                            
##  Median :100000                            
##  Mean   :103031                            
##  3rd Qu.:106000                            
##  Max.   :220000
boxplot(salaried_students, 
        main="Salaries received by placed students",
        col=c("yellow"),
        horizontal=TRUE,
        xlab="salaries of placed students",
        ylab="salary")

dataset.df2$salaried <- ifelse(dataset.df2$salary == 0, 0, 1)

#dataset.df2$salary <- NULL #remove salary column
some(dataset.df2)
##     age sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg quarter
## 47   24   1      560       81       50       71  3.40  3.67       1
## 58   25   1      600       82       74       83  3.50  3.25       1
## 76   25   1      660       94       84       94  3.27  3.75       2
## 115  26   2      670       87       95       95  3.10  3.33       2
## 129  27   1      620       97       63       88  3.20  3.00       2
## 189  25   1      630       75       93       89  2.70  2.50       3
## 199  29   1      710       93       98       99  2.90  3.25       3
## 232  27   1      670       89       91       95  3.60  3.25       4
## 233  27   1      580       74       70       78  3.40  3.25       4
## 236  28   1      710       94       98       99  3.40  3.75       4
##     work_yrs frstlang salary satis salaried
## 47         2        1 100000     6        1
## 58         3        1 108000     6        1
## 76         2        1      0     5        0
## 115        1        1  82000     7        1
## 129        3        1 103000     6        1
## 189        2        1  90000     5        1
## 199        7        1  98000     5        1
## 232        5        1      0     6        0
## 233        3        1      0     6        0
## 236        6        1      0     6        0
table(dataset.df2$salaried)
## 
##   0   1 
##  90 103

Observation

103 got job (with salary) param found.

mytable <- xtabs(~ salaried+sex+quarter, data=dataset.df2)

ftable(mytable)
##              quarter  1  2  3  4
## salaried sex                    
## 0        1           11 21 16 19
##          2            7  6  7  3
## 1        1           23 19 17 13
##          2           12  6  7  6
#library(vcd)
mosaic(mytable, shade=TRUE, legend=TRUE, main=" Dist. of salaried, sex and quarter")

attach(dataset.df2)
#7. Pair Plots
# ==========
##  Salary and Work Exp
bwplot(sex[dataset.df2$salary>0] ~ work_yrs[dataset.df2$salary>0], data=dataset.df2, 
       horizontal=TRUE, xlab="Sex, Work_Exp distribution")

plot(~salary[dataset.df2$salary>0] + work_yrs[dataset.df2$salary>0], main="Salary and Work Exp of salaried students")
abline(0,1)

#6e. Scatter plots
# ==========
pairs(dataset.df2)

scatterplotMatrix(formula = ~ sex+age+gmat_tot+gmat_qpc+ gmat_tpc+
                  gmat_vpc+s_avg+f_avg+ quarter+ work_yrs+frstlang+satis+salary, 
                  cex=0.6, data=dataset.df2, diagonal="histogram")

corrgram(dataset.df2, order=TRUE,
         main="MBA Salaries vs Other Params",
         lower.panel=panel.pts, upper.panel=panel.pie,
         diag.panel=panel.minmax, text.panel=panel.txt)

##Correlation
vctCorr = numeric(0)
for (i in names(dataset.df2)){
  cor.result <- cor(as.numeric(dataset.df2$salaried), as.numeric(dataset.df2[,i]))
  vctCorr <- c(vctCorr, cor.result)
}
dfrCorr <- vctCorr
names(dfrCorr) <- names(dataset.df2)
dfrCorr
##         age         sex    gmat_tot    gmat_qpc    gmat_vpc    gmat_tpc 
## -0.20569719  0.05047054  0.01491495  0.02698202  0.02888098  0.08264631 
##       s_avg       f_avg     quarter    work_yrs    frstlang      salary 
##  0.08063913  0.02746051 -0.12788216 -0.12330395 -0.03899476  0.96951510 
##       satis    salaried 
##  0.16882557  1.00000000
dfrGraph <- gather(dataset.df2, variable, value, -c(salary, salaried))
head(dfrGraph)
##   salary salaried variable value
## 1      0        0      age    23
## 2      0        0      age    24
## 3      0        0      age    24
## 4      0        0      age    24
## 5      0        0      age    24
## 6      0        0      age    25
ggplot(dfrGraph) +
    geom_jitter(aes(value,salaried, colour=variable)) + 
    geom_smooth(aes(value,salaried, colour=variable), method=lm, se=FALSE) +
    facet_wrap(~variable, scales="free_x") +
    labs(title="Relation Of salaried students With Other params")

##Observation salaried variable is correlated to other variables.

Find Best Multi Linear Model
Choose the best linear model by using step(). Choose a model by AIC in a Stepwise Algorithm In statistics, stepwise regression is a method of fitting regression models in which the choice of predictive variables is carried out by an automatic procedure. In each step, a variable is considered for addition to or subtraction from the set of explanatory variables based on some prespecified criterion. The Akaike information criterion (AIC) is a measure of the relative quality of statistical models for a given set of data. Given a collection of models for the data, AIC estimates the quality of each model, relative to each of the other models. Hence, AIC provides a means for model selection.

#?step()
stpModel=step(lm(data=dataset.df2, salaried~. -salary), trace=0, steps=100)
stpSummary <- summary(stpModel)
stpSummary 
## 
## Call:
## lm(formula = salaried ~ age + gmat_tot + gmat_tpc + quarter + 
##     satis, data = dataset.df2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.8780 -0.4781  0.2619  0.4163  0.8791 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  1.342013   0.600063   2.236  0.02650 * 
## age         -0.024381   0.008346  -2.921  0.00391 **
## gmat_tot    -0.001943   0.001294  -1.501  0.13498   
## gmat_tpc     0.007751   0.005478   1.415  0.15874   
## quarter     -0.065926   0.031995  -2.060  0.04074 * 
## satis        0.099025   0.045597   2.172  0.03113 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4806 on 187 degrees of freedom
## Multiple R-squared:  0.1008, Adjusted R-squared:  0.07678 
## F-statistic: 4.193 on 5 and 187 DF,  p-value: 0.001234
#?step()
stpModel=step(glm(data=dataset.df2, salaried~. -salary, family=binomial), trace=0, steps=100)
stpSummary <- summary(stpModel)
stpSummary 
## 
## Call:
## glm(formula = salaried ~ age + gmat_tot + gmat_tpc + quarter + 
##     satis, family = binomial, data = dataset.df2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9491  -1.1229   0.7702   1.0287   2.1052  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  5.154571   3.053403   1.688  0.09138 . 
## age         -0.112199   0.040810  -2.749  0.00597 **
## gmat_tot    -0.015176   0.009298  -1.632  0.10266   
## gmat_tpc     0.067617   0.044113   1.533  0.12532   
## quarter     -0.296904   0.143264  -2.072  0.03823 * 
## satis        0.426628   0.203977   2.092  0.03648 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 266.68  on 192  degrees of freedom
## Residual deviance: 244.99  on 187  degrees of freedom
## AIC: 256.99
## 
## Number of Fisher Scoring iterations: 4

Observation

Best results given by salaried ~ age + quarter + satis and also gmat_tot + gmat_tpc

Make Final Multi Linear Model

x1 <- dataset.df2$age
x2 <- dataset.df2$quarter
x3 <- dataset.df2$satis
x4 <- dataset.df2$gmat_tpc
x5 <- dataset.df2$gmat_tot
y  <- dataset.df2$salaried
mgmModel1 <- glm(y~x1+x2+x3+x4+x5)

Observation

No errors. Model successfully created.

Show Model

# print summary
summary(mgmModel1)
## 
## Call:
## glm(formula = y ~ x1 + x2 + x3 + x4 + x5)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.8780  -0.4781   0.2619   0.4163   0.8791  
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  1.342013   0.600063   2.236  0.02650 * 
## x1          -0.024381   0.008346  -2.921  0.00391 **
## x2          -0.065926   0.031995  -2.060  0.04074 * 
## x3           0.099025   0.045597   2.172  0.03113 * 
## x4           0.007751   0.005478   1.415  0.15874   
## x5          -0.001943   0.001294  -1.501  0.13498   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.2309553)
## 
##     Null deviance: 48.031  on 192  degrees of freedom
## Residual deviance: 43.189  on 187  degrees of freedom
## AIC: 272.77
## 
## Number of Fisher Scoring iterations: 2

Make Final Multi Linear Model

x1 <- dataset.df2$age
x2 <- dataset.df2$quarter
x3 <- dataset.df2$satis
x4 <- dataset.df2$gmat_tpc
x5 <- dataset.df2$gmat_tot
y  <- dataset.df2$salaried
mgmModel2 <- glm(y~x1+x2+x3+x4+x5, family=binomial(link="logit"))

Observation

No errors. Model successfully created.

Show Model

# print summary
summary(mgmModel2)
## 
## Call:
## glm(formula = y ~ x1 + x2 + x3 + x4 + x5, family = binomial(link = "logit"))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9491  -1.1229   0.7702   1.0287   2.1052  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  5.154571   3.053403   1.688  0.09138 . 
## x1          -0.112199   0.040810  -2.749  0.00597 **
## x2          -0.296904   0.143264  -2.072  0.03823 * 
## x3           0.426628   0.203977   2.092  0.03648 * 
## x4           0.067617   0.044113   1.533  0.12532   
## x5          -0.015176   0.009298  -1.632  0.10266   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 266.68  on 192  degrees of freedom
## Residual deviance: 244.99  on 187  degrees of freedom
## AIC: 256.99
## 
## Number of Fisher Scoring iterations: 4

Confusion Matrix Primary source of accuracy is the confusion matrix Its important to find out the foll: Accuracy, Precision, Recall

#library(caret)
prdVal <- predict(mgmModel2, type='response')
prdBln <- ifelse(prdVal > 0.50, 1, 0)
cnfmtrx <- table(prd=prdBln, act=dataset.df2$salaried)
confusionMatrix(cnfmtrx)
## Confusion Matrix and Statistics
## 
##    act
## prd  0  1
##   0 49 21
##   1 41 82
##                                          
##                Accuracy : 0.6788         
##                  95% CI : (0.6079, 0.744)
##     No Information Rate : 0.5337         
##     P-Value [Acc > NIR] : 2.916e-05      
##                                          
##                   Kappa : 0.3454         
##  Mcnemar's Test P-Value : 0.01582        
##                                          
##             Sensitivity : 0.5444         
##             Specificity : 0.7961         
##          Pos Pred Value : 0.7000         
##          Neg Pred Value : 0.6667         
##              Prevalence : 0.4663         
##          Detection Rate : 0.2539         
##    Detection Prevalence : 0.3627         
##       Balanced Accuracy : 0.6703         
##                                          
##        'Positive' Class : 0              
## 

Observation

At 95% Confidence Interval, p-value < 0.05. Accuracy of 67% is achieved.

Test Data

# find mpg of a person with weight 3.0
dfrTest <- data.frame(x1=c(20),x2=c(1),x3=c(6), x4=c(71), x5=c(570))
dfrTest 
##   x1 x2 x3 x4  x5
## 1 20  1  6 71 570
#names(dfrTest) <- c("x1","x2","x3")
#dfrTest 

Observation

Test Data successfully created.

result <-  predict(mgmModel2, dfrTest)
print(result)
##        1 
## 1.324165

Predict

resVal <- predict(mgmModel2, dfrTest, type="response")
prdSur <- ifelse(resVal > 0.5, 1, 0)
prdSur <- as.factor(prdSur)
levels(prdSur) <- c("Didn't Get A Job", "Got A Job")
dfrTest <- mutate(dfrTest, Result=resVal, Prd_Outcome=prdSur)
some(dfrTest)
##   x1 x2 x3 x4  x5    Result      Prd_Outcome
## 1 20  1  6 71 570 0.7898739 Didn't Get A Job

Observation

Model created and data prediction is done on test data.