Description

This is the analysis of MBA Starting Salaries HBR Case Study. The data set with which we are dealing with here is a classification data set about MBA students and about their starting salaries , GMAT Scores , percentiles, age, sex etc.Some of them answered the survey,some don’t reveal their salary amount,some didn’t respond at all.

library(readr)
MBA_SS <- read_csv("C:/Users/Internship/MBA Starting Salaries Data.csv")
## Parsed with column specification:
## cols(
##   age = col_integer(),
##   sex = col_integer(),
##   gmat_tot = col_integer(),
##   gmat_qpc = col_integer(),
##   gmat_vpc = col_integer(),
##   gmat_tpc = col_integer(),
##   s_avg = col_double(),
##   f_avg = col_double(),
##   quarter = col_integer(),
##   work_yrs = col_integer(),
##   frstlang = col_integer(),
##   salary = col_integer(),
##   satis = col_integer()
## )
View(MBA_SS)
summary(MBA_SS)
##       age             sex           gmat_tot        gmat_qpc    
##  Min.   :22.00   Min.   :1.000   Min.   :450.0   Min.   :28.00  
##  1st Qu.:25.00   1st Qu.:1.000   1st Qu.:580.0   1st Qu.:72.00  
##  Median :27.00   Median :1.000   Median :620.0   Median :83.00  
##  Mean   :27.36   Mean   :1.248   Mean   :619.5   Mean   :80.64  
##  3rd Qu.:29.00   3rd Qu.:1.000   3rd Qu.:660.0   3rd Qu.:93.00  
##  Max.   :48.00   Max.   :2.000   Max.   :790.0   Max.   :99.00  
##     gmat_vpc        gmat_tpc        s_avg           f_avg      
##  Min.   :16.00   Min.   : 0.0   Min.   :2.000   Min.   :0.000  
##  1st Qu.:71.00   1st Qu.:78.0   1st Qu.:2.708   1st Qu.:2.750  
##  Median :81.00   Median :87.0   Median :3.000   Median :3.000  
##  Mean   :78.32   Mean   :84.2   Mean   :3.025   Mean   :3.062  
##  3rd Qu.:91.00   3rd Qu.:94.0   3rd Qu.:3.300   3rd Qu.:3.250  
##  Max.   :99.00   Max.   :99.0   Max.   :4.000   Max.   :4.000  
##     quarter         work_yrs         frstlang         salary      
##  Min.   :1.000   Min.   : 0.000   Min.   :1.000   Min.   :     0  
##  1st Qu.:1.250   1st Qu.: 2.000   1st Qu.:1.000   1st Qu.:     0  
##  Median :2.000   Median : 3.000   Median :1.000   Median :   999  
##  Mean   :2.478   Mean   : 3.872   Mean   :1.117   Mean   : 39026  
##  3rd Qu.:3.000   3rd Qu.: 4.000   3rd Qu.:1.000   3rd Qu.: 97000  
##  Max.   :4.000   Max.   :22.000   Max.   :2.000   Max.   :220000  
##      satis      
##  Min.   :  1.0  
##  1st Qu.:  5.0  
##  Median :  6.0  
##  Mean   :172.2  
##  3rd Qu.:  7.0  
##  Max.   :998.0
library(psych)
describe(MBA_SS)
##          vars   n     mean       sd median  trimmed     mad min    max
## age         1 274    27.36     3.71     27    26.76    2.97  22     48
## sex         2 274     1.25     0.43      1     1.19    0.00   1      2
## gmat_tot    3 274   619.45    57.54    620   618.86   59.30 450    790
## gmat_qpc    4 274    80.64    14.87     83    82.31   14.83  28     99
## gmat_vpc    5 274    78.32    16.86     81    80.33   14.83  16     99
## gmat_tpc    6 274    84.20    14.02     87    86.12   11.86   0     99
## s_avg       7 274     3.03     0.38      3     3.03    0.44   2      4
## f_avg       8 274     3.06     0.53      3     3.09    0.37   0      4
## quarter     9 274     2.48     1.11      2     2.47    1.48   1      4
## work_yrs   10 274     3.87     3.23      3     3.29    1.48   0     22
## frstlang   11 274     1.12     0.32      1     1.02    0.00   1      2
## salary     12 274 39025.69 50951.56    999 33607.86 1481.12   0 220000
## satis      13 274   172.18   371.61      6    91.50    1.48   1    998
##           range  skew kurtosis      se
## age          26  2.16     6.45    0.22
## sex           1  1.16    -0.66    0.03
## gmat_tot    340 -0.01     0.06    3.48
## gmat_qpc     71 -0.92     0.30    0.90
## gmat_vpc     83 -1.04     0.74    1.02
## gmat_tpc     99 -2.28     9.02    0.85
## s_avg         2 -0.06    -0.38    0.02
## f_avg         4 -2.08    10.85    0.03
## quarter       3  0.02    -1.35    0.07
## work_yrs     22  2.78     9.80    0.20
## frstlang      1  2.37     3.65    0.02
## salary   220000  0.70    -1.05 3078.10
## satis       997  1.77     1.13   22.45

Creating a subset of those who have given their salary details and who took part in the survey

placed<-MBA_SS[which(MBA_SS$salary>999),] 
 View(placed)
 PLACED<-MBA_SS$salary[MBA_SS$salary>999]
 View(PLACED)
 notplaced<-MBA_SS[which(MBA_SS$salary==0),]
 View(notplaced)

Creating BoxPlots

MBA_SS1=MBA_SS[,1:13]
View(MBA_SS1)
MBA_SS2=log(MBA_SS1+1)
boxplot(MBA_SS2,xlab="Value",ylab="Parameters",main="Boxplot Presentation of different Parameters")

placed1=placed[,1:13]
View(placed1)
placed2=log(placed1+1)
boxplot(placed2,xlab="Value",ylab="Parameters",main="Boxplot Presentation of different Parameters")

Creating A BoxPlot For GMAT Scores

boxplot(MBA_SS$gmat_tot,horizontal = TRUE, xlab="GMAT Score",main="BoxPlot Presentation Of GMAT Score")

par(mfrow=c(1,3)) 
with(MBA_SS, boxplot(MBA_SS$gmat_qpc,main="GMAT percentage quantitive",ylab="Percentage %")) 
with(MBA_SS, boxplot(MBA_SS$gmat_vpc,main="GMAT percentage verbal",ylab="Percentage %")) 
with(MBA_SS, boxplot(MBA_SS$gmat_tpc,main="GMAT percentage total",ylab="Percentage %"))

par(mfrow=c(1,1))
par(mfrow=c(1,2))
with(MBA_SS, boxplot(MBA_SS$s_avg,main="Spring MBA Average",ylab="Average"))
par(mfrow=c(1,3))

hist(MBA_SS$s_avg,xlab="s_avg",ylab="Frequency",main="spring MBA average", col=c("red","blue","green","yellow"))

boxplot(MBA_SS$s_avg,main="spring MBA average", xlab="s_avg",ylab="Frequency")

barplot(MBA_SS$s_avg,main = "spring MBA average", xlab="s_avg",ylab="Frequency")

with(MBA_SS, boxplot(MBA_SS$f_avg,main="Fall MBA Average",ylab="Average"))

par(mfrow=c(1,3))
hist(MBA_SS$f_avg,xlab="f_avg",ylab="Frequency",main="fall MBA average", col=c("red","blue","green","yellow"))

boxplot(MBA_SS$f_avg,main="fall MBA average", xlab="f_avg",ylab="Frequency")

barplot(MBA_SS$f_avg,main = "fall MBA average", xlab="f_avg",ylab="Frequency")

par(mfrow=c(1,1)) #Working years 
boxplot(MBA_SS$work_yrs,horizontal = TRUE, xlab="Working Yeats",main=" Working experience in years")

table(MBA_SS$salary>999)
## 
## FALSE  TRUE 
##   171   103
boxplot(placed,horizontal = TRUE, xlab="Salary",main="Boxplot presentation of Starting salary ")

#Age
count<-table(MBA_SS$age) 
barplot(count, main = "Barplot for age", xlab = "Age in Years")

par(mfrow=c(1,3))
 hist(placed$age,xlab="Age",ylab="Frequency",main="AGE", col=c("red","black","pink","yellow"))
 boxplot(placed$age,main="AGE", xlab="Age",ylab="Frequency")
 barplot(placed$age,main = "AGE", xlab="Age",ylab="Frequency")

#Sex
 count1<-table(MBA_SS$sex)
 par(mfrow=c(1,1))
pie(table(MBA_SS$sex),col=c("yellow","blue"),main="Gender Split Up")

#Quartile Ranking 
count2<-table(MBA_SS$quarter)
 par(mfrow=c(1,1))
pie(table(MBA_SS$quarter),col=c("red","yellow","grey","black"),main="Quartile Ranking")

par(mfrow=c(1,3))
hist(placed$quarter,xlab="quarter",ylab="Frequency",main="Quartile Ranking", col=c("pink","blue","black","yellow"))
boxplot(placed$quarter,main="quarter", xlab="quarter",ylab="Quartile Ranking")
barplot(placed$quarter,main = "quarter", xlab="quarter",ylab="Quartile Ranking")

#First Language
 count3<-table(MBA_SS$frstlang)
 par(mfrow=c(1,1))
pie(table(MBA_SS$frstlang),col=c("red","yellow"),main="First Language")

#Degree of satisfaction
 count4<-table(MBA_SS$satis[MBA_SS$satis<998])
 par(mfrow=c(1,1))
pie(table(MBA_SS$satis[MBA_SS$satis<998]),col=c("red","yellow","violet","black","blue","green","orange"),main="Degree of Satisfaction with MBA Program ")

par(mfrow=c(1,3))
hist(placed$satis,xlab="satis",ylab="Frequency",main="Degree of satisfaction", col=c("red","blue","green","yellow"))
boxplot(placed$satis,main="Degree of satisfaction", xlab="satis",ylab="Frequency")
barplot(placed$satis,main = "Degree of satisfaction", xlab="satis",ylab="Frequency")

library(car)
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
scatterplot(MBA_SS$salary,MBA_SS$age,main="Salary of MBAs with Age",ylab = "Age in years", xlab="Salary",cex=1.1,pch=19)

scatterplot(MBA_SS$salary,MBA_SS$work_yrs,main="Salary of MBAs with Work experience",ylab = "Work experience in years", xlab="Salary",cex=1.1,pch=19)

scatterplot(MBA_SS$salary,MBA_SS$gmat_tpc,main="Salary of MBAs with GMAT Percentile",ylab = "GMAT Percentile %", xlab="Salary",cex=1.1,pch=19)

library(car)
scatterplot(placed$salary,placed$age,main="Salary of MBAs with Age",ylab = "Age in years", xlab="Salary",cex=1.1,pch=19)

scatterplot(placed$salary,placed$work_yrs,main="Salary of MBAs with Work experience",ylab = "Work experience in years", xlab="Salary",cex=1.1,pch=19)

scatterplot(placed$salary,placed$gmat_tpc,main="Salary of MBAs with GMAT Percentile",ylab = "GMAT Percentile %", xlab="Salary",cex=1.1,pch=19)

## Creating JitterPlots

plot(jitter(placed$sex),jitter(placed$salary),main="Salary of MBAs with Sex",xlab = "Gender Male(1) Female(2)", ylab="Salary",cex=1.1)

plot(jitter(placed$frstlang),jitter(placed$salary),main="Salary of MBAs with First Language",xlab = "FIrst Language English(1) Other(2)", ylab="Salary",cex=1.1)

plot(jitter(placed$satis),jitter(placed$salary),main="Salary of MBAs with degree of satisfaction",xlab = "Degree of Satisfaction out of 7", ylab="Salary",cex=1.1)

plot(jitter(MBA_SS$sex),jitter(MBA_SS$salary),main="Salary of MBAs with Sex",xlab = "Gender Male(1) Female(2)", ylab="Salary",cex=1.1)

plot(jitter(MBA_SS$frstlang),jitter(MBA_SS$salary),main="Salary of MBAs with First Language",xlab = "FIrst Language English(1) Other(2)", ylab="Salary",cex=1.1)

plot(jitter(MBA_SS$satis),jitter(MBA_SS$salary),main="Salary of MBAs with degree of satisfaction",xlab = "Degree of Satisfaction out of 7", ylab="Salary",cex=1.1)

##Correlation tests to find relationship between different parameters

Creating A Correlation matrix,Covariance matrix, Corrgram

corr.test(placed, use = "complete")
## Call:corr.test(x = placed, use = "complete")
## Correlation matrix 
##            age   sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg
## age       1.00 -0.14    -0.08    -0.17     0.02    -0.10  0.16 -0.22
## sex      -0.14  1.00    -0.02    -0.15     0.05    -0.05  0.08  0.17
## gmat_tot -0.08 -0.02     1.00     0.67     0.78     0.97  0.17  0.12
## gmat_qpc -0.17 -0.15     0.67     1.00     0.09     0.66  0.02  0.10
## gmat_vpc  0.02  0.05     0.78     0.09     1.00     0.78  0.16  0.02
## gmat_tpc -0.10 -0.05     0.97     0.66     0.78     1.00  0.14  0.07
## s_avg     0.16  0.08     0.17     0.02     0.16     0.14  1.00  0.45
## f_avg    -0.22  0.17     0.12     0.10     0.02     0.07  0.45  1.00
## quarter  -0.13 -0.02    -0.11     0.01    -0.13    -0.10 -0.84 -0.43
## work_yrs  0.88 -0.09    -0.12    -0.18    -0.03    -0.13  0.16 -0.22
## frstlang  0.35  0.08    -0.13     0.01    -0.22    -0.16 -0.14 -0.05
## salary    0.50 -0.17    -0.09     0.01    -0.14    -0.13  0.10 -0.11
## satis     0.11 -0.09     0.06     0.00     0.15     0.12 -0.14 -0.12
##          quarter work_yrs frstlang salary satis
## age        -0.13     0.88     0.35   0.50  0.11
## sex        -0.02    -0.09     0.08  -0.17 -0.09
## gmat_tot   -0.11    -0.12    -0.13  -0.09  0.06
## gmat_qpc    0.01    -0.18     0.01   0.01  0.00
## gmat_vpc   -0.13    -0.03    -0.22  -0.14  0.15
## gmat_tpc   -0.10    -0.13    -0.16  -0.13  0.12
## s_avg      -0.84     0.16    -0.14   0.10 -0.14
## f_avg      -0.43    -0.22    -0.05  -0.11 -0.12
## quarter     1.00    -0.13     0.11  -0.13  0.23
## work_yrs   -0.13     1.00     0.20   0.45  0.06
## frstlang    0.11     0.20     1.00   0.27  0.09
## salary     -0.13     0.45     0.27   1.00 -0.04
## satis       0.23     0.06     0.09  -0.04  1.00
## Sample Size 
## [1] 103
## Probability values (Entries above the diagonal are adjusted for multiple tests.) 
##           age  sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg quarter
## age      0.00 1.00     1.00     1.00     1.00     1.00  1.00  1.00    1.00
## sex      0.15 0.00     1.00     1.00     1.00     1.00  1.00  1.00    1.00
## gmat_tot 0.43 0.84     0.00     0.00     0.00     0.00  1.00  1.00    1.00
## gmat_qpc 0.10 0.14     0.00     0.00     1.00     0.00  1.00  1.00    1.00
## gmat_vpc 0.86 0.59     0.00     0.34     0.00     0.00  1.00  1.00    1.00
## gmat_tpc 0.33 0.64     0.00     0.00     0.00     0.00  1.00  1.00    1.00
## s_avg    0.11 0.42     0.08     0.88     0.11     0.16  0.00  0.00    0.00
## f_avg    0.03 0.09     0.22     0.32     0.82     0.48  0.00  0.00    0.00
## quarter  0.21 0.83     0.29     0.90     0.20     0.32  0.00  0.00    0.00
## work_yrs 0.00 0.35     0.22     0.06     0.78     0.18  0.10  0.03    0.19
## frstlang 0.00 0.45     0.19     0.89     0.03     0.10  0.16  0.61    0.27
## salary   0.00 0.09     0.36     0.89     0.17     0.18  0.31  0.29    0.20
## satis    0.28 0.36     0.52     0.97     0.13     0.24  0.15  0.24    0.02
##          work_yrs frstlang salary satis
## age          0.00     0.02   0.00     1
## sex          1.00     1.00   1.00     1
## gmat_tot     1.00     1.00   1.00     1
## gmat_qpc     1.00     1.00   1.00     1
## gmat_vpc     1.00     1.00   1.00     1
## gmat_tpc     1.00     1.00   1.00     1
## s_avg        1.00     1.00   1.00     1
## f_avg        1.00     1.00   1.00     1
## quarter      1.00     1.00   1.00     1
## work_yrs     0.00     1.00   0.00     1
## frstlang     0.05     0.00   0.42     1
## salary       0.00     0.01   0.00     1
## satis        0.53     0.37   0.69     0
## 
##  To see confidence intervals of the correlations, print with the short=FALSE option
x<-placed[,c("age","sex","gmat_tot","gmat_qpc","gmat_vpc","gmat_tpc","s_avg","f_avg","quarter", "work_yrs", "frstlang", "salary","satis")] 
y<-placed[,c("salary","gmat_tpc","work_yrs","satis")]
cor(x,y)
##               salary    gmat_tpc    work_yrs        satis
## age       0.49964284 -0.09609156  0.88052470  0.108323083
## sex      -0.16628869 -0.04686981 -0.09233003 -0.091995338
## gmat_tot -0.09067141  0.96680810 -0.12280018  0.064742057
## gmat_qpc  0.01414130  0.65865003 -0.18270126 -0.003984632
## gmat_vpc -0.13743230  0.78443167 -0.02812182  0.148634805
## gmat_tpc -0.13201783  1.00000000 -0.13246963  0.116308417
## s_avg     0.10173175  0.13938500  0.16328236 -0.143565573
## f_avg    -0.10603897  0.07051391 -0.21633018 -0.117733043
## quarter  -0.12848526 -0.09955033 -0.12896722  0.225119851
## work_yrs  0.45466634 -0.13246963  1.00000000  0.062999256
## frstlang  0.26701953 -0.16437561  0.19627277  0.089834769
## salary    1.00000000 -0.13201783  0.45466634 -0.040050600
## satis    -0.04005060  0.11630842  0.06299926  1.000000000
cov(x,y)
##                 salary      gmat_tpc      work_yrs         satis
## age       2.921052e+04 -3.460213e+00     8.6728536    0.27765087
## sex      -1.369577e+03 -2.377689e-01    -0.1281173   -0.03321911
## gmat_tot -8.212449e+04  5.393623e+02   -18.7388159    2.57091186
## gmat_qpc  3.382438e+03  9.703607e+01    -7.3624595   -0.04178565
## gmat_vpc -3.964803e+04  1.393882e+02    -1.3668380    1.87997335
## gmat_tpc -2.596339e+04  1.211342e+02    -4.3892062    1.00285551
## s_avg     6.880204e+02  5.806292e-01     0.1860480   -0.04256901
## f_avg    -9.241129e+02  3.785056e-01    -0.3176271   -0.04498382
## quarter  -2.571117e+03 -1.227013e+00    -0.4347992    0.19750619
## work_yrs  2.445820e+04 -4.389206e+00     9.0630116    0.14858176
## frstlang  1.206714e+03 -4.575481e-01     0.1494384    0.01779935
## salary    3.192940e+08 -2.596339e+04 24458.1995050 -560.65829050
## satis    -5.606583e+02  1.002856e+00     0.1485818    0.61374453
var(x,y)
##                 salary      gmat_tpc      work_yrs         satis
## age       2.921052e+04 -3.460213e+00     8.6728536    0.27765087
## sex      -1.369577e+03 -2.377689e-01    -0.1281173   -0.03321911
## gmat_tot -8.212449e+04  5.393623e+02   -18.7388159    2.57091186
## gmat_qpc  3.382438e+03  9.703607e+01    -7.3624595   -0.04178565
## gmat_vpc -3.964803e+04  1.393882e+02    -1.3668380    1.87997335
## gmat_tpc -2.596339e+04  1.211342e+02    -4.3892062    1.00285551
## s_avg     6.880204e+02  5.806292e-01     0.1860480   -0.04256901
## f_avg    -9.241129e+02  3.785056e-01    -0.3176271   -0.04498382
## quarter  -2.571117e+03 -1.227013e+00    -0.4347992    0.19750619
## work_yrs  2.445820e+04 -4.389206e+00     9.0630116    0.14858176
## frstlang  1.206714e+03 -4.575481e-01     0.1494384    0.01779935
## salary    3.192940e+08 -2.596339e+04 24458.1995050 -560.65829050
## satis    -5.606583e+02  1.002856e+00     0.1485818    0.61374453
corr.test(placed, use = "complete")
## Call:corr.test(x = placed, use = "complete")
## Correlation matrix 
##            age   sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg
## age       1.00 -0.14    -0.08    -0.17     0.02    -0.10  0.16 -0.22
## sex      -0.14  1.00    -0.02    -0.15     0.05    -0.05  0.08  0.17
## gmat_tot -0.08 -0.02     1.00     0.67     0.78     0.97  0.17  0.12
## gmat_qpc -0.17 -0.15     0.67     1.00     0.09     0.66  0.02  0.10
## gmat_vpc  0.02  0.05     0.78     0.09     1.00     0.78  0.16  0.02
## gmat_tpc -0.10 -0.05     0.97     0.66     0.78     1.00  0.14  0.07
## s_avg     0.16  0.08     0.17     0.02     0.16     0.14  1.00  0.45
## f_avg    -0.22  0.17     0.12     0.10     0.02     0.07  0.45  1.00
## quarter  -0.13 -0.02    -0.11     0.01    -0.13    -0.10 -0.84 -0.43
## work_yrs  0.88 -0.09    -0.12    -0.18    -0.03    -0.13  0.16 -0.22
## frstlang  0.35  0.08    -0.13     0.01    -0.22    -0.16 -0.14 -0.05
## salary    0.50 -0.17    -0.09     0.01    -0.14    -0.13  0.10 -0.11
## satis     0.11 -0.09     0.06     0.00     0.15     0.12 -0.14 -0.12
##          quarter work_yrs frstlang salary satis
## age        -0.13     0.88     0.35   0.50  0.11
## sex        -0.02    -0.09     0.08  -0.17 -0.09
## gmat_tot   -0.11    -0.12    -0.13  -0.09  0.06
## gmat_qpc    0.01    -0.18     0.01   0.01  0.00
## gmat_vpc   -0.13    -0.03    -0.22  -0.14  0.15
## gmat_tpc   -0.10    -0.13    -0.16  -0.13  0.12
## s_avg      -0.84     0.16    -0.14   0.10 -0.14
## f_avg      -0.43    -0.22    -0.05  -0.11 -0.12
## quarter     1.00    -0.13     0.11  -0.13  0.23
## work_yrs   -0.13     1.00     0.20   0.45  0.06
## frstlang    0.11     0.20     1.00   0.27  0.09
## salary     -0.13     0.45     0.27   1.00 -0.04
## satis       0.23     0.06     0.09  -0.04  1.00
## Sample Size 
## [1] 103
## Probability values (Entries above the diagonal are adjusted for multiple tests.) 
##           age  sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg quarter
## age      0.00 1.00     1.00     1.00     1.00     1.00  1.00  1.00    1.00
## sex      0.15 0.00     1.00     1.00     1.00     1.00  1.00  1.00    1.00
## gmat_tot 0.43 0.84     0.00     0.00     0.00     0.00  1.00  1.00    1.00
## gmat_qpc 0.10 0.14     0.00     0.00     1.00     0.00  1.00  1.00    1.00
## gmat_vpc 0.86 0.59     0.00     0.34     0.00     0.00  1.00  1.00    1.00
## gmat_tpc 0.33 0.64     0.00     0.00     0.00     0.00  1.00  1.00    1.00
## s_avg    0.11 0.42     0.08     0.88     0.11     0.16  0.00  0.00    0.00
## f_avg    0.03 0.09     0.22     0.32     0.82     0.48  0.00  0.00    0.00
## quarter  0.21 0.83     0.29     0.90     0.20     0.32  0.00  0.00    0.00
## work_yrs 0.00 0.35     0.22     0.06     0.78     0.18  0.10  0.03    0.19
## frstlang 0.00 0.45     0.19     0.89     0.03     0.10  0.16  0.61    0.27
## salary   0.00 0.09     0.36     0.89     0.17     0.18  0.31  0.29    0.20
## satis    0.28 0.36     0.52     0.97     0.13     0.24  0.15  0.24    0.02
##          work_yrs frstlang salary satis
## age          0.00     0.02   0.00     1
## sex          1.00     1.00   1.00     1
## gmat_tot     1.00     1.00   1.00     1
## gmat_qpc     1.00     1.00   1.00     1
## gmat_vpc     1.00     1.00   1.00     1
## gmat_tpc     1.00     1.00   1.00     1
## s_avg        1.00     1.00   1.00     1
## f_avg        1.00     1.00   1.00     1
## quarter      1.00     1.00   1.00     1
## work_yrs     0.00     1.00   0.00     1
## frstlang     0.05     0.00   0.42     1
## salary       0.00     0.01   0.00     1
## satis        0.53     0.37   0.69     0
## 
##  To see confidence intervals of the correlations, print with the short=FALSE option
library(corrplot) 
## corrplot 0.84 loaded
corrplot(corr=cor(placed[,c(1:13)],use = "complete.obs"), method = "ellipse")

library(gplots)
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
corrplot.mixed(corr=cor(placed[,c(1:13)],use ="complete.obs"), lower = "number", upper = "circle", tl.pos = c("d",
"lt", "n"), diag = c("n", "l", "u"), bg = "white", addgrid.col = "grey",
lower.col = NULL, upper.col = NULL)

library(corrgram)
corrgram(placed, order=TRUE, lower.panel=panel.shade, upper.panel=panel.pie, text.panel=panel.txt, main="Corrgram of MBA Salaries")

library(corrplot) 
corrplot(corr=cor(MBA_SS[,c(1:13)],use = "complete.obs"), method = "ellipse")

library(gplots)
corrplot.mixed(corr=cor(MBA_SS[,c(1:13)],use ="complete.obs"), lower = "number", upper = "circle", tl.pos = c("d",
"lt", "n"), diag = c("n", "l", "u"), bg = "white", addgrid.col = "grey",
lower.col = NULL, upper.col = NULL)

library(corrgram)
corrgram(MBA_SS, order=TRUE, lower.panel=panel.shade, upper.panel=panel.pie, text.panel=panel.txt, main="Corrgram of MBA Salaries")

## Visualising using the GPLOT command

library(ggvis)
placed %>% ggvis(~salary, ~frstlang, fill = ~satis) %>% layer_points()
placed %>% ggvis(~salary, ~s_avg, fill = ~satis) %>% layer_points()
placed %>% ggvis(~salary, ~f_avg, fill = ~satis) %>% layer_points()
placed %>% ggvis(~salary, ~gmat_tpc, fill = ~satis) %>% layer_points()
placed %>% ggvis(~salary, ~gmat_vpc, fill = ~satis) %>% layer_points()
placed %>% ggvis(~salary, ~gmat_vpc, fill = ~satis) %>% layer_points()
placed %>% ggvis(~salary, ~work_yrs, fill = ~satis) %>% layer_points()
library(ggvis)
MBA_SS %>% ggvis(~salary, ~frstlang, fill = ~satis) %>% layer_points()
MBA_SS %>% ggvis(~salary, ~s_avg, fill = ~satis) %>% layer_points()
MBA_SS %>% ggvis(~salary, ~f_avg, fill = ~satis) %>% layer_points()
MBA_SS %>% ggvis(~salary, ~gmat_tpc, fill = ~satis) %>% layer_points()
MBA_SS %>% ggvis(~salary, ~gmat_vpc, fill = ~satis) %>% layer_points()
MBA_SS %>% ggvis(~salary, ~gmat_tot, fill = ~satis) %>% layer_points()
MBA_SS %>% ggvis(~salary, ~work_yrs, fill = ~satis) %>% layer_points()

##Making Contigengency tables ##Perfoming chi-square tests

mytable<-xtabs(~sex+work_yrs, data = placed)
addmargins(mytable)
##      work_yrs
## sex     0   1   2   3   4   5   6   7   8  10  15  16 Sum
##   1     1   4  24  16  10   4   5   1   3   1   1   2  72
##   2     0   4  14   5   1   3   2   0   1   0   1   0  31
##   Sum   1   8  38  21  11   7   7   1   4   1   2   2 103
chisq.test(mytable)
## Warning in chisq.test(mytable): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  mytable
## X-squared = 8.1579, df = 11, p-value = 0.6991

Inference

p-value = 0.6991 We can’t reject the null hypothesis and the parameters works_yr and sex are independent.We can’t predict the sex of the MBA student from work experience .

mytable1<-xtabs(~work_yrs+satis, data = placed)
addmargins(mytable1)
##         satis
## work_yrs   3   4   5   6   7 Sum
##      0     0   1   0   0   0   1
##      1     0   0   5   1   2   8
##      2     0   0   8  19  11  38
##      3     1   0   6  12   2  21
##      4     0   0   3   5   3  11
##      5     0   0   3   3   1   7
##      6     0   0   2   5   0   7
##      7     0   0   1   0   0   1
##      8     0   0   0   3   1   4
##      10    0   0   0   0   1   1
##      15    0   0   0   2   0   2
##      16    0   0   1   0   1   2
##      Sum   1   1  29  50  22 103
chisq.test(mytable1)
## Warning in chisq.test(mytable1): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  mytable1
## X-squared = 131.13, df = 44, p-value = 1.35e-10

Inference

p-value = 1.35e-10(<0.05) We can reject the null hypothesis and the parameters works_yr and satis are not independent.we can predict work experience from his or her degree of satisfaction.

mytable2<-xtabs(~frstlang+~sex, data = placed)
addmargins(mytable2)
##         sex
## frstlang   1   2 Sum
##      1    68  28  96
##      2     4   3   7
##      Sum  72  31 103
chisq.test(mytable2)
## Warning in chisq.test(mytable2): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  mytable2
## X-squared = 0.11264, df = 1, p-value = 0.7372

Inference

Since, p-value = 0.7372 We can’t reject the null hypothesis and the parameters sex and frstlang are independent.we can’t predict work experience from her first language.

mytable3<-xtabs(~work_yrs+frstlang, data = placed)
addmargins(mytable3)
##         frstlang
## work_yrs   1   2 Sum
##      0     1   0   1
##      1     8   0   8
##      2    36   2  38
##      3    20   1  21
##      4    10   1  11
##      5     6   1   7
##      6     7   0   7
##      7     1   0   1
##      8     4   0   4
##      10    0   1   1
##      15    1   1   2
##      16    2   0   2
##      Sum  96   7 103
chisq.test(mytable3)
## Warning in chisq.test(mytable3): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  mytable3
## X-squared = 22.274, df = 11, p-value = 0.02233

Inference

Since, p-value = 0.02233(<0.05) we can reject the null hypothesis and the parameters works_yr and frstlang are not independent. That is, we can predict work experience from her first language. ##T-Tests For Hypothesis

t.test(salary~sex,alternative="greater",data=placed) #Average Salary of Males is greater than the average         salary of Females
## 
##  Welch Two Sample t-test
## 
## data:  salary by sex
## t = 1.3628, df = 38.115, p-value = 0.09047
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
##  -1527.96      Inf
## sample estimates:
## mean in group 1 mean in group 2 
##       104970.97        98524.39
t.test(salary~frstlang, alternative="greater", data = placed) #Average Salary of people those who have English as their first language is greater than average salary of those of speak otherlanguage
## 
##  Welch Two Sample t-test
## 
## data:  salary by frstlang
## t = -1.1202, df = 6.0863, p-value = 0.8476
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
##  -51508.45       Inf
## sample estimates:
## mean in group 1 mean in group 2 
##        101748.6        120614.3
t.test(gmat_tpc~sex, alternative="greater", data = placed) #Average GMAT Percentile of Males is greater than the average GMAT percentile of Females
## 
##  Welch Two Sample t-test
## 
## data:  gmat_tpc by sex
## t = 0.43873, df = 48.83, p-value = 0.3314
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
##  -3.157889       Inf
## sample estimates:
## mean in group 1 mean in group 2 
##        84.86111        83.74194

Since, p-value = 0.09047 (>0.05) We can’t say that the average salary of males is greater than females as we can’t reject the null hypothesis that Average Salary of Males is greater than the average salary of Females

Since, p-value = 0.8476 (>0.05) We can’t say that the average salary of English speaking students is greater than other language speaking students as we can’t reject the null hypothesis that Average Salary of people those who have English as their first language is greater than average salary of those of speak otherlanguage

we can’t say the the average GMAT percentile of males is more than that of females Since, p-value = 0.3314 (>0.05) & we can’t reject the null hypothesis that Average GMAT Percentile of Males is greater than the average GMAT percentile of Females ##The Model (SUbmodels)

placed$mb_avg <- (placed$s_avg + placed$f_avg)/2
model1 <- lm(salary ~ age + work_yrs + mb_avg + gmat_tot + sex + satis + gmat_vpc + gmat_qpc + gmat_tpc + s_avg + f_avg, data = placed)
summary(model1)
## 
## Call:
## lm(formula = salary ~ age + work_yrs + mb_avg + gmat_tot + sex + 
##     satis + gmat_vpc + gmat_qpc + gmat_tpc + s_avg + f_avg, data = placed)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -27974  -7315   -119   5041  72348 
## 
## Coefficients: (1 not defined because of singularities)
##              Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 57744.917  50742.159   1.138   0.2581  
## age          2314.479   1013.453   2.284   0.0247 *
## work_yrs      407.556   1091.395   0.373   0.7097  
## mb_avg      -2606.422   7640.220  -0.341   0.7338  
## gmat_tot       -2.674    170.717  -0.016   0.9875  
## sex         -3128.430   3495.714  -0.895   0.3732  
## satis       -1362.352   2070.020  -0.658   0.5121  
## gmat_vpc      551.916    489.787   1.127   0.2627  
## gmat_qpc      836.303    491.191   1.703   0.0920 .
## gmat_tpc    -1434.011    712.038  -2.014   0.0469 *
## s_avg        4777.674   7525.889   0.635   0.5271  
## f_avg              NA         NA      NA       NA  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15430 on 92 degrees of freedom
## Multiple R-squared:  0.3276, Adjusted R-squared:  0.2546 
## F-statistic: 4.483 on 10 and 92 DF,  p-value: 3.734e-05
model1$coefficients
## (Intercept)         age    work_yrs      mb_avg    gmat_tot         sex 
## 57744.91736  2314.47880   407.55619 -2606.42188    -2.67356 -3128.42951 
##       satis    gmat_vpc    gmat_qpc    gmat_tpc       s_avg       f_avg 
## -1362.35189   551.91609   836.30336 -1434.01111  4777.67360          NA

Initially a model can be generated for salary where variables are taken maximum but removing few based on out previous hypothesis testing. so first model is created with predictive variables age + work_yrs + mb_avg + gmat_tot + sex + satis + gmat_vpc + gmat_qpc + gmat_tpc + s_avg + f_avg

And Adjusted R-squared: 0.2546 is noted. Then I removed one by one variable in subsequent stepsand keep doing regression until maximum adjustable R-squared is obtained. Which is actually best fit to regression. ##Fitting Mutilple Regression With y=f(x1,x2,x3,.x9) ##Multi-variable Regression Line

placed$mb_avg <- (placed$s_avg + placed$f_avg)/2
modelA <- lm(salary ~ age + work_yrs + mb_avg + sex + satis + gmat_vpc + gmat_qpc + gmat_tpc + s_avg, data = placed)
summary(modelA)
## 
## Call:
## lm(formula = salary ~ age + work_yrs + mb_avg + sex + satis + 
##     gmat_vpc + gmat_qpc + gmat_tpc + s_avg, data = placed)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -27980  -7307    -92   5027  72329 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  57125.6    31624.0   1.806   0.0741 .
## age           2313.4     1005.5   2.301   0.0236 *
## work_yrs       407.9     1085.3   0.376   0.7079  
## mb_avg       -2625.6     7500.5  -0.350   0.7271  
## sex          -3135.1     3451.2  -0.908   0.3660  
## satis        -1356.3     2022.4  -0.671   0.5041  
## gmat_vpc       546.7      358.6   1.524   0.1308  
## gmat_qpc       831.1      358.6   2.318   0.0227 *
## gmat_tpc     -1435.8      699.3  -2.053   0.0429 *
## s_avg         4779.9     7484.0   0.639   0.5246  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15340 on 93 degrees of freedom
## Multiple R-squared:  0.3276, Adjusted R-squared:  0.2626 
## F-statistic: 5.036 on 9 and 93 DF,  p-value: 1.573e-05
modelB <- lm(salary ~ age + work_yrs + sex + satis + gmat_vpc + gmat_qpc + gmat_tpc + s_avg, data = placed)
summary(modelB)
## 
## Call:
## lm(formula = salary ~ age + work_yrs + sex + satis + gmat_vpc + 
##     gmat_qpc + gmat_tpc + s_avg, data = placed)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -27658  -7463   -293   4791  71831 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  55014.6    30898.5   1.780   0.0782 .
## age           2336.0      998.7   2.339   0.0215 *
## work_yrs       446.0     1074.8   0.415   0.6791  
## sex          -3281.6     3409.7  -0.962   0.3383  
## satis        -1357.9     2012.9  -0.675   0.5016  
## gmat_vpc       539.6      356.4   1.514   0.1334  
## gmat_qpc       818.4      355.1   2.305   0.0234 *
## gmat_tpc     -1415.7      693.7  -2.041   0.0441 *
## s_avg         2620.6     4218.4   0.621   0.5359  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15270 on 94 degrees of freedom
## Multiple R-squared:  0.3268, Adjusted R-squared:  0.2695 
## F-statistic: 5.703 on 8 and 94 DF,  p-value: 6.544e-06
modelC <- lm(salary ~ age + sex + satis + gmat_vpc + gmat_qpc + gmat_tpc + s_avg, data = placed)
summary(modelC)
## 
## Call:
## lm(formula = salary ~ age + sex + satis + gmat_vpc + gmat_qpc + 
##     gmat_tpc + s_avg, data = placed)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -27509  -7439   -100   4348  72056 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  47208.3    24404.0   1.934   0.0560 .  
## age           2697.3      486.8   5.541 2.68e-07 ***
## sex          -3200.4     3389.2  -0.944   0.3474    
## satis        -1394.6     2002.2  -0.697   0.4878    
## gmat_vpc       526.8      353.5   1.490   0.1394    
## gmat_qpc       806.5      352.4   2.289   0.0243 *  
## gmat_tpc     -1397.3      689.2  -2.027   0.0454 *  
## s_avg         2710.2     4194.4   0.646   0.5198    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15210 on 95 degrees of freedom
## Multiple R-squared:  0.3255, Adjusted R-squared:  0.2758 
## F-statistic:  6.55 on 7 and 95 DF,  p-value: 2.59e-06
modelD <- lm(salary ~ age + sex + satis + gmat_vpc + gmat_qpc + gmat_tpc, data = placed)
summary(modelD)
## 
## Call:
## lm(formula = salary ~ age + sex + satis + gmat_vpc + gmat_qpc + 
##     gmat_tpc, data = placed)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -25949  -7801   -299   5037  70273 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  53662.6    22198.3   2.417   0.0175 *  
## age           2759.8      475.6   5.803 8.38e-08 ***
## sex          -3002.5     3365.1  -0.892   0.3745    
## satis        -1640.2     1959.8  -0.837   0.4047    
## gmat_vpc       516.1      352.0   1.466   0.1459    
## gmat_qpc       788.6      350.2   2.252   0.0266 *  
## gmat_tpc     -1353.4      683.8  -1.979   0.0507 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15160 on 96 degrees of freedom
## Multiple R-squared:  0.3226, Adjusted R-squared:  0.2802 
## F-statistic: 7.618 on 6 and 96 DF,  p-value: 1.057e-06
modelE <- lm(salary ~ age + sex + gmat_vpc + gmat_qpc + gmat_tpc, data = placed)
summary(modelE)
## 
## Call:
## lm(formula = salary ~ age + sex + gmat_vpc + gmat_qpc + gmat_tpc, 
##     data = placed)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -28067  -8243     10   5811  69769 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  45736.1    20045.4   2.282   0.0247 *  
## age           2723.1      472.9   5.759 9.95e-08 ***
## sex          -2753.2     3346.7  -0.823   0.4127    
## gmat_vpc       520.6      351.4   1.481   0.1418    
## gmat_qpc       806.5      349.0   2.311   0.0230 *  
## gmat_tpc     -1387.1      681.6  -2.035   0.0446 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15140 on 97 degrees of freedom
## Multiple R-squared:  0.3176, Adjusted R-squared:  0.2824 
## F-statistic:  9.03 on 5 and 97 DF,  p-value: 4.494e-07
modelF <- lm(salary ~ age + gmat_vpc + gmat_qpc + gmat_tpc, data = placed)
summary(modelF)
## 
## Call:
## lm(formula = salary ~ age + gmat_vpc + gmat_qpc + gmat_tpc, data = placed)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -29373  -8011    280   5705  67116 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  39383.0    18467.6   2.133   0.0355 *  
## age           2791.1      464.8   6.005 3.25e-08 ***
## gmat_vpc       513.2      350.7   1.463   0.1467    
## gmat_qpc       822.3      347.9   2.363   0.0201 *  
## gmat_tpc     -1383.8      680.4  -2.034   0.0447 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15110 on 98 degrees of freedom
## Multiple R-squared:  0.3129, Adjusted R-squared:  0.2848 
## F-statistic: 11.16 on 4 and 98 DF,  p-value: 1.691e-07
modelH <- lm(salary ~ age + work_yrs + gmat_tot + sex + gmat_tpc + mb_avg, data = placed)
summary(modelH)
## 
## Call:
## lm(formula = salary ~ age + work_yrs + gmat_tot + sex + gmat_tpc + 
##     mb_avg, data = placed)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -31851  -7752  -1585   6885  75610 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  16870.1    38025.6   0.444   0.6583  
## age           2160.3     1008.4   2.142   0.0347 *
## work_yrs       482.3     1091.0   0.442   0.6594  
## gmat_tot       180.1      122.9   1.465   0.1461  
## sex          -4703.9     3446.0  -1.365   0.1754  
## gmat_tpc      -948.7      562.8  -1.686   0.0951 .
## mb_avg         618.0     4352.6   0.142   0.8874  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15570 on 96 degrees of freedom
## Multiple R-squared:  0.2853, Adjusted R-squared:  0.2407 
## F-statistic: 6.389 on 6 and 96 DF,  p-value: 1.096e-05
modelI <- lm(salary ~ age + work_yrs + sex + gmat_tpc + mb_avg, data = placed)
summary(modelI)
## 
## Call:
## lm(formula = salary ~ age + work_yrs + sex + gmat_tpc + mb_avg, 
##     data = placed)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -31606  -7795  -2434   5953  81096 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  52297.7    29522.8   1.771   0.0796 .
## age           2304.0     1009.5   2.282   0.0247 *
## work_yrs       374.3     1094.9   0.342   0.7332  
## sex          -4276.7     3453.9  -1.238   0.2186  
## gmat_tpc      -151.0      143.6  -1.052   0.2955  
## mb_avg        1937.7     4283.5   0.452   0.6520  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15660 on 97 degrees of freedom
## Multiple R-squared:  0.2694, Adjusted R-squared:  0.2317 
## F-statistic: 7.152 on 5 and 97 DF,  p-value: 9.811e-06
modelJ <- lm(salary ~ age + work_yrs + gmat_tot + mb_avg, data = placed)
summary(modelJ)
## 
## Call:
## lm(formula = salary ~ age + work_yrs + gmat_tot + mb_avg, data = placed)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -33021  -7914  -1891   4615  79250 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 44101.55   31514.13   1.399   0.1648  
## age          2457.29    1008.22   2.437   0.0166 *
## work_yrs      316.81    1100.09   0.288   0.7740  
## gmat_tot      -18.51      31.50  -0.588   0.5581  
## mb_avg       1089.07    4289.73   0.254   0.8001  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15750 on 98 degrees of freedom
## Multiple R-squared:  0.2534, Adjusted R-squared:  0.223 
## F-statistic: 8.317 on 4 and 98 DF,  p-value: 8.093e-06
model2 <- lm(salary ~ age  + gmat_tot + mb_avg, data = placed)
summary(model2)
## 
## Call:
## lm(formula = salary ~ age + gmat_tot + mb_avg, data = placed)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -32905  -8104  -1887   4804  79345 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 39016.57   25981.91   1.502    0.136    
## age          2712.82     476.56   5.692 1.28e-07 ***
## gmat_tot      -19.54      31.15  -0.627    0.532    
## mb_avg       1102.87    4269.55   0.258    0.797    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15680 on 99 degrees of freedom
## Multiple R-squared:  0.2528, Adjusted R-squared:  0.2302 
## F-statistic: 11.16 on 3 and 99 DF,  p-value: 2.248e-06
model2$coefficients
## (Intercept)         age    gmat_tot      mb_avg 
## 39016.56653  2712.81802   -19.53809  1102.87444
plot(modelA)

plot(modelC)

plot(modelC)

plot(modelC)

plot(model2)

model3<-lm(salary~work_yrs+gmat_tot-1, data = placed)
summary(model3)
## 
## Call:
## lm(formula = salary ~ work_yrs + gmat_tot - 1, data = placed)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -30428  -9691   -624   8110  97678 
## 
## Coefficients:
##          Estimate Std. Error t value Pr(>|t|)    
## work_yrs 3264.289    579.553   5.632 1.61e-07 ***
## gmat_tot  146.716      4.449  32.976  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17920 on 101 degrees of freedom
## Multiple R-squared:  0.9712, Adjusted R-squared:  0.9706 
## F-statistic:  1702 on 2 and 101 DF,  p-value: < 2.2e-16
plot(model3)

model3$coefficients
##  work_yrs  gmat_tot 
## 3264.2887  146.7158
model4<-lm(salary~work_yrs+age*frstlang+gmat_tot+sex-1, data = placed)
summary(model4)
## 
## Call:
## lm(formula = salary ~ work_yrs + age * frstlang + gmat_tot + 
##     sex - 1, data = placed)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -28406  -9496   -820   6174  69521 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## work_yrs       -958.87    1091.08  -0.879 0.381667    
## age            2905.80     830.35   3.499 0.000706 ***
## frstlang     -15290.15   25367.28  -0.603 0.548081    
## gmat_tot         40.36      31.33   1.288 0.200705    
## sex           -2260.92    3407.03  -0.664 0.508518    
## age:frstlang    794.41     797.85   0.996 0.321878    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15790 on 97 degrees of freedom
## Multiple R-squared:  0.9785, Adjusted R-squared:  0.9772 
## F-statistic: 736.6 on 6 and 97 DF,  p-value: < 2.2e-16
plot(model4)

model4$coefficients
##     work_yrs          age     frstlang     gmat_tot          sex 
##   -958.86681   2905.80137 -15290.15225     40.35853  -2260.92128 
## age:frstlang 
##    794.41173
model5<-lm(salary~work_yrs+age, data = placed)
summary(model5)
## 
## Call:
## lm(formula = salary ~ work_yrs + age, data = placed)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -31675  -8099  -2108   4411  80650 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  36967.5    23323.8   1.585   0.1161  
## work_yrs       388.8     1084.0   0.359   0.7206  
## age           2413.8      997.4   2.420   0.0173 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15620 on 100 degrees of freedom
## Multiple R-squared:  0.2506, Adjusted R-squared:  0.2356 
## F-statistic: 16.72 on 2 and 100 DF,  p-value: 5.438e-07
plot(model5)

model5$coefficients
## (Intercept)    work_yrs         age 
##  36967.4546    388.8347   2413.7599
model6<-lm(salary~work_yrs+sex, data = placed)
summary(model6)
## 
## Call:
## lm(formula = salary ~ work_yrs + sex, data = placed)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -31896  -8086  -2076   4789  90595 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  99676.9     5267.7  18.922  < 2e-16 ***
## work_yrs      2630.0      525.7   5.003 2.42e-06 ***
## sex          -4860.6     3433.4  -1.416     0.16    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15910 on 100 degrees of freedom
## Multiple R-squared:  0.2223, Adjusted R-squared:  0.2068 
## F-statistic: 14.29 on 2 and 100 DF,  p-value: 3.471e-06
plot(model6)

model6$coefficients
## (Intercept)    work_yrs         sex 
##   99676.944    2629.973   -4860.589

Inferences

Multiple Regression Model 3 -> The first model has salary as response variable or dependent variable. -> Predictor variables are work_yrs and gmat_tot. -> The R^2 value is 0.9712 which is a very good model -> The model’s, p-value: < 2.2e-16 is also lower than the statistical significance level of 0.05, this indicates that we can safely reject the null hypothesis that the value for the coefficient is zero (or in other words, the predictor variable has no explanatory relationship with the response variable).

Multiple regression model 4 -> The first model has salary as response variable or dependent variable. -> Predictor variables are are work_yrs , gmat_tot., age, frstlang and sex. -> The R^2 value is 0.9785 which is very good model -> The model’s, p-value: < 2.2e-16 is also lower than the statistical significance level of 0.05, this indicates that we can safely reject the null hypothesis that the value for the coefficient is zero (or in other words, the predictor variable has no explanatory relationship with the response variable).

Multiple regression model 5 -> The first model has salary as response variable or dependent variable. -> Predictor variables are are work_yrs and age. -> The R^2 value is 0.2506which is not so good model -> The model’s, p-value:5.438e-07is also lower than the statistical significance level of 0.05, this indicates that we can safely reject the null hypothesis that the value for the coefficient is zero (or in other words, the predictor variable has no explanatory relationship with the response variable).

Multiple regression model 6 -> The first model has salary as response variable or dependent variable. ->Predictor variables are are work_yrs , and sex. -> The R^2 value is 0.2223which is very good model -> The model’s, p-value: 3.471e-06 is also lower than the statistical significance level of 0.05, this indicates that we can safely reject the null hypothesis that the value for the coefficient is zero (or in other words, the predictor variable has no explanatory relationship with the response variable).

Logistic Regression Model

placed$sex <- factor(placed$sex) #Generating model for those who got placed
is.factor(placed$sex) 
## [1] TRUE
fit1 <- glm(sex~., family = binomial(link = 'logit'), data = placed)    
summary(fit1)
## 
## Call:
## glm(formula = sex ~ ., family = binomial(link = "logit"), data = placed)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4863  -0.7894  -0.5805   0.7626   2.3292  
## 
## Coefficients: (1 not defined because of singularities)
##               Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  0.1064455  8.2384884   0.013   0.9897  
## age         -0.3643742  0.1889782  -1.928   0.0538 .
## gmat_tot     0.0162521  0.0269925   0.602   0.5471  
## gmat_qpc    -0.0435054  0.0770321  -0.565   0.5722  
## gmat_vpc     0.0084836  0.0780797   0.109   0.9135  
## gmat_tpc    -0.0561304  0.1181993  -0.475   0.6349  
## s_avg        0.1751868  1.5508906   0.113   0.9101  
## f_avg        1.5943945  1.0429927   1.529   0.1263  
## quarter      0.2901630  0.4253040   0.682   0.4951  
## work_yrs     0.2410914  0.1783851   1.352   0.1765  
## frstlang     2.4111026  1.0665299   2.261   0.0238 *
## salary      -0.0000184  0.0000191  -0.963   0.3353  
## satis       -0.2638553  0.3332759  -0.792   0.4285  
## mb_avg              NA         NA      NA       NA  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 126.01  on 102  degrees of freedom
## Residual deviance: 107.49  on  90  degrees of freedom
## AIC: 133.49
## 
## Number of Fisher Scoring iterations: 5
anova(fit1, test = "Chisq")#Now we can run the anova() function on the model to analyze the table of deviance
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: sex
## 
## Terms added sequentially (first to last)
## 
## 
##          Df Deviance Resid. Df Resid. Dev Pr(>Chi)  
## NULL                       102     126.01           
## age       1   2.3856       101     123.62  0.12245  
## gmat_tot  1   0.0744       100     123.55  0.78507  
## gmat_qpc  1   4.1847        99     119.36  0.04079 *
## gmat_vpc  1   1.8543        98     117.51  0.17329  
## gmat_tpc  1   0.0823        97     117.43  0.77423  
## s_avg     1   0.4155        96     117.01  0.51919  
## f_avg     1   2.1057        95     114.90  0.14675  
## quarter   1   0.4742        94     114.43  0.49107  
## work_yrs  1   0.5956        93     113.83  0.44026  
## frstlang  1   4.6687        92     109.17  0.03072 *
## salary    1   1.0389        91     108.13  0.30808  
## satis     1   0.6359        90     107.49  0.42521  
## mb_avg    0   0.0000        90     107.49           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
 fitted.results <- predict(fit1,data=placed,type='response')
 fitted.results <- ifelse(fitted.results > 0.5,1,0)
  misClasificError <- mean(fitted.results != placed$sex) 
  print(paste('Accuracy',1-misClasificError))
## [1] "Accuracy 0.0485436893203883"
 notplaced$sex <- factor(notplaced$sex) #Generating model for those who got not placed 
 is.factor(notplaced$sex)
## [1] TRUE
fit2 <- glm(sex~., family = binomial(link = 'logit'), data = notplaced)    
 summary(fit2)
## 
## Call:
## glm(formula = sex ~ ., family = binomial(link = "logit"), data = notplaced)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5451  -0.7582  -0.4838   0.6019   2.1976  
## 
## Coefficients: (1 not defined because of singularities)
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept) 13.39699    7.84772   1.707   0.0878 .
## age          0.05353    0.12071   0.443   0.6574  
## gmat_tot    -0.03439    0.02183  -1.576   0.1151  
## gmat_qpc     0.02944    0.06260   0.470   0.6381  
## gmat_vpc     0.10328    0.06711   1.539   0.1238  
## gmat_tpc     0.03205    0.06128   0.523   0.6010  
## s_avg       -0.47864    1.17187  -0.408   0.6830  
## f_avg       -0.58170    0.57645  -1.009   0.3129  
## quarter     -0.49321    0.36673  -1.345   0.1787  
## work_yrs    -0.08643    0.14181  -0.609   0.5422  
## frstlang    -0.31776    1.29059  -0.246   0.8055  
## salary            NA         NA      NA       NA  
## satis       -0.51118    0.40913  -1.249   0.2115  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 102.304  on 89  degrees of freedom
## Residual deviance:  86.742  on 78  degrees of freedom
## AIC: 110.74
## 
## Number of Fisher Scoring iterations: 5
 anova(fit2, test = "Chisq") #anova test
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: sex
## 
## Terms added sequentially (first to last)
## 
## 
##          Df Deviance Resid. Df Resid. Dev Pr(>Chi)  
## NULL                        89    102.304           
## age       1   0.4712        88    101.833  0.49244  
## gmat_tot  1   0.3130        87    101.520  0.57585  
## gmat_qpc  1   4.3705        86     97.150  0.03657 *
## gmat_vpc  1   5.0395        85     92.110  0.02478 *
## gmat_tpc  1   0.6560        84     91.454  0.41798  
## s_avg     1   0.0490        83     91.405  0.82487  
## f_avg     1   0.5497        82     90.855  0.45844  
## quarter   1   1.6354        81     89.220  0.20096  
## work_yrs  1   0.8609        80     88.359  0.35348  
## frstlang  1   0.0078        79     88.351  0.92960  
## salary    0   0.0000        79     88.351           
## satis     1   1.6093        78     86.742  0.20459  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fitted.results <- predict(fit2,data=notplaced,type='response')
 fitted.results <- ifelse(fitted.results > 0.5,1,0)

misClasificError <- mean(fitted.results !=notplaced$sex) 
print(paste('Accuracy',1-misClasificError))
## [1] "Accuracy 0.0444444444444444"

In logistics regression the dependent variable can only be categorical , so we choose sex as the dependent variable. ->The model is prepared for both who were placed and not placed MBA Students . ->The accuracy of both the models came out to be very less and they are 4.85% and 4.44% repectively.