Reading the dataset

mbastart <- read.csv(paste("mbastart.csv",sep = ""))
View(mbastart)

summary

 library(psych)
describe(mbastart)
##          vars   n     mean       sd median  trimmed     mad min    max
## age         1 274    27.36     3.71     27    26.76    2.97  22     48
## sex         2 274     1.25     0.43      1     1.19    0.00   1      2
## gmat_tot    3 274   619.45    57.54    620   618.86   59.30 450    790
## gmat_qpc    4 274    80.64    14.87     83    82.31   14.83  28     99
## gmat_vpc    5 274    78.32    16.86     81    80.33   14.83  16     99
## gmat_tpc    6 274    84.20    14.02     87    86.12   11.86   0     99
## s_avg       7 274     3.03     0.38      3     3.03    0.44   2      4
## f_avg       8 274     3.06     0.53      3     3.09    0.37   0      4
## quarter     9 274     2.48     1.11      2     2.47    1.48   1      4
## work_yrs   10 274     3.87     3.23      3     3.29    1.48   0     22
## frstlang   11 274     1.12     0.32      1     1.02    0.00   1      2
## salary     12 274 39025.69 50951.56    999 33607.86 1481.12   0 220000
## satis      13 274   172.18   371.61      6    91.50    1.48   1    998
##           range  skew kurtosis      se
## age          26  2.16     6.45    0.22
## sex           1  1.16    -0.66    0.03
## gmat_tot    340 -0.01     0.06    3.48
## gmat_qpc     71 -0.92     0.30    0.90
## gmat_vpc     83 -1.04     0.74    1.02
## gmat_tpc     99 -2.28     9.02    0.85
## s_avg         2 -0.06    -0.38    0.02
## f_avg         4 -2.08    10.85    0.03
## quarter       3  0.02    -1.35    0.07
## work_yrs     22  2.78     9.80    0.20
## frstlang      1  2.37     3.65    0.02
## salary   220000  0.70    -1.05 3078.10
## satis       997  1.77     1.13   22.45
str(mbastart)
## 'data.frame':    274 obs. of  13 variables:
##  $ age     : int  23 24 24 24 24 24 25 25 25 25 ...
##  $ sex     : int  2 1 1 1 2 1 1 2 1 1 ...
##  $ gmat_tot: int  620 610 670 570 710 640 610 650 630 680 ...
##  $ gmat_qpc: int  77 90 99 56 93 82 89 88 79 99 ...
##  $ gmat_vpc: int  87 71 78 81 98 89 74 89 91 81 ...
##  $ gmat_tpc: int  87 87 95 75 98 91 87 92 89 96 ...
##  $ s_avg   : num  3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
##  $ f_avg   : num  3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
##  $ quarter : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ work_yrs: int  2 2 2 1 2 2 2 2 2 2 ...
##  $ frstlang: int  1 1 1 1 1 1 1 1 2 1 ...
##  $ salary  : int  0 0 0 0 999 0 0 0 999 998 ...
##  $ satis   : int  7 6 6 7 5 6 5 6 4 998 ...

Visualizing each variable

age plot

    par(mfrow=c(1,2))
    boxplot(mbastart$age,col="lightblue", ylab = "Age")
    hist(mbastart$age,col="peachpuff", xlab="Age in years",main="Age Distribution")

gender plot

    mbastart$sex=factor(mbastart$sex, levels=c(1,2), labels=c("Male","Female"))
    plot(mbastart$sex,col = "lightblue",main = "Gender distribution")

GMAT scores

    par(mfrow=c(1,2))
    boxplot(mbastart$gmat_tot,ylab="GMAT total score",col="lightblue")
    hist(mbastart$gmat_tot, xlab="GMAT total score",main="Distribution of GMAT scores", breaks=20,col="lightgreen")

Quantitative GMAT percentile

    par(mfrow= c(1,2))
    boxplot(mbastart$gmat_qpc, ylab = "quantitative GMAT percentile",col="lightblue")
    hist(mbastart$gmat_qpc,col="peachpuff", xlab = "quantitative GMAT percentile", main ="Distribution of quantitative GMAT percentile")

Verbal GMAT percentile

par(mfrow= c(1,2))
boxplot(mbastart$gmat_vpc, ylab = "verbal GMAT percentile",col ="lightblue")
hist(mbastart$gmat_vpc,col="lightgreen", xlab = "verbal GMAT percentile", main = "Distribution of verbal GMAT percentile")

Total GMAT percentile

par(mfrow= c(1,2))
boxplot(mbastart$gmat_tpc, ylab = "Total GMAT percentile",col="yellow")
hist(mbastart$gmat_tpc,col="lightblue", xlab = "Total GMAT percentile", main = "Total GMAT percentile Distribution")

spring MBA Average

par(mfrow=c(1,2))
boxplot(mbastart$s_avg, ylab = "spring MBA average",col="yellow")
hist(mbastart$s_avg, col="lightblue",xlab = "spring MBA average", main = "spring MBA average Distribution")

Fall MBA average

par(mfrow=c(1,2))
boxplot(mbastart$f_avg, ylab = "fall MBA average",col="peachpuff")
hist(mbastart$f_avg,col="lightblue", xlab = "fall MBA average", main = "fall MBA average Distribution")

Quartile ranking

par(mfrow=c(1,2))
boxplot(mbastart$quarter, ylab = "quartile ranking",col="lightblue")
hist(mbastart$quarter,col="lightgreen",xlab = "quartile ranking", main = "quartile ranking Distribution")

Work Experience

par(mfrow=c(1,2))
boxplot(mbastart$work_yrs, ylab = "years of work experience",col="lightblue")
hist(mbastart$work_yrs,col="yellow",xlab="No. of years of work experience",main = "Work Experience",breaks = 20)

First Language

    mbastart$frstlang = factor(mbastart$frstlang, levels=c(1,2), labels=c("English","Others"))
    plot(mbastart$frstlang,col="lightblue",main = "Language Distribution")

Satisfaction levels

    newdata <- mbastart[ which(mbastart$satis<='7'), ]
hist(newdata$satis,breaks =5,col="lightgreen",xlab="Degree of Satisfaction (1=low,7=high)", main="Satisfaction  distribution")

Starting Salary

 newdata1 <- mbastart[ which(mbastart$salary !="998" & mbastart$salary !="999"), ]
hist(newdata1$salary, breaks=5,col="pink",xlab="starting salary", main="Salary  distribution")

Understanding pairwise relation among the variables

Relation between salary and age

library(car)    
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
scatterplot(salary ~age,     data=newdata1,
            spread=FALSE, smoother.args=list(lty=2),
            main="Scatter plot of salary vs age",
            xlab="age",
            ylab="salary")

Relation between salary and gender

scatterplot(salary ~sex,     data=newdata1,
            spread=FALSE, smoother.args=list(lty=2),
            main="Scatter plot of salary vs sex",
            xlab="sex",
            ylab="salary")

Relation between salary and firstlanguage

scatterplot(salary ~frstlang,     data=newdata1,
            main="Scatter plot of salary vs first language",
            xlab="first language",
            ylab="salary")

Relation between salary and GMAT total

scatterplot(salary ~gmat_tot,     data=newdata1,
            spread=FALSE, smoother.args=list(lty=3),
            main="Scatter plot of salary vs Gmat total",
            xlab="Gmat score",
            ylab="salary")

correlogram

library(corrgram)
    corrgram(newdata1, order=TRUE, lower.panel=panel.shade,
    upper.panel=panel.pie, text.panel=panel.txt,
    main="MBA starting salary analysis Correlogram")

Variance - Covariance Matrix

x <- newdata1[,c("age", "gmat_tot", "gmat_qpc", "gmat_vpc","gmat_tpc","s_avg","f_avg","work_yrs","salary")]
y <- newdata1[,c("age", "gmat_tot", "gmat_qpc", "gmat_vpc","gmat_tpc","s_avg","f_avg","work_yrs","salary")]
cov(x,y) 
##                    age    gmat_tot     gmat_qpc     gmat_vpc      gmat_tpc
## age       1.778562e+01  -29.954933   -14.089729   -0.4564443    -7.5127645
## gmat_tot -2.995493e+01 3196.950561   636.350928  685.4644322   672.4651878
## gmat_qpc -1.408973e+01  636.350928   229.384067   42.7985481   141.4933074
## gmat_vpc -4.564443e-01  685.464432    42.798548  259.2695920   149.8747571
## gmat_tpc -7.512764e+00  672.465188   141.493307  149.8747571   183.0113882
## s_avg     2.626913e-01    3.076706     0.109287    1.1636153     0.9688199
## f_avg    -7.513817e-02    2.969557     1.025241    0.2769703     0.7718585
## work_yrs  1.355880e+01  -36.222204   -13.484078   -2.4562014    -8.2897776
## salary   -2.918528e+04 -170.881369 22855.717832 2901.3078044 43822.5291991
##                 s_avg        f_avg      work_yrs        salary
## age         0.2626913  -0.07513817  1.355880e+01 -2.918528e+04
## gmat_tot    3.0767055   2.96955689 -3.622220e+01 -1.708814e+02
## gmat_qpc    0.1092870   1.02524072 -1.348408e+01  2.285572e+04
## gmat_vpc    1.1636153   0.27697026 -2.456201e+00  2.901308e+03
## gmat_tpc    0.9688199   0.77185854 -8.289778e+00  4.382253e+04
## s_avg       0.1436561   0.10251263  2.224652e-01  1.940528e+03
## f_avg       0.1025126   0.26995964 -9.189254e-02  2.443157e+02
## work_yrs    0.2224652  -0.09189254  1.360379e+01 -1.044263e+04
## salary   1940.5276360 244.31568869 -1.044263e+04  2.825177e+09

Dataset containing those who got a job

 job <- mbastart[ which(mbastart$salary !="998" & mbastart$salary !="999" & mbastart$salary!="0"), ]
    head(job)
##    age    sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg quarter
## 35  22 Female      660       90       92       94   3.5  3.75       1
## 36  27 Female      700       94       98       98   3.3  3.25       1
## 37  25 Female      680       87       96       96   3.5  2.67       1
## 38  25 Female      650       82       91       93   3.4  3.25       1
## 39  27   Male      710       96       96       98   3.3  3.50       1
## 40  28 Female      620       52       98       87   3.4  3.75       1
##    work_yrs frstlang salary satis
## 35        1  English  85000     5
## 36        2  English  85000     6
## 37        2  English  86000     5
## 38        3  English  88000     7
## 39        2  English  92000     6
## 40        5  English  93000     5

Contingency tables showing the affect of various factors on the starting salary

    mytable <-xtabs(~sex+salary,data=job)
    mytable
##         salary
## sex      64000 77000 78256 82000 85000 86000 88000 88500 90000 92000 93000
##   Male       0     1     0     0     1     0     0     1     3     2     2
##   Female     1     0     1     1     3     2     1     0     0     1     1
##         salary
## sex      95000 96000 96500 97000 98000 99000 100000 100400 101000 101100
##   Male       4     3     1     2     6     0      4      1      0      1
##   Female     3     1     0     0     4     1      5      0      2      0
##         salary
## sex      101600 102500 103000 104000 105000 106000 107000 107300 107500
##   Male        1      1      1      2     11      2      1      1      1
##   Female      0      0      0      0      0      1      0      0      0
##         salary
## sex      108000 110000 112000 115000 118000 120000 126710 130000 145800
##   Male        2      0      3      5      1      3      1      1      1
##   Female      0      1      0      0      0      1      0      0      0
##         salary
## sex      146000 162000 220000
##   Male        1      1      0
##   Female      0      0      1

From this table we see that most higher starting salaries have been awarded to men.

 mytable1 <-xtabs(~salary+work_yrs,data=job)
    mytable1
##         work_yrs
## salary   0 1 2 3 4 5 6 7 8 10 15 16
##   64000  0 0 1 0 0 0 0 0 0  0  0  0
##   77000  0 0 1 0 0 0 0 0 0  0  0  0
##   78256  0 1 0 0 0 0 0 0 0  0  0  0
##   82000  0 1 0 0 0 0 0 0 0  0  0  0
##   85000  0 1 2 1 0 0 0 0 0  0  0  0
##   86000  0 0 1 1 0 0 0 0 0  0  0  0
##   88000  0 0 0 1 0 0 0 0 0  0  0  0
##   88500  0 0 0 1 0 0 0 0 0  0  0  0
##   90000  0 0 2 0 0 1 0 0 0  0  0  0
##   92000  0 0 3 0 0 0 0 0 0  0  0  0
##   93000  0 0 0 0 1 1 0 0 1  0  0  0
##   95000  1 1 2 2 0 1 0 0 0  0  0  0
##   96000  0 1 2 0 1 0 0 0 0  0  0  0
##   96500  0 0 1 0 0 0 0 0 0  0  0  0
##   97000  0 0 0 1 1 0 0 0 0  0  0  0
##   98000  0 0 7 1 1 0 0 1 0  0  0  0
##   99000  0 0 0 0 0 1 0 0 0  0  0  0
##   100000 0 0 6 1 1 0 1 0 0  0  0  0
##   100400 0 0 0 1 0 0 0 0 0  0  0  0
##   101000 0 0 2 0 0 0 0 0 0  0  0  0
##   101100 0 0 0 0 0 0 0 0 1  0  0  0
##   101600 0 0 0 1 0 0 0 0 0  0  0  0
##   102500 0 0 0 0 0 0 1 0 0  0  0  0
##   103000 0 0 0 1 0 0 0 0 0  0  0  0
##   104000 0 0 0 0 2 0 0 0 0  0  0  0
##   105000 0 0 4 4 0 1 1 0 0  0  0  1
##   106000 0 0 0 0 0 0 2 0 1  0  0  0
##   107000 0 0 1 0 0 0 0 0 0  0  0  0
##   107300 0 0 1 0 0 0 0 0 0  0  0  0
##   107500 0 0 0 1 0 0 0 0 0  0  0  0
##   108000 0 0 0 1 1 0 0 0 0  0  0  0
##   110000 0 0 0 0 0 0 1 0 0  0  0  0
##   112000 0 0 1 0 0 0 1 0 0  0  0  1
##   115000 0 2 0 1 2 0 0 0 0  0  0  0
##   118000 0 0 0 0 0 0 0 0 0  1  0  0
##   120000 0 0 0 1 0 2 0 0 1  0  0  0
##   126710 0 0 0 1 0 0 0 0 0  0  0  0
##   130000 0 0 0 0 1 0 0 0 0  0  0  0
##   145800 0 0 1 0 0 0 0 0 0  0  0  0
##   146000 0 0 0 0 0 0 0 0 0  0  1  0
##   162000 0 1 0 0 0 0 0 0 0  0  0  0
##   220000 0 0 0 0 0 0 0 0 0  0  1  0

From the above table that a minimum of 2 years of work experience is necessary

 mytable2<-xtabs(~salary+frstlang,data=job)
    mytable2
##         frstlang
## salary   English Others
##   64000        1      0
##   77000        1      0
##   78256        1      0
##   82000        1      0
##   85000        4      0
##   86000        2      0
##   88000        1      0
##   88500        1      0
##   90000        3      0
##   92000        3      0
##   93000        3      0
##   95000        7      0
##   96000        4      0
##   96500        1      0
##   97000        2      0
##   98000        8      2
##   99000        0      1
##   100000       9      0
##   100400       1      0
##   101000       2      0
##   101100       1      0
##   101600       1      0
##   102500       1      0
##   103000       1      0
##   104000       1      1
##   105000      11      0
##   106000       3      0
##   107000       1      0
##   107300       0      1
##   107500       1      0
##   108000       2      0
##   110000       1      0
##   112000       3      0
##   115000       5      0
##   118000       0      1
##   120000       4      0
##   126710       1      0
##   130000       1      0
##   145800       1      0
##   146000       1      0
##   162000       1      0
##   220000       0      1

It is seen that students with English as first language are mostly preferred and get higher salaries and jobs compared to those whose first language is not English.

mytable3<-xtabs(~salary+gmat_tot,data=job)
    mytable3
##         gmat_tot
## salary   500 520 530 540 550 560 570 580 590 600 610 620 630 640 650 660
##   64000    0   0   0   0   0   1   0   0   0   0   0   0   0   0   0   0
##   77000    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1
##   78256    0   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##   82000    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##   85000    0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   1
##   86000    0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0
##   88000    0   0   0   0   0   0   0   0   0   0   0   0   0   0   1   0
##   88500    0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0
##   90000    0   0   0   0   0   0   0   1   0   0   0   0   1   0   1   0
##   92000    0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   1
##   93000    0   0   0   1   0   0   0   0   0   0   1   1   0   0   0   0
##   95000    0   0   1   0   0   2   0   0   0   0   2   0   0   0   0   0
##   96000    0   0   0   0   0   1   0   0   1   1   0   0   0   0   1   0
##   96500    1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##   97000    0   0   0   0   0   0   0   1   0   0   0   1   0   0   0   0
##   98000    0   0   0   0   0   1   3   1   1   0   1   0   0   0   0   0
##   99000    0   0   0   0   0   0   0   1   0   0   0   0   0   0   0   0
##   100000   0   0   0   0   0   2   0   1   0   1   1   0   1   0   2   0
##   100400   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0
##   101000   0   0   0   0   0   0   0   0   0   1   0   1   0   0   0   0
##   101100   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1
##   101600   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0
##   102500   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##   103000   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0
##   104000   0   0   1   0   0   1   0   0   0   0   0   0   0   0   0   0
##   105000   0   0   0   0   2   0   2   3   0   1   0   1   0   0   1   0
##   106000   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0
##   107000   0   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0
##   107300   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1
##   107500   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0
##   108000   0   0   0   0   0   0   1   0   0   1   0   0   0   0   0   0
##   110000   0   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0
##   112000   0   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0
##   115000   0   0   0   1   0   0   1   0   0   0   0   1   1   0   0   0
##   118000   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0
##   120000   0   0   0   0   0   0   0   0   0   2   0   0   0   0   0   0
##   126710   0   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0
##   130000   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1   0
##   145800   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0
##   146000   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0
##   162000   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##   220000   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##         gmat_tot
## salary   670 680 700 710 720
##   64000    0   0   0   0   0
##   77000    0   0   0   0   0
##   78256    0   0   0   0   0
##   82000    1   0   0   0   0
##   85000    0   0   1   0   1
##   86000    0   1   0   0   0
##   88000    0   0   0   0   0
##   88500    0   0   0   0   0
##   90000    0   0   0   0   0
##   92000    0   0   0   1   0
##   93000    0   0   0   0   0
##   95000    2   0   0   0   0
##   96000    0   0   0   0   0
##   96500    0   0   0   0   0
##   97000    0   0   0   0   0
##   98000    1   1   0   1   0
##   99000    0   0   0   0   0
##   100000   0   0   0   1   0
##   100400   0   0   0   0   0
##   101000   0   0   0   0   0
##   101100   0   0   0   0   0
##   101600   0   0   0   0   0
##   102500   1   0   0   0   0
##   103000   0   0   0   0   0
##   104000   0   0   0   0   0
##   105000   0   1   0   0   0
##   106000   0   2   0   0   0
##   107000   0   0   0   0   0
##   107300   0   0   0   0   0
##   107500   0   0   0   0   0
##   108000   0   0   0   0   0
##   110000   0   0   0   0   0
##   112000   1   1   0   0   0
##   115000   0   0   0   1   0
##   118000   0   0   0   0   0
##   120000   1   0   1   0   0
##   126710   0   0   0   0   0
##   130000   0   0   0   0   0
##   145800   0   0   0   0   0
##   146000   0   0   0   0   0
##   162000   0   0   1   0   0
##   220000   0   0   0   0   0

Chisquare test

chisq.test(mytable1)
## Warning in chisq.test(mytable1): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  mytable1
## X-squared = 535.23, df = 451, p-value = 0.003809

Since p<0.01 there is a relationship betweeen work experience and salary

    chisq.test(mytable2)
## Warning in chisq.test(mytable2): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  mytable2
## X-squared = 69.847, df = 41, p-value = 0.003296

Since p<0.01 we can say that there is a relationship between first language and salary

    chisq.test(mytable3)
## Warning in chisq.test(mytable3): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  mytable3
## X-squared = 927.24, df = 820, p-value = 0.005279

Since p<0.01 we see there exists a relationship between Total GMAT score and starting salary.

Regression Model

Model 1:

    model1 <- lm(job$salary ~job$gmat_tot+job$gmat_qpc+job$gmat_vpc+job$gmat_tpc, data = job)
summary(model1)
## 
## Call:
## lm(formula = job$salary ~ job$gmat_tot + job$gmat_qpc + job$gmat_vpc + 
##     job$gmat_tpc, data = job)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -40370  -8250  -2164   5253 100097 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  109539.54   48054.24   2.279   0.0248 *
## job$gmat_tot     55.01     181.71   0.303   0.7627  
## job$gmat_qpc    718.40     541.90   1.326   0.1880  
## job$gmat_vpc    546.10     543.85   1.004   0.3178  
## job$gmat_tpc  -1663.16     801.57  -2.075   0.0406 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17670 on 98 degrees of freedom
## Multiple R-squared:  0.06089,    Adjusted R-squared:  0.02256 
## F-statistic: 1.589 on 4 and 98 DF,  p-value: 0.1834

Gmat_tpc is a significant variable in this model. The multiple R squared value indicates that the model accounts for 6% of the variance in the variables The residual error (17670) can be thought of as the average error in predicting salary using the various gmat data available

Model 2:

    model2<- lm(job$salary ~job$satis+job$work_yrs+job$frstlang, data = job)
summary(model2)
## 
## Call:
## lm(formula = job$salary ~ job$satis + job$work_yrs + job$frstlang, 
##     data = job)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -31764  -9640   -604   4816  76193 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        104142.2    11899.4   8.752 5.73e-14 ***
## job$satis           -1913.1     2000.0  -0.957   0.3411    
## job$work_yrs         2506.8      528.6   4.742 7.11e-06 ***
## job$frstlangOthers  13541.5     6305.7   2.147   0.0342 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15740 on 99 degrees of freedom
## Multiple R-squared:  0.2466, Adjusted R-squared:  0.2237 
## F-statistic:  10.8 on 3 and 99 DF,  p-value: 3.354e-06

work_yrs and frstlang are significant variables in model 2 The multiple R squared value indicates that the model accounts for 24.66% of the variance in the variables The residual error(15740) can be thought of as the average error in predicting salary using work experience, job satisfaction and first language.

Model 3:

   model3 <- lm(job$salary ~job$age+job$sex, data = job)
summary(model3)
## 
## Call:
## lm(formula = job$salary ~ job$age + job$sex, data = job)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -29047  -9444  -1750   5428  84503 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    33116.2    12997.5   2.548   0.0124 *  
## job$age         2653.1      475.1   5.584 2.03e-07 ***
## job$sexFemale  -3743.6     3372.6  -1.110   0.2697    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15540 on 100 degrees of freedom
## Multiple R-squared:  0.2588, Adjusted R-squared:  0.244 
## F-statistic: 17.46 on 2 and 100 DF,  p-value: 3.144e-07

Age is a significant factor in model 3.

Dataset consisting of people with no job

    nojob<- mbastart[ which(mbastart$salary !="998" & mbastart$salary !="999" & mbastart$salary==0), ]
    head(nojob)
##   age    sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg quarter
## 1  23 Female      620       77       87       87   3.4  3.00       1
## 2  24   Male      610       90       71       87   3.5  4.00       1
## 3  24   Male      670       99       78       95   3.3  3.25       1
## 4  24   Male      570       56       81       75   3.3  2.67       1
## 6  24   Male      640       82       89       91   3.9  3.75       1
## 7  25   Male      610       89       74       87   3.4  3.50       1
##   work_yrs frstlang salary satis
## 1        2  English      0     7
## 2        2  English      0     6
## 3        2  English      0     6
## 4        1  English      0     7
## 6        2  English      0     6
## 7        2  English      0     5
    View(nojob)

chisquare test

satisfaction of MBA course effects salary or not

  chisq.test(nojob$work_yrs,nojob$satis)
## Warning in chisq.test(nojob$work_yrs, nojob$satis): Chi-squared
## approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  nojob$work_yrs and nojob$satis
## X-squared = 44.974, df = 48, p-value = 0.5976

The p-value is small enough (<0.05) so we reject the null hypothesis It means The salary is affected by satisfaction rate of the MBA course studies by the students.

GMAT Overall percentile affects the salary or not

chisq.test(aggregate(mbastart$salary, by=list(Overall_GMAT_Percentile =  mbastart$gmat_tpc), mean))
## Warning in chisq.test(aggregate(mbastart$salary, by =
## list(Overall_GMAT_Percentile = mbastart$gmat_tpc), : Chi-squared
## approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  aggregate(mbastart$salary, by = list(Overall_GMAT_Percentile = mbastart$gmat_tpc),     mean)
## X-squared = 173100, df = 41, p-value < 2.2e-16

The p-value is small enough (<0.05) so we reject the null hypothesis It means The GMAT Percentile affects the students who got placed and who did not got placed