Reading the data set

 setwd("C:/Users/Jaya/Downloads")
mydata.df <- read.csv(paste("MBA Starting Salaries Data.csv", sep=""))
View(mydata.df)

Calculating summary statistics :

and mean, standard deviation, median, mode

summary(mydata.df)

##       age             sex           gmat_tot        gmat_qpc    
##  Min.   :22.00   Min.   :1.000   Min.   :450.0   Min.   :28.00  
##  1st Qu.:25.00   1st Qu.:1.000   1st Qu.:580.0   1st Qu.:72.00  
##  Median :27.00   Median :1.000   Median :620.0   Median :83.00  
##  Mean   :27.36   Mean   :1.248   Mean   :619.5   Mean   :80.64  
##  3rd Qu.:29.00   3rd Qu.:1.000   3rd Qu.:660.0   3rd Qu.:93.00  
##  Max.   :48.00   Max.   :2.000   Max.   :790.0   Max.   :99.00  
##     gmat_vpc        gmat_tpc        s_avg           f_avg      
##  Min.   :16.00   Min.   : 0.0   Min.   :2.000   Min.   :0.000  
##  1st Qu.:71.00   1st Qu.:78.0   1st Qu.:2.708   1st Qu.:2.750  
##  Median :81.00   Median :87.0   Median :3.000   Median :3.000  
##  Mean   :78.32   Mean   :84.2   Mean   :3.025   Mean   :3.062  
##  3rd Qu.:91.00   3rd Qu.:94.0   3rd Qu.:3.300   3rd Qu.:3.250  
##  Max.   :99.00   Max.   :99.0   Max.   :4.000   Max.   :4.000  
##     quarter         work_yrs         frstlang         salary      
##  Min.   :1.000   Min.   : 0.000   Min.   :1.000   Min.   :     0  
##  1st Qu.:1.250   1st Qu.: 2.000   1st Qu.:1.000   1st Qu.:     0  
##  Median :2.000   Median : 3.000   Median :1.000   Median :   999  
##  Mean   :2.478   Mean   : 3.872   Mean   :1.117   Mean   : 39026  
##  3rd Qu.:3.000   3rd Qu.: 4.000   3rd Qu.:1.000   3rd Qu.: 97000  
##  Max.   :4.000   Max.   :22.000   Max.   :2.000   Max.   :220000  
##      satis      
##  Min.   :  1.0  
##  1st Qu.:  5.0  
##  Median :  6.0  
##  Mean   :172.2  
##  3rd Qu.:  7.0  
##  Max.   :998.0

str(mydata.df)

## 'data.frame':    274 obs. of  13 variables:
##  $ age     : int  23 24 24 24 24 24 25 25 25 25 ...
##  $ sex     : int  2 1 1 1 2 1 1 2 1 1 ...
##  $ gmat_tot: int  620 610 670 570 710 640 610 650 630 680 ...
##  $ gmat_qpc: int  77 90 99 56 93 82 89 88 79 99 ...
##  $ gmat_vpc: int  87 71 78 81 98 89 74 89 91 81 ...
##  $ gmat_tpc: int  87 87 95 75 98 91 87 92 89 96 ...
##  $ s_avg   : num  3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
##  $ f_avg   : num  3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
##  $ quarter : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ work_yrs: int  2 2 2 1 2 2 2 2 2 2 ...
##  $ frstlang: int  1 1 1 1 1 1 1 1 2 1 ...
##  $ salary  : int  0 0 0 0 999 0 0 0 999 998 ...
##  $ satis   : int  7 6 6 7 5 6 5 6 4 998 ...

library(psych)
describe(mydata.df)[,c(1:5)]

##          vars   n     mean       sd median
## age         1 274    27.36     3.71     27
## sex         2 274     1.25     0.43      1
## gmat_tot    3 274   619.45    57.54    620
## gmat_qpc    4 274    80.64    14.87     83
## gmat_vpc    5 274    78.32    16.86     81
## gmat_tpc    6 274    84.20    14.02     87
## s_avg       7 274     3.03     0.38      3
## f_avg       8 274     3.06     0.53      3
## quarter     9 274     2.48     1.11      2
## work_yrs   10 274     3.87     3.23      3
## frstlang   11 274     1.12     0.32      1
## salary     12 274 39025.69 50951.56    999
## satis      13 274   172.18   371.61      6

Bar Plots

to visualize the distribution of each variable independently #Age plot

hist(mydata.df$age,col="blue", xlab="Age in years",main="Age Distribution")

#Demarcating the genders plot

    mydata.df$sex=factor(mydata.df$sex, levels=c(1,2), labels=c("Male","Female"))
plot(mydata.df$sex,col = "red",main = "Gender distribution")

#GMAT scores

    hist(mydata.df$gmat_tot, xlab="GMAT total score",main="Distribution of GMAT scores", breaks=20,col="orange")

#Work Experience

  hist(mydata.df$work_yrs,col="pink",xlab="No. of years of work experience",main = "Work Experience",breaks = 20)

#First Language

    mydata.df$frstlang = factor(mydata.df$frstlang, levels=c(1,2), labels=c("English","Others"))
    plot(mydata.df$frstlang,col="yellow",main = "Language Distribution")

#Satisfaction levels

    newdata <- mydata.df[ which(mydata.df$satis<='7'), ]
hist(newdata$satis,breaks =5,col="green",xlab="Degree of Satisfaction (1=low,7=high)", main="Satisfaction  distribution")

#Starting Salary

    newdata1 <- mydata.df[ which(mydata.df$salary !="998" & mydata.df$salary !="999"), ]
hist(newdata1$salary, breaks=5,col="purple",xlab="starting salary", main="Salary  distribution")

#Scatter Plots

library(car)

## 
## Attaching package: 'car'

## The following object is masked from 'package:psych':
## 
##     logit

scatterplot(salary ~age,     data=newdata1,
            spread=FALSE, smoother.args=list(lty=2),
            main="Scatter plot of salary vs age",
            xlab="age",
            ylab="salary")

scatterplot(salary ~sex,     data=newdata1,
            spread=FALSE, smoother.args=list(lty=2),
            main="Scatter plot of salary vs sex",
            xlab="sex",
            ylab="salary")

scatterplot(salary ~frstlang,     data=newdata1,
            main="Scatter plot of salary vs first language",
            xlab="first language",
            ylab="salary")

scatterplot(salary ~gmat_tot,     data=newdata1,
            spread=FALSE, smoother.args=list(lty=2),
            main="Scatter plot of salary vs Gmat total",
            xlab="Gmat score",
            ylab="salary")

Corrgram

    library(corrgram)
    corrgram(newdata1, order=TRUE, lower.panel=panel.shade,
    upper.panel=panel.pie, text.panel=panel.txt,
    main="MBA starting salary analysis Correlogram")

#Variance - Covariance Matrix

   x <- newdata1[,c("age", "gmat_tot", "gmat_qpc", "gmat_vpc","gmat_tpc","s_avg","f_avg","work_yrs","salary")]
   y <- newdata1[,c("age", "gmat_tot", "gmat_qpc", "gmat_vpc","gmat_tpc","s_avg","f_avg","work_yrs","salary")]
   cov(x,y)

##                    age    gmat_tot     gmat_qpc     gmat_vpc      gmat_tpc
## age       1.778562e+01  -29.954933   -14.089729   -0.4564443    -7.5127645
## gmat_tot -2.995493e+01 3196.950561   636.350928  685.4644322   672.4651878
## gmat_qpc -1.408973e+01  636.350928   229.384067   42.7985481   141.4933074
## gmat_vpc -4.564443e-01  685.464432    42.798548  259.2695920   149.8747571
## gmat_tpc -7.512764e+00  672.465188   141.493307  149.8747571   183.0113882
## s_avg     2.626913e-01    3.076706     0.109287    1.1636153     0.9688199
## f_avg    -7.513817e-02    2.969557     1.025241    0.2769703     0.7718585
## work_yrs  1.355880e+01  -36.222204   -13.484078   -2.4562014    -8.2897776
## salary   -2.918528e+04 -170.881369 22855.717832 2901.3078044 43822.5291991
##                 s_avg        f_avg      work_yrs        salary
## age         0.2626913  -0.07513817  1.355880e+01 -2.918528e+04
## gmat_tot    3.0767055   2.96955689 -3.622220e+01 -1.708814e+02
## gmat_qpc    0.1092870   1.02524072 -1.348408e+01  2.285572e+04
## gmat_vpc    1.1636153   0.27697026 -2.456201e+00  2.901308e+03
## gmat_tpc    0.9688199   0.77185854 -8.289778e+00  4.382253e+04
## s_avg       0.1436561   0.10251263  2.224652e-01  1.940528e+03
## f_avg       0.1025126   0.26995964 -9.189254e-02  2.443157e+02
## work_yrs    0.2224652  -0.09189254  1.360379e+01 -1.044263e+04
## salary   1940.5276360 244.31568869 -1.044263e+04  2.825177e+09

Dataset containing those who got a job only

    job.df <- mydata.df[ which(mydata.df$salary !="998" & mydata.df$salary !="999" & mydata.df$salary!="0"), ]
    head(job.df)

##    age    sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg quarter
## 35  22 Female      660       90       92       94   3.5  3.75       1
## 36  27 Female      700       94       98       98   3.3  3.25       1
## 37  25 Female      680       87       96       96   3.5  2.67       1
## 38  25 Female      650       82       91       93   3.4  3.25       1
## 39  27   Male      710       96       96       98   3.3  3.50       1
## 40  28 Female      620       52       98       87   3.4  3.75       1
##    work_yrs frstlang salary satis
## 35        1  English  85000     5
## 36        2  English  85000     6
## 37        2  English  86000     5
## 38        3  English  88000     7
## 39        2  English  92000     6
## 40        5  English  93000     5

From this table we see that most higher starting salaries have been awarded to men.

    mytable1 <-xtabs(~salary+work_yrs,data=job.df)
    mytable1

##         work_yrs
## salary   0 1 2 3 4 5 6 7 8 10 15 16
##   64000  0 0 1 0 0 0 0 0 0  0  0  0
##   77000  0 0 1 0 0 0 0 0 0  0  0  0
##   78256  0 1 0 0 0 0 0 0 0  0  0  0
##   82000  0 1 0 0 0 0 0 0 0  0  0  0
##   85000  0 1 2 1 0 0 0 0 0  0  0  0
##   86000  0 0 1 1 0 0 0 0 0  0  0  0
##   88000  0 0 0 1 0 0 0 0 0  0  0  0
##   88500  0 0 0 1 0 0 0 0 0  0  0  0
##   90000  0 0 2 0 0 1 0 0 0  0  0  0
##   92000  0 0 3 0 0 0 0 0 0  0  0  0
##   93000  0 0 0 0 1 1 0 0 1  0  0  0
##   95000  1 1 2 2 0 1 0 0 0  0  0  0
##   96000  0 1 2 0 1 0 0 0 0  0  0  0
##   96500  0 0 1 0 0 0 0 0 0  0  0  0
##   97000  0 0 0 1 1 0 0 0 0  0  0  0
##   98000  0 0 7 1 1 0 0 1 0  0  0  0
##   99000  0 0 0 0 0 1 0 0 0  0  0  0
##   100000 0 0 6 1 1 0 1 0 0  0  0  0
##   100400 0 0 0 1 0 0 0 0 0  0  0  0
##   101000 0 0 2 0 0 0 0 0 0  0  0  0
##   101100 0 0 0 0 0 0 0 0 1  0  0  0
##   101600 0 0 0 1 0 0 0 0 0  0  0  0
##   102500 0 0 0 0 0 0 1 0 0  0  0  0
##   103000 0 0 0 1 0 0 0 0 0  0  0  0
##   104000 0 0 0 0 2 0 0 0 0  0  0  0
##   105000 0 0 4 4 0 1 1 0 0  0  0  1
##   106000 0 0 0 0 0 0 2 0 1  0  0  0
##   107000 0 0 1 0 0 0 0 0 0  0  0  0
##   107300 0 0 1 0 0 0 0 0 0  0  0  0
##   107500 0 0 0 1 0 0 0 0 0  0  0  0
##   108000 0 0 0 1 1 0 0 0 0  0  0  0
##   110000 0 0 0 0 0 0 1 0 0  0  0  0
##   112000 0 0 1 0 0 0 1 0 0  0  0  1
##   115000 0 2 0 1 2 0 0 0 0  0  0  0
##   118000 0 0 0 0 0 0 0 0 0  1  0  0
##   120000 0 0 0 1 0 2 0 0 1  0  0  0
##   126710 0 0 0 1 0 0 0 0 0  0  0  0
##   130000 0 0 0 0 1 0 0 0 0  0  0  0
##   145800 0 0 1 0 0 0 0 0 0  0  0  0
##   146000 0 0 0 0 0 0 0 0 0  0  1  0
##   162000 0 1 0 0 0 0 0 0 0  0  0  0
##   220000 0 0 0 0 0 0 0 0 0  0  1  0

From the above table that a minimum of 2 years of work experience is necessary

    mytable2<-xtabs(~salary+frstlang,data=job.df)
    mytable2

##         frstlang
## salary   English Others
##   64000        1      0
##   77000        1      0
##   78256        1      0
##   82000        1      0
##   85000        4      0
##   86000        2      0
##   88000        1      0
##   88500        1      0
##   90000        3      0
##   92000        3      0
##   93000        3      0
##   95000        7      0
##   96000        4      0
##   96500        1      0
##   97000        2      0
##   98000        8      2
##   99000        0      1
##   100000       9      0
##   100400       1      0
##   101000       2      0
##   101100       1      0
##   101600       1      0
##   102500       1      0
##   103000       1      0
##   104000       1      1
##   105000      11      0
##   106000       3      0
##   107000       1      0
##   107300       0      1
##   107500       1      0
##   108000       2      0
##   110000       1      0
##   112000       3      0
##   115000       5      0
##   118000       0      1
##   120000       4      0
##   126710       1      0
##   130000       1      0
##   145800       1      0
##   146000       1      0
##   162000       1      0
##   220000       0      1

It is seen that students with English as first language are mostly preferred and get higher salaries and jobs compared to those whose first language is not English.

    mytable3<-xtabs(~salary+gmat_tot,data=job.df)
    mytable3

##         gmat_tot
## salary   500 520 530 540 550 560 570 580 590 600 610 620 630 640 650 660
##   64000    0   0   0   0   0   1   0   0   0   0   0   0   0   0   0   0
##   77000    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1
##   78256    0   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##   82000    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##   85000    0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   1
##   86000    0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0
##   88000    0   0   0   0   0   0   0   0   0   0   0   0   0   0   1   0
##   88500    0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0
##   90000    0   0   0   0   0   0   0   1   0   0   0   0   1   0   1   0
##   92000    0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   1
##   93000    0   0   0   1   0   0   0   0   0   0   1   1   0   0   0   0
##   95000    0   0   1   0   0   2   0   0   0   0   2   0   0   0   0   0
##   96000    0   0   0   0   0   1   0   0   1   1   0   0   0   0   1   0
##   96500    1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##   97000    0   0   0   0   0   0   0   1   0   0   0   1   0   0   0   0
##   98000    0   0   0   0   0   1   3   1   1   0   1   0   0   0   0   0
##   99000    0   0   0   0   0   0   0   1   0   0   0   0   0   0   0   0
##   100000   0   0   0   0   0   2   0   1   0   1   1   0   1   0   2   0
##   100400   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0
##   101000   0   0   0   0   0   0   0   0   0   1   0   1   0   0   0   0
##   101100   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1
##   101600   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0
##   102500   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##   103000   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0
##   104000   0   0   1   0   0   1   0   0   0   0   0   0   0   0   0   0
##   105000   0   0   0   0   2   0   2   3   0   1   0   1   0   0   1   0
##   106000   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0
##   107000   0   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0
##   107300   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1
##   107500   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0
##   108000   0   0   0   0   0   0   1   0   0   1   0   0   0   0   0   0
##   110000   0   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0
##   112000   0   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0
##   115000   0   0   0   1   0   0   1   0   0   0   0   1   1   0   0   0
##   118000   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0
##   120000   0   0   0   0   0   0   0   0   0   2   0   0   0   0   0   0
##   126710   0   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0
##   130000   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1   0
##   145800   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0
##   146000   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0
##   162000   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##   220000   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##         gmat_tot
## salary   670 680 700 710 720
##   64000    0   0   0   0   0
##   77000    0   0   0   0   0
##   78256    0   0   0   0   0
##   82000    1   0   0   0   0
##   85000    0   0   1   0   1
##   86000    0   1   0   0   0
##   88000    0   0   0   0   0
##   88500    0   0   0   0   0
##   90000    0   0   0   0   0
##   92000    0   0   0   1   0
##   93000    0   0   0   0   0
##   95000    2   0   0   0   0
##   96000    0   0   0   0   0
##   96500    0   0   0   0   0
##   97000    0   0   0   0   0
##   98000    1   1   0   1   0
##   99000    0   0   0   0   0
##   100000   0   0   0   1   0
##   100400   0   0   0   0   0
##   101000   0   0   0   0   0
##   101100   0   0   0   0   0
##   101600   0   0   0   0   0
##   102500   1   0   0   0   0
##   103000   0   0   0   0   0
##   104000   0   0   0   0   0
##   105000   0   1   0   0   0
##   106000   0   2   0   0   0
##   107000   0   0   0   0   0
##   107300   0   0   0   0   0
##   107500   0   0   0   0   0
##   108000   0   0   0   0   0
##   110000   0   0   0   0   0
##   112000   1   1   0   0   0
##   115000   0   0   0   1   0
##   118000   0   0   0   0   0
##   120000   1   0   1   0   0
##   126710   0   0   0   0   0
##   130000   0   0   0   0   0
##   145800   0   0   0   0   0
##   146000   0   0   0   0   0
##   162000   0   0   1   0   0
##   220000   0   0   0   0   0

Chi square test

    log.transformed.salary=log(job.df$salary)
t.test(log.transformed.salary~ job.df$sex, var.equal = TRUE)

## 
##  Two Sample t-test
## 
## data:  log.transformed.salary by job.df$sex
## t = 2.4552, df = 101, p-value = 0.01579
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.01470674 0.13847594
## sample estimates:
##   mean in group Male mean in group Female 
##             11.55390             11.47731

  chisq.test(mytable1)

## Warning in chisq.test(mytable1): Chi-squared approximation may be incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  mytable1
## X-squared = 535.23, df = 451, p-value = 0.003809

Since p<0.01 there is a relationship betweeen work experience and salary

    chisq.test(mytable2)

## Warning in chisq.test(mytable2): Chi-squared approximation may be incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  mytable2
## X-squared = 69.847, df = 41, p-value = 0.003296

Since p<0.01 we can say that there is a relationship between first language and salary

    chisq.test(mytable3)

## Warning in chisq.test(mytable3): Chi-squared approximation may be incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  mytable3
## X-squared = 927.24, df = 820, p-value = 0.005279

Since p<0.01 we see there exists a relationship between Total GMAT score and starting salary.

Regression Model

Model 1

    fit <- lm(job.df$salary ~job.df$gmat_tot+job.df$gmat_qpc+job.df$gmat_vpc+job.df$gmat_tpc, data = job.df)
summary(fit)

## 
## Call:
## lm(formula = job.df$salary ~ job.df$gmat_tot + job.df$gmat_qpc + 
##     job.df$gmat_vpc + job.df$gmat_tpc, data = job.df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -40370  -8250  -2164   5253 100097 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)  
## (Intercept)     109539.54   48054.24   2.279   0.0248 *
## job.df$gmat_tot     55.01     181.71   0.303   0.7627  
## job.df$gmat_qpc    718.40     541.90   1.326   0.1880  
## job.df$gmat_vpc    546.10     543.85   1.004   0.3178  
## job.df$gmat_tpc  -1663.16     801.57  -2.075   0.0406 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17670 on 98 degrees of freedom
## Multiple R-squared:  0.06089,    Adjusted R-squared:  0.02256 
## F-statistic: 1.589 on 4 and 98 DF,  p-value: 0.1834

Gmat_tpc is a significant variable in model 1 The multiple R squared value indicates that the model accounts for 6% of the variance in the variables The residual error (17670) can be thought of as the average error in predicting salary using the various gmat data available

Model 2

    fit1 <- lm(job.df$salary ~job.df$satis+job.df$work_yrs+job.df$frstlang, data = job.df)
summary(fit1)

## 
## Call:
## lm(formula = job.df$salary ~ job.df$satis + job.df$work_yrs + 
##     job.df$frstlang, data = job.df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -31764  -9640   -604   4816  76193 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           104142.2    11899.4   8.752 5.73e-14 ***
## job.df$satis           -1913.1     2000.0  -0.957   0.3411    
## job.df$work_yrs         2506.8      528.6   4.742 7.11e-06 ***
## job.df$frstlangOthers  13541.5     6305.7   2.147   0.0342 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15740 on 99 degrees of freedom
## Multiple R-squared:  0.2466, Adjusted R-squared:  0.2237 
## F-statistic:  10.8 on 3 and 99 DF,  p-value: 3.354e-06

work_yrs and frstlang are significant variables in model 2 The multiple R squared value indicates that the model accounts for 24.66% of the variance in the variables The residual error(15740) can be thought of as the average error in predicting salary using work experience, job satisfaction and first language.

Model 3

    fit2 <- lm(job.df$salary ~job.df$age+job.df$sex, data = job.df)
summary(fit2)

## 
## Call:
## lm(formula = job.df$salary ~ job.df$age + job.df$sex, data = job.df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -29047  -9444  -1750   5428  84503 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       33116.2    12997.5   2.548   0.0124 *  
## job.df$age         2653.1      475.1   5.584 2.03e-07 ***
## job.df$sexFemale  -3743.6     3372.6  -1.110   0.2697    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15540 on 100 degrees of freedom
## Multiple R-squared:  0.2588, Adjusted R-squared:  0.244 
## F-statistic: 17.46 on 2 and 100 DF,  p-value: 3.144e-07

Age is a significant factor in model 3

We see that model 2 is better than model 1 and model 3, with a higher R-squared value.

Dataset consisting of people with no job

    nojob.df<- mydata.df[ which(mydata.df$salary !="998" & mydata.df$salary !="999" & mydata.df$salary==0), ]
    head(nojob.df)

##   age    sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg quarter
## 1  23 Female      620       77       87       87   3.4  3.00       1
## 2  24   Male      610       90       71       87   3.5  4.00       1
## 3  24   Male      670       99       78       95   3.3  3.25       1
## 4  24   Male      570       56       81       75   3.3  2.67       1
## 6  24   Male      640       82       89       91   3.9  3.75       1
## 7  25   Male      610       89       74       87   3.4  3.50       1
##   work_yrs frstlang salary satis
## 1        2  English      0     7
## 2        2  English      0     6
## 3        2  English      0     6
## 4        1  English      0     7
## 6        2  English      0     6
## 7        2  English      0     5

hist(nojob.df$gmat_tot,
     main = "GMAT performance of students with no job",
     xlab="GMAT score",
     breaks=10,
     col = "green")

Distributed between 550-650 for unplaced students while it is more scattered amongst those who do have a job.

    chisq.test(nojob.df$work_yrs,nojob.df$satis)

## Warning in chisq.test(nojob.df$work_yrs, nojob.df$satis): Chi-squared
## approximation may be incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  nojob.df$work_yrs and nojob.df$satis
## X-squared = 44.974, df = 48, p-value = 0.5976

This shows that the null hypothesis is true and that unplaced students with work experience are satisfied with the MBA program.

CASE: MBA Starting Salaries

Sai Sandeep MN

23 January 2018