Summary statistics

str(mbaStudent)
## 'data.frame':    274 obs. of  13 variables:
##  $ age     : int  23 24 24 24 24 24 25 25 25 25 ...
##  $ sex     : int  2 1 1 1 2 1 1 2 1 1 ...
##  $ gmat_tot: int  620 610 670 570 710 640 610 650 630 680 ...
##  $ gmat_qpc: int  77 90 99 56 93 82 89 88 79 99 ...
##  $ gmat_vpc: int  87 71 78 81 98 89 74 89 91 81 ...
##  $ gmat_tpc: int  87 87 95 75 98 91 87 92 89 96 ...
##  $ s_avg   : num  3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
##  $ f_avg   : num  3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
##  $ quarter : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ work_yrs: int  2 2 2 1 2 2 2 2 2 2 ...
##  $ frstlang: int  1 1 1 1 1 1 1 1 2 1 ...
##  $ salary  : int  0 0 0 0 999 0 0 0 999 998 ...
##  $ satis   : int  7 6 6 7 5 6 5 6 4 998 ...
#coverting sex, frstlang ,and satis into  factor
mbaStudent$sex <- factor(mbaStudent$sex)
mbaStudent$frstlang <- factor(mbaStudent$frstlang)
mbaStudent$satis <- factor(mbaStudent$satis)
str(mbaStudent)
## 'data.frame':    274 obs. of  13 variables:
##  $ age     : int  23 24 24 24 24 24 25 25 25 25 ...
##  $ sex     : Factor w/ 2 levels "1","2": 2 1 1 1 2 1 1 2 1 1 ...
##  $ gmat_tot: int  620 610 670 570 710 640 610 650 630 680 ...
##  $ gmat_qpc: int  77 90 99 56 93 82 89 88 79 99 ...
##  $ gmat_vpc: int  87 71 78 81 98 89 74 89 91 81 ...
##  $ gmat_tpc: int  87 87 95 75 98 91 87 92 89 96 ...
##  $ s_avg   : num  3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
##  $ f_avg   : num  3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
##  $ quarter : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ work_yrs: int  2 2 2 1 2 2 2 2 2 2 ...
##  $ frstlang: Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 2 1 ...
##  $ salary  : int  0 0 0 0 999 0 0 0 999 998 ...
##  $ satis   : Factor w/ 8 levels "1","2","3","4",..: 7 6 6 7 5 6 5 6 4 8 ...
summary(mbaStudent)
##       age        sex        gmat_tot        gmat_qpc        gmat_vpc    
##  Min.   :22.00   1:206   Min.   :450.0   Min.   :28.00   Min.   :16.00  
##  1st Qu.:25.00   2: 68   1st Qu.:580.0   1st Qu.:72.00   1st Qu.:71.00  
##  Median :27.00           Median :620.0   Median :83.00   Median :81.00  
##  Mean   :27.36           Mean   :619.5   Mean   :80.64   Mean   :78.32  
##  3rd Qu.:29.00           3rd Qu.:660.0   3rd Qu.:93.00   3rd Qu.:91.00  
##  Max.   :48.00           Max.   :790.0   Max.   :99.00   Max.   :99.00  
##                                                                         
##     gmat_tpc        s_avg           f_avg          quarter     
##  Min.   : 0.0   Min.   :2.000   Min.   :0.000   Min.   :1.000  
##  1st Qu.:78.0   1st Qu.:2.708   1st Qu.:2.750   1st Qu.:1.250  
##  Median :87.0   Median :3.000   Median :3.000   Median :2.000  
##  Mean   :84.2   Mean   :3.025   Mean   :3.062   Mean   :2.478  
##  3rd Qu.:94.0   3rd Qu.:3.300   3rd Qu.:3.250   3rd Qu.:3.000  
##  Max.   :99.0   Max.   :4.000   Max.   :4.000   Max.   :4.000  
##                                                                
##     work_yrs      frstlang     salary           satis   
##  Min.   : 0.000   1:242    Min.   :     0   6      :97  
##  1st Qu.: 2.000   2: 32    1st Qu.:     0   5      :74  
##  Median : 3.000            Median :   999   998    :46  
##  Mean   : 3.872            Mean   : 39026   7      :33  
##  3rd Qu.: 4.000            3rd Qu.: 97000   4      :17  
##  Max.   :22.000            Max.   :220000   3      : 5  
##                                             (Other): 2
library(psych)
describe(mbaStudent[,c(1,3:10,12,13)])
##          vars   n     mean       sd median  trimmed     mad min    max
## age         1 274    27.36     3.71     27    26.76    2.97  22     48
## gmat_tot    2 274   619.45    57.54    620   618.86   59.30 450    790
## gmat_qpc    3 274    80.64    14.87     83    82.31   14.83  28     99
## gmat_vpc    4 274    78.32    16.86     81    80.33   14.83  16     99
## gmat_tpc    5 274    84.20    14.02     87    86.12   11.86   0     99
## s_avg       6 274     3.03     0.38      3     3.03    0.44   2      4
## f_avg       7 274     3.06     0.53      3     3.09    0.37   0      4
## quarter     8 274     2.48     1.11      2     2.47    1.48   1      4
## work_yrs    9 274     3.87     3.23      3     3.29    1.48   0     22
## salary     10 274 39025.69 50951.56    999 33607.86 1481.12   0 220000
## satis*     11 274     5.97     1.28      6     6.00    1.48   1      8
##           range  skew kurtosis      se
## age          26  2.16     6.45    0.22
## gmat_tot    340 -0.01     0.06    3.48
## gmat_qpc     71 -0.92     0.30    0.90
## gmat_vpc     83 -1.04     0.74    1.02
## gmat_tpc     99 -2.28     9.02    0.85
## s_avg         2 -0.06    -0.38    0.02
## f_avg         4 -2.08    10.85    0.03
## quarter       3  0.02    -1.35    0.07
## work_yrs     22  2.78     9.80    0.20
## salary   220000  0.70    -1.05 3078.10
## satis*        7 -0.17     0.28    0.08

Plots (Vizualization)

boxplot(mbaStudent$age, horizontal = TRUE, main="Age", xlab="years",col="plum")

boxplot(mbaStudent$gmat_tot, horizontal = TRUE, main="GMAT score", xlab="score",col="plum")

boxplot(mbaStudent$gmat_qpc, horizontal = TRUE, main="GMAT Quantitative percentile", xlab="percentile",col="plum")

boxplot(mbaStudent$gmat_vpc, horizontal = TRUE, main="GMAT Verbal percentile", xlab="percentile",col="plum")

boxplot(mbaStudent$gmat_tpc, horizontal = TRUE, main="Total percentile", xlab="percentile",col="plum")

par(mfrow=c(1,2))

boxplot(mbaStudent$s_avg, horizontal = FALSE, main="Spring MBA average", ylab="grade",col="plum")

boxplot(mbaStudent$f_avg, horizontal = FALSE, main="Fall MBA average", ylab="grade",col="plum")

hist(mbaStudent$gmat_tpc, xlab="total percentile", main="GMAT total percentile", col = "dodgerblue3")

hist(mbaStudent$gmat_qpc, xlab="percentile", main="GMAT quantitative percentile", col = "dodgerblue3")

hist(mbaStudent$gmat_vpc, xlab="percentile", main="GMAT verbal percentile", col = "dodgerblue3")

par(mfrow=c(1,2))

hist(mbaStudent$s_avg, xlab="Grade", main="Spring MBA average", col = "dodgerblue3")

hist(mbaStudent$f_avg, xlab="Grade", main="Fall MBA average", col="dodgerblue4")

par(mfrow=c(1,1))

Looking at strength and direction of corelations between variables.

library(corrgram)
## Warning: replacing previous import by 'magrittr::%>%' when loading
## 'dendextend'
corrgram(mbaStudent, lower.panel=panel.shade,
         upper.panel=panel.pie,
         main="Corrgram ")

Subset of students who got job

#one who disclosed there salary are only taken into account
gotJob<- mbaStudent[which(mbaStudent$salary > 1000),]
View(gotJob)

Looking into inter correlation between paris

library(corrgram)
corrgram(gotJob, lower.panel=panel.shade,
         upper.panel=panel.pie,
         main="Corrgram ")

Fitting different models:

Model1<- lm(salary ~ frstlang + work_yrs + age, gotJob )

Model1

\(salary =\beta_c + \beta_0 frstlang + \beta_1 workYrs + \beta_2 age + \epsilon\)

Model2<- lm(salary ~ frstlang + work_yrs + age + gmat_tot + f_avg + s_avg, gotJob )

Model2

\(salary = \beta_c + \beta_0 frstlang + \beta_1 workYrs + \beta_2 age + \beta_3 gmatTot + \beta_4 fallAvg + \beta_5 springAvg + \epsilon\)

Model3<- lm(salary ~ frstlang + work_yrs + age + gmat_tot + f_avg + s_avg + sex + gmat_qpc + gmat_vpc + gmat_tpc, gotJob )

Model3

\(salary = \beta_c + \beta_0 frstlang + \beta_1 workYrs + \beta_2 age + \beta_3 gmatTot + \beta_4 fallAvg + \beta_5 springAvg + \beta_6 sex + \beta_7 gmatQuant + \beta_8 gmatVerbal + \beta_9 gmatTotalPercentile + \epsilon\)

Contingency table:

xtabs(~gotJob$sex + gotJob$satis)
##           gotJob$satis
## gotJob$sex  1  2  3  4  5  6  7 998
##          1  0  0  0  1 17 40 14   0
##          2  0  0  1  0 12 10  8   0

Run a chi-square test Hypothesis: Male are more satisfied with job than females among who got their jobs,. H0: Satisfaction and sex are independent.

myTable1<- xtabs(~satis+sex, data=gotJob)
chisq.test(myTable1)
## Warning in chisq.test(myTable1): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  myTable1
## X-squared = NaN, df = 7, p-value = NA

Since p.value > 0.05, therefore we can reject the null hypothesis that satisfaction and sex are independent, it is actually dependent.

Run a t-test We wil see weather female are outperforming in semester grades.

Hypothesis: Females have high spring + fall average grades than males. H0: There is no significant difference between male and female avergae semester grade.

aggregate(gotJob$s_avg+gotJob$f_avg, by=list(gotJob$sex), mean)
##   Group.1        x
## 1       1 6.110556
## 2       2 6.352258
#summing up both fall and spring grade into one to run t-test
gotJob$s_avg<- gotJob$s_avg + gotJob$f_avg

t.test(gotJob$s_avg ~ gotJob$sex)
## 
##  Welch Two Sample t-test
## 
## data:  gotJob$s_avg by gotJob$sex
## t = -1.74, df = 77.562, p-value = 0.08582
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.518269  0.034864
## sample estimates:
## mean in group 1 mean in group 2 
##        6.110556        6.352258
#correcting s_avg and f_avg
gotJob<- mbaStudent[which(mbaStudent$salary > 1000),]

Result : We can not say that females are outperforming than males in semester grades. Since p.value is greater than > 0.05. So We accepet the null hypothesis that there is no significant difference between males and females’ semester grade

aggregate(gotJob$s_avg+gotJob$f_avg, by=list(gotJob$sex), mean)
##   Group.1        x
## 1       1 6.110556
## 2       2 6.352258

Model Summary

summary(Model1)
## 
## Call:
## lm(formula = salary ~ frstlang + work_yrs + age, data = gotJob)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -31941  -9139  -1086   4793  75526 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  49039.7    25119.6   1.952   0.0537 .
## frstlang2     8546.9     6728.1   1.270   0.2069  
## work_yrs       747.2     1116.9   0.669   0.5050  
## age           1892.0     1075.9   1.759   0.0818 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15570 on 99 degrees of freedom
## Multiple R-squared:  0.2626, Adjusted R-squared:  0.2403 
## F-statistic: 11.75 on 3 and 99 DF,  p-value: 1.188e-06
summary(Model2)
## 
## Call:
## lm(formula = salary ~ frstlang + work_yrs + age + gmat_tot + 
##     f_avg + s_avg, data = gotJob)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -34180  -8542  -1237   4941  76534 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 52885.09   32247.47   1.640   0.1043  
## frstlang2    9167.61    7065.92   1.297   0.1976  
## work_yrs      649.28    1145.52   0.567   0.5722  
## age          1844.76    1110.08   1.662   0.0998 .
## gmat_tot      -14.78      31.91  -0.463   0.6442  
## f_avg       -1023.94    3835.20  -0.267   0.7901  
## s_avg        3236.73    5031.86   0.643   0.5216  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15770 on 96 degrees of freedom
## Multiple R-squared:  0.2669, Adjusted R-squared:  0.2211 
## F-statistic: 5.825 on 6 and 96 DF,  p-value: 3.292e-05
summary(Model3)
## 
## Call:
## lm(formula = salary ~ frstlang + work_yrs + age + gmat_tot + 
##     f_avg + s_avg + sex + gmat_qpc + gmat_vpc + gmat_tpc, data = gotJob)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -30627  -8168   -767   5445  70245 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 56927.8436 49104.8678   1.159   0.2493  
## frstlang2    7704.4426  7289.3746   1.057   0.2933  
## work_yrs      775.6177  1131.4164   0.686   0.4947  
## age          1702.9961  1124.8254   1.514   0.1335  
## gmat_tot       -0.2345   168.0014  -0.001   0.9989  
## f_avg       -1693.3454  3815.2794  -0.444   0.6582  
## s_avg        5117.8754  4987.7232   1.026   0.3075  
## sex2        -3781.6672  3551.3887  -1.065   0.2897  
## gmat_qpc      830.0941   488.3993   1.700   0.0926 .
## gmat_vpc      579.0159   488.6739   1.185   0.2391  
## gmat_tpc    -1465.3294   705.6349  -2.077   0.0406 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15370 on 92 degrees of freedom
## Multiple R-squared:  0.3326, Adjusted R-squared:   0.26 
## F-statistic: 4.585 on 10 and 92 DF,  p-value: 2.812e-05

We could compare all three models since dependent variable is same in all models. Model3 has highest adjusted R-squared i.e. 26%. So we choose Model3 over Model1 and Model2.

Subsetting students which have no job.

withoutJob<- mbaStudent[which(mbaStudent$salary == 0),]
View(withoutJob)

Comparing both ‘withJob’ and ‘withoutJob’ students

#Visuzlize Seats Economy between two sets
boxplot(gotJob$age, withoutJob$age ,col=c("skyblue","pink"),  horizontal = TRUE, main="Age", xlab="Year ", names=c("with job","without job"))

boxplot(gotJob$gmat_tot, withoutJob$gmat_tot ,col=c("skyblue","pink"),  horizontal = TRUE, main="GMAT Total", xlab="Score", names=c("with job","without job"))

boxplot(gotJob$gmat_qpc, withoutJob$gmat_qpc ,col=c("skyblue","pink"),  horizontal = TRUE, main="GMAT Qunatitative Percentile", xlab="Percentile", names=c("with job","without job"))

boxplot(gotJob$gmat_vpc, withoutJob$gmat_vpc ,col=c("skyblue","pink"),  horizontal = TRUE, main="GMAT Verbal Percentile", xlab="Percentile", names=c("with job","without job"))

boxplot(gotJob$s_avg, withoutJob$s_avg ,col=c("skyblue","pink"),  horizontal = TRUE, main="MBA Spring average", xlab="Grades", names=c("with job","without job"))

boxplot(gotJob$f_avg, withoutJob$f_avg ,col=c("skyblue","pink"),  horizontal = TRUE, main="MBA Fall average", xlab="Grades", names=c("with job","without job"))

boxplot(gotJob$work_yrs, withoutJob$work_yrs ,col=c("skyblue","pink"),  horizontal = TRUE, main="Working Years", xlab="Year experience", names=c("with job","without job"))

>RESULT Students having good semester grades are more likely to get job. English plays vital role in job market.