R Markdown

This Rmd document is an analysis of the MBA Starting Salaries.csv file, comprising of Boxplots, barcharts, histograms and scatterplots. This also includes various tests performed on the dataset to verify the hyposthesis and to evaluate the significant difference.

#TASK 2-a:

setwd("F:/R-Internship/Course related files")
sal_mba.df<-read.csv(paste("MBA Salary.csv",sep=""))
View(sal_mba.df)
summary(sal_mba.df)
##       age             sex           gmat_tot        gmat_qpc    
##  Min.   :22.00   Min.   :1.000   Min.   :450.0   Min.   :28.00  
##  1st Qu.:25.00   1st Qu.:1.000   1st Qu.:580.0   1st Qu.:72.00  
##  Median :27.00   Median :1.000   Median :620.0   Median :83.00  
##  Mean   :27.36   Mean   :1.248   Mean   :619.5   Mean   :80.64  
##  3rd Qu.:29.00   3rd Qu.:1.000   3rd Qu.:660.0   3rd Qu.:93.00  
##  Max.   :48.00   Max.   :2.000   Max.   :790.0   Max.   :99.00  
##     gmat_vpc        gmat_tpc        s_avg           f_avg      
##  Min.   :16.00   Min.   : 0.0   Min.   :2.000   Min.   :0.000  
##  1st Qu.:71.00   1st Qu.:78.0   1st Qu.:2.708   1st Qu.:2.750  
##  Median :81.00   Median :87.0   Median :3.000   Median :3.000  
##  Mean   :78.32   Mean   :84.2   Mean   :3.025   Mean   :3.062  
##  3rd Qu.:91.00   3rd Qu.:94.0   3rd Qu.:3.300   3rd Qu.:3.250  
##  Max.   :99.00   Max.   :99.0   Max.   :4.000   Max.   :4.000  
##     quarter         work_yrs         frstlang         salary      
##  Min.   :1.000   Min.   : 0.000   Min.   :1.000   Min.   :     0  
##  1st Qu.:1.250   1st Qu.: 2.000   1st Qu.:1.000   1st Qu.:     0  
##  Median :2.000   Median : 3.000   Median :1.000   Median :   999  
##  Mean   :2.478   Mean   : 3.872   Mean   :1.117   Mean   : 39026  
##  3rd Qu.:3.000   3rd Qu.: 4.000   3rd Qu.:1.000   3rd Qu.: 97000  
##  Max.   :4.000   Max.   :22.000   Max.   :2.000   Max.   :220000  
##      satis      
##  Min.   :  1.0  
##  1st Qu.:  5.0  
##  Median :  6.0  
##  Mean   :172.2  
##  3rd Qu.:  7.0  
##  Max.   :998.0
library(psych)
describe(sal_mba.df)
##          vars   n     mean       sd median  trimmed     mad min    max
## age         1 274    27.36     3.71     27    26.76    2.97  22     48
## sex         2 274     1.25     0.43      1     1.19    0.00   1      2
## gmat_tot    3 274   619.45    57.54    620   618.86   59.30 450    790
## gmat_qpc    4 274    80.64    14.87     83    82.31   14.83  28     99
## gmat_vpc    5 274    78.32    16.86     81    80.33   14.83  16     99
## gmat_tpc    6 274    84.20    14.02     87    86.12   11.86   0     99
## s_avg       7 274     3.03     0.38      3     3.03    0.44   2      4
## f_avg       8 274     3.06     0.53      3     3.09    0.37   0      4
## quarter     9 274     2.48     1.11      2     2.47    1.48   1      4
## work_yrs   10 274     3.87     3.23      3     3.29    1.48   0     22
## frstlang   11 274     1.12     0.32      1     1.02    0.00   1      2
## salary     12 274 39025.69 50951.56    999 33607.86 1481.12   0 220000
## satis      13 274   172.18   371.61      6    91.50    1.48   1    998
##           range  skew kurtosis      se
## age          26  2.16     6.45    0.22
## sex           1  1.16    -0.66    0.03
## gmat_tot    340 -0.01     0.06    3.48
## gmat_qpc     71 -0.92     0.30    0.90
## gmat_vpc     83 -1.04     0.74    1.02
## gmat_tpc     99 -2.28     9.02    0.85
## s_avg         2 -0.06    -0.38    0.02
## f_avg         4 -2.08    10.85    0.03
## quarter       3  0.02    -1.35    0.07
## work_yrs     22  2.78     9.80    0.20
## frstlang      1  2.37     3.65    0.02
## salary   220000  0.70    -1.05 3078.10
## satis       997  1.77     1.13   22.45
#Histogram:
hist(sal_mba.df$salary,xlab="Salary earned",ylab="Count",
     col="light blue",breaks=5,main="Salary of MBA students")

mba_disclosed<-sal_mba.df[sal_mba.df$salary != 0,]
mba_disclosed1<-mba_disclosed[mba_disclosed$salary !=998,]
mba_disclosed2<-mba_disclosed1[mba_disclosed1$salary !=999,]

#Bar-chart:
mba_mean<-aggregate(salary~quarter,data=mba_disclosed2,FUN=mean)
library(lattice)
barchart(quarter~salary,data=mba_mean,col="blue",main="Mean disclosed salary 
         by quartile ranking",xlab="Mean salary",ylab="Quartile Ranking")

#Box-Plot 1:
boxplot(salary~sex,data=mba_disclosed2,xlab="Salary",
        ylab="Gender",main="Disclosed Salaries of MBA students",horizontal = TRUE,yaxt="n")
axis(side=2,at=c(1,2),labels=c("Male","Female"))

#Box-Plot 2:
boxplot(salary~frstlang,data=mba_disclosed2,xlab="Salary",ylab="First Language"
        ,main="Disclosed Salaries of MBA students",horizontal=TRUE,yaxt="n")
axis(side=2,at=c(1,2),labels=c("English","Other Language"))

#Box-Plot 3:
boxplot(salary~quarter,data=mba_disclosed2,xlab="Salary",ylab="Quartile",
        main="Disclosed salary of MBA students",horizontal=TRUE,yaxt="n")
axis(side=2,at=c(1,2,3,4),labels=c(1,2,3,4))

library(lattice)
#Scatter-Plot 1:
plot(mba_disclosed2$salary,mba_disclosed2$gmat_tpc,xlab="Salary",
     ylab="GMAT Total Percentile",main="Disclosed salary of MBA students")

#Scatter-Plot 2:
library(car)
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
scatterplot(mba_disclosed2$salary,mba_disclosed2$s_avg,xlab="Salary",
     ylab="Spring MBA Average",main="Disclosed salary of MBA students")

#Scatter-Plot 3:
library(car)
scatterplot(mba_disclosed2$salary,mba_disclosed2$f_avg,xlab="Salary",
     ylab="Fall MBA Average",main="Disclosed salary of students")

#Scatter-plot Matrix:
library(car)
scatterplotMatrix(mba_disclosed[,c("gmat_tot","s_avg","f_avg","salary")],
                    spread = FALSE,smoother.args = list(lty=2),
                    main="MBA Starting Salary")

#Corrgram:
library(corrgram)
corrgram(mba_disclosed,order=TRUE,lower.panel=panel.shade, upper.panel= panel.pie,
         text.panel = panel.txt, main="Corrgram of MBA Starting Salaries")

#TASK 2-b:
mba_disclosed<-sal_mba.df[sal_mba.df$salary != 0,]

#3-way Contingency tables:
mba_t1<-xtabs(~sex+quarter+frstlang,data=mba_disclosed)
ftable(mba_t1)
##             frstlang  1  2
## sex quarter               
## 1   1                30  3
##     2                32  2
##     3                31  7
##     4                29  5
## 2   1                18  0
##     2                 5  4
##     3                 8  1
##     4                 7  2
margin.table(mba_t1,1)
## sex
##   1   2 
## 139  45
margin.table(mba_t1,2)
## quarter
##  1  2  3  4 
## 51 43 47 43
addmargins(mba_t1)
## , , frstlang = 1
## 
##      quarter
## sex     1   2   3   4 Sum
##   1    30  32  31  29 122
##   2    18   5   8   7  38
##   Sum  48  37  39  36 160
## 
## , , frstlang = 2
## 
##      quarter
## sex     1   2   3   4 Sum
##   1     3   2   7   5  17
##   2     0   4   1   2   7
##   Sum   3   6   8   7  24
## 
## , , frstlang = Sum
## 
##      quarter
## sex     1   2   3   4 Sum
##   1    33  34  38  34 139
##   2    18   9   9   9  45
##   Sum  51  43  47  43 184
#Chi-square test:
mba_t2<-xtabs(~sex+frstlang,data=mba_disclosed)
chisq.test(mba_t2)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  mba_t2
## X-squared = 0.10308, df = 1, p-value = 0.7482
mba_t3<-xtabs(~sex+quarter,data=mba_disclosed)
chisq.test(mba_t3)
## 
##  Pearson's Chi-squared test
## 
## data:  mba_t3
## X-squared = 4.5377, df = 3, p-value = 0.209
mba_t4<-xtabs(~quarter+frstlang,data=mba_disclosed)
chisq.test(mba_t4)
## 
##  Pearson's Chi-squared test
## 
## data:  mba_t4
## X-squared = 3.3899, df = 3, p-value = 0.3353
#T-test:
t.test(mba_disclosed$gmat_tpc,mba_disclosed$salary)
## 
##  Welch Two Sample t-test
## 
## data:  mba_disclosed$gmat_tpc and mba_disclosed$salary
## t = -14.99, df = 183, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -65667.09 -50391.33
## sample estimates:
##   mean of x   mean of y 
##    85.13043 58114.34239
t.test(mba_disclosed$age,mba_disclosed$salary)
## 
##  Welch Two Sample t-test
## 
## data:  mba_disclosed$age and mba_disclosed$salary
## t = -15.005, df = 183, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -65725.43 -50449.67
## sample estimates:
##   mean of x   mean of y 
##    26.79348 58114.34239
t.test(mba_disclosed$s_avg,mba_disclosed$salary)
## 
##  Welch Two Sample t-test
## 
## data:  mba_disclosed$s_avg and mba_disclosed$salary
## t = -15.011, df = 183, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -65749.20 -50473.44
## sample estimates:
##    mean of x    mean of y 
##     3.022554 58114.342391
t.test(mba_disclosed$sex,mba_disclosed$salary)
## 
##  Welch Two Sample t-test
## 
## data:  mba_disclosed$sex and mba_disclosed$salary
## t = -15.012, df = 183, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -65750.98 -50475.22
## sample estimates:
##    mean of x    mean of y 
##     1.244565 58114.342391
#Linear-Regression Model:
mba_reg<-lm(salary~age+gmat_tot+gmat_vpc+s_avg+quarter+frstlang,data=mba_disclosed)
summary(mba_reg)
## 
## Call:
## lm(formula = salary ~ age + gmat_tot + gmat_vpc + s_avg + quarter + 
##     frstlang, data = mba_disclosed)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -94209 -47628  16695  39941 156113 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 102308.90   88904.42   1.151   0.2514  
## age           1963.11    1410.42   1.392   0.1657  
## gmat_tot      -161.84     108.72  -1.489   0.1384  
## gmat_vpc       -40.45     372.04  -0.109   0.9135  
## s_avg        16885.28   18004.24   0.938   0.3496  
## quarter      -4311.60    6035.29  -0.714   0.4759  
## frstlang    -29586.41   12685.82  -2.332   0.0208 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 50350 on 177 degrees of freedom
## Multiple R-squared:  0.1109, Adjusted R-squared:  0.08075 
## F-statistic: 3.679 on 6 and 177 DF,  p-value: 0.001808
mba_reg1<-lm(salary~sex+gmat_qpc+gmat_tpc+f_avg+work_yrs+satis,data=mba_disclosed)
summary(mba_reg1)
## 
## Call:
## lm(formula = salary ~ sex + gmat_qpc + gmat_tpc + f_avg + work_yrs + 
##     satis, data = mba_disclosed)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -92622  -6057   7848  24808 113513 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 56821.329  30441.807   1.867   0.0636 .  
## sex          5777.726   7206.841   0.802   0.4238    
## gmat_qpc     -481.644    279.833  -1.721   0.0870 .  
## gmat_tpc      292.677    291.330   1.005   0.3164    
## f_avg        6262.471   6021.104   1.040   0.2997    
## work_yrs     2354.024   1242.936   1.894   0.0599 .  
## satis         -75.005      7.012 -10.697   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 40440 on 177 degrees of freedom
## Multiple R-squared:  0.4264, Adjusted R-squared:  0.407 
## F-statistic: 21.93 on 6 and 177 DF,  p-value: < 2.2e-16
#TASK 2-c:
mba_not_placed<-sal_mba.df[sal_mba.df$salary==0,]
View(mba_not_placed)

#Contingency Table:
mba_np1<-xtabs(~sex+quarter+frstlang,data=mba_not_placed)
ftable(mba_np1)
##             frstlang  1  2
## sex quarter               
## 1   1                10  1
##     2                19  2
##     3                14  2
##     4                17  2
## 2   1                 7  0
##     2                 6  0
##     3                 6  1
##     4                 3  0
margin.table(mba_np1,1)
## sex
##  1  2 
## 67 23
margin.table(mba_np1,2)
## quarter
##  1  2  3  4 
## 18 27 23 22
addmargins(mba_np1)
## , , frstlang = 1
## 
##      quarter
## sex    1  2  3  4 Sum
##   1   10 19 14 17  60
##   2    7  6  6  3  22
##   Sum 17 25 20 20  82
## 
## , , frstlang = 2
## 
##      quarter
## sex    1  2  3  4 Sum
##   1    1  2  2  2   7
##   2    0  0  1  0   1
##   Sum  1  2  3  2   8
## 
## , , frstlang = Sum
## 
##      quarter
## sex    1  2  3  4 Sum
##   1   11 21 16 19  67
##   2    7  6  7  3  23
##   Sum 18 27 23 22  90
#Chi-square test:
mba_np2<-xtabs(~sex+quarter,data=mba_not_placed)
chisq.test(mba_np2)
## Warning in chisq.test(mba_np2): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  mba_np2
## X-squared = 3.7704, df = 3, p-value = 0.2874
mba_np3<-xtabs(~sex+frstlang,data=mba_not_placed)
chisq.test(mba_np3)
## Warning in chisq.test(mba_np3): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  mba_np3
## X-squared = 0.21376, df = 1, p-value = 0.6438
mba_np4<-xtabs(~quarter+frstlang,data=mba_not_placed)
chisq.test(mba_np4)
## Warning in chisq.test(mba_np4): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  mba_np4
## X-squared = 0.81142, df = 3, p-value = 0.8467