Reading the file

setwd("~/Desktop/5 SRM Kashish Mukheja/Downoad content")
mba<-read.csv(paste("MBA Starting Salaries Data.csv",sep=""))
View(mba)
colnames(mba)
##  [1] "age"      "sex"      "gmat_tot" "gmat_qpc" "gmat_vpc" "gmat_tpc"
##  [7] "s_avg"    "f_avg"    "quarter"  "work_yrs" "frstlang" "salary"  
## [13] "satis"

Summary statistics

library(psych)
describe(mba)
##          vars   n     mean       sd median  trimmed     mad min    max
## age         1 274    27.36     3.71     27    26.76    2.97  22     48
## sex         2 274     1.25     0.43      1     1.19    0.00   1      2
## gmat_tot    3 274   619.45    57.54    620   618.86   59.30 450    790
## gmat_qpc    4 274    80.64    14.87     83    82.31   14.83  28     99
## gmat_vpc    5 274    78.32    16.86     81    80.33   14.83  16     99
## gmat_tpc    6 274    84.20    14.02     87    86.12   11.86   0     99
## s_avg       7 274     3.03     0.38      3     3.03    0.44   2      4
## f_avg       8 274     3.06     0.53      3     3.09    0.37   0      4
## quarter     9 274     2.48     1.11      2     2.47    1.48   1      4
## work_yrs   10 274     3.87     3.23      3     3.29    1.48   0     22
## frstlang   11 274     1.12     0.32      1     1.02    0.00   1      2
## salary     12 274 39025.69 50951.56    999 33607.86 1481.12   0 220000
## satis      13 274   172.18   371.61      6    91.50    1.48   1    998
##           range  skew kurtosis      se
## age          26  2.16     6.45    0.22
## sex           1  1.16    -0.66    0.03
## gmat_tot    340 -0.01     0.06    3.48
## gmat_qpc     71 -0.92     0.30    0.90
## gmat_vpc     83 -1.04     0.74    1.02
## gmat_tpc     99 -2.28     9.02    0.85
## s_avg         2 -0.06    -0.38    0.02
## f_avg         4 -2.08    10.85    0.03
## quarter       3  0.02    -1.35    0.07
## work_yrs     22  2.78     9.80    0.20
## frstlang      1  2.37     3.65    0.02
## salary   220000  0.70    -1.05 3078.10
## satis       997  1.77     1.13   22.45

Creating a dataframe who answered the survey and gave information about the salary

mbas<-mba[which(mba$salary!=998 & mba$salary!=999), ]
View(mbas)

Creating a dataframe who answered the survey and gave information about the salary who got placed

mbasp<-mbas[which(mbas$salary!=0), ]
View(mbasp)

Histogram for each variable

library(lattice)
histogram(~age,
          data=mba,
          type="count",
          nint=10,
          xlab="Age", main="Age distibution")

histogram(~gmat_tot,
          data=mba,
          type="count",
          nint=10,
          xlab="gmat_tot", main="Total Gmat scores")

histogram(~work_yrs,
          data=mba,
          type="count",
          nint=10,
          xlab="work_yrs", main="Working years")

mean(mba$gmat_tot[mba$frstlang==1])
## [1] 622.2727
mean(mba$gmat_tot[mba$frstlang!=1])
## [1] 598.125

Scatterplot

library(car)    
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
scatterplot(salary ~age,     data=mbasp,
            spread=FALSE, smoother.args=list(lty=2),
            main="Salary vs Age",
            xlab="Age",
            ylab="Salary")

scatterplot(salary ~ work_yrs , data=mbasp,
            spread=FALSE, smoother.args=list(lty=2),
            main="Work Experience Vs Salary",
            xlab="Work_yrs",
            ylab="Salary")

scatterplot(salary ~ satis , data=mbasp,
            spread=FALSE, smoother.args=list(lty=2),
            main="Satisfaction Vs Salary",
            xlab="Satisfaction Score",
            ylab="Salary")

scatterplot(work_yrs ~ age , data=mbasp,
            spread=FALSE, smoother.args=list(lty=2),
            main="Work Experience Vs Age",
            xlab="Age",
            ylab="Work Experience")

scatterplot(salary ~ quarter , data=mbasp,
            spread=FALSE, smoother.args=list(lty=2),
            main="Salary Vs Quarter",
            xlab="Quarter",
            ylab="Salary")

Corrgram

library(corrgram)
library(ellipse)
## 
## Attaching package: 'ellipse'
## The following object is masked from 'package:car':
## 
##     ellipse
## The following object is masked from 'package:graphics':
## 
##     pairs
corrgram(mbas, order = FALSE, lower.panel = panel.shade, upper.panel = panel.pie, text.panel = panel.txt,main = "Corrgram of those who did not get placed and gave information")

library(corrgram)
library(ellipse)
corrgram(mbasp, order = FALSE, lower.panel = panel.shade, upper.panel = panel.pie, text.panel = panel.txt,main = "Corrgram of those who got placed and gave information")

T-test

Ho:-There is no significant difference between the starting salaries of Male and Female
H1:-There is a significant difference between the starting salaries of Male and Female

t.test(mbasp$salary[mbasp$sex==1],mbasp$salary[mbasp$sex==2],alternative = "less")
## 
##  Welch Two Sample t-test
## 
## data:  mbasp$salary[mbasp$sex == 1] and mbasp$salary[mbasp$sex == 2]
## t = 1.3628, df = 38.115, p-value = 0.9095
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
##      -Inf 14421.13
## sample estimates:
## mean of x mean of y 
## 104970.97  98524.39

Inference:-Since p-vale>0.05, we accept Ho, hence there is no significant difference in the starting salaries of male and female.

Ho:-There is no significant difference between the starting salaries of those whose first language is english and those whose first language is not enlgish
H1:-There is a significant difference between the starting salaries of those whose first language is english and those whose first language is not enlgish

t.test(mbasp$salary[mbasp$frstlang==1],mbasp$salary[mbasp$frstlang==2])
## 
##  Welch Two Sample t-test
## 
## data:  mbasp$salary[mbasp$frstlang == 1] and mbasp$salary[mbasp$frstlang == 2]
## t = -1.1202, df = 6.0863, p-value = 0.3049
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -59933.62  22202.25
## sample estimates:
## mean of x mean of y 
##  101748.6  120614.3

Inference:-Since p-vale>0.05, we accept Ho, there is no significant difference between the starting salaries of those whose first language is english and those whose first language is not enlgish

Ho:-There is no significant difference between the starting salaries of those whose age is less than 27 and greater than 27.
H1:-There is a significant difference between the starting salaries of those whose age is less than 27 and greater than 27.

t.test(mbasp$salary[mbasp$age<=27],mbasp$salary[mbasp$age>27],alternative = "less")
## 
##  Welch Two Sample t-test
## 
## data:  mbasp$salary[mbasp$age <= 27] and mbasp$salary[mbasp$age > 27]
## t = -2.2726, df = 38.266, p-value = 0.01437
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
##       -Inf -2676.967
## sample estimates:
## mean of x mean of y 
##  100011.9  110376.7

Inference:-We reject the Null Hypothesis,since p-vale<0.05.Hence there is a significant difference between the starting salaries of those whose age is less than 27 and greater than 27.The starting salaried of those aged less than or equal to 27 is less than those whose aged greater than 27

Regression Analysis

Model 1

fit1<- lm(mbasp$salary ~mbasp$gmat_tot+mbasp$gmat_qpc+mbasp$gmat_vpc+mbasp$gmat_tpc +mbasp$s_avg +mbasp$f_avg +mbasp$quarter +mbasp$work_yrs +mbasp$satis, data = mbasp)
summary(fit1)
## 
## Call:
## lm(formula = mbasp$salary ~ mbasp$gmat_tot + mbasp$gmat_qpc + 
##     mbasp$gmat_vpc + mbasp$gmat_tpc + mbasp$s_avg + mbasp$f_avg + 
##     mbasp$quarter + mbasp$work_yrs + mbasp$satis, data = mbasp)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -35858  -7483     -6   4104  75509 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    117561.06   49384.33   2.381   0.0193 *  
## mbasp$gmat_tot     58.14     180.07   0.323   0.7475    
## mbasp$gmat_qpc    838.14     504.39   1.662   0.0999 .  
## mbasp$gmat_vpc    533.01     507.55   1.050   0.2964    
## mbasp$gmat_tpc  -1670.82     726.00  -2.301   0.0236 *  
## mbasp$s_avg     -3155.10    8243.61  -0.383   0.7028    
## mbasp$f_avg     -3237.85    3931.51  -0.824   0.4123    
## mbasp$quarter   -3146.53    2756.33  -1.142   0.2566    
## mbasp$work_yrs   2572.46     571.19   4.504 1.94e-05 ***
## mbasp$satis       -68.92    2170.36  -0.032   0.9747    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15780 on 93 degrees of freedom
## Multiple R-squared:  0.2887, Adjusted R-squared:  0.2198 
## F-statistic: 4.194 on 9 and 93 DF,  p-value: 0.0001432

Model 2

fit2<-lm(mbasp$salary ~mbasp$gmat_tot+mbasp$gmat_qpc+mbasp$gmat_vpc+mbasp$gmat_tpc +mbasp$s_avg +mbasp$f_avg +mbasp$quarter +mbasp$work_yrs +mbasp$satis +mbasp$age, data = mbasp)
summary(fit2)  
## 
## Call:
## lm(formula = mbasp$salary ~ mbasp$gmat_tot + mbasp$gmat_qpc + 
##     mbasp$gmat_vpc + mbasp$gmat_tpc + mbasp$s_avg + mbasp$f_avg + 
##     mbasp$quarter + mbasp$work_yrs + mbasp$satis + mbasp$age, 
##     data = mbasp)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -26196  -8241   -324   5297  70000 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)  
## (Intercept)    69019.43   52376.30   1.318   0.1909  
## mbasp$gmat_tot    29.52     176.18   0.168   0.8673  
## mbasp$gmat_qpc   813.29     492.44   1.652   0.1020  
## mbasp$gmat_vpc   489.93     495.74   0.988   0.3256  
## mbasp$gmat_tpc -1479.96     713.20  -2.075   0.0408 *
## mbasp$s_avg    -3124.32    8046.45  -0.388   0.6987  
## mbasp$f_avg    -2345.08    3855.93  -0.608   0.5446  
## mbasp$quarter  -2787.20    2694.67  -1.034   0.3037  
## mbasp$work_yrs   360.74    1087.30   0.332   0.7408  
## mbasp$satis     -719.58    2136.17  -0.337   0.7370  
## mbasp$age       2379.27    1004.19   2.369   0.0199 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15410 on 92 degrees of freedom
## Multiple R-squared:  0.3296, Adjusted R-squared:  0.2567 
## F-statistic: 4.523 on 10 and 92 DF,  p-value: 3.341e-05

Model 3

fit3<-lm(mbasp$salary ~mbasp$gmat_tot +mbasp$s_avg +mbasp$f_avg +mbasp$quarter +mbasp$work_yrs +mbasp$satis +mbasp$age, data = mbasp)
summary(fit3)  
## 
## Call:
## lm(formula = mbasp$salary ~ mbasp$gmat_tot + mbasp$s_avg + mbasp$f_avg + 
##     mbasp$quarter + mbasp$work_yrs + mbasp$satis + mbasp$age, 
##     data = mbasp)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -25359  -8539  -1011   5144  80571 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)  
## (Intercept)    72607.27   41174.65   1.763   0.0810 .
## mbasp$gmat_tot   -14.81      32.12  -0.461   0.6458  
## mbasp$s_avg    -3736.71    8002.20  -0.467   0.6416  
## mbasp$f_avg     -779.13    3848.67  -0.202   0.8400  
## mbasp$quarter  -2054.06    2662.26  -0.772   0.4423  
## mbasp$work_yrs   242.38    1117.61   0.217   0.8288  
## mbasp$satis    -1702.09    2096.83  -0.812   0.4190  
## mbasp$age       2512.61    1025.40   2.450   0.0161 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15860 on 95 degrees of freedom
## Multiple R-squared:  0.266,  Adjusted R-squared:  0.2119 
## F-statistic: 4.918 on 7 and 95 DF,  p-value: 9.018e-05

Model 4

fit2<-lm(mbasp$salary ~mbasp$gmat_tot+mbasp$gmat_qpc+mbasp$gmat_vpc+mbasp$gmat_tpc +mbasp$s_avg +mbasp$f_avg +mbasp$quarter +mbasp$satis +mbasp$age +mbasp$frstlang, data = mbasp)
summary(fit2)  
## 
## Call:
## lm(formula = mbasp$salary ~ mbasp$gmat_tot + mbasp$gmat_qpc + 
##     mbasp$gmat_vpc + mbasp$gmat_tpc + mbasp$s_avg + mbasp$f_avg + 
##     mbasp$quarter + mbasp$satis + mbasp$age + mbasp$frstlang, 
##     data = mbasp)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -24137  -8244   -490   5313  68756 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    64622.144  49262.731   1.312   0.1929    
## mbasp$gmat_tot     8.337    177.818   0.047   0.9627    
## mbasp$gmat_qpc   827.849    491.659   1.684   0.0956 .  
## mbasp$gmat_vpc   530.807    498.305   1.065   0.2896    
## mbasp$gmat_tpc -1436.428    711.446  -2.019   0.0464 *  
## mbasp$s_avg    -1805.530   8145.604  -0.222   0.8251    
## mbasp$f_avg    -2741.535   3852.548  -0.712   0.4785    
## mbasp$quarter  -2647.810   2692.668  -0.983   0.3280    
## mbasp$satis     -925.938   2140.124  -0.433   0.6663    
## mbasp$age       2501.003    559.182   4.473  2.2e-05 ***
## mbasp$frstlang  5156.619   6934.452   0.744   0.4590    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15370 on 92 degrees of freedom
## Multiple R-squared:  0.3328, Adjusted R-squared:  0.2603 
## F-statistic: 4.589 on 10 and 92 DF,  p-value: 2.778e-05

Model 5

fit2<-lm(mbasp$salary ~mbasp$gmat_tot+mbasp$gmat_qpc+mbasp$gmat_vpc+mbasp$gmat_tpc +mbasp$s_avg +mbasp$f_avg +mbasp$quarter +mbasp$satis +mbasp$work_yrs +mbasp$frstlang, data = mbasp)
summary(fit2)  
## 
## Call:
## lm(formula = mbasp$salary ~ mbasp$gmat_tot + mbasp$gmat_qpc + 
##     mbasp$gmat_vpc + mbasp$gmat_tpc + mbasp$s_avg + mbasp$f_avg + 
##     mbasp$quarter + mbasp$satis + mbasp$work_yrs + mbasp$frstlang, 
##     data = mbasp)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -31824  -7739   -135   4626  69927 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    113245.969  48915.660   2.315 0.022830 *  
## mbasp$gmat_tot      5.264    180.698   0.029 0.976821    
## mbasp$gmat_qpc    888.079    499.786   1.777 0.078887 .  
## mbasp$gmat_vpc    637.786    505.676   1.261 0.210406    
## mbasp$gmat_tpc  -1562.324    720.886  -2.167 0.032800 *  
## mbasp$s_avg      -892.329   8257.710  -0.108 0.914183    
## mbasp$f_avg     -3467.223   3891.435  -0.891 0.375260    
## mbasp$quarter   -2785.408   2734.537  -1.019 0.311063    
## mbasp$satis      -600.166   2168.554  -0.277 0.782586    
## mbasp$work_yrs   2353.141    578.903   4.065 0.000101 ***
## mbasp$frstlang  11549.611   6630.292   1.742 0.084860 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15610 on 92 degrees of freedom
## Multiple R-squared:  0.3114, Adjusted R-squared:  0.2365 
## F-statistic:  4.16 on 10 and 92 DF,  p-value: 9.275e-05

Concusion from regression analysis:- From the above model, we see that the model 4 best suits the data.Its adjusted R-sqared is 0.2603 and Multiple R-squared is 0.3328.Age is the most statically significant variable here.

COMPARING THOSE WHO GOT A JOB WITH THOSE WHO DID NOT GET A JOB

mbas$pnp[mbas$salary==0]<-1
mbas$pnp[mbas$salary!=0]<-2
View(mbas)

Chi-Squared Test

Ho:-Gender is independent of the people placed H1:-Gender is dependent of the people placed

mbaschi<-xtabs(~sex + pnp,data = mbas)
addmargins(mbaschi)
##      pnp
## sex     1   2 Sum
##   1    67  72 139
##   2    23  31  54
##   Sum  90 103 193
View(mbaschi)
chisq.test(mbaschi)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  mbaschi
## X-squared = 0.29208, df = 1, p-value = 0.5889

Inference:-Since the p-value>0.05, so we accept the null Hypothesis.Hence, there is no dependance of placement with gender.

Ho:-Language and placement are independant H1:-Language and placement are dependant

mbaschi1<-xtabs(~frstlang + pnp,data = mbas)
addmargins(mbaschi1)
##         pnp
## frstlang   1   2 Sum
##      1    82  96 178
##      2     8   7  15
##      Sum  90 103 193
View(mbaschi1)
chisq.test(mbaschi1)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  mbaschi1
## X-squared = 0.074127, df = 1, p-value = 0.7854

Inference:-Since the p-value>0.05, so we accept the null Hypothesis.Hence, there is no dependance of placement with FirstLanguage.

mbas$cgpa<-(mbas$s_avg+mbas$f_avg)/2
mbas$impr<-((mbas$f_avg-mbas$s_avg)/mbas$s_avg*100)

Ho:-The percent percent change in cgpa and placement are independant
H1:-The percent percent change in cgpa and placement are dependant

mbas$imprpos[mbas$impr>=0] <- 1
mbas$imprpos[mbas$impr<0] <- 0
mbaschi2<-xtabs(~imprpos + pnp,data = mbas)
addmargins(mbaschi2)
##        pnp
## imprpos   1   2 Sum
##     0    31  46  77
##     1    59  57 116
##     Sum  90 103 193
chisq.test(mbaschi2)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  mbaschi2
## X-squared = 1.6861, df = 1, p-value = 0.1941

Inference:-Since the p-value>0.05, so we accept the null Hypothesis.Hence, there is no dependance of placement with change in cgpa.

T-test

Ho:-There is no significant difference between Gmat Score of those placed and not placed
H1:-There is a significant difference between the Gmat Score of those placed and not placed

t.test(mbas$gmat_tot[mbas$pnp==1],mbas$gmat_tot[mbas$pnp==2])
## 
##  Welch Two Sample t-test
## 
## data:  mbas$gmat_tot[mbas$pnp == 1] and mbas$gmat_tot[mbas$pnp == 2]
## t = -0.20321, df = 170.77, p-value = 0.8392
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -18.06406  14.69189
## sample estimates:
## mean of x mean of y 
##  614.3333  616.0194

Inference:-Since p-vale>0.05, we accept Ho, there is no significant difference between Gmat Score of those placed and not placed.

Ho:-There is no significant difference between cgpa and placed and not placed
H1:-There is a significant difference between cgpa and placed and not placed

t.test(mbas$cgpa[mbas$pnp==1],mbas$cgpa[mbas$pnp==2])
## 
##  Welch Two Sample t-test
## 
## data:  mbas$cgpa[mbas$pnp == 1] and mbas$cgpa[mbas$pnp == 2]
## t = -0.78225, df = 178.77, p-value = 0.4351
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.15787336  0.06823905
## sample estimates:
## mean of x mean of y 
##  3.046833  3.091650

Inference:-Since p-vale>0.05, we accept Ho, there is no significant difference between cgpa of those placed and not placed.