DATASET ANALYSIS:

# Reading the dataset.

setwd("C:/Users/Dixit/iim_internship/week_4/project")
MBASalarydata <- read.csv(file="MBA_Starting_Salaries_Data.csv",head=TRUE,sep=",")

# Reading the dataset without the missing or unanswered data and unplaced students.
MBAdataClean<- MBASalarydata[which(MBASalarydata$salary!=998 & MBASalarydata$salary !=999 & MBASalarydata$salary !=0),]
# Summary of MBA salaries dataset.
summary(MBASalarydata)
##       age             sex           gmat_tot        gmat_qpc    
##  Min.   :22.00   Min.   :1.000   Min.   :450.0   Min.   :28.00  
##  1st Qu.:25.00   1st Qu.:1.000   1st Qu.:580.0   1st Qu.:72.00  
##  Median :27.00   Median :1.000   Median :620.0   Median :83.00  
##  Mean   :27.36   Mean   :1.248   Mean   :619.5   Mean   :80.64  
##  3rd Qu.:29.00   3rd Qu.:1.000   3rd Qu.:660.0   3rd Qu.:93.00  
##  Max.   :48.00   Max.   :2.000   Max.   :790.0   Max.   :99.00  
##     gmat_vpc        gmat_tpc        s_avg           f_avg      
##  Min.   :16.00   Min.   : 0.0   Min.   :2.000   Min.   :0.000  
##  1st Qu.:71.00   1st Qu.:78.0   1st Qu.:2.708   1st Qu.:2.750  
##  Median :81.00   Median :87.0   Median :3.000   Median :3.000  
##  Mean   :78.32   Mean   :84.2   Mean   :3.025   Mean   :3.062  
##  3rd Qu.:91.00   3rd Qu.:94.0   3rd Qu.:3.300   3rd Qu.:3.250  
##  Max.   :99.00   Max.   :99.0   Max.   :4.000   Max.   :4.000  
##     quarter         work_yrs         frstlang         salary      
##  Min.   :1.000   Min.   : 0.000   Min.   :1.000   Min.   :     0  
##  1st Qu.:1.250   1st Qu.: 2.000   1st Qu.:1.000   1st Qu.:     0  
##  Median :2.000   Median : 3.000   Median :1.000   Median :   999  
##  Mean   :2.478   Mean   : 3.872   Mean   :1.117   Mean   : 39026  
##  3rd Qu.:3.000   3rd Qu.: 4.000   3rd Qu.:1.000   3rd Qu.: 97000  
##  Max.   :4.000   Max.   :22.000   Max.   :2.000   Max.   :220000  
##      satis      
##  Min.   :  1.0  
##  1st Qu.:  5.0  
##  Median :  6.0  
##  Mean   :172.2  
##  3rd Qu.:  7.0  
##  Max.   :998.0
# Summary of the dataset without the missing or unanswered data and unplaced students.
summary(MBAdataClean)
##       age             sex           gmat_tot      gmat_qpc    
##  Min.   :22.00   Min.   :1.000   Min.   :500   Min.   :39.00  
##  1st Qu.:25.00   1st Qu.:1.000   1st Qu.:580   1st Qu.:72.00  
##  Median :26.00   Median :1.000   Median :620   Median :82.00  
##  Mean   :26.78   Mean   :1.301   Mean   :616   Mean   :79.73  
##  3rd Qu.:28.00   3rd Qu.:2.000   3rd Qu.:655   3rd Qu.:89.00  
##  Max.   :40.00   Max.   :2.000   Max.   :720   Max.   :99.00  
##     gmat_vpc        gmat_tpc         s_avg           f_avg      
##  Min.   :30.00   Min.   :51.00   Min.   :2.200   Min.   :0.000  
##  1st Qu.:71.00   1st Qu.:78.00   1st Qu.:2.850   1st Qu.:2.915  
##  Median :81.00   Median :87.00   Median :3.100   Median :3.250  
##  Mean   :78.56   Mean   :84.52   Mean   :3.092   Mean   :3.091  
##  3rd Qu.:92.00   3rd Qu.:93.50   3rd Qu.:3.400   3rd Qu.:3.415  
##  Max.   :99.00   Max.   :99.00   Max.   :4.000   Max.   :4.000  
##     quarter         work_yrs        frstlang         salary      
##  Min.   :1.000   Min.   : 0.00   Min.   :1.000   Min.   : 64000  
##  1st Qu.:1.000   1st Qu.: 2.00   1st Qu.:1.000   1st Qu.: 95000  
##  Median :2.000   Median : 3.00   Median :1.000   Median :100000  
##  Mean   :2.262   Mean   : 3.68   Mean   :1.068   Mean   :103031  
##  3rd Qu.:3.000   3rd Qu.: 4.00   3rd Qu.:1.000   3rd Qu.:106000  
##  Max.   :4.000   Max.   :16.00   Max.   :2.000   Max.   :220000  
##      satis      
##  Min.   :3.000  
##  1st Qu.:5.000  
##  Median :6.000  
##  Mean   :5.883  
##  3rd Qu.:6.000  
##  Max.   :7.000
# Effect of Gender on salary.

library(lattice)

boxplot(MBAdataClean$salary ~MBAdataClean$sex , 
        xlab = "Sex", ylab ="salary", main = "Effect of gender on salary.")

library(car)

scatterplot(MBAdataClean$salary~MBAdataClean$sex,spread=FALSE, smoother.args=list(lty=2), pch=19,
            main="Scatterplot of Salary vs. sex ",
            xlab="sex",
            ylab="salary")

# There is not much effect of gender on salary.
# Effect of Age on salary.

library(lattice)

boxplot(MBAdataClean$salary ~MBAdataClean$age , 
        xlab = "Age", ylab ="salary", main = "Effect of Age on salary.")

library(car)

scatterplot(MBAdataClean$salary~MBAdataClean$age,spread=FALSE, smoother.args=list(lty=2), pch=19,
            main="Scatterplot of Salary vs. Age ",
            xlab="Age",
            ylab="salary")

# we can see a significant correlation between age and salary.
# Effect of first Language on salary.
library(lattice)

boxplot(MBAdataClean$salary ~MBAdataClean$frstlang , 
        xlab = "First Language", ylab ="salary", main = "Effect of first Language on salary.")

library(car)

scatterplot(MBAdataClean$salary~MBAdataClean$frstlang,spread=FALSE, smoother.args=list(lty=2), pch=19,
            main="Scatterplot of Salary vs. First language",
            xlab="First Language",
            ylab="salary")

# There is no significant correlation between the first language and the salary.
# Effect of work years on Salary
library(car)

scatterplot(MBAdataClean$salary~MBAdataClean$work_yrs,spread=FALSE, smoother.args=list(lty=2), pch=19,
            main="Scatterplot of Salary vs. work years ",
            xlab="Work years",
            ylab="salary")

# There is a strong correlation between the number of work years and the salary.
# Effect of satisfaction on salary.
library(lattice)

boxplot(MBAdataClean$salary ~MBAdataClean$satis , 
        xlab = "Satisfaction", ylab ="salary", main = "Effect of satisfaction on salary.")

library(car)

scatterplot(MBAdataClean$salary~MBAdataClean$satis,spread=FALSE, smoother.args=list(lty=2), pch=19,
            main="Scatterplot of Salary vs. Satisfaction",
            xlab="Satisfaction",
            ylab="salary")

# There is no significant correlation between satisfaction and salary.
# Effect of GMAT on Salary.
library(car)

scatterplotMatrix(formula = ~ salary + gmat_tot + gmat_qpc + gmat_vpc + gmat_tpc , cex=0.6,
                  data=MBAdataClean, main = " Effect of GMAT on Salary.")

# Draw a Corrgram

library("corrgram") 
## Warning: replacing previous import by 'magrittr::%>%' when loading
## 'dendextend'
corrgram(MBAdataClean, order=FALSE, lower.panel=panel.shade,
         upper.panel=panel.pie, text.panel=panel.txt,
         main="Corrgram of MBA data variables")

# Create a Variance-Covariance Matrix.
covMBAdataclean <-cov(MBAdataClean)
covMBAdataclean
##                    age           sex      gmat_tot      gmat_qpc
## age         10.7045498 -2.164477e-01 -1.305445e+01   -7.22796497
## sex         -0.2164477  2.124500e-01 -4.568818e-01   -0.90757662
## gmat_tot   -13.0544451 -4.568818e-01  2.569294e+03  452.14258519
## gmat_qpc    -7.2279650 -9.075766e-01  4.521426e+02  179.18027794
## gmat_vpc     0.9505045  3.974872e-01  6.386360e+02   20.45849990
## gmat_tpc    -3.4602132 -2.377689e-01  5.393623e+02   97.03607462
## s_avg        0.1938587  1.409575e-02  3.299562e+00    0.07838473
## f_avg       -0.3462517  3.725395e-02  3.027432e+00    0.64252142
## quarter     -0.4604988 -1.104131e-02 -6.005140e+00    0.18960594
## work_yrs     8.6728536 -1.281173e-01 -1.873882e+01   -7.36245955
## frstlang     0.2898344  8.756901e-03 -1.687607e+00    0.04806777
## salary   29210.5193223 -1.369577e+03 -8.212449e+04 3382.43784504
## satis        0.2776509 -3.321911e-02  2.570912e+00   -0.04178565
##               gmat_vpc      gmat_tpc        s_avg         f_avg
## age       9.505045e-01 -3.460213e+00   0.19385875 -3.462517e-01
## sex       3.974872e-01 -2.377689e-01   0.01409575  3.725395e-02
## gmat_tot  6.386360e+02  5.393623e+02   3.29956215  3.027432e+00
## gmat_qpc  2.045850e+01  9.703607e+01   0.07838473  6.425214e-01
## gmat_vpc  2.606602e+02  1.393882e+02   0.96945936  1.803303e-01
## gmat_tpc  1.393882e+02  1.211342e+02   0.58062916  3.785056e-01
## s_avg     9.694594e-01  5.806292e-01   0.14325138  8.231046e-02
## f_avg     1.803303e-01  3.785056e-01   0.08231046  2.378638e-01
## quarter  -2.325528e+00 -1.227013e+00  -0.35620503 -2.356492e-01
## work_yrs -1.366838e+00 -4.389206e+00   0.18604797 -3.176271e-01
## frstlang -8.915858e-01 -4.575481e-01  -0.01319912 -6.243099e-03
## salary   -3.964803e+04 -2.596339e+04 688.02042071 -9.241129e+02
## satis     1.879973e+00  1.002856e+00  -0.04256901 -4.498382e-02
##                quarter      work_yrs      frstlang        salary
## age      -4.604988e-01     8.6728536  2.898344e-01  2.921052e+04
## sex      -1.104131e-02    -0.1281173  8.756901e-03 -1.369577e+03
## gmat_tot -6.005140e+00   -18.7388159 -1.687607e+00 -8.212449e+04
## gmat_qpc  1.896059e-01    -7.3624595  4.806777e-02  3.382438e+03
## gmat_vpc -2.325528e+00    -1.3668380 -8.915858e-01 -3.964803e+04
## gmat_tpc -1.227013e+00    -4.3892062 -4.575481e-01 -2.596339e+04
## s_avg    -3.562050e-01     0.1860480 -1.319912e-02  6.880204e+02
## f_avg    -2.356492e-01    -0.3176271 -6.243099e-03 -9.241129e+02
## quarter   1.254140e+00    -0.4347992  3.102989e-02 -2.571117e+03
## work_yrs -4.347992e-01     9.0630116  1.494384e-01  2.445820e+04
## frstlang  3.102989e-02     0.1494384  6.396345e-02  1.206714e+03
## salary   -2.571117e+03 24458.1995050  1.206714e+03  3.192940e+08
## satis     1.975062e-01     0.1485818  1.779935e-02 -5.606583e+02
##                  satis
## age         0.27765087
## sex        -0.03321911
## gmat_tot    2.57091186
## gmat_qpc   -0.04178565
## gmat_vpc    1.87997335
## gmat_tpc    1.00285551
## s_avg      -0.04256901
## f_avg      -0.04498382
## quarter     0.19750619
## work_yrs    0.14858176
## frstlang    0.01779935
## salary   -560.65829050
## satis       0.61374453
#Transform covariance to correlation matrix
covCorrMBAdataclean <- cov2cor(covMBAdataclean)
covCorrMBAdataclean
##                  age         sex    gmat_tot     gmat_qpc    gmat_vpc
## age       1.00000000 -0.14352927 -0.07871678 -0.165039057  0.01799420
## sex      -0.14352927  1.00000000 -0.01955548 -0.147099027  0.05341428
## gmat_tot -0.07871678 -0.01955548  1.00000000  0.666382266  0.78038546
## gmat_qpc -0.16503906 -0.14709903  0.66638227  1.000000000  0.09466541
## gmat_vpc  0.01799420  0.05341428  0.78038546  0.094665411  1.00000000
## gmat_tpc -0.09609156 -0.04686981  0.96680810  0.658650025  0.78443167
## s_avg     0.15654954  0.08079985  0.17198874  0.015471662  0.15865101
## f_avg    -0.21699191  0.16572186  0.12246257  0.098418869  0.02290167
## quarter  -0.12568145 -0.02139041 -0.10578964  0.012648346 -0.12862079
## work_yrs  0.88052470 -0.09233003 -0.12280018 -0.182701263 -0.02812182
## frstlang  0.35026743  0.07512009 -0.13164323  0.014198516 -0.21835333
## salary    0.49964284 -0.16628869 -0.09067141  0.014141299 -0.13743230
## satis     0.10832308 -0.09199534  0.06474206 -0.003984632  0.14863481
##             gmat_tpc       s_avg       f_avg     quarter    work_yrs
## age      -0.09609156  0.15654954 -0.21699191 -0.12568145  0.88052470
## sex      -0.04686981  0.08079985  0.16572186 -0.02139041 -0.09233003
## gmat_tot  0.96680810  0.17198874  0.12246257 -0.10578964 -0.12280018
## gmat_qpc  0.65865003  0.01547166  0.09841887  0.01264835 -0.18270126
## gmat_vpc  0.78443167  0.15865101  0.02290167 -0.12862079 -0.02812182
## gmat_tpc  1.00000000  0.13938500  0.07051391 -0.09955033 -0.13246963
## s_avg     0.13938500  1.00000000  0.44590413 -0.84038355  0.16328236
## f_avg     0.07051391  0.44590413  1.00000000 -0.43144819 -0.21633018
## quarter  -0.09955033 -0.84038355 -0.43144819  1.00000000 -0.12896722
## work_yrs -0.13246963  0.16328236 -0.21633018 -0.12896722  1.00000000
## frstlang -0.16437561 -0.13788905 -0.05061394  0.10955726  0.19627277
## salary   -0.13201783  0.10173175 -0.10603897 -0.12848526  0.45466634
## satis     0.11630842 -0.14356557 -0.11773304  0.22511985  0.06299926
##             frstlang      salary        satis
## age       0.35026743  0.49964284  0.108323083
## sex       0.07512009 -0.16628869 -0.091995338
## gmat_tot -0.13164323 -0.09067141  0.064742057
## gmat_qpc  0.01419852  0.01414130 -0.003984632
## gmat_vpc -0.21835333 -0.13743230  0.148634805
## gmat_tpc -0.16437561 -0.13201783  0.116308417
## s_avg    -0.13788905  0.10173175 -0.143565573
## f_avg    -0.05061394 -0.10603897 -0.117733043
## quarter   0.10955726 -0.12848526  0.225119851
## work_yrs  0.19627277  0.45466634  0.062999256
## frstlang  1.00000000  0.26701953  0.089834769
## salary    0.26701953  1.00000000 -0.040050600
## satis     0.08983477 -0.04005060  1.000000000
# Null Hypothesis- "The salary does not depend on First Language."
# T-Test to check correlation between salary and first language.

t.test(MBAdataClean$salary ~ MBAdataClean$frstlang)
## 
##  Welch Two Sample t-test
## 
## data:  MBAdataClean$salary by MBAdataClean$frstlang
## t = -1.1202, df = 6.0863, p-value = 0.3049
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -59933.62  22202.25
## sample estimates:
## mean in group 1 mean in group 2 
##        101748.6        120614.3
#  As p-value>0.05, accept the null hypothesis, So,there is no significant difference between starting salary of MBAs whose first language is english and others as salary doesnt depend on first language.
## TASK 2b: WHO GOT HOW MUCH SALARY?

# Null Hypothesis- "The salary does not depend on Gender.
# T-Test to check correlation between salary and Gender.

t.test(MBAdataClean$salary ~ MBAdataClean$sex)
## 
##  Welch Two Sample t-test
## 
## data:  MBAdataClean$salary by MBAdataClean$sex
## t = 1.3628, df = 38.115, p-value = 0.1809
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -3128.55 16021.72
## sample estimates:
## mean in group 1 mean in group 2 
##       104970.97        98524.39
# As p-value>0.05, accept the null hypothesis, So, there is a no siginficant difference starting salary of male and female MBAs as salary doesnt depend on gender.
## Regression Model that predict starting salary.

# Model 1
Model1<-lm(salary ~ age +gmat_tot + gmat_qpc + gmat_vpc + gmat_tpc + quarter + s_avg+ f_avg +  satis + frstlang , data = MBAdataClean)
summary(Model1)
## 
## Call:
## lm(formula = salary ~ age + gmat_tot + gmat_qpc + gmat_vpc + 
##     gmat_tpc + quarter + s_avg + f_avg + satis + frstlang, data = MBAdataClean)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -24137  -8244   -490   5313  68756 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 64622.144  49262.731   1.312   0.1929    
## age          2501.003    559.182   4.473  2.2e-05 ***
## gmat_tot        8.337    177.818   0.047   0.9627    
## gmat_qpc      827.849    491.659   1.684   0.0956 .  
## gmat_vpc      530.807    498.305   1.065   0.2896    
## gmat_tpc    -1436.428    711.446  -2.019   0.0464 *  
## quarter     -2647.810   2692.668  -0.983   0.3280    
## s_avg       -1805.530   8145.604  -0.222   0.8251    
## f_avg       -2741.535   3852.548  -0.712   0.4785    
## satis        -925.938   2140.124  -0.433   0.6663    
## frstlang     5156.619   6934.452   0.744   0.4590    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15370 on 92 degrees of freedom
## Multiple R-squared:  0.3328, Adjusted R-squared:  0.2603 
## F-statistic: 4.589 on 10 and 92 DF,  p-value: 2.778e-05
# Model 2
Model2<-lm(salary ~ age +gmat_tot + quarter +satis 
                 + frstlang , data = MBAdataClean)
summary(Model2)
## 
## Call:
## lm(formula = salary ~ age + gmat_tot + quarter + satis + frstlang, 
##     data = MBAdataClean)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -28366  -9128   -892   5055  76836 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 50088.40   25880.10   1.935   0.0559 .  
## age          2487.04     517.40   4.807  5.6e-06 ***
## gmat_tot      -14.65      31.11  -0.471   0.6386    
## quarter     -1119.24    1462.40  -0.765   0.4459    
## satis       -1856.87    2056.96  -0.903   0.3689    
## frstlang     8269.35    6644.88   1.244   0.2163    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15600 on 97 degrees of freedom
## Multiple R-squared:  0.2748, Adjusted R-squared:  0.2374 
## F-statistic: 7.352 on 5 and 97 DF,  p-value: 7.014e-06
# Model 3
Model3<-lm(salary ~ age +gmat_tot + gmat_qpc + gmat_vpc + gmat_tpc + satis 
                 + frstlang , data = MBAdataClean)
summary(Model3)
## 
## Call:
## lm(formula = salary ~ age + gmat_tot + gmat_qpc + gmat_vpc + 
##     gmat_tpc + satis + frstlang, data = MBAdataClean)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -27442  -9074    -26   5449  65805 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 51877.83   47122.04   1.101   0.2737    
## age          2720.36     507.19   5.364 5.73e-07 ***
## gmat_tot      -27.96     162.79  -0.172   0.8640    
## gmat_qpc      841.99     471.63   1.785   0.0774 .  
## gmat_vpc      567.02     477.05   1.189   0.2376    
## gmat_tpc    -1309.36     699.44  -1.872   0.0643 .  
## satis       -1688.22    2036.28  -0.829   0.4091    
## frstlang     4176.03    6703.97   0.623   0.5348    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15270 on 95 degrees of freedom
## Multiple R-squared:  0.3198, Adjusted R-squared:  0.2697 
## F-statistic:  6.38 on 7 and 95 DF,  p-value: 3.716e-06
# Model 4
Model4<-lm(salary ~ age +gmat_tot + gmat_qpc + gmat_vpc + gmat_tpc + satis +quarter
                 + frstlang , data = MBAdataClean)
summary(Model4)
## 
## Call:
## lm(formula = salary ~ age + gmat_tot + gmat_qpc + gmat_vpc + 
##     gmat_tpc + satis + quarter + frstlang, data = MBAdataClean)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -25674  -9624   -484   5419  68151 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 57913.60   47398.27   1.222   0.2248    
## age          2613.36     516.08   5.064 2.04e-06 ***
## gmat_tot      -36.54     162.81  -0.224   0.8229    
## gmat_qpc      889.73     473.18   1.880   0.0632 .  
## gmat_vpc      602.10     477.65   1.261   0.2106    
## gmat_tpc    -1370.02     700.94  -1.955   0.0536 .  
## satis       -1128.93    2097.77  -0.538   0.5917    
## quarter     -1571.32    1439.53  -1.092   0.2778    
## frstlang     5060.24    6746.03   0.750   0.4551    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15260 on 94 degrees of freedom
## Multiple R-squared:  0.3283, Adjusted R-squared:  0.2711 
## F-statistic: 5.743 on 8 and 94 DF,  p-value: 5.953e-06
# Seeing the R squared, the fourth model looks more accurate.
## Comparision of placed and unplaced students.

MBAdataFull <- MBASalarydata[which(MBASalarydata$salary!=998 & MBASalarydata$salary !=999),]
MBAdataFull$placed[MBAdataFull$salary ==0] <- 0
MBAdataFull$placed[MBAdataFull$salary !=0] <- 1
# Null hypothesis: first Language and placement are independent (H0)
# ChiSquareTest
mbadataschi1<-xtabs(~frstlang + placed, data = MBAdataFull)
addmargins(mbadataschi1)
##         placed
## frstlang   0   1 Sum
##      1    82  96 178
##      2     8   7  15
##      Sum  90 103 193
chisq.test(mbadataschi1)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  mbadataschi1
## X-squared = 0.074127, df = 1, p-value = 0.7854
# As p>0.05, null hypothesis is accepted, language and placement are independent.
# Null hypothesis: Gender and placement are independent(H0)
# ChiSquareTest
mbadataschi<-xtabs(~sex + placed,data = MBAdataFull)
addmargins(mbadataschi)
##      placed
## sex     0   1 Sum
##   1    67  72 139
##   2    23  31  54
##   Sum  90 103 193
chisq.test(mbadataschi)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  mbadataschi
## X-squared = 0.29208, df = 1, p-value = 0.5889
# As p>0.05, null hypothesis is accepted, Gender and placement are independent.