#title: "MBA_Starting_Salaries_Data"
#author: "Ayush Bose"
#date: "January 23, 2018"

#Reading the data set
mydata.df <- read.csv(paste("MBA_Starting_Salaries_Data.csv", sep=""))
View(mydata.df)
#Calculating summary statistics 
summary(mydata.df)
##       age             sex           gmat_tot        gmat_qpc    
##  Min.   :22.00   Min.   :1.000   Min.   :450.0   Min.   :28.00  
##  1st Qu.:25.00   1st Qu.:1.000   1st Qu.:580.0   1st Qu.:72.00  
##  Median :27.00   Median :1.000   Median :620.0   Median :83.00  
##  Mean   :27.36   Mean   :1.248   Mean   :619.5   Mean   :80.64  
##  3rd Qu.:29.00   3rd Qu.:1.000   3rd Qu.:660.0   3rd Qu.:93.00  
##  Max.   :48.00   Max.   :2.000   Max.   :790.0   Max.   :99.00  
##     gmat_vpc        gmat_tpc        s_avg           f_avg      
##  Min.   :16.00   Min.   : 0.0   Min.   :2.000   Min.   :0.000  
##  1st Qu.:71.00   1st Qu.:78.0   1st Qu.:2.708   1st Qu.:2.750  
##  Median :81.00   Median :87.0   Median :3.000   Median :3.000  
##  Mean   :78.32   Mean   :84.2   Mean   :3.025   Mean   :3.062  
##  3rd Qu.:91.00   3rd Qu.:94.0   3rd Qu.:3.300   3rd Qu.:3.250  
##  Max.   :99.00   Max.   :99.0   Max.   :4.000   Max.   :4.000  
##     quarter         work_yrs         frstlang         salary      
##  Min.   :1.000   Min.   : 0.000   Min.   :1.000   Min.   :     0  
##  1st Qu.:1.250   1st Qu.: 2.000   1st Qu.:1.000   1st Qu.:     0  
##  Median :2.000   Median : 3.000   Median :1.000   Median :   999  
##  Mean   :2.478   Mean   : 3.872   Mean   :1.117   Mean   : 39026  
##  3rd Qu.:3.000   3rd Qu.: 4.000   3rd Qu.:1.000   3rd Qu.: 97000  
##  Max.   :4.000   Max.   :22.000   Max.   :2.000   Max.   :220000  
##      satis      
##  Min.   :  1.0  
##  1st Qu.:  5.0  
##  Median :  6.0  
##  Mean   :172.2  
##  3rd Qu.:  7.0  
##  Max.   :998.0
str(mydata.df)
## 'data.frame':    274 obs. of  13 variables:
##  $ age     : int  23 24 24 24 24 24 25 25 25 25 ...
##  $ sex     : int  2 1 1 1 2 1 1 2 1 1 ...
##  $ gmat_tot: int  620 610 670 570 710 640 610 650 630 680 ...
##  $ gmat_qpc: int  77 90 99 56 93 82 89 88 79 99 ...
##  $ gmat_vpc: int  87 71 78 81 98 89 74 89 91 81 ...
##  $ gmat_tpc: int  87 87 95 75 98 91 87 92 89 96 ...
##  $ s_avg   : num  3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
##  $ f_avg   : num  3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
##  $ quarter : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ work_yrs: int  2 2 2 1 2 2 2 2 2 2 ...
##  $ frstlang: int  1 1 1 1 1 1 1 1 2 1 ...
##  $ salary  : int  0 0 0 0 999 0 0 0 999 998 ...
##  $ satis   : int  7 6 6 7 5 6 5 6 4 998 ...
#Drawing Box Plots / Bar Plots to visualize the distribution of each variable independently
hist(mydata.df$age, 
     main="Visualization of Age",
     xlab="Age",
     ylab="Count",
     breaks=10,             
     col="peachpuff")

boxplot(mydata.df$age)

hist(mydata.df$gmat_tot,
     main="Visualization of GmatTotal",
     xlab="gmat_tot",
     ylab="Count",
     breaks=10,             
     col="peachpuff")

boxplot(mydata.df$gmat_tot)

hist(mydata.df$work_yrs, 
     main="Visualization of Work Experience",
     xlab="work_yrs",
     ylab="Count",
     breaks=10,
     col="peachpuff")

boxplot(mydata.df$work_yrs)

hist(mydata.df$salary, 
     main="Visualization of salary",
     xlab="salary",
     ylab="Count",
     breaks=10,             
     col="peachpuff")

boxplot(mydata.df$salary)

hist(mydata.df$satis, 
     main="Visualization of Satisfaction",
     xlab="satis",
     ylab="Count",
     breaks=10,             
     col="peachpuff")  

boxplot(mydata.df$satis)

#Drawing Scatter Plots to understand how are the variables correlated pair-wise
pairs(formula = ~ age + sex + gmat_tot + s_avg + f_avg + quarter + work_yrs + frstlang + salary + satis, cex=0.6, data=mydata.df)
#Drawing a Corrgram; Creating a Variance-Covariance Matrix
library(corrgram)
## Warning: package 'corrgram' was built under R version 3.4.3

corrgram(mydata.df, order=TRUE, lower.panel=panel.shade,
         upper.panel=panel.pie, text.panel=panel.txt,
         main="Corrgram of Data")

cov(mydata.df)
##                    age           sex      gmat_tot      gmat_qpc
## age       1.376904e+01 -4.513248e-02 -3.115879e+01 -1.192655e+01
## sex      -4.513248e-02  1.872677e-01 -1.328841e+00 -1.053769e+00
## gmat_tot -3.115879e+01 -1.328841e+00  3.310688e+03  6.200233e+02
## gmat_qpc -1.192655e+01 -1.053769e+00  6.200233e+02  2.210731e+02
## gmat_vpc -2.763643e+00  5.463758e-01  7.260006e+02  3.814826e+01
## gmat_tpc -8.839978e+00 -4.908960e-02  6.839911e+02  1.357997e+02
## s_avg     2.116874e-01  2.096227e-02  2.480257e+00 -1.691233e-01
## f_avg    -3.399348e-02  2.082698e-02  3.154688e+00  5.753854e-01
## quarter  -2.045935e-01 -6.414267e-02 -5.891153e+00  6.001979e-01
## work_yrs  1.029494e+01 -1.580172e-02 -3.391634e+01 -1.137186e+01
## frstlang  6.796610e-02  2.138980e-04 -2.499933e+00  6.646346e-01
## salary   -1.183042e+04  1.518264e+03 -1.611600e+05 -3.335823e+04
## satis    -1.763499e+02 -8.780808e+00  1.765263e+03  3.348371e+02
##               gmat_vpc     gmat_tpc         s_avg        f_avg
## age         -2.7636427   -8.8399775    0.21168739  -0.03399348
## sex          0.5463758   -0.0490896    0.02096227   0.02082698
## gmat_tot   726.0006417  683.9910698    2.48025721   3.15468838
## gmat_qpc    38.1482581  135.7996845   -0.16912329   0.57538542
## gmat_vpc   284.2481217  157.4932488    1.31357023   0.67207000
## gmat_tpc   157.4932488  196.6057057    0.62710008   0.58698618
## s_avg        1.3135702    0.6271001    0.14521760   0.11016898
## f_avg        0.6720700    0.5869862    0.11016898   0.27567237
## quarter     -3.2676666   -1.2923719   -0.32237213  -0.26080880
## work_yrs    -3.6181653   -7.8575172    0.15926392  -0.06628700
## frstlang    -2.1145691   -0.4663244   -0.01671372  -0.00626026
## salary   -5273.8523836 3522.7500067 2831.60098580 787.65597177
## satis      392.3562739  484.2466779   -4.62884495   2.12532927
##                quarter      work_yrs      frstlang        salary
## age      -2.045935e-01   10.29493864  6.796610e-02 -1.183042e+04
## sex      -6.414267e-02   -0.01580172  2.138980e-04  1.518264e+03
## gmat_tot -5.891153e+00  -33.91633914 -2.499933e+00 -1.611600e+05
## gmat_qpc  6.001979e-01  -11.37186171  6.646346e-01 -3.335823e+04
## gmat_vpc -3.267667e+00   -3.61816529 -2.114569e+00 -5.273852e+03
## gmat_tpc -1.292372e+00   -7.85751718 -4.663244e-01  3.522750e+03
## s_avg    -3.223721e-01    0.15926392 -1.671372e-02  2.831601e+03
## f_avg    -2.608088e-01   -0.06628700 -6.260260e-03  7.876560e+02
## quarter   1.232119e+00   -0.30866822  3.553381e-02 -9.296214e+03
## work_yrs -3.086682e-01   10.44882490 -2.898318e-02  1.486147e+03
## frstlang  3.553381e-02   -0.02898318  1.035266e-01 -1.419586e+03
## salary   -9.296214e+03 1486.14704152 -1.419586e+03  2.596062e+09
## satis    -5.227133e-03 -131.24080907  9.484532e+00 -6.347115e+06
##                  satis
## age      -1.763499e+02
## sex      -8.780808e+00
## gmat_tot  1.765263e+03
## gmat_qpc  3.348371e+02
## gmat_vpc  3.923563e+02
## gmat_tpc  4.842467e+02
## s_avg    -4.628845e+00
## f_avg     2.125329e+00
## quarter  -5.227133e-03
## work_yrs -1.312408e+02
## frstlang  9.484532e+00
## salary   -6.347115e+06
## satis     1.380974e+05
var(mydata.df)
##                    age           sex      gmat_tot      gmat_qpc
## age       1.376904e+01 -4.513248e-02 -3.115879e+01 -1.192655e+01
## sex      -4.513248e-02  1.872677e-01 -1.328841e+00 -1.053769e+00
## gmat_tot -3.115879e+01 -1.328841e+00  3.310688e+03  6.200233e+02
## gmat_qpc -1.192655e+01 -1.053769e+00  6.200233e+02  2.210731e+02
## gmat_vpc -2.763643e+00  5.463758e-01  7.260006e+02  3.814826e+01
## gmat_tpc -8.839978e+00 -4.908960e-02  6.839911e+02  1.357997e+02
## s_avg     2.116874e-01  2.096227e-02  2.480257e+00 -1.691233e-01
## f_avg    -3.399348e-02  2.082698e-02  3.154688e+00  5.753854e-01
## quarter  -2.045935e-01 -6.414267e-02 -5.891153e+00  6.001979e-01
## work_yrs  1.029494e+01 -1.580172e-02 -3.391634e+01 -1.137186e+01
## frstlang  6.796610e-02  2.138980e-04 -2.499933e+00  6.646346e-01
## salary   -1.183042e+04  1.518264e+03 -1.611600e+05 -3.335823e+04
## satis    -1.763499e+02 -8.780808e+00  1.765263e+03  3.348371e+02
##               gmat_vpc     gmat_tpc         s_avg        f_avg
## age         -2.7636427   -8.8399775    0.21168739  -0.03399348
## sex          0.5463758   -0.0490896    0.02096227   0.02082698
## gmat_tot   726.0006417  683.9910698    2.48025721   3.15468838
## gmat_qpc    38.1482581  135.7996845   -0.16912329   0.57538542
## gmat_vpc   284.2481217  157.4932488    1.31357023   0.67207000
## gmat_tpc   157.4932488  196.6057057    0.62710008   0.58698618
## s_avg        1.3135702    0.6271001    0.14521760   0.11016898
## f_avg        0.6720700    0.5869862    0.11016898   0.27567237
## quarter     -3.2676666   -1.2923719   -0.32237213  -0.26080880
## work_yrs    -3.6181653   -7.8575172    0.15926392  -0.06628700
## frstlang    -2.1145691   -0.4663244   -0.01671372  -0.00626026
## salary   -5273.8523836 3522.7500067 2831.60098580 787.65597177
## satis      392.3562739  484.2466779   -4.62884495   2.12532927
##                quarter      work_yrs      frstlang        salary
## age      -2.045935e-01   10.29493864  6.796610e-02 -1.183042e+04
## sex      -6.414267e-02   -0.01580172  2.138980e-04  1.518264e+03
## gmat_tot -5.891153e+00  -33.91633914 -2.499933e+00 -1.611600e+05
## gmat_qpc  6.001979e-01  -11.37186171  6.646346e-01 -3.335823e+04
## gmat_vpc -3.267667e+00   -3.61816529 -2.114569e+00 -5.273852e+03
## gmat_tpc -1.292372e+00   -7.85751718 -4.663244e-01  3.522750e+03
## s_avg    -3.223721e-01    0.15926392 -1.671372e-02  2.831601e+03
## f_avg    -2.608088e-01   -0.06628700 -6.260260e-03  7.876560e+02
## quarter   1.232119e+00   -0.30866822  3.553381e-02 -9.296214e+03
## work_yrs -3.086682e-01   10.44882490 -2.898318e-02  1.486147e+03
## frstlang  3.553381e-02   -0.02898318  1.035266e-01 -1.419586e+03
## salary   -9.296214e+03 1486.14704152 -1.419586e+03  2.596062e+09
## satis    -5.227133e-03 -131.24080907  9.484532e+00 -6.347115e+06
##                  satis
## age      -1.763499e+02
## sex      -8.780808e+00
## gmat_tot  1.765263e+03
## gmat_qpc  3.348371e+02
## gmat_vpc  3.923563e+02
## gmat_tpc  4.842467e+02
## s_avg    -4.628845e+00
## f_avg     2.125329e+00
## quarter  -5.227133e-03
## work_yrs -1.312408e+02
## frstlang  9.484532e+00
## salary   -6.347115e+06
## satis     1.380974e+05
#Taking a subset of the dataset consisting of only those people who actually got a job.
placed.df <- subset(mydata.df, salary>0 & salary!= 998 & salary!=999)
View(placed.df)

#Further analysis on placed.df
#Consider y = f(x). Where y = Starting Salary. Let us analyse impact of {gender; first language; prior work experience; GMAT performance; MBA performance} on y.

#Drawing contingency tables
mytable <- with(placed.df, table(sex))
mytable
## sex
##  1  2 
## 72 31
mytable <- with(placed.df, table(frstlang))
mytable
## frstlang
##  1  2 
## 96  7
aggregate(salary ~ sex, data=placed.df, mean)
##   sex    salary
## 1   1 104970.97
## 2   2  98524.39
aggregate(salary ~ frstlang, data=placed.df, mean)
##   frstlang   salary
## 1        1 101748.6
## 2        2 120614.3
aggregate(salary ~ s_avg + f_avg, data=placed.df, mean)
##    s_avg f_avg    salary
## 1   4.00  0.00 146000.00
## 2   2.20  2.00 105000.00
## 3   2.40  2.00  85000.00
## 4   2.40  2.25  90000.00
## 5   2.30  2.50  98000.00
## 6   2.50  2.50  96000.00
## 7   2.60  2.50 107700.00
## 8   2.70  2.50  90000.00
## 9   3.50  2.67  86000.00
## 10  2.40  2.75  99500.00
## 11  2.50  2.75 220000.00
## 12  2.60  2.75 114155.00
## 13  2.70  2.75  90750.00
## 14  2.80  2.75 104833.33
## 15  2.90  2.75  91085.33
## 16  3.00  2.75  97250.00
## 17  2.91  2.83 105000.00
## 18  2.50  3.00  77000.00
## 19  2.60  3.00 100000.00
## 20  2.70  3.00  98000.00
## 21  2.80  3.00  99700.00
## 22  2.90  3.00 101400.00
## 23  3.00  3.00 105000.00
## 24  3.09  3.00 100000.00
## 25  3.10  3.00 112450.00
## 26  3.20  3.00 109000.00
## 27  3.30  3.00 105000.00
## 28  3.40  3.00 100000.00
## 29  3.50  3.00 113000.00
## 30  2.80  3.25  98000.00
## 31  2.90  3.25  93000.00
## 32  3.00  3.25 107500.00
## 33  3.20  3.25 105166.67
## 34  3.27  3.25  95000.00
## 35  3.30  3.25 101416.67
## 36  3.40  3.25  90000.00
## 37  3.50  3.25  97333.33
## 38  3.10  3.33  82000.00
## 39  2.90  3.50 107300.00
## 40  3.09  3.50 107000.00
## 41  3.10  3.50  96500.00
## 42  3.20  3.50  95000.00
## 43  3.30  3.50  95750.00
## 44  3.45  3.50 105000.00
## 45  3.50  3.50 111500.00
## 46  3.60  3.50 110500.00
## 47  3.80  3.50 105000.00
## 48  3.50  3.60  85000.00
## 49  3.70  3.60 106000.00
## 50  3.40  3.67 100000.00
## 51  3.60  3.67  95000.00
## 52  3.40  3.75  93000.00
## 53  3.50  3.75  85000.00
## 54  3.60  3.75 162000.00
## 55  3.70  4.00 115000.00
## 56  3.80  4.00 120000.00
aggregate(salary ~ gmat_tot, data=placed.df, mean)
##    gmat_tot   salary
## 1       500 158250.0
## 2       520  78256.0
## 3       530  99500.0
## 4       540 104000.0
## 5       550 112236.7
## 6       560  94000.0
## 7       570 103857.1
## 8       580  99875.0
## 9       590  97000.0
## 10      600 107666.7
## 11      610  96200.0
## 12      620 104108.3
## 13      630 105812.5
## 14      640 110000.0
## 15      650 101285.7
## 16      660  92480.0
## 17      670 100642.9
## 18      680 102166.7
## 19      700 122333.3
## 20      710 101250.0
## 21      720  85000.0
#Running t-tests
t.test(salary ~ sex, data=placed.df)
## 
##  Welch Two Sample t-test
## 
## data:  salary by sex
## t = 1.3628, df = 38.115, p-value = 0.1809
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -3128.55 16021.72
## sample estimates:
## mean in group 1 mean in group 2 
##       104970.97        98524.39
t.test(salary ~ frstlang, data=placed.df)
## 
##  Welch Two Sample t-test
## 
## data:  salary by frstlang
## t = -1.1202, df = 6.0863, p-value = 0.3049
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -59933.62  22202.25
## sample estimates:
## mean in group 1 mean in group 2 
##        101748.6        120614.3
#Regression Model 1 :
fit <- lm(salary ~ work_yrs + gmat_tot + s_avg + f_avg, data = placed.df)
summary(fit)
## 
## Call:
## lm(formula = salary ~ work_yrs + gmat_tot + s_avg + f_avg, data = placed.df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -36351  -8173  -1170   3864  87090 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 98635.48   22643.60   4.356 3.26e-05 ***
## work_yrs     2579.88     577.97   4.464 2.16e-05 ***
## gmat_tot      -14.98      32.53  -0.460    0.646    
## s_avg        2422.16    5033.78   0.481    0.631    
## f_avg       -1087.60    3889.90  -0.280    0.780    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16200 on 98 degrees of freedom
## Multiple R-squared:  0.2098, Adjusted R-squared:  0.1776 
## F-statistic: 6.506 on 4 and 98 DF,  p-value: 0.0001098
#Making Regression model 2 :
fit <- lm(salary ~ work_yrs + age + s_avg + frstlang , data = placed.df)
summary(fit)
## 
## Call:
## lm(formula = salary ~ work_yrs + age + s_avg + frstlang, data = placed.df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -32957  -9005  -1362   4613  76947 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  34470.6    26189.2   1.316   0.1912  
## work_yrs       746.2     1121.1   0.666   0.5072  
## age           1833.3     1085.7   1.689   0.0945 .
## s_avg         2207.1     4233.5   0.521   0.6033  
## frstlang      9270.9     6894.3   1.345   0.1818  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15630 on 98 degrees of freedom
## Multiple R-squared:  0.2647, Adjusted R-squared:  0.2347 
## F-statistic: 8.818 on 4 and 98 DF,  p-value: 4.008e-06
#Model 2 is the better model. 
#Compare the remaining subset of those people who did not get a job and compare them with those people who got a job. Here, we are not analyzing what drives a higher salary. Instead, we are analysing the two groups who got a job / did not get a job
notplaced.df <- subset(mydata.df, salary==0)
View(notplaced.df)
t.test(gmat_tot ~ frstlang, data=notplaced.df)
## 
##  Welch Two Sample t-test
## 
## data:  gmat_tot by frstlang
## t = 0.51644, df = 7.9236, p-value = 0.6197
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -49.86769  78.58720
## sample estimates:
## mean in group 1 mean in group 2 
##        615.6098        601.2500