setwd("C:/Users/Dell/Downloads/Sameer Mathur")
mba.df<-read.csv("MBA Starting Salaries.csv")
View(mba.df)

Summarizing every variable of the dataset.

summary(mba.df)
##       age             sex           gmat_tot        gmat_qpc    
##  Min.   :22.00   Min.   :1.000   Min.   :450.0   Min.   :28.00  
##  1st Qu.:25.00   1st Qu.:1.000   1st Qu.:580.0   1st Qu.:72.00  
##  Median :27.00   Median :1.000   Median :620.0   Median :83.00  
##  Mean   :27.36   Mean   :1.248   Mean   :619.5   Mean   :80.64  
##  3rd Qu.:29.00   3rd Qu.:1.000   3rd Qu.:660.0   3rd Qu.:93.00  
##  Max.   :48.00   Max.   :2.000   Max.   :790.0   Max.   :99.00  
##     gmat_vpc        gmat_tpc        s_avg           f_avg      
##  Min.   :16.00   Min.   : 0.0   Min.   :2.000   Min.   :0.000  
##  1st Qu.:71.00   1st Qu.:78.0   1st Qu.:2.708   1st Qu.:2.750  
##  Median :81.00   Median :87.0   Median :3.000   Median :3.000  
##  Mean   :78.32   Mean   :84.2   Mean   :3.025   Mean   :3.062  
##  3rd Qu.:91.00   3rd Qu.:94.0   3rd Qu.:3.300   3rd Qu.:3.250  
##  Max.   :99.00   Max.   :99.0   Max.   :4.000   Max.   :4.000  
##     quarter         work_yrs         frstlang         salary      
##  Min.   :1.000   Min.   : 0.000   Min.   :1.000   Min.   :     0  
##  1st Qu.:1.250   1st Qu.: 2.000   1st Qu.:1.000   1st Qu.:     0  
##  Median :2.000   Median : 3.000   Median :1.000   Median :   999  
##  Mean   :2.478   Mean   : 3.872   Mean   :1.117   Mean   : 39026  
##  3rd Qu.:3.000   3rd Qu.: 4.000   3rd Qu.:1.000   3rd Qu.: 97000  
##  Max.   :4.000   Max.   :22.000   Max.   :2.000   Max.   :220000  
##      satis      
##  Min.   :  1.0  
##  1st Qu.:  5.0  
##  Median :  6.0  
##  Mean   :172.2  
##  3rd Qu.:  7.0  
##  Max.   :998.0
library(psych)
describe(mba.df)[,1:5]
##          vars   n     mean       sd median
## age         1 274    27.36     3.71     27
## sex         2 274     1.25     0.43      1
## gmat_tot    3 274   619.45    57.54    620
## gmat_qpc    4 274    80.64    14.87     83
## gmat_vpc    5 274    78.32    16.86     81
## gmat_tpc    6 274    84.20    14.02     87
## s_avg       7 274     3.03     0.38      3
## f_avg       8 274     3.06     0.53      3
## quarter     9 274     2.48     1.11      2
## work_yrs   10 274     3.87     3.23      3
## frstlang   11 274     1.12     0.32      1
## salary     12 274 39025.69 50951.56    999
## satis      13 274   172.18   371.61      6

Drawing Bar Plots to visualize the distribution of each variable independently.

par(mfrow=c(1,2))
hist(mba.df$age,xlab = "Age",col = "peachpuff1")
hist(mba.df$sex,xlab = "Sex",col = "peachpuff1",xaxt="n")
axis(side = 1,at=c(1,2),labels = c("Males","Females"))

hist(mba.df$gmat_tot,xlab = "GMAT total score",col = "peachpuff1",breaks = 10)
hist(mba.df$work_yrs,xlab = "Work Experience",col = "peachpuff1")

hist(mba.df$frstlang,xlab = "First language",col = "peachpuff1")
hist(mba.df$salary,xlab = "Age",main="Salaries",col = "peachpuff1",breaks = 10)

hist(mba.df$satis,xlab = "Age",col = "peachpuff1")

Drawing Scatter Plots to understand how are the variables correlated pair-wise.

par(mfrow=c(1,2))
plot(mba.df$gmat_tot,mba.df$work_yrs)
plot(mba.df$gmat_tot,mba.df$age)

plot(mba.df$gmat_tot,mba.df$s_avg)
plot(mba.df$gmat_tot,mba.df$f_avg)

plot(mba.df$salary,mba.df$gmat_tot)
plot(mba.df$salary,mba.df$age)

Finding the covariance matrix,i.e. the corelation between all the variables.

cov(mba.df)
##                    age           sex      gmat_tot      gmat_qpc
## age       1.376904e+01 -4.513248e-02 -3.115879e+01 -1.192655e+01
## sex      -4.513248e-02  1.872677e-01 -1.328841e+00 -1.053769e+00
## gmat_tot -3.115879e+01 -1.328841e+00  3.310688e+03  6.200233e+02
## gmat_qpc -1.192655e+01 -1.053769e+00  6.200233e+02  2.210731e+02
## gmat_vpc -2.763643e+00  5.463758e-01  7.260006e+02  3.814826e+01
## gmat_tpc -8.839978e+00 -4.908960e-02  6.839911e+02  1.357997e+02
## s_avg     2.116874e-01  2.096227e-02  2.480257e+00 -1.691233e-01
## f_avg    -3.399348e-02  2.082698e-02  3.154688e+00  5.753854e-01
## quarter  -2.045935e-01 -6.414267e-02 -5.891153e+00  6.001979e-01
## work_yrs  1.029494e+01 -1.580172e-02 -3.391634e+01 -1.137186e+01
## frstlang  6.796610e-02  2.138980e-04 -2.499933e+00  6.646346e-01
## salary   -1.183042e+04  1.518264e+03 -1.611600e+05 -3.335823e+04
## satis    -1.763499e+02 -8.780808e+00  1.765263e+03  3.348371e+02
##               gmat_vpc     gmat_tpc         s_avg        f_avg
## age         -2.7636427   -8.8399775    0.21168739  -0.03399348
## sex          0.5463758   -0.0490896    0.02096227   0.02082698
## gmat_tot   726.0006417  683.9910698    2.48025721   3.15468838
## gmat_qpc    38.1482581  135.7996845   -0.16912329   0.57538542
## gmat_vpc   284.2481217  157.4932488    1.31357023   0.67207000
## gmat_tpc   157.4932488  196.6057057    0.62710008   0.58698618
## s_avg        1.3135702    0.6271001    0.14521760   0.11016898
## f_avg        0.6720700    0.5869862    0.11016898   0.27567237
## quarter     -3.2676666   -1.2923719   -0.32237213  -0.26080880
## work_yrs    -3.6181653   -7.8575172    0.15926392  -0.06628700
## frstlang    -2.1145691   -0.4663244   -0.01671372  -0.00626026
## salary   -5273.8523836 3522.7500067 2831.60098580 787.65597177
## satis      392.3562739  484.2466779   -4.62884495   2.12532927
##                quarter      work_yrs      frstlang        salary
## age      -2.045935e-01   10.29493864  6.796610e-02 -1.183042e+04
## sex      -6.414267e-02   -0.01580172  2.138980e-04  1.518264e+03
## gmat_tot -5.891153e+00  -33.91633914 -2.499933e+00 -1.611600e+05
## gmat_qpc  6.001979e-01  -11.37186171  6.646346e-01 -3.335823e+04
## gmat_vpc -3.267667e+00   -3.61816529 -2.114569e+00 -5.273852e+03
## gmat_tpc -1.292372e+00   -7.85751718 -4.663244e-01  3.522750e+03
## s_avg    -3.223721e-01    0.15926392 -1.671372e-02  2.831601e+03
## f_avg    -2.608088e-01   -0.06628700 -6.260260e-03  7.876560e+02
## quarter   1.232119e+00   -0.30866822  3.553381e-02 -9.296214e+03
## work_yrs -3.086682e-01   10.44882490 -2.898318e-02  1.486147e+03
## frstlang  3.553381e-02   -0.02898318  1.035266e-01 -1.419586e+03
## salary   -9.296214e+03 1486.14704152 -1.419586e+03  2.596062e+09
## satis    -5.227133e-03 -131.24080907  9.484532e+00 -6.347115e+06
##                  satis
## age      -1.763499e+02
## sex      -8.780808e+00
## gmat_tot  1.765263e+03
## gmat_qpc  3.348371e+02
## gmat_vpc  3.923563e+02
## gmat_tpc  4.842467e+02
## s_avg    -4.628845e+00
## f_avg     2.125329e+00
## quarter  -5.227133e-03
## work_yrs -1.312408e+02
## frstlang  9.484532e+00
## salary   -6.347115e+06
## satis     1.380974e+05

Changing the values of columns ‘sex’ and ‘first language’

mba.df$sex[mba.df$sex==1]<-"Male"
mba.df$sex[mba.df$sex==2]<-"Female"
mba.df$sex <- factor(mba.df$sex)

mba.df$frstlang[mba.df$frstlang == 1] <- 'English'
mba.df$frstlang[mba.df$frstlang == 2] <- 'Other'
mba.df$frstlang <- factor(mba.df$frstlang)

Plotting a corrgram of all the variables in the data frame

library(corrgram)
corrgram(mba.df,order=TRUE,lower.panel = panel.shade,upper.panel = panel.pie,text.panel = panel.txt,main="Corrgram of Relations between MBA salaries and various factors")

Creating four seperate dataframes

# MBAs who got placed and disclosed their salaries
placed.df <- mba.df[which (mba.df$salary > 1000)  , ]
View(placed.df)

# MBAs who did not get placed
notPlaced.df <- mba.df[which(mba.df$salary==0), ]
View(notPlaced.df)

# MBAs who were placed but did not disclose their salary
notDisclosedSalary.df  <- mba.df[which (mba.df$salary == 999)  , ]
View(notDisclosedSalary.df)

# MBAs who did not answer the survey
notAnsweredSurvey.df  <- mba.df[which (mba.df$salary == 998)  , ]
View(notAnsweredSurvey.df)

knownMBA.df <- rbind(placed.df, notDisclosedSalary.df, notPlaced.df)
View(knownMBA.df)

Creating a new column ‘Got Placed’ with variables TRUE and FALSE

knownMBA.df$GotPlaced = (knownMBA.df$salary >1000)
View(knownMBA.df)

knownMBA.df$GotPlaced <- factor(knownMBA.df$GotPlaced)
str(knownMBA.df)
## 'data.frame':    228 obs. of  14 variables:
##  $ age      : int  22 27 25 25 27 28 24 25 25 25 ...
##  $ sex      : Factor w/ 2 levels "Female","Male": 1 1 1 1 2 1 2 1 1 2 ...
##  $ gmat_tot : int  660 700 680 650 710 620 670 560 530 650 ...
##  $ gmat_qpc : int  90 94 87 82 96 52 84 52 50 79 ...
##  $ gmat_vpc : int  92 98 96 91 96 98 96 81 62 93 ...
##  $ gmat_tpc : int  94 98 96 93 98 87 95 72 61 93 ...
##  $ s_avg    : num  3.5 3.3 3.5 3.4 3.3 3.4 3.3 3.3 3.6 3.3 ...
##  $ f_avg    : num  3.75 3.25 2.67 3.25 3.5 3.75 3.25 3.5 3.67 3.5 ...
##  $ quarter  : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ work_yrs : int  1 2 2 3 2 5 0 1 3 1 ...
##  $ frstlang : Factor w/ 2 levels "English","Other": 1 1 1 1 1 1 1 1 1 1 ...
##  $ salary   : int  85000 85000 86000 88000 92000 93000 95000 95000 95000 96000 ...
##  $ satis    : int  5 6 5 7 6 5 4 5 3 7 ...
##  $ GotPlaced: Factor w/ 2 levels "FALSE","TRUE": 2 2 2 2 2 2 2 2 2 2 ...
library(lattice)
histogram(~salary, data = mba.df,
 main = "Distribution of MBA's Starting Salary", xlab="MBA's Starting Salary", col='grey' ) 

age<- table(placed.df$salary,placed.df$age)
age
##         
##          22 23 24 25 26 27 28 29 30 31 32 33 34 39 40
##   64000   0  0  1  0  0  0  0  0  0  0  0  0  0  0  0
##   77000   0  1  0  0  0  0  0  0  0  0  0  0  0  0  0
##   78256   0  1  0  0  0  0  0  0  0  0  0  0  0  0  0
##   82000   0  0  0  0  1  0  0  0  0  0  0  0  0  0  0
##   85000   1  0  0  1  1  1  0  0  0  0  0  0  0  0  0
##   86000   0  0  0  1  1  0  0  0  0  0  0  0  0  0  0
##   88000   0  0  0  1  0  0  0  0  0  0  0  0  0  0  0
##   88500   0  0  0  0  0  1  0  0  0  0  0  0  0  0  0
##   90000   0  0  0  2  0  1  0  0  0  0  0  0  0  0  0
##   92000   0  0  0  2  0  1  0  0  0  0  0  0  0  0  0
##   93000   0  0  0  1  0  0  1  0  0  1  0  0  0  0  0
##   95000   0  0  1  5  0  0  0  1  0  0  0  0  0  0  0
##   96000   0  0  1  1  2  0  0  0  0  0  0  0  0  0  0
##   96500   0  0  1  0  0  0  0  0  0  0  0  0  0  0  0
##   97000   0  0  0  0  0  1  1  0  0  0  0  0  0  0  0
##   98000   0  1  3  2  1  1  1  1  0  0  0  0  0  0  0
##   99000   0  0  0  0  0  0  1  0  0  0  0  0  0  0  0
##   100000  0  1  4  1  1  1  0  0  0  1  0  0  0  0  0
##   100400  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0
##   101000  0  0  1  1  0  0  0  0  0  0  0  0  0  0  0
##   101100  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0
##   101600  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0
##   102500  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0
##   103000  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0
##   104000  0  0  0  0  0  0  1  0  0  1  0  0  0  0  0
##   105000  0  1  1  2  3  1  0  0  1  1  0  0  1  0  0
##   106000  0  0  0  0  0  0  0  1  2  0  0  0  0  0  0
##   107000  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0
##   107300  0  0  0  0  0  0  0  0  0  0  1  0  0  0  0
##   107500  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0
##   108000  0  0  0  1  0  0  1  0  0  0  0  0  0  0  0
##   110000  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0
##   112000  0  0  1  0  0  0  0  1  0  0  0  0  0  1  0
##   115000  0  0  1  1  0  3  0  0  0  0  0  0  0  0  0
##   118000  0  0  0  0  0  0  0  0  0  0  0  1  0  0  0
##   120000  0  0  0  0  0  1  1  0  2  0  0  0  0  0  0
##   126710  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0
##   130000  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0
##   145800  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0
##   146000  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1
##   162000  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0
##   220000  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1
mytable<- xtabs(~salary+sex,data = placed.df)
mytable
##         sex
## salary   Female Male
##   64000       1    0
##   77000       0    1
##   78256       1    0
##   82000       1    0
##   85000       3    1
##   86000       2    0
##   88000       1    0
##   88500       0    1
##   90000       0    3
##   92000       1    2
##   93000       1    2
##   95000       3    4
##   96000       1    3
##   96500       0    1
##   97000       0    2
##   98000       4    6
##   99000       1    0
##   100000      5    4
##   100400      0    1
##   101000      2    0
##   101100      0    1
##   101600      0    1
##   102500      0    1
##   103000      0    1
##   104000      0    2
##   105000      0   11
##   106000      1    2
##   107000      0    1
##   107300      0    1
##   107500      0    1
##   108000      0    2
##   110000      1    0
##   112000      0    3
##   115000      0    5
##   118000      0    1
##   120000      1    3
##   126710      0    1
##   130000      0    1
##   145800      0    1
##   146000      0    1
##   162000      0    1
##   220000      1    0
language<- xtabs(salary~frstlang,data = knownMBA.df)
language
## frstlang
## English   Other 
## 9793840  853291
work<- xtabs(salary~work_yrs,data = placed.df)
work
## work_yrs
##       0       1       2       3       4       5       6       7       8 
##   95000  828256 3711600 2134710 1160000  722000  741500   98000  420100 
##      10      15      16 
##  118000  366000  217000
score<- xtabs(gmat_tot~salary,data = placed.df)
score
## salary
##  64000  77000  78256  82000  85000  86000  88000  88500  90000  92000 
##    560    660    520    670   2700   1310    650    620   1860   1990 
##  93000  95000  96000  96500  97000  98000  99000 100000 100400 101000 
##   1770   4210   2400    500   1200   6110    580   5550    630   1220 
## 101100 101600 102500 103000 104000 105000 106000 107000 107300 107500 
##    660    630    670    620   1090   6530   1980    600    660    630 
## 108000 110000 112000 115000 118000 120000 126710 130000 145800 146000 
##   1170    640   1950   3070    620   2570    550    650    620    630 
## 162000 220000 
##    700    500
aggregate(knownMBA.df$salary,by=list(knownMBA.df$sex),mean)
##   Group.1        x
## 1  Female 51851.71
## 2    Male 44898.70
aggregate(knownMBA.df$gmat_tot,by=list(knownMBA.df$sex),mean)
##   Group.1        x
## 1  Female 612.8814
## 2    Male 618.8757
aggregate(placed.df$salary,by=list(placed.df$work_yrs),mean)
##    Group.1         x
## 1        0  95000.00
## 2        1 103532.00
## 3        2  97673.68
## 4        3 101652.86
## 5        4 105454.55
## 6        5 103142.86
## 7        6 105928.57
## 8        7  98000.00
## 9        8 105025.00
## 10      10 118000.00
## 11      15 183000.00
## 12      16 108500.00
aggregate(placed.df$salary,by=list(placed.df$age),mean)
##    Group.1         x
## 1       22  85000.00
## 2       23  91651.20
## 3       24 101518.75
## 4       25  99086.96
## 5       26 101665.00
## 6       27 102214.29
## 7       28 103625.00
## 8       29 102083.33
## 9       30 109916.67
## 10      31 100500.00
## 11      32 107300.00
## 12      33 118000.00
## 13      34 105000.00
## 14      39 112000.00
## 15      40 183000.00
aggregate(knownMBA.df$gmat_tot,by=list(knownMBA.df$age),mean)
##    Group.1        x
## 1       22 630.0000
## 2       23 622.5000
## 3       24 619.6774
## 4       25 628.6842
## 5       26 595.8621
## 6       27 643.4286
## 7       28 602.9412
## 8       29 620.0000
## 9       30 630.9091
## 10      31 580.0000
## 11      32 608.7500
## 12      33 620.0000
## 13      34 597.5000
## 14      35 580.0000
## 15      36 590.0000
## 16      37 560.0000
## 17      39 650.0000
## 18      40 565.0000
## 19      42 650.0000
## 20      43 555.0000
## 21      48 590.0000
chisq.test(age)
## Warning in chisq.test(age): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  age
## X-squared = 717.62, df = 574, p-value = 3.929e-05
chisq.test(mytable)
## Warning in chisq.test(mytable): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  mytable
## X-squared = 52.681, df = 41, p-value = 0.1045
chisq.test(language)
## 
##  Chi-squared test for given probabilities
## 
## data:  language
## X-squared = 7507500, df = 1, p-value < 2.2e-16
chisq.test(work)
## 
##  Chi-squared test for given probabilities
## 
## data:  work
## X-squared = 14068000, df = 11, p-value < 2.2e-16
Model <- salary ~ 
             work_yrs + age + sex + frstlang + satis 
fit <- lm(Model, data = placed.df)
summary(fit)
## 
## Call:
## lm(formula = Model, data = placed.df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -25863  -9753   -834   5571  78637 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)  
## (Intercept)    64196.3    26797.8   2.396   0.0185 *
## work_yrs         850.8     1117.2   0.762   0.4482  
## age             1719.3     1089.4   1.578   0.1178  
## sexMale         4999.4     3420.1   1.462   0.1470  
## frstlangOther  10459.5     6775.8   1.544   0.1259  
## satis          -2471.2     1978.7  -1.249   0.2147  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15460 on 97 degrees of freedom
## Multiple R-squared:  0.2878, Adjusted R-squared:  0.2511 
## F-statistic: 7.839 on 5 and 97 DF,  p-value: 3.121e-06
Model1<- gmat_tot~ age+ sex + work_yrs+frstlang
fit1<-lm(Model1, data = knownMBA.df)
summary(fit1)
## 
## Call:
## lm(formula = Model1, data = knownMBA.df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -168.958  -47.435   -1.049   38.001  168.977 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    579.817     43.916  13.203   <2e-16 ***
## age              2.066      1.847   1.119   0.2645    
## sexMale          6.207      8.463   0.733   0.4641    
## work_yrs        -5.195      2.107  -2.465   0.0144 *  
## frstlangOther  -29.556     12.286  -2.406   0.0170 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 55.9 on 223 degrees of freedom
## Multiple R-squared:  0.06419,    Adjusted R-squared:  0.0474 
## F-statistic: 3.824 on 4 and 223 DF,  p-value: 0.005