TASK 2(a)

Reading and describing the dataset

Salary.df<-read.csv(paste("MBA Starting Salaries Data.csv"))
View(Salary.df)
summary(Salary.df)
##       age             sex           gmat_tot        gmat_qpc    
##  Min.   :22.00   Min.   :1.000   Min.   :450.0   Min.   :28.00  
##  1st Qu.:25.00   1st Qu.:1.000   1st Qu.:580.0   1st Qu.:72.00  
##  Median :27.00   Median :1.000   Median :620.0   Median :83.00  
##  Mean   :27.36   Mean   :1.248   Mean   :619.5   Mean   :80.64  
##  3rd Qu.:29.00   3rd Qu.:1.000   3rd Qu.:660.0   3rd Qu.:93.00  
##  Max.   :48.00   Max.   :2.000   Max.   :790.0   Max.   :99.00  
##     gmat_vpc        gmat_tpc        s_avg           f_avg      
##  Min.   :16.00   Min.   : 0.0   Min.   :2.000   Min.   :0.000  
##  1st Qu.:71.00   1st Qu.:78.0   1st Qu.:2.708   1st Qu.:2.750  
##  Median :81.00   Median :87.0   Median :3.000   Median :3.000  
##  Mean   :78.32   Mean   :84.2   Mean   :3.025   Mean   :3.062  
##  3rd Qu.:91.00   3rd Qu.:94.0   3rd Qu.:3.300   3rd Qu.:3.250  
##  Max.   :99.00   Max.   :99.0   Max.   :4.000   Max.   :4.000  
##     quarter         work_yrs         frstlang         salary      
##  Min.   :1.000   Min.   : 0.000   Min.   :1.000   Min.   :     0  
##  1st Qu.:1.250   1st Qu.: 2.000   1st Qu.:1.000   1st Qu.:     0  
##  Median :2.000   Median : 3.000   Median :1.000   Median :   999  
##  Mean   :2.478   Mean   : 3.872   Mean   :1.117   Mean   : 39026  
##  3rd Qu.:3.000   3rd Qu.: 4.000   3rd Qu.:1.000   3rd Qu.: 97000  
##  Max.   :4.000   Max.   :22.000   Max.   :2.000   Max.   :220000  
##      satis      
##  Min.   :  1.0  
##  1st Qu.:  5.0  
##  Median :  6.0  
##  Mean   :172.2  
##  3rd Qu.:  7.0  
##  Max.   :998.0
library(psych)
describe(Salary.df)
##          vars   n     mean       sd median  trimmed     mad min    max
## age         1 274    27.36     3.71     27    26.76    2.97  22     48
## sex         2 274     1.25     0.43      1     1.19    0.00   1      2
## gmat_tot    3 274   619.45    57.54    620   618.86   59.30 450    790
## gmat_qpc    4 274    80.64    14.87     83    82.31   14.83  28     99
## gmat_vpc    5 274    78.32    16.86     81    80.33   14.83  16     99
## gmat_tpc    6 274    84.20    14.02     87    86.12   11.86   0     99
## s_avg       7 274     3.03     0.38      3     3.03    0.44   2      4
## f_avg       8 274     3.06     0.53      3     3.09    0.37   0      4
## quarter     9 274     2.48     1.11      2     2.47    1.48   1      4
## work_yrs   10 274     3.87     3.23      3     3.29    1.48   0     22
## frstlang   11 274     1.12     0.32      1     1.02    0.00   1      2
## salary     12 274 39025.69 50951.56    999 33607.86 1481.12   0 220000
## satis      13 274   172.18   371.61      6    91.50    1.48   1    998
##           range  skew kurtosis      se
## age          26  2.16     6.45    0.22
## sex           1  1.16    -0.66    0.03
## gmat_tot    340 -0.01     0.06    3.48
## gmat_qpc     71 -0.92     0.30    0.90
## gmat_vpc     83 -1.04     0.74    1.02
## gmat_tpc     99 -2.28     9.02    0.85
## s_avg         2 -0.06    -0.38    0.02
## f_avg         4 -2.08    10.85    0.03
## quarter       3  0.02    -1.35    0.07
## work_yrs     22  2.78     9.80    0.20
## frstlang      1  2.37     3.65    0.02
## salary   220000  0.70    -1.05 3078.10
## satis       997  1.77     1.13   22.45

Summary statistics of important variables

Data Types:

str(Salary.df)
## 'data.frame':    274 obs. of  13 variables:
##  $ age     : int  23 24 24 24 24 24 25 25 25 25 ...
##  $ sex     : int  2 1 1 1 2 1 1 2 1 1 ...
##  $ gmat_tot: int  620 610 670 570 710 640 610 650 630 680 ...
##  $ gmat_qpc: int  77 90 99 56 93 82 89 88 79 99 ...
##  $ gmat_vpc: int  87 71 78 81 98 89 74 89 91 81 ...
##  $ gmat_tpc: int  87 87 95 75 98 91 87 92 89 96 ...
##  $ s_avg   : num  3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
##  $ f_avg   : num  3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
##  $ quarter : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ work_yrs: int  2 2 2 1 2 2 2 2 2 2 ...
##  $ frstlang: int  1 1 1 1 1 1 1 1 2 1 ...
##  $ salary  : int  0 0 0 0 999 0 0 0 999 998 ...
##  $ satis   : int  7 6 6 7 5 6 5 6 4 998 ...

Mean of important variables:

mean(Salary.df$age)
## [1] 27.35766
mean(Salary.df$s_avg)
## [1] 3.025401
mean(Salary.df$f_avg)
## [1] 3.061533
mean(Salary.df$work_yrs)
## [1] 3.872263
mean(Salary.df$salary)
## [1] 39025.69
mean(Salary.df$satis)
## [1] 172.1788

Median of important variables:

median(Salary.df$age)
## [1] 27
median(Salary.df$s_avg)
## [1] 3
median(Salary.df$f_avg)
## [1] 3
median(Salary.df$work_yrs)
## [1] 3
median(Salary.df$salary)
## [1] 999
median(Salary.df$satis)
## [1] 6

Standard Deviation of important variables:

sd(Salary.df$age)
## [1] 3.710666
sd(Salary.df$s_avg)
## [1] 0.3810743
sd(Salary.df$f_avg)
## [1] 0.5250451
sd(Salary.df$work_yrs)
## [1] 3.232464
sd(Salary.df$salary)
## [1] 50951.56
sd(Salary.df$satis)
## [1] 371.6146

Visualising the dataset using boxplot

boxplot(Salary.df$age,horizontal=TRUE,main="Visualisation of age")

boxplot(Salary.df$gmat_tot,horizontal=TRUE,main="Visualisation of GMAT Total Score")

boxplot(Salary.df$work_yrs,horizontal=TRUE,main="Visualisation of number of work years")

boxplot(Salary.df$salary,horizontal=TRUE,main="Visualisation of salary")

Drawing Scatter Plot to indicate correlation

library(car)
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
scatterplot.matrix(formula=~age+sex+gmat_tot+s_avg+f_avg+quarter+work_yrs+frstlang+salary+satis,cex=0.6,diagonal="histogram",data = Salary.df)
## Warning: 'scatterplot.matrix' is deprecated.
## Use 'scatterplotMatrix' instead.
## See help("Deprecated") and help("car-deprecated").

Drawing Corrgram

library(corrgram)
corrgram(Salary.df, order = T, text.panel=panel.txt,
         lower.panel = panel.shade,
         upper.panel = panel.pie, main="Corrgram of all variables")

Creating a variance-covariance matrix

cov(Salary.df)
##                    age           sex      gmat_tot      gmat_qpc
## age       1.376904e+01 -4.513248e-02 -3.115879e+01 -1.192655e+01
## sex      -4.513248e-02  1.872677e-01 -1.328841e+00 -1.053769e+00
## gmat_tot -3.115879e+01 -1.328841e+00  3.310688e+03  6.200233e+02
## gmat_qpc -1.192655e+01 -1.053769e+00  6.200233e+02  2.210731e+02
## gmat_vpc -2.763643e+00  5.463758e-01  7.260006e+02  3.814826e+01
## gmat_tpc -8.839978e+00 -4.908960e-02  6.839911e+02  1.357997e+02
## s_avg     2.116874e-01  2.096227e-02  2.480257e+00 -1.691233e-01
## f_avg    -3.399348e-02  2.082698e-02  3.154688e+00  5.753854e-01
## quarter  -2.045935e-01 -6.414267e-02 -5.891153e+00  6.001979e-01
## work_yrs  1.029494e+01 -1.580172e-02 -3.391634e+01 -1.137186e+01
## frstlang  6.796610e-02  2.138980e-04 -2.499933e+00  6.646346e-01
## salary   -1.183042e+04  1.518264e+03 -1.611600e+05 -3.335823e+04
## satis    -1.763499e+02 -8.780808e+00  1.765263e+03  3.348371e+02
##               gmat_vpc     gmat_tpc         s_avg        f_avg
## age         -2.7636427   -8.8399775    0.21168739  -0.03399348
## sex          0.5463758   -0.0490896    0.02096227   0.02082698
## gmat_tot   726.0006417  683.9910698    2.48025721   3.15468838
## gmat_qpc    38.1482581  135.7996845   -0.16912329   0.57538542
## gmat_vpc   284.2481217  157.4932488    1.31357023   0.67207000
## gmat_tpc   157.4932488  196.6057057    0.62710008   0.58698618
## s_avg        1.3135702    0.6271001    0.14521760   0.11016898
## f_avg        0.6720700    0.5869862    0.11016898   0.27567237
## quarter     -3.2676666   -1.2923719   -0.32237213  -0.26080880
## work_yrs    -3.6181653   -7.8575172    0.15926392  -0.06628700
## frstlang    -2.1145691   -0.4663244   -0.01671372  -0.00626026
## salary   -5273.8523836 3522.7500067 2831.60098580 787.65597177
## satis      392.3562739  484.2466779   -4.62884495   2.12532927
##                quarter      work_yrs      frstlang        salary
## age      -2.045935e-01   10.29493864  6.796610e-02 -1.183042e+04
## sex      -6.414267e-02   -0.01580172  2.138980e-04  1.518264e+03
## gmat_tot -5.891153e+00  -33.91633914 -2.499933e+00 -1.611600e+05
## gmat_qpc  6.001979e-01  -11.37186171  6.646346e-01 -3.335823e+04
## gmat_vpc -3.267667e+00   -3.61816529 -2.114569e+00 -5.273852e+03
## gmat_tpc -1.292372e+00   -7.85751718 -4.663244e-01  3.522750e+03
## s_avg    -3.223721e-01    0.15926392 -1.671372e-02  2.831601e+03
## f_avg    -2.608088e-01   -0.06628700 -6.260260e-03  7.876560e+02
## quarter   1.232119e+00   -0.30866822  3.553381e-02 -9.296214e+03
## work_yrs -3.086682e-01   10.44882490 -2.898318e-02  1.486147e+03
## frstlang  3.553381e-02   -0.02898318  1.035266e-01 -1.419586e+03
## salary   -9.296214e+03 1486.14704152 -1.419586e+03  2.596062e+09
## satis    -5.227133e-03 -131.24080907  9.484532e+00 -6.347115e+06
##                  satis
## age      -1.763499e+02
## sex      -8.780808e+00
## gmat_tot  1.765263e+03
## gmat_qpc  3.348371e+02
## gmat_vpc  3.923563e+02
## gmat_tpc  4.842467e+02
## s_avg    -4.628845e+00
## f_avg     2.125329e+00
## quarter  -5.227133e-03
## work_yrs -1.312408e+02
## frstlang  9.484532e+00
## salary   -6.347115e+06
## satis     1.380974e+05
var(Salary.df)
##                    age           sex      gmat_tot      gmat_qpc
## age       1.376904e+01 -4.513248e-02 -3.115879e+01 -1.192655e+01
## sex      -4.513248e-02  1.872677e-01 -1.328841e+00 -1.053769e+00
## gmat_tot -3.115879e+01 -1.328841e+00  3.310688e+03  6.200233e+02
## gmat_qpc -1.192655e+01 -1.053769e+00  6.200233e+02  2.210731e+02
## gmat_vpc -2.763643e+00  5.463758e-01  7.260006e+02  3.814826e+01
## gmat_tpc -8.839978e+00 -4.908960e-02  6.839911e+02  1.357997e+02
## s_avg     2.116874e-01  2.096227e-02  2.480257e+00 -1.691233e-01
## f_avg    -3.399348e-02  2.082698e-02  3.154688e+00  5.753854e-01
## quarter  -2.045935e-01 -6.414267e-02 -5.891153e+00  6.001979e-01
## work_yrs  1.029494e+01 -1.580172e-02 -3.391634e+01 -1.137186e+01
## frstlang  6.796610e-02  2.138980e-04 -2.499933e+00  6.646346e-01
## salary   -1.183042e+04  1.518264e+03 -1.611600e+05 -3.335823e+04
## satis    -1.763499e+02 -8.780808e+00  1.765263e+03  3.348371e+02
##               gmat_vpc     gmat_tpc         s_avg        f_avg
## age         -2.7636427   -8.8399775    0.21168739  -0.03399348
## sex          0.5463758   -0.0490896    0.02096227   0.02082698
## gmat_tot   726.0006417  683.9910698    2.48025721   3.15468838
## gmat_qpc    38.1482581  135.7996845   -0.16912329   0.57538542
## gmat_vpc   284.2481217  157.4932488    1.31357023   0.67207000
## gmat_tpc   157.4932488  196.6057057    0.62710008   0.58698618
## s_avg        1.3135702    0.6271001    0.14521760   0.11016898
## f_avg        0.6720700    0.5869862    0.11016898   0.27567237
## quarter     -3.2676666   -1.2923719   -0.32237213  -0.26080880
## work_yrs    -3.6181653   -7.8575172    0.15926392  -0.06628700
## frstlang    -2.1145691   -0.4663244   -0.01671372  -0.00626026
## salary   -5273.8523836 3522.7500067 2831.60098580 787.65597177
## satis      392.3562739  484.2466779   -4.62884495   2.12532927
##                quarter      work_yrs      frstlang        salary
## age      -2.045935e-01   10.29493864  6.796610e-02 -1.183042e+04
## sex      -6.414267e-02   -0.01580172  2.138980e-04  1.518264e+03
## gmat_tot -5.891153e+00  -33.91633914 -2.499933e+00 -1.611600e+05
## gmat_qpc  6.001979e-01  -11.37186171  6.646346e-01 -3.335823e+04
## gmat_vpc -3.267667e+00   -3.61816529 -2.114569e+00 -5.273852e+03
## gmat_tpc -1.292372e+00   -7.85751718 -4.663244e-01  3.522750e+03
## s_avg    -3.223721e-01    0.15926392 -1.671372e-02  2.831601e+03
## f_avg    -2.608088e-01   -0.06628700 -6.260260e-03  7.876560e+02
## quarter   1.232119e+00   -0.30866822  3.553381e-02 -9.296214e+03
## work_yrs -3.086682e-01   10.44882490 -2.898318e-02  1.486147e+03
## frstlang  3.553381e-02   -0.02898318  1.035266e-01 -1.419586e+03
## salary   -9.296214e+03 1486.14704152 -1.419586e+03  2.596062e+09
## satis    -5.227133e-03 -131.24080907  9.484532e+00 -6.347115e+06
##                  satis
## age      -1.763499e+02
## sex      -8.780808e+00
## gmat_tot  1.765263e+03
## gmat_qpc  3.348371e+02
## gmat_vpc  3.923563e+02
## gmat_tpc  4.842467e+02
## s_avg    -4.628845e+00
## f_avg     2.125329e+00
## quarter  -5.227133e-03
## work_yrs -1.312408e+02
## frstlang  9.484532e+00
## salary   -6.347115e+06
## satis     1.380974e+05

TASK2(b)

Forming the subset of those who got the job

placed <- Salary.df[ which(Salary.df$salary!=0),]

Creating the contingency table

mytable1 <- xtabs(~salary+sex,data = placed)
mytable1
##         sex
## salary    1  2
##   998    37  9
##   999    30  5
##   64000   0  1
##   77000   1  0
##   78256   0  1
##   82000   0  1
##   85000   1  3
##   86000   0  2
##   88000   0  1
##   88500   1  0
##   90000   3  0
##   92000   2  1
##   93000   2  1
##   95000   4  3
##   96000   3  1
##   96500   1  0
##   97000   2  0
##   98000   6  4
##   99000   0  1
##   100000  4  5
##   100400  1  0
##   101000  0  2
##   101100  1  0
##   101600  1  0
##   102500  1  0
##   103000  1  0
##   104000  2  0
##   105000 11  0
##   106000  2  1
##   107000  1  0
##   107300  1  0
##   107500  1  0
##   108000  2  0
##   110000  0  1
##   112000  3  0
##   115000  5  0
##   118000  1  0
##   120000  3  1
##   126710  1  0
##   130000  1  0
##   145800  1  0
##   146000  1  0
##   162000  1  0
##   220000  0  1
mytable2 <- xtabs(~salary+age,data = placed)
mytable2
##         age
## salary   22 23 24 25 26 27 28 29 30 31 32 33 34 39 40
##   998     0  0  2 15 11 11  4  0  1  2  0  0  0  0  0
##   999     0  0  2  6  5  7  3  5  3  2  2  0  0  0  0
##   64000   0  0  1  0  0  0  0  0  0  0  0  0  0  0  0
##   77000   0  1  0  0  0  0  0  0  0  0  0  0  0  0  0
##   78256   0  1  0  0  0  0  0  0  0  0  0  0  0  0  0
##   82000   0  0  0  0  1  0  0  0  0  0  0  0  0  0  0
##   85000   1  0  0  1  1  1  0  0  0  0  0  0  0  0  0
##   86000   0  0  0  1  1  0  0  0  0  0  0  0  0  0  0
##   88000   0  0  0  1  0  0  0  0  0  0  0  0  0  0  0
##   88500   0  0  0  0  0  1  0  0  0  0  0  0  0  0  0
##   90000   0  0  0  2  0  1  0  0  0  0  0  0  0  0  0
##   92000   0  0  0  2  0  1  0  0  0  0  0  0  0  0  0
##   93000   0  0  0  1  0  0  1  0  0  1  0  0  0  0  0
##   95000   0  0  1  5  0  0  0  1  0  0  0  0  0  0  0
##   96000   0  0  1  1  2  0  0  0  0  0  0  0  0  0  0
##   96500   0  0  1  0  0  0  0  0  0  0  0  0  0  0  0
##   97000   0  0  0  0  0  1  1  0  0  0  0  0  0  0  0
##   98000   0  1  3  2  1  1  1  1  0  0  0  0  0  0  0
##   99000   0  0  0  0  0  0  1  0  0  0  0  0  0  0  0
##   100000  0  1  4  1  1  1  0  0  0  1  0  0  0  0  0
##   100400  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0
##   101000  0  0  1  1  0  0  0  0  0  0  0  0  0  0  0
##   101100  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0
##   101600  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0
##   102500  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0
##   103000  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0
##   104000  0  0  0  0  0  0  1  0  0  1  0  0  0  0  0
##   105000  0  1  1  2  3  1  0  0  1  1  0  0  1  0  0
##   106000  0  0  0  0  0  0  0  1  2  0  0  0  0  0  0
##   107000  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0
##   107300  0  0  0  0  0  0  0  0  0  0  1  0  0  0  0
##   107500  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0
##   108000  0  0  0  1  0  0  1  0  0  0  0  0  0  0  0
##   110000  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0
##   112000  0  0  1  0  0  0  0  1  0  0  0  0  0  1  0
##   115000  0  0  1  1  0  3  0  0  0  0  0  0  0  0  0
##   118000  0  0  0  0  0  0  0  0  0  0  0  1  0  0  0
##   120000  0  0  0  0  0  1  1  0  2  0  0  0  0  0  0
##   126710  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0
##   130000  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0
##   145800  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0
##   146000  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1
##   162000  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0
##   220000  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1
mytable3 <- xtabs(~gmat_tot+sex,data = placed)
mytable3
##         sex
## gmat_tot  1  2
##      450  0  1
##      460  1  0
##      500  2  1
##      520  0  1
##      530  1  1
##      540  2  0
##      550  4  0
##      560  9  4
##      570  9  2
##      580  8  3
##      590  5  1
##      600 14  3
##      610  7  2
##      620 12  4
##      630 14  3
##      640  4  2
##      650  7  4
##      660  8  3
##      670  8  5
##      680  7  2
##      690  4  0
##      700  2  1
##      710  5  1
##      720  1  1
##      730  1  0
##      740  3  0
##      790  1  0
mytable4 <- xtabs(~work_yrs+sex,data = placed)
mytable4
##         sex
## work_yrs  1  2
##       0   2  0
##       1   8  4
##       2  41 19
##       3  34  8
##       4  28  6
##       5   6  3
##       6   7  3
##       7   4  0
##       8   4  1
##       9   1  0
##       10  1  0
##       15  1  1
##       16  2  0
mytable5 <- xtabs(~salary+frstlang,data = placed)
mytable5
##         frstlang
## salary    1  2
##   998    38  8
##   999    26  9
##   64000   1  0
##   77000   1  0
##   78256   1  0
##   82000   1  0
##   85000   4  0
##   86000   2  0
##   88000   1  0
##   88500   1  0
##   90000   3  0
##   92000   3  0
##   93000   3  0
##   95000   7  0
##   96000   4  0
##   96500   1  0
##   97000   2  0
##   98000   8  2
##   99000   0  1
##   100000  9  0
##   100400  1  0
##   101000  2  0
##   101100  1  0
##   101600  1  0
##   102500  1  0
##   103000  1  0
##   104000  1  1
##   105000 11  0
##   106000  3  0
##   107000  1  0
##   107300  0  1
##   107500  1  0
##   108000  2  0
##   110000  1  0
##   112000  3  0
##   115000  5  0
##   118000  0  1
##   120000  4  0
##   126710  1  0
##   130000  1  0
##   145800  1  0
##   146000  1  0
##   162000  1  0
##   220000  0  1

Running chi-square test

chisq.test(mytable1)
## Warning in chisq.test(mytable1): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  mytable1
## X-squared = 64.319, df = 43, p-value = 0.0192
chisq.test(mytable2)
## Warning in chisq.test(mytable2): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  mytable2
## X-squared = 948.87, df = 602, p-value < 2.2e-16
chisq.test(mytable3)
## Warning in chisq.test(mytable3): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  mytable3
## X-squared = 17.414, df = 26, p-value = 0.896
chisq.test(mytable4)
## Warning in chisq.test(mytable4): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  mytable4
## X-squared = 8.2662, df = 12, p-value = 0.764
chisq.test(mytable5)
## Warning in chisq.test(mytable5): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  mytable5
## X-squared = 48.273, df = 43, p-value = 0.2682

Running the t-test

t.test(mytable1)
## 
##  One Sample t-test
## 
## data:  mytable1
## t = 3.765, df = 87, p-value = 0.000302
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  0.9870773 3.1947409
## sample estimates:
## mean of x 
##  2.090909
t.test(mytable2)
## 
##  One Sample t-test
## 
## data:  mytable2
## t = 6.6143, df = 659, p-value = 7.727e-11
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  0.1960245 0.3615512
## sample estimates:
## mean of x 
## 0.2787879
t.test(mytable3)
## 
##  One Sample t-test
## 
## data:  mytable3
## t = 7.058, df = 53, p-value = 3.637e-09
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  2.439094 4.375721
## sample estimates:
## mean of x 
##  3.407407
t.test(mytable4)
## 
##  One Sample t-test
## 
## data:  mytable4
## t = 3.2898, df = 25, p-value = 0.002979
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##   2.646542 11.507304
## sample estimates:
## mean of x 
##  7.076923
t.test(mytable5)
## 
##  One Sample t-test
## 
## data:  mytable5
## t = 3.7666, df = 87, p-value = 0.0003004
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  0.9875448 3.1942733
## sample estimates:
## mean of x 
##  2.090909

Regression model

reg <- lm(salary~age+sex+work_yrs,data = placed)
summary(reg)
## 
## Call:
## lm(formula = salary ~ age + sex + work_yrs, data = placed)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -82407 -52377  22515  45293 117966 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)    74699      62860   1.188   0.2363  
## age            -2005       2548  -0.787   0.4324  
## sex            13777       8961   1.537   0.1259  
## work_yrs        5673       2813   2.017   0.0452 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 51660 on 180 degrees of freedom
## Multiple R-squared:  0.04794,    Adjusted R-squared:  0.03207 
## F-statistic: 3.021 on 3 and 180 DF,  p-value: 0.03112
reg <- lm(salary~gmat_tot+sex+gmat_tot+frstlang+quarter,data = placed)
summary(reg)
## 
## Call:
## lm(formula = salary ~ gmat_tot + sex + gmat_tot + frstlang + 
##     quarter, data = placed)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -90378 -48397  15960  40122 170419 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 206446.17   50007.22   4.128  5.6e-05 ***
## gmat_tot      -174.55      69.37  -2.516   0.0127 *  
## sex          10419.93    8725.34   1.194   0.2340    
## frstlang    -27618.09   11267.00  -2.451   0.0152 *  
## quarter      -8798.76    3360.58  -2.618   0.0096 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 50340 on 179 degrees of freedom
## Multiple R-squared:  0.1012, Adjusted R-squared:  0.08115 
## F-statistic: 5.041 on 4 and 179 DF,  p-value: 0.0007144
reg <- lm(salary~age+work_yrs,data = placed)
summary(reg)
## 
## Call:
## lm(formula = salary ~ age + work_yrs, data = placed)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -87007 -54400  27484  44513 126316 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)   104828      59952   1.749   0.0821 .
## age            -2534       2534  -1.000   0.3186  
## work_yrs        6014       2815   2.137   0.0340 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 51860 on 181 degrees of freedom
## Multiple R-squared:  0.03543,    Adjusted R-squared:  0.02478 
## F-statistic: 3.325 on 2 and 181 DF,  p-value: 0.0382

By analysing the above models, we can see that the second model best fits.

TASK 2(c)

Comapring the subsets of those who didnot get the job to those who got the job

notplaced <- Salary.df[ which(Salary.df$salary==0),]
View(notplaced)

Contingency table

mytable3 <- xtabs(~gmat_tot+sex,data = notplaced)
mytable3
##         sex
## gmat_tot 1 2
##      450 1 0
##      480 1 0
##      510 2 0
##      530 2 1
##      540 2 1
##      550 3 1
##      560 3 5
##      570 5 2
##      580 4 0
##      590 3 0
##      600 3 0
##      610 7 2
##      620 2 2
##      630 4 1
##      640 3 3
##      650 3 2
##      660 3 0
##      670 4 0
##      680 2 1
##      700 1 1
##      710 3 1
##      720 2 0
##      730 1 0
##      740 1 0
##      750 1 0
##      760 1 0
mytable4 <- xtabs(~work_yrs+sex,data = notplaced)
mytable4
##         sex
## work_yrs  1  2
##       0   1  0
##       1  12  0
##       2  16  6
##       3   9  5
##       4   8  1
##       5   7  5
##       6   2  0
##       7   3  2
##       8   2  0
##       9   0  1
##       10  0  1
##       11  1  1
##       12  2  0
##       13  0  1
##       16  1  0
##       18  1  0
##       22  2  0

We can compare the above contingency table with table corresponding to placed dataframe

Running chi-squared test

chisq.test(mytable3)
## Warning in chisq.test(mytable3): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  mytable3
## X-squared = 19.78, df = 25, p-value = 0.7583
chisq.test(mytable4)
## Warning in chisq.test(mytable4): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  mytable4
## X-squared = 21.229, df = 16, p-value = 0.1699

Running a logistic regression

setwd("C:/Users/SURABHI/Desktop/IIM INTERNSHIP")
Model.df<-read.csv(paste("MBA Starting Salaries Data.csv"),header=T,na.strings=c(""))
sapply(Model.df,function(x) sum(is.na(x)))
##      age      sex gmat_tot gmat_qpc gmat_vpc gmat_tpc    s_avg    f_avg 
##        0        0        0        0        0        0        0        0 
##  quarter work_yrs frstlang   salary    satis 
##        0        0        0        0        0
library(Amelia)
## Loading required package: Rcpp
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.4, built: 2015-12-05)
## ## Copyright (C) 2005-2018 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
missmap(Model.df, main = "Missing values vs observed")

data <- subset(Model.df,select=c(1,2,3,4,5,6,7,8,9,10,11,12))
data$salary[data$salary>0] <- 1
train <- data[1:800,]
test <- data[801:889,]
model <- glm(salary~gmat_qpc,family=binomial(link='logit'),data=train)
summary(model)
## 
## Call:
## glm(formula = salary ~ gmat_qpc, family = binomial(link = "logit"), 
##     data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5893  -1.4189   0.8395   0.8976   1.0766  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.205952   0.693627  -0.297    0.767
## gmat_qpc     0.011480   0.008535   1.345    0.179
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 346.93  on 273  degrees of freedom
## Residual deviance: 345.13  on 272  degrees of freedom
##   (526 observations deleted due to missingness)
## AIC: 349.13
## 
## Number of Fisher Scoring iterations: 4