Task 2b:

data.df <- read.csv(paste("MBA Starting Salaries Data.csv", sep=""))
library(psych)
View(data.df)

Comments: 274 observations of 14 variables

Summary Statistics:

summary(data.df)
##       age             sex           gmat_tot        gmat_qpc    
##  Min.   :22.00   Min.   :1.000   Min.   :450.0   Min.   :28.00  
##  1st Qu.:25.00   1st Qu.:1.000   1st Qu.:580.0   1st Qu.:72.00  
##  Median :27.00   Median :1.000   Median :620.0   Median :83.00  
##  Mean   :27.36   Mean   :1.248   Mean   :619.5   Mean   :80.64  
##  3rd Qu.:29.00   3rd Qu.:1.000   3rd Qu.:660.0   3rd Qu.:93.00  
##  Max.   :48.00   Max.   :2.000   Max.   :790.0   Max.   :99.00  
##     gmat_vpc        gmat_tpc        s_avg           f_avg      
##  Min.   :16.00   Min.   : 0.0   Min.   :2.000   Min.   :0.000  
##  1st Qu.:71.00   1st Qu.:78.0   1st Qu.:2.708   1st Qu.:2.750  
##  Median :81.00   Median :87.0   Median :3.000   Median :3.000  
##  Mean   :78.32   Mean   :84.2   Mean   :3.025   Mean   :3.062  
##  3rd Qu.:91.00   3rd Qu.:94.0   3rd Qu.:3.300   3rd Qu.:3.250  
##  Max.   :99.00   Max.   :99.0   Max.   :4.000   Max.   :4.000  
##     quarter         work_yrs         frstlang         salary      
##  Min.   :1.000   Min.   : 0.000   Min.   :1.000   Min.   :     0  
##  1st Qu.:1.250   1st Qu.: 2.000   1st Qu.:1.000   1st Qu.:     0  
##  Median :2.000   Median : 3.000   Median :1.000   Median :   999  
##  Mean   :2.478   Mean   : 3.872   Mean   :1.117   Mean   : 39026  
##  3rd Qu.:3.000   3rd Qu.: 4.000   3rd Qu.:1.000   3rd Qu.: 97000  
##  Max.   :4.000   Max.   :22.000   Max.   :2.000   Max.   :220000  
##      satis      
##  Min.   :  1.0  
##  1st Qu.:  5.0  
##  Median :  6.0  
##  Mean   :172.2  
##  3rd Qu.:  7.0  
##  Max.   :998.0
str(data.df)
## 'data.frame':    274 obs. of  13 variables:
##  $ age     : int  23 24 24 24 24 24 25 25 25 25 ...
##  $ sex     : int  2 1 1 1 2 1 1 2 1 1 ...
##  $ gmat_tot: int  620 610 670 570 710 640 610 650 630 680 ...
##  $ gmat_qpc: int  77 90 99 56 93 82 89 88 79 99 ...
##  $ gmat_vpc: int  87 71 78 81 98 89 74 89 91 81 ...
##  $ gmat_tpc: int  87 87 95 75 98 91 87 92 89 96 ...
##  $ s_avg   : num  3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
##  $ f_avg   : num  3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
##  $ quarter : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ work_yrs: int  2 2 2 1 2 2 2 2 2 2 ...
##  $ frstlang: int  1 1 1 1 1 1 1 1 2 1 ...
##  $ salary  : int  0 0 0 0 999 0 0 0 999 998 ...
##  $ satis   : int  7 6 6 7 5 6 5 6 4 998 ...

Visualization of distribution of variables

hist(data.df$age, 
     main="Distribution of Age",
     xlab="Age",
     ylab="Count",
     breaks=10,          
     col="pink")  

boxplot(data.df$age)

Visualization of Sex

hist(data.df$gmat_tot, 
     main="Distribution of Gmat Total",
     xlab="gmat_tot",
     ylab="Count",
     breaks=10,             
     col="yellow")

boxplot(data.df$gmat_tot)

Visualization of Work Experience

hist(data.df$work_yrs, 
     main="Distribution of Work Experience",
     xlab="Work Experience (Years)",
     ylab="Count",
     breaks=10,             
     col="turquoise")  

boxplot(data.df$work_yrs)

Visualization of Salary

hist(data.df$salary, 
     main="Distribution of salary",
     xlab="salary",
     ylab="Count",
     breaks=10,            
     col="green3") 

boxplot(data.df$salary)

Visulaization of Satisfaction

hist(data.df$satis, 
     main="Distribution of Satisfaction",
     xlab="Satisfaction",
     ylab="Count",
     breaks=10,            
     col="peachpuff")  

boxplot(data.df$satis)

Pair-wise correlation of variables

pairs(formula = ~ age + sex + gmat_tot + s_avg + f_avg + quarter + work_yrs + frstlang + salary + satis, cex=0.5, data=data.df)

Corrgram, Variance-Covariance Matrix

library(corrgram)
corrgram(data.df, order=TRUE, lower.panel=panel.shade,
         upper.panel=panel.pie, text.panel=panel.txt,
         main="Corrgram of Dataset")

cov(data.df)
##                    age           sex      gmat_tot      gmat_qpc
## age       1.376904e+01 -4.513248e-02 -3.115879e+01 -1.192655e+01
## sex      -4.513248e-02  1.872677e-01 -1.328841e+00 -1.053769e+00
## gmat_tot -3.115879e+01 -1.328841e+00  3.310688e+03  6.200233e+02
## gmat_qpc -1.192655e+01 -1.053769e+00  6.200233e+02  2.210731e+02
## gmat_vpc -2.763643e+00  5.463758e-01  7.260006e+02  3.814826e+01
## gmat_tpc -8.839978e+00 -4.908960e-02  6.839911e+02  1.357997e+02
## s_avg     2.116874e-01  2.096227e-02  2.480257e+00 -1.691233e-01
## f_avg    -3.399348e-02  2.082698e-02  3.154688e+00  5.753854e-01
## quarter  -2.045935e-01 -6.414267e-02 -5.891153e+00  6.001979e-01
## work_yrs  1.029494e+01 -1.580172e-02 -3.391634e+01 -1.137186e+01
## frstlang  6.796610e-02  2.138980e-04 -2.499933e+00  6.646346e-01
## salary   -1.183042e+04  1.518264e+03 -1.611600e+05 -3.335823e+04
## satis    -1.763499e+02 -8.780808e+00  1.765263e+03  3.348371e+02
##               gmat_vpc     gmat_tpc         s_avg        f_avg
## age         -2.7636427   -8.8399775    0.21168739  -0.03399348
## sex          0.5463758   -0.0490896    0.02096227   0.02082698
## gmat_tot   726.0006417  683.9910698    2.48025721   3.15468838
## gmat_qpc    38.1482581  135.7996845   -0.16912329   0.57538542
## gmat_vpc   284.2481217  157.4932488    1.31357023   0.67207000
## gmat_tpc   157.4932488  196.6057057    0.62710008   0.58698618
## s_avg        1.3135702    0.6271001    0.14521760   0.11016898
## f_avg        0.6720700    0.5869862    0.11016898   0.27567237
## quarter     -3.2676666   -1.2923719   -0.32237213  -0.26080880
## work_yrs    -3.6181653   -7.8575172    0.15926392  -0.06628700
## frstlang    -2.1145691   -0.4663244   -0.01671372  -0.00626026
## salary   -5273.8523836 3522.7500067 2831.60098580 787.65597177
## satis      392.3562739  484.2466779   -4.62884495   2.12532927
##                quarter      work_yrs      frstlang        salary
## age      -2.045935e-01   10.29493864  6.796610e-02 -1.183042e+04
## sex      -6.414267e-02   -0.01580172  2.138980e-04  1.518264e+03
## gmat_tot -5.891153e+00  -33.91633914 -2.499933e+00 -1.611600e+05
## gmat_qpc  6.001979e-01  -11.37186171  6.646346e-01 -3.335823e+04
## gmat_vpc -3.267667e+00   -3.61816529 -2.114569e+00 -5.273852e+03
## gmat_tpc -1.292372e+00   -7.85751718 -4.663244e-01  3.522750e+03
## s_avg    -3.223721e-01    0.15926392 -1.671372e-02  2.831601e+03
## f_avg    -2.608088e-01   -0.06628700 -6.260260e-03  7.876560e+02
## quarter   1.232119e+00   -0.30866822  3.553381e-02 -9.296214e+03
## work_yrs -3.086682e-01   10.44882490 -2.898318e-02  1.486147e+03
## frstlang  3.553381e-02   -0.02898318  1.035266e-01 -1.419586e+03
## salary   -9.296214e+03 1486.14704152 -1.419586e+03  2.596062e+09
## satis    -5.227133e-03 -131.24080907  9.484532e+00 -6.347115e+06
##                  satis
## age      -1.763499e+02
## sex      -8.780808e+00
## gmat_tot  1.765263e+03
## gmat_qpc  3.348371e+02
## gmat_vpc  3.923563e+02
## gmat_tpc  4.842467e+02
## s_avg    -4.628845e+00
## f_avg     2.125329e+00
## quarter  -5.227133e-03
## work_yrs -1.312408e+02
## frstlang  9.484532e+00
## salary   -6.347115e+06
## satis     1.380974e+05
var(data.df)
##                    age           sex      gmat_tot      gmat_qpc
## age       1.376904e+01 -4.513248e-02 -3.115879e+01 -1.192655e+01
## sex      -4.513248e-02  1.872677e-01 -1.328841e+00 -1.053769e+00
## gmat_tot -3.115879e+01 -1.328841e+00  3.310688e+03  6.200233e+02
## gmat_qpc -1.192655e+01 -1.053769e+00  6.200233e+02  2.210731e+02
## gmat_vpc -2.763643e+00  5.463758e-01  7.260006e+02  3.814826e+01
## gmat_tpc -8.839978e+00 -4.908960e-02  6.839911e+02  1.357997e+02
## s_avg     2.116874e-01  2.096227e-02  2.480257e+00 -1.691233e-01
## f_avg    -3.399348e-02  2.082698e-02  3.154688e+00  5.753854e-01
## quarter  -2.045935e-01 -6.414267e-02 -5.891153e+00  6.001979e-01
## work_yrs  1.029494e+01 -1.580172e-02 -3.391634e+01 -1.137186e+01
## frstlang  6.796610e-02  2.138980e-04 -2.499933e+00  6.646346e-01
## salary   -1.183042e+04  1.518264e+03 -1.611600e+05 -3.335823e+04
## satis    -1.763499e+02 -8.780808e+00  1.765263e+03  3.348371e+02
##               gmat_vpc     gmat_tpc         s_avg        f_avg
## age         -2.7636427   -8.8399775    0.21168739  -0.03399348
## sex          0.5463758   -0.0490896    0.02096227   0.02082698
## gmat_tot   726.0006417  683.9910698    2.48025721   3.15468838
## gmat_qpc    38.1482581  135.7996845   -0.16912329   0.57538542
## gmat_vpc   284.2481217  157.4932488    1.31357023   0.67207000
## gmat_tpc   157.4932488  196.6057057    0.62710008   0.58698618
## s_avg        1.3135702    0.6271001    0.14521760   0.11016898
## f_avg        0.6720700    0.5869862    0.11016898   0.27567237
## quarter     -3.2676666   -1.2923719   -0.32237213  -0.26080880
## work_yrs    -3.6181653   -7.8575172    0.15926392  -0.06628700
## frstlang    -2.1145691   -0.4663244   -0.01671372  -0.00626026
## salary   -5273.8523836 3522.7500067 2831.60098580 787.65597177
## satis      392.3562739  484.2466779   -4.62884495   2.12532927
##                quarter      work_yrs      frstlang        salary
## age      -2.045935e-01   10.29493864  6.796610e-02 -1.183042e+04
## sex      -6.414267e-02   -0.01580172  2.138980e-04  1.518264e+03
## gmat_tot -5.891153e+00  -33.91633914 -2.499933e+00 -1.611600e+05
## gmat_qpc  6.001979e-01  -11.37186171  6.646346e-01 -3.335823e+04
## gmat_vpc -3.267667e+00   -3.61816529 -2.114569e+00 -5.273852e+03
## gmat_tpc -1.292372e+00   -7.85751718 -4.663244e-01  3.522750e+03
## s_avg    -3.223721e-01    0.15926392 -1.671372e-02  2.831601e+03
## f_avg    -2.608088e-01   -0.06628700 -6.260260e-03  7.876560e+02
## quarter   1.232119e+00   -0.30866822  3.553381e-02 -9.296214e+03
## work_yrs -3.086682e-01   10.44882490 -2.898318e-02  1.486147e+03
## frstlang  3.553381e-02   -0.02898318  1.035266e-01 -1.419586e+03
## salary   -9.296214e+03 1486.14704152 -1.419586e+03  2.596062e+09
## satis    -5.227133e-03 -131.24080907  9.484532e+00 -6.347115e+06
##                  satis
## age      -1.763499e+02
## sex      -8.780808e+00
## gmat_tot  1.765263e+03
## gmat_qpc  3.348371e+02
## gmat_vpc  3.923563e+02
## gmat_tpc  4.842467e+02
## s_avg    -4.628845e+00
## f_avg     2.125329e+00
## quarter  -5.227133e-03
## work_yrs -1.312408e+02
## frstlang  9.484532e+00
## salary   -6.347115e+06
## satis     1.380974e+05

Task 1b: Subset of dataset consisting of only those people who actually got a job

placed.df <- subset(data.df, salary>0 & salary!= 998 & salary!=999)
View(placed.df)

Contingency Tables

mytable <- with(placed.df, table(sex))
mytable
## sex
##  1  2 
## 72 31
mytable <- with(placed.df, table(frstlang))
mytable
## frstlang
##  1  2 
## 96  7
aggregate(salary ~ sex, data=placed.df, mean)
##   sex    salary
## 1   1 104970.97
## 2   2  98524.39
aggregate(salary ~ frstlang, data=placed.df, mean)
##   frstlang   salary
## 1        1 101748.6
## 2        2 120614.3

T-Tests

t.test(salary ~ sex, data=placed.df)
## 
##  Welch Two Sample t-test
## 
## data:  salary by sex
## t = 1.3628, df = 38.115, p-value = 0.1809
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -3128.55 16021.72
## sample estimates:
## mean in group 1 mean in group 2 
##       104970.97        98524.39

Comments: p-value=0.1809(>0.05)

t.test(salary ~ frstlang, data=placed.df)
## 
##  Welch Two Sample t-test
## 
## data:  salary by frstlang
## t = -1.1202, df = 6.0863, p-value = 0.3049
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -59933.62  22202.25
## sample estimates:
## mean in group 1 mean in group 2 
##        101748.6        120614.3

Regression Model1:

fit <- lm(salary ~ work_yrs + gmat_tot + s_avg + f_avg, data = placed.df)
summary(fit)
## 
## Call:
## lm(formula = salary ~ work_yrs + gmat_tot + s_avg + f_avg, data = placed.df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -36351  -8173  -1170   3864  87090 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 98635.48   22643.60   4.356 3.26e-05 ***
## work_yrs     2579.88     577.97   4.464 2.16e-05 ***
## gmat_tot      -14.98      32.53  -0.460    0.646    
## s_avg        2422.16    5033.78   0.481    0.631    
## f_avg       -1087.60    3889.90  -0.280    0.780    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16200 on 98 degrees of freedom
## Multiple R-squared:  0.2098, Adjusted R-squared:  0.1776 
## F-statistic: 6.506 on 4 and 98 DF,  p-value: 0.0001098

Regression Model2:

fit <- lm(salary ~ work_yrs + age + s_avg + frstlang , data = placed.df)
summary(fit)
## 
## Call:
## lm(formula = salary ~ work_yrs + age + s_avg + frstlang, data = placed.df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -32957  -9005  -1362   4613  76947 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  34470.6    26189.2   1.316   0.1912  
## work_yrs       746.2     1121.1   0.666   0.5072  
## age           1833.3     1085.7   1.689   0.0945 .
## s_avg         2207.1     4233.5   0.521   0.6033  
## frstlang      9270.9     6894.3   1.345   0.1818  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15630 on 98 degrees of freedom
## Multiple R-squared:  0.2647, Adjusted R-squared:  0.2347 
## F-statistic: 8.818 on 4 and 98 DF,  p-value: 4.008e-06

Comments: Model 2 is a better model

Task 1b: Comparing the remaining subset of those people who did not get a job and compare them with those people who got a job

notplaced.df <- subset(data.df, salary==0)
View(notplaced.df)

T-Test:

t.test(gmat_tot ~ frstlang, data=notplaced.df)
## 
##  Welch Two Sample t-test
## 
## data:  gmat_tot by frstlang
## t = 0.51644, df = 7.9236, p-value = 0.6197
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -49.86769  78.58720
## sample estimates:
## mean in group 1 mean in group 2 
##        615.6098        601.2500

Comments: p-value=0.6197