Reading the data set into R

mba<- read.csv(paste("MBA Starting Salaries Data.csv", sep=""))
View(mba)
attach(mba)
dim(mba)
## [1] 274  13
str(mba)
## 'data.frame':    274 obs. of  13 variables:
##  $ age     : int  23 24 24 24 24 24 25 25 25 25 ...
##  $ sex     : int  2 1 1 1 2 1 1 2 1 1 ...
##  $ gmat_tot: int  620 610 670 570 710 640 610 650 630 680 ...
##  $ gmat_qpc: int  77 90 99 56 93 82 89 88 79 99 ...
##  $ gmat_vpc: int  87 71 78 81 98 89 74 89 91 81 ...
##  $ gmat_tpc: int  87 87 95 75 98 91 87 92 89 96 ...
##  $ s_avg   : num  3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
##  $ f_avg   : num  3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
##  $ quarter : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ work_yrs: int  2 2 2 1 2 2 2 2 2 2 ...
##  $ frstlang: int  1 1 1 1 1 1 1 1 2 1 ...
##  $ salary  : int  0 0 0 0 999 0 0 0 999 998 ...
##  $ satis   : int  7 6 6 7 5 6 5 6 4 998 ...

Create summary statistics (e.g. mean, standard deviation, median, mode) for the important variables in the dataset.

summary(mba)
##       age             sex           gmat_tot        gmat_qpc    
##  Min.   :22.00   Min.   :1.000   Min.   :450.0   Min.   :28.00  
##  1st Qu.:25.00   1st Qu.:1.000   1st Qu.:580.0   1st Qu.:72.00  
##  Median :27.00   Median :1.000   Median :620.0   Median :83.00  
##  Mean   :27.36   Mean   :1.248   Mean   :619.5   Mean   :80.64  
##  3rd Qu.:29.00   3rd Qu.:1.000   3rd Qu.:660.0   3rd Qu.:93.00  
##  Max.   :48.00   Max.   :2.000   Max.   :790.0   Max.   :99.00  
##     gmat_vpc        gmat_tpc        s_avg           f_avg      
##  Min.   :16.00   Min.   : 0.0   Min.   :2.000   Min.   :0.000  
##  1st Qu.:71.00   1st Qu.:78.0   1st Qu.:2.708   1st Qu.:2.750  
##  Median :81.00   Median :87.0   Median :3.000   Median :3.000  
##  Mean   :78.32   Mean   :84.2   Mean   :3.025   Mean   :3.062  
##  3rd Qu.:91.00   3rd Qu.:94.0   3rd Qu.:3.300   3rd Qu.:3.250  
##  Max.   :99.00   Max.   :99.0   Max.   :4.000   Max.   :4.000  
##     quarter         work_yrs         frstlang         salary      
##  Min.   :1.000   Min.   : 0.000   Min.   :1.000   Min.   :     0  
##  1st Qu.:1.250   1st Qu.: 2.000   1st Qu.:1.000   1st Qu.:     0  
##  Median :2.000   Median : 3.000   Median :1.000   Median :   999  
##  Mean   :2.478   Mean   : 3.872   Mean   :1.117   Mean   : 39026  
##  3rd Qu.:3.000   3rd Qu.: 4.000   3rd Qu.:1.000   3rd Qu.: 97000  
##  Max.   :4.000   Max.   :22.000   Max.   :2.000   Max.   :220000  
##      satis      
##  Min.   :  1.0  
##  1st Qu.:  5.0  
##  Median :  6.0  
##  Mean   :172.2  
##  3rd Qu.:  7.0  
##  Max.   :998.0
library(psych)
describe(mba)
##          vars   n     mean       sd median  trimmed     mad min    max
## age         1 274    27.36     3.71     27    26.76    2.97  22     48
## sex         2 274     1.25     0.43      1     1.19    0.00   1      2
## gmat_tot    3 274   619.45    57.54    620   618.86   59.30 450    790
## gmat_qpc    4 274    80.64    14.87     83    82.31   14.83  28     99
## gmat_vpc    5 274    78.32    16.86     81    80.33   14.83  16     99
## gmat_tpc    6 274    84.20    14.02     87    86.12   11.86   0     99
## s_avg       7 274     3.03     0.38      3     3.03    0.44   2      4
## f_avg       8 274     3.06     0.53      3     3.09    0.37   0      4
## quarter     9 274     2.48     1.11      2     2.47    1.48   1      4
## work_yrs   10 274     3.87     3.23      3     3.29    1.48   0     22
## frstlang   11 274     1.12     0.32      1     1.02    0.00   1      2
## salary     12 274 39025.69 50951.56    999 33607.86 1481.12   0 220000
## satis      13 274   172.18   371.61      6    91.50    1.48   1    998
##           range  skew kurtosis      se
## age          26  2.16     6.45    0.22
## sex           1  1.16    -0.66    0.03
## gmat_tot    340 -0.01     0.06    3.48
## gmat_qpc     71 -0.92     0.30    0.90
## gmat_vpc     83 -1.04     0.74    1.02
## gmat_tpc     99 -2.28     9.02    0.85
## s_avg         2 -0.06    -0.38    0.02
## f_avg         4 -2.08    10.85    0.03
## quarter       3  0.02    -1.35    0.07
## work_yrs     22  2.78     9.80    0.20
## frstlang      1  2.37     3.65    0.02
## salary   220000  0.70    -1.05 3078.10
## satis       997  1.77     1.13   22.45
notplaced = mba[which (mba$salary == 0)  , ]
placed = mba[which (mba$salary != 0)  , ]
notanswered = placed[which (placed$salary == 998)  , ]
notdisclosed = placed[which (placed$salary == 999)  , ]
a = placed[which (placed$salary != 999)  , ]
salarygiven = a[which (a$salary != 998)  , ]
View(salarygiven)
  1. To Draw Box Plots / Bar Plots to visualize the distribution of each variable independently
boxplot(placed$salary, data=placed , xlab="Salary", main="Boxplot of Salary", horizontal=TRUE,col = "blue")

boxplot(placed$age, data=placed , xlab="Age", main="Boxplot of Age", horizontal=TRUE,col = "yellow")

library(lattice)
histogram(~mba$salary, data = placed,main = "Distribution of Salary", xlab="Salary", col='grey' ) 

To Draw Scatter Plots to understand how are the variables correlated pair-wise

Salary vs work experience

library(car)
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
scatterplot(salary ~ work_yrs | sex ,data=placed, main="Scatterplot of Salary with Work Experience", xlab="Work Experience", ylab="Starting Salaries")

Salary vs Degree of satisfaction

scatterplot(salary ~ satis ,data=placed, main="Scatterplot of Salary with Degree of satisfaction", xlab="Degree of satisfaction", ylab="Starting Salaries")

Salary vs GMAT total score

scatterplot(salary ~ gmat_tot ,data=placed, main="Scatterplot of Salary with GMAT Total Score", xlab="GMAT Total Score", ylab="Starting Salaries")

To draw a Corrgram and to Create a Variance-Covariance Matrix

library(corrgram) 
corrgram(placed , order=FALSE, lower.panel=panel.shade, upper.panel=panel.pie, text.panel=panel.txt, main="Corrgram of Placed Variables")

scatterplot.matrix(~salary+gmat_tot+s_avg+f_avg+work_yrs, data=placed,
    main="Salary versus other variables")
## Warning: 'scatterplot.matrix' is deprecated.
## Use 'scatterplotMatrix' instead.
## See help("Deprecated") and help("car-deprecated").

Draw Draw Contingency Tables, as appropriate

gotplaced = placed[which (placed$salary != 998)  , ]
View(gotplaced)
table(gotplaced$sex)
## 
##   1   2 
## 102  36
table(gotplaced$frstlang)
## 
##   1   2 
## 122  16
x<-table(gotplaced$sex,gotplaced$frstlang)
x
##    
##      1  2
##   1 91 11
##   2 31  5
prop.table(x)*100
##    
##             1         2
##   1 65.942029  7.971014
##   2 22.463768  3.623188
table(gotplaced$work_yrs)
## 
##  0  1  2  3  4  5  6  7  8  9 10 15 16 
##  2 10 44 29 22  9  9  3  4  1  1  2  2

To run chi-square tests, as appropriate

mba$isplaced[mba$salary > 1000] <- 1
View(mba)
s <- xtabs(~mba$isplaced + mba$sex, data = mba)
s
##             mba$sex
## mba$isplaced  1  2
##            1 72 31
prop.table(s,2)*100
##             mba$sex
## mba$isplaced   1   2
##            1 100 100

Hypothesis : Percentage of Femlaes who got placed is more than percentage of males who got placed.

Chi-square test

chisq.test(s)
## 
##  Chi-squared test for given probabilities
## 
## data:  s
## X-squared = 16.32, df = 1, p-value = 5.349e-05

To run t-tests, as appropriate

Null Hyphothesis 1 : There is no significant difference between the salary of females who are placed and salaries of males who are placed.

t.test(salarygiven$salary~salarygiven$sex)
## 
##  Welch Two Sample t-test
## 
## data:  salarygiven$salary by salarygiven$sex
## t = 1.3628, df = 38.115, p-value = 0.1809
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -3128.55 16021.72
## sample estimates:
## mean in group 1 mean in group 2 
##       104970.97        98524.39

Since p-value > 0.05 , Null hyphothesis is valid and there is no significant difference between the salary of females who are placed and salaries of males who are placed.

Null Hyphothesis 2 : There is no significant difference between the people who are placed who have their first language as English and those who have their first language as not english

t.test(salarygiven$salary~salarygiven$frstlang)
## 
##  Welch Two Sample t-test
## 
## data:  salarygiven$salary by salarygiven$frstlang
## t = -1.1202, df = 6.0863, p-value = 0.3049
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -59933.62  22202.25
## sample estimates:
## mean in group 1 mean in group 2 
##        101748.6        120614.3

Since p-value > 0.05 , Null hyphothesis is valid and there is no significant difference between the people who are placed who have their first language as English and those who have their first language as not english.

Liner REGRESSION Independent Variables: {work_yrs,s_avg,f_avg,gmat_qpc,gmat_vpc,gmattpc,sex,frstlang,satis} Dependent Variable: Salary

f1 <- gotplaced$salary ~ gotplaced$work_yrs + gotplaced$s_avg + gotplaced$f_avg + gotplaced$gmat_qpc + gotplaced$gmat_vpc + gotplaced$gmat_tpc + gotplaced$sex + gotplaced$frstlang + gotplaced$satis 
lm1 <- lm(f1, data = gotplaced)
summary(lm1)
## 
## Call:
## lm(formula = f1, data = gotplaced)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -102066  -21162    7023   24234  129791 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -85125.9    47151.0  -1.805  0.07336 .  
## gotplaced$work_yrs   1486.8     1341.7   1.108  0.26986    
## gotplaced$s_avg     32113.7    10752.2   2.987  0.00338 ** 
## gotplaced$f_avg     -1726.0     9014.2  -0.191  0.84845    
## gotplaced$gmat_qpc   -404.5      364.8  -1.109  0.26955    
## gotplaced$gmat_vpc   -713.1      326.3  -2.185  0.03067 *  
## gotplaced$gmat_tpc    594.5      486.8   1.221  0.22422    
## gotplaced$sex        6582.5     7832.9   0.840  0.40227    
## gotplaced$frstlang -15675.1    12231.1  -1.282  0.20231    
## gotplaced$satis     20286.7     3133.3   6.475 1.85e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 38810 on 128 degrees of freedom
## Multiple R-squared:  0.3669, Adjusted R-squared:  0.3223 
## F-statistic: 8.241 on 9 and 128 DF,  p-value: 1.321e-09

if p- value for independant variables is less than 0.05 , then Salary is significantly dependant on that. SO Slary is significantly dependant on s_avg, gmat_vpc and satis.

f2 <- gotplaced$salary ~ gotplaced$work_yrs + gotplaced$s_avg + gotplaced$gmat_vpc + gotplaced$sex + gotplaced$frstlang + gotplaced$satis 
lm2 <- lm(f2, data = gotplaced)
summary(lm2)
## 
## Call:
## lm(formula = f2, data = gotplaced)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -100997  -21894    7832   24235  123531 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -94161.3    40734.6  -2.312  0.02236 *  
## gotplaced$work_yrs   1668.6     1233.8   1.352  0.17858    
## gotplaced$s_avg     29977.9     9057.6   3.310  0.00121 ** 
## gotplaced$gmat_vpc   -429.5      217.6  -1.973  0.05055 .  
## gotplaced$sex        7555.2     7636.5   0.989  0.32431    
## gotplaced$frstlang -14968.2    11886.1  -1.259  0.21016    
## gotplaced$satis     20801.8     3086.6   6.739 4.58e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 38620 on 131 degrees of freedom
## Multiple R-squared:  0.3585, Adjusted R-squared:  0.3291 
## F-statistic:  12.2 on 6 and 131 DF,  p-value: 7.177e-11
lm2$coefficients
##        (Intercept) gotplaced$work_yrs    gotplaced$s_avg 
##        -94161.2871          1668.5710         29977.9326 
## gotplaced$gmat_vpc      gotplaced$sex gotplaced$frstlang 
##          -429.5049          7555.2158        -14968.2102 
##    gotplaced$satis 
##         20801.8085

Logistic Regression

lm3 <- glm(mba$isplaced ~.,family=binomial(link='logit'),data=mba)
## Warning: glm.fit: algorithm did not converge
summary(lm3)
## 
## Call:
## glm(formula = mba$isplaced ~ ., family = binomial(link = "logit"), 
##     data = mba)
## 
## Deviance Residuals: 
##       Min         1Q     Median         3Q        Max  
## 2.409e-06  2.409e-06  2.409e-06  2.409e-06  2.409e-06  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)
## (Intercept)  2.657e+01  1.238e+06       0        1
## age         -6.348e-08  2.645e+04       0        1
## sex         -7.085e-07  8.346e+04       0        1
## gmat_tot    -5.808e-08  4.129e+03       0        1
## gmat_qpc     1.532e-07  1.163e+04       0        1
## gmat_vpc     1.403e-07  1.166e+04       0        1
## gmat_tpc    -4.542e-08  1.688e+04       0        1
## s_avg        1.124e-06  1.902e+05       0        1
## f_avg       -5.060e-07  9.006e+04       0        1
## quarter     -4.420e-07  6.308e+04       0        1
## work_yrs     1.281e-07  2.628e+04       0        1
## frstlang     9.599e-07  1.712e+05       0        1
## salary      -5.076e-11  2.433e+00       0        1
## satis       -5.605e-07  4.988e+04       0        1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 0.0000e+00  on 102  degrees of freedom
## Residual deviance: 5.9756e-10  on  89  degrees of freedom
##   (171 observations deleted due to missingness)
## AIC: 28
## 
## Number of Fisher Scoring iterations: 25
lm3$coefficients
##   (Intercept)           age           sex      gmat_tot      gmat_qpc 
##  2.656609e+01 -6.348145e-08 -7.084946e-07 -5.808314e-08  1.532370e-07 
##      gmat_vpc      gmat_tpc         s_avg         f_avg       quarter 
##  1.403170e-07 -4.542382e-08  1.123515e-06 -5.059961e-07 -4.419518e-07 
##      work_yrs      frstlang        salary         satis 
##  1.280627e-07  9.598788e-07 -5.076355e-11 -5.605335e-07