Review the Data

Read and Summarize the Data

We do this by using the read.csv command

# Read the data
MBA.df <- (read.csv(paste("MBA Starting Salaries Data.csv", sep="")))

summarize the data.

# Summarize the data
attach(MBA.df)
library(psych)
describe(MBA.df)
##          vars   n     mean       sd median  trimmed     mad min    max
## age         1 274    27.36     3.71     27    26.76    2.97  22     48
## sex         2 274     1.25     0.43      1     1.19    0.00   1      2
## gmat_tot    3 274   619.45    57.54    620   618.86   59.30 450    790
## gmat_qpc    4 274    80.64    14.87     83    82.31   14.83  28     99
## gmat_vpc    5 274    78.32    16.86     81    80.33   14.83  16     99
## gmat_tpc    6 274    84.20    14.02     87    86.12   11.86   0     99
## s_avg       7 274     3.03     0.38      3     3.03    0.44   2      4
## f_avg       8 274     3.06     0.53      3     3.09    0.37   0      4
## quarter     9 274     2.48     1.11      2     2.47    1.48   1      4
## work_yrs   10 274     3.87     3.23      3     3.29    1.48   0     22
## frstlang   11 274     1.12     0.32      1     1.02    0.00   1      2
## salary     12 274 39025.69 50951.56    999 33607.86 1481.12   0 220000
## satis      13 274   172.18   371.61      6    91.50    1.48   1    998
##           range  skew kurtosis      se
## age          26  2.16     6.45    0.22
## sex           1  1.16    -0.66    0.03
## gmat_tot    340 -0.01     0.06    3.48
## gmat_qpc     71 -0.92     0.30    0.90
## gmat_vpc     83 -1.04     0.74    1.02
## gmat_tpc     99 -2.28     9.02    0.85
## s_avg         2 -0.06    -0.38    0.02
## f_avg         4 -2.08    10.85    0.03
## quarter       3  0.02    -1.35    0.07
## work_yrs     22  2.78     9.80    0.20
## frstlang      1  2.37     3.65    0.02
## salary   220000  0.70    -1.05 3078.10
## satis       997  1.77     1.13   22.45
# Data Types
str(MBA.df)
## 'data.frame':    274 obs. of  13 variables:
##  $ age     : int  23 24 24 24 24 24 25 25 25 25 ...
##  $ sex     : int  2 1 1 1 2 1 1 2 1 1 ...
##  $ gmat_tot: int  620 610 670 570 710 640 610 650 630 680 ...
##  $ gmat_qpc: int  77 90 99 56 93 82 89 88 79 99 ...
##  $ gmat_vpc: int  87 71 78 81 98 89 74 89 91 81 ...
##  $ gmat_tpc: int  87 87 95 75 98 91 87 92 89 96 ...
##  $ s_avg   : num  3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
##  $ f_avg   : num  3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
##  $ quarter : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ work_yrs: int  2 2 2 1 2 2 2 2 2 2 ...
##  $ frstlang: int  1 1 1 1 1 1 1 1 2 1 ...
##  $ salary  : int  0 0 0 0 999 0 0 0 999 998 ...
##  $ satis   : int  7 6 6 7 5 6 5 6 4 998 ...
summary(MBA.df)
##       age             sex           gmat_tot        gmat_qpc    
##  Min.   :22.00   Min.   :1.000   Min.   :450.0   Min.   :28.00  
##  1st Qu.:25.00   1st Qu.:1.000   1st Qu.:580.0   1st Qu.:72.00  
##  Median :27.00   Median :1.000   Median :620.0   Median :83.00  
##  Mean   :27.36   Mean   :1.248   Mean   :619.5   Mean   :80.64  
##  3rd Qu.:29.00   3rd Qu.:1.000   3rd Qu.:660.0   3rd Qu.:93.00  
##  Max.   :48.00   Max.   :2.000   Max.   :790.0   Max.   :99.00  
##     gmat_vpc        gmat_tpc        s_avg           f_avg      
##  Min.   :16.00   Min.   : 0.0   Min.   :2.000   Min.   :0.000  
##  1st Qu.:71.00   1st Qu.:78.0   1st Qu.:2.708   1st Qu.:2.750  
##  Median :81.00   Median :87.0   Median :3.000   Median :3.000  
##  Mean   :78.32   Mean   :84.2   Mean   :3.025   Mean   :3.062  
##  3rd Qu.:91.00   3rd Qu.:94.0   3rd Qu.:3.300   3rd Qu.:3.250  
##  Max.   :99.00   Max.   :99.0   Max.   :4.000   Max.   :4.000  
##     quarter         work_yrs         frstlang         salary      
##  Min.   :1.000   Min.   : 0.000   Min.   :1.000   Min.   :     0  
##  1st Qu.:1.250   1st Qu.: 2.000   1st Qu.:1.000   1st Qu.:     0  
##  Median :2.000   Median : 3.000   Median :1.000   Median :   999  
##  Mean   :2.478   Mean   : 3.872   Mean   :1.117   Mean   : 39026  
##  3rd Qu.:3.000   3rd Qu.: 4.000   3rd Qu.:1.000   3rd Qu.: 97000  
##  Max.   :4.000   Max.   :22.000   Max.   :2.000   Max.   :220000  
##      satis      
##  Min.   :  1.0  
##  1st Qu.:  5.0  
##  Median :  6.0  
##  Mean   :172.2  
##  3rd Qu.:  7.0  
##  Max.   :998.0
mean(MBA.df$age)
## [1] 27.35766
sd(MBA.df$age)
## [1] 3.710666
mean(MBA.df$salary)
## [1] 39025.69
sd(MBA.df$salary)
## [1] 50951.56
mean(MBA.df$work_yrs)
## [1] 3.872263
sd(MBA.df$work_yrs)
## [1] 3.232464
aggregate(cbind(salary) ~ work_yrs , data = MBA.df, mean)
##    work_yrs    salary
## 1         0  31999.67
## 2         1  34677.08
## 3         2  45531.24
## 4         3  38494.21
## 5         4  27510.81
## 6         5  34476.10
## 7         6  62041.33
## 8         7  11221.78
## 9         8  60156.86
## 10        9    499.50
## 11       10  59000.00
## 12       11      0.00
## 13       12      0.00
## 14       13      0.00
## 15       15 183000.00
## 16       16  72333.33
## 17       18      0.00
## 18       22      0.00
boxplot(salary ~ work_yrs ,data=MBA.df, main="Effect of Work Experience on Salary", xlab="Work Experience", ylab="Starting Salary")

##Distribution of MBA’s Starting Salary

library(lattice)
histogram(~salary, data = MBA.df, main = "Distribution of MBA's Starting Salary", xlab="Starting Salary") 

par(mfrow = c(1,1))
library(lattice)
bwplot(sex ~ salary , data = MBA.df, horizontal = TRUE , xlab = "Salary" , ylab = "Gender" , main = "Comparison of Salaries of Males and Females")

## => 3a.  Draw a scatter plot of Profit vs. MTenure

library(car)
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
scatterplot(age ~ salary, data=MBA.df,
            spread=FALSE, smoother.args=list(lty=2), pch=19,
            main="Scatterplot of Age vs. Salary",
            xlab="Age",
            ylab="Salary")

scatterplot(sex ~ salary, data=MBA.df,
            spread=FALSE, smoother.args=list(lty=2), pch=19,
            main="Scatterplot of Gender vs. Salary",
            xlab="Gender",
            ylab="Salary")

# Relationship between Salary and other variables

# => 4. Construct a Correlation Matrix for all variables in the dataset
options(digits=2)
cor(MBA.df, use="complete.obs", method="kendall") 
##              age     sex gmat_tot gmat_qpc gmat_vpc gmat_tpc   s_avg
## age       1.0000 -0.0649  -0.0893   -0.143   0.0043  -0.0921  0.0746
## sex      -0.0649  1.0000  -0.0315   -0.116   0.0632  -0.0292  0.1050
## gmat_tot -0.0893 -0.0315   1.0000    0.570   0.6028   0.9760  0.0757
## gmat_qpc -0.1429 -0.1160   0.5705    1.000   0.1358   0.5662 -0.0336
## gmat_vpc  0.0043  0.0632   0.6028    0.136   1.0000   0.5948  0.1461
## gmat_tpc -0.0921 -0.0292   0.9760    0.566   0.5948   1.0000  0.0698
## s_avg     0.0746  0.1050   0.0757   -0.034   0.1461   0.0698  1.0000
## f_avg     0.0235  0.1064   0.0980    0.019   0.1189   0.0921  0.5907
## quarter   0.0046 -0.1220  -0.0656    0.041  -0.1370  -0.0572 -0.7192
## work_yrs  0.6550  0.0039  -0.1466   -0.190  -0.0477  -0.1440  0.0621
## frstlang  0.0827  0.0015  -0.1049    0.126  -0.2535  -0.1007 -0.1214
## salary   -0.0670 -0.0070   0.0015   -0.018   0.0065   0.0019  0.0547
## satis    -0.1218 -0.0307   0.0797   -0.025   0.1085   0.0808 -0.0033
##            f_avg quarter work_yrs frstlang  salary   satis
## age       0.0235  0.0046   0.6550   0.0827 -0.0670 -0.1218
## sex       0.1064 -0.1220   0.0039   0.0015 -0.0070 -0.0307
## gmat_tot  0.0980 -0.0656  -0.1466  -0.1049  0.0015  0.0797
## gmat_qpc  0.0192  0.0408  -0.1901   0.1255 -0.0183 -0.0249
## gmat_vpc  0.1189 -0.1370  -0.0477  -0.2535  0.0065  0.1085
## gmat_tpc  0.0921 -0.0572  -0.1440  -0.1007  0.0019  0.0808
## s_avg     0.5907 -0.7192   0.0621  -0.1214  0.0547 -0.0033
## f_avg     1.0000 -0.4800  -0.0038  -0.0700  0.0237 -0.0010
## quarter  -0.4800  1.0000  -0.0222   0.0912 -0.1027 -0.0069
## work_yrs -0.0038 -0.0222   1.0000  -0.0012 -0.0091 -0.0502
## frstlang -0.0700  0.0912  -0.0012   1.0000 -0.0100 -0.0911
## salary    0.0237 -0.1027  -0.0091  -0.0100  1.0000 -0.0020
## satis    -0.0010 -0.0069  -0.0502  -0.0911 -0.0020  1.0000

Construct a Corrgram for all variables in the dataset

library(corrgram)
corrgram(MBA.df, order=FALSE, 
         lower.panel=panel.shade,
         upper.panel=panel.pie, 
         diag.panel=panel.minmax,
         text.panel=panel.txt,
         main="Corrgram of MBA starting salaries intercorrelations")

library(car)
scatterplotMatrix(MBA.df[,c("age","sex","work_yrs","salary")], 
                  spread=FALSE, smoother.args=list(lty=2),
                  main="Scatter Plot Matrix")

scatterplotMatrix(~age+sex+gmat_tot+s_avg+f_avg+work_yrs+salary+satis, data=MBA.df)

##Take a subset of the dataset consisting of only those people who actually got a job.

placed.df <- MBA.df[which (MBA.df$salary > 1000) , ]
placed.df <- MBA.df[which (MBA.df$salary > 1000) , ]
notPlaced.df <- MBA.df[which(MBA.df$salary==0), ]
notDisclosedSalary.df <- MBA.df[which (MBA.df$salary == 999) , ]

clean.df <- rbind(placed.df, notDisclosedSalary.df, notPlaced.df)
clean.df$GotPlaced = (clean.df$salary >1000)
clean.df$GotPlaced <- factor(clean.df$GotPlaced)
t1 <- table(clean.df$GotPlaced == 'TRUE') 
t1
## 
## FALSE  TRUE 
##   125   103

Think about the problem as y = f(x), where y = Starting Salary and x = various factors that it could depend upon

Effect of Gender on the on Salary

 aggregate(cbind(salary, work_yrs, age) ~ sex, data = MBA.df, mean)
##   sex salary work_yrs age
## 1   1  37014      3.9  27
## 2   2  45121      3.8  27
boxplot(salary ~ sex ,data=MBA.df, main="Effect of Gender on Salary", ylab="Gender", xlab="Starting Salary")

t2 <- xtabs(~ GotPlaced + sex , data= clean.df) 
t2
##          sex
## GotPlaced  1  2
##     FALSE 97 28
##     TRUE  72 31

Effect of Age on the on Salary

aggregate(cbind(salary, work_yrs) ~ age, data = MBA.df, mean)
##    age salary work_yrs
## 1   22  42500      1.0
## 2   23  57282      1.8
## 3   24  49342      1.7
## 4   25  43396      2.3
## 5   26  35982      2.9
## 6   27  31499      3.1
## 7   28  39809      4.7
## 8   29  28068      4.5
## 9   30  55291      5.6
## 10  31  40599      5.8
## 11  32  13662      5.6
## 12  33 118000     10.0
## 13  34  26250     11.5
## 14  35      0      9.3
## 15  36      0     12.5
## 16  37      0      9.0
## 17  39  56000     10.5
## 18  40 183000     15.0
## 19  42      0     13.0
## 20  43      0     19.0
## 21  48      0     22.0
t3 <- xtabs(~ GotPlaced + age , data= clean.df) 
t3
##          age
## GotPlaced 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 39 40 42 43 48
##     FALSE  1  3 15 15 15 21  9 16  5  4  7  0  3  3  2  1  1  0  1  2  1
##     TRUE   1  5 16 23 14 14  8  6  6  4  1  1  1  0  0  0  1  2  0  0  0
prop.table(t3, 2)
##          age
## GotPlaced   22   23   24   25   26   27   28   29   30   31   32   33   34
##     FALSE 0.50 0.38 0.48 0.39 0.52 0.60 0.53 0.73 0.45 0.50 0.88 0.00 0.75
##     TRUE  0.50 0.62 0.52 0.61 0.48 0.40 0.47 0.27 0.55 0.50 0.12 1.00 0.25
##          age
## GotPlaced   35   36   37   39   40   42   43   48
##     FALSE 1.00 1.00 1.00 0.50 0.00 1.00 1.00 1.00
##     TRUE  0.00 0.00 0.00 0.50 1.00 0.00 0.00 0.00

Effect of first language on the on Salary

aggregate(cbind(salary, work_yrs) ~ frstlang , data = MBA.df, mean)
##   frstlang salary work_yrs
## 1        1  40627      3.9
## 2        2  26915      3.6
t4 <- xtabs(~ GotPlaced + frstlang, data=clean.df )
t4
##          frstlang
## GotPlaced   1   2
##     FALSE 108  17
##     TRUE   96   7
prop.table(t4, 2)
##          frstlang
## GotPlaced    1    2
##     FALSE 0.53 0.71
##     TRUE  0.47 0.29

Chi-Sq. tests

H1: percentage of female who got placed is higher than ##percentage of male who got placed

chisq.test(t2)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  t2
## X-squared = 1, df = 1, p-value = 0.2

H2: The percentage of people placed whose first language is English is higher than the percentage of people placed whose first language is not English

chisq.test(t4)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  t4
## X-squared = 2, df = 1, p-value = 0.1

H3 he percentage of people placed whose age is less than 30 ih is higher than the percentage of people placed whose age is greater than 30

chisq.test(t3)
## 
##  Pearson's Chi-squared test
## 
## data:  t3
## X-squared = 20, df = 20, p-value = 0.2

The results of the Chi-Square tests tell us that age, GMAT percentiles, work experience and first language are factors that affect starting salary (i.e p < 0.05), whereas sex, average GPA for Spring and Fall semesters, quartile ranking and satisfaction with degree have no effect on the salary (p > 0.05). This, however, is in contrast with the results obtained from the plots that we observed earlier.

Preparing for regression analysis

MBA.df$sex[MBA.df$sex == 1] <- 'Male' 
MBA.df$sex[MBA.df$sex == 2] <- 'Female'
MBA.df$sex <- factor(MBA.df$sex) 
MBA.df$frstlang[MBA.df$frstlang == 1] <- 'English' 
MBA.df$frstlang[MBA.df$frstlang == 2] <- 'Other' 
MBA.df$frstlang <- factor(MBA.df$frstlang)

Regression Analysis

m <- salary ~ age + work_yrs + s_avg + f_avg + gmat_qpc + gmat_vpc + sex + frstlang + satis
fit <- lm(m, data = placed.df) 
summary(fit)
## 
## Call:
## lm(formula = m, data = placed.df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -25490  -7943  -1508   5760  78864 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)    49477      31790    1.56    0.123  
## age             1942       1138    1.71    0.091 .
## work_yrs         645       1146    0.56    0.575  
## s_avg           3384       5021    0.67    0.502  
## f_avg          -1093       3813   -0.29    0.775  
## gmat_qpc         118        121    0.97    0.333  
## gmat_vpc        -129        104   -1.24    0.217  
## sex            -3998       3591   -1.11    0.268  
## frstlang        8324       7367    1.13    0.261  
## satis          -1848       2052   -0.90    0.370  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15600 on 93 degrees of freedom
## Multiple R-squared:  0.307,  Adjusted R-squared:  0.24 
## F-statistic: 4.57 on 9 and 93 DF,  p-value: 5.29e-05

The results of the Chi-Square tests tell us that age, GMAT percentiles, work experience and first language are factors that affect starting salary (i.e p < 0.05), whereas sex, average GPA for Spring and Fall semesters, quartile ranking and satisfaction with degree have no effect on the salary (p > 0.05). This, however, is in contrast with the results obtained from the plots that we observed earlier.