R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

mba.df <- read.csv(paste("MBA Starting Salaries Data.csv", sep=""))
View(mba.df)
summary(mba.df)
##       age             sex           gmat_tot        gmat_qpc    
##  Min.   :22.00   Min.   :1.000   Min.   :450.0   Min.   :28.00  
##  1st Qu.:25.00   1st Qu.:1.000   1st Qu.:580.0   1st Qu.:72.00  
##  Median :27.00   Median :1.000   Median :620.0   Median :83.00  
##  Mean   :27.36   Mean   :1.248   Mean   :619.5   Mean   :80.64  
##  3rd Qu.:29.00   3rd Qu.:1.000   3rd Qu.:660.0   3rd Qu.:93.00  
##  Max.   :48.00   Max.   :2.000   Max.   :790.0   Max.   :99.00  
##     gmat_vpc        gmat_tpc        s_avg           f_avg      
##  Min.   :16.00   Min.   : 0.0   Min.   :2.000   Min.   :0.000  
##  1st Qu.:71.00   1st Qu.:78.0   1st Qu.:2.708   1st Qu.:2.750  
##  Median :81.00   Median :87.0   Median :3.000   Median :3.000  
##  Mean   :78.32   Mean   :84.2   Mean   :3.025   Mean   :3.062  
##  3rd Qu.:91.00   3rd Qu.:94.0   3rd Qu.:3.300   3rd Qu.:3.250  
##  Max.   :99.00   Max.   :99.0   Max.   :4.000   Max.   :4.000  
##     quarter         work_yrs         frstlang         salary      
##  Min.   :1.000   Min.   : 0.000   Min.   :1.000   Min.   :     0  
##  1st Qu.:1.250   1st Qu.: 2.000   1st Qu.:1.000   1st Qu.:     0  
##  Median :2.000   Median : 3.000   Median :1.000   Median :   999  
##  Mean   :2.478   Mean   : 3.872   Mean   :1.117   Mean   : 39026  
##  3rd Qu.:3.000   3rd Qu.: 4.000   3rd Qu.:1.000   3rd Qu.: 97000  
##  Max.   :4.000   Max.   :22.000   Max.   :2.000   Max.   :220000  
##      satis      
##  Min.   :  1.0  
##  1st Qu.:  5.0  
##  Median :  6.0  
##  Mean   :172.2  
##  3rd Qu.:  7.0  
##  Max.   :998.0
library(psych)
describe(mba.df)
##          vars   n     mean       sd median  trimmed     mad min    max
## age         1 274    27.36     3.71     27    26.76    2.97  22     48
## sex         2 274     1.25     0.43      1     1.19    0.00   1      2
## gmat_tot    3 274   619.45    57.54    620   618.86   59.30 450    790
## gmat_qpc    4 274    80.64    14.87     83    82.31   14.83  28     99
## gmat_vpc    5 274    78.32    16.86     81    80.33   14.83  16     99
## gmat_tpc    6 274    84.20    14.02     87    86.12   11.86   0     99
## s_avg       7 274     3.03     0.38      3     3.03    0.44   2      4
## f_avg       8 274     3.06     0.53      3     3.09    0.37   0      4
## quarter     9 274     2.48     1.11      2     2.47    1.48   1      4
## work_yrs   10 274     3.87     3.23      3     3.29    1.48   0     22
## frstlang   11 274     1.12     0.32      1     1.02    0.00   1      2
## salary     12 274 39025.69 50951.56    999 33607.86 1481.12   0 220000
## satis      13 274   172.18   371.61      6    91.50    1.48   1    998
##           range  skew kurtosis      se
## age          26  2.16     6.45    0.22
## sex           1  1.16    -0.66    0.03
## gmat_tot    340 -0.01     0.06    3.48
## gmat_qpc     71 -0.92     0.30    0.90
## gmat_vpc     83 -1.04     0.74    1.02
## gmat_tpc     99 -2.28     9.02    0.85
## s_avg         2 -0.06    -0.38    0.02
## f_avg         4 -2.08    10.85    0.03
## quarter       3  0.02    -1.35    0.07
## work_yrs     22  2.78     9.80    0.20
## frstlang      1  2.37     3.65    0.02
## salary   220000  0.70    -1.05 3078.10
## satis       997  1.77     1.13   22.45
attach(mba.df)
mba.df$sex[mba.df$sex == 1] <- 'Male'
mba.df$sex[mba.df$sex == 2] <- 'Female'
mba.df$sex <- factor(mba.df$sex)
mba.df$frstlang[mba.df$frstlang == 1] <- 'English'
mba.df$frstlang[mba.df$frstlang == 2] <- 'Other'
mba.df$frstlang <- factor(mba.df$frstlang)

str(mba.df)
## 'data.frame':    274 obs. of  13 variables:
##  $ age     : int  23 24 24 24 24 24 25 25 25 25 ...
##  $ sex     : Factor w/ 2 levels "Female","Male": 1 2 2 2 1 2 2 1 2 2 ...
##  $ gmat_tot: int  620 610 670 570 710 640 610 650 630 680 ...
##  $ gmat_qpc: int  77 90 99 56 93 82 89 88 79 99 ...
##  $ gmat_vpc: int  87 71 78 81 98 89 74 89 91 81 ...
##  $ gmat_tpc: int  87 87 95 75 98 91 87 92 89 96 ...
##  $ s_avg   : num  3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
##  $ f_avg   : num  3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
##  $ quarter : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ work_yrs: int  2 2 2 1 2 2 2 2 2 2 ...
##  $ frstlang: Factor w/ 2 levels "English","Other": 1 1 1 1 1 1 1 1 2 1 ...
##  $ salary  : int  0 0 0 0 999 0 0 0 999 998 ...
##  $ satis   : int  7 6 6 7 5 6 5 6 4 998 ...
placed.df <- mba.df[which (mba.df$salary > 1000)  , ]
View(placed.df)
notPlaced.df <- mba.df[which(mba.df$salary==0), ]
View(notPlaced.df)
avgSalary = mean(placed.df$salary)
avgSalary
## [1] 103030.7
library(car)
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
some(placed.df)
##     age    sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg quarter
## 35   22 Female      660       90       92       94   3.5  3.75       1
## 44   25   Male      650       79       93       93   3.3  3.50       1
## 51   26   Male      570       68       74       75   3.8  3.50       1
## 64   27   Male      600       67       84       83   3.5  3.00       1
## 121  24   Male      500       78       30       52   3.0  2.75       2
## 125  28 Female      580       83       58       79   3.1  3.00       2
## 135  29   Male      670       91       93       95   3.1  3.00       2
## 204  30   Male      670       83       97       96   2.8  2.75       3
## 266  24 Female      600       77       78       84   2.6  3.00       4
## 274  40 Female      500       60       45       51   2.5  2.75       4
##     work_yrs frstlang salary satis
## 35         1  English  85000     5
## 44         1  English  96000     7
## 51         3  English 105000     6
## 64         3  English 120000     5
## 121        2  English  96500     6
## 125        5    Other  99000     6
## 135        6  English 112000     6
## 204        6  English 102500     5
## 266        2  English 100000     6
## 274       15    Other 220000     6
library(car)

View(placed.df)
describe(placed.df)[,c(1:5)]
##           vars   n      mean       sd   median
## age          1 103     26.78     3.27 2.60e+01
## sex*         2 103      1.70     0.46 2.00e+00
## gmat_tot     3 103    616.02    50.69 6.20e+02
## gmat_qpc     4 103     79.73    13.39 8.20e+01
## gmat_vpc     5 103     78.56    16.14 8.10e+01
## gmat_tpc     6 103     84.52    11.01 8.70e+01
## s_avg        7 103      3.09     0.38 3.10e+00
## f_avg        8 103      3.09     0.49 3.25e+00
## quarter      9 103      2.26     1.12 2.00e+00
## work_yrs    10 103      3.68     3.01 3.00e+00
## frstlang*   11 103      1.07     0.25 1.00e+00
## salary      12 103 103030.74 17868.80 1.00e+05
## satis       13 103      5.88     0.78 6.00e+00
library(lattice)
histogram(~salary, data = placed.df,
 main = "Distribution of Starting Salary", xlab="Starting Salary", col='red' ) 

histogram(~salary, data = notPlaced.df,
 main = "Distribution of Starting Salary", xlab="Starting Salary", col='red' ) 

library(car)
scatterplot(salary ~ work_yrs ,data=placed.df, main="Scatterplot of Salary with Work Experience", xlab="Work Experience", ylab="MBA's Starting Salaries", horizontal=TRUE)
## Warning in plot.window(...): "horizontal" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "horizontal" is not a graphical
## parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "horizontal"
## is not a graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "horizontal"
## is not a graphical parameter
## Warning in box(...): "horizontal" is not a graphical parameter
## Warning in title(...): "horizontal" is not a graphical parameter

boxplot(salary ~ work_yrs ,data=placed.df, main="Distribution of Salary with Work Experience", ylab="Work Experience", xlab="MBA's Starting Salaries", horizontal=TRUE)

library(car)
scatterplot.matrix(~salary+gmat_tot+s_avg+f_avg, data=placed.df,
    main="Salary versus other variables")
## Warning: 'scatterplot.matrix' is deprecated.
## Use 'scatterplotMatrix' instead.
## See help("Deprecated") and help("car-deprecated").

scatterplot.matrix(~salary+gmat_tot+s_avg+f_avg |sex, data=placed.df,
    main="Salary versus other variables")
## Warning: 'scatterplot.matrix' is deprecated.
## Use 'scatterplotMatrix' instead.
## See help("Deprecated") and help("car-deprecated").

histogram(~salary, data = placed.df,
 main = "Frequency of Starting Salary", xlab="Starting Salary", col='blue' ) 

ageTable <- table(placed.df$sex, placed.df$age)
ageTable
##         
##          22 23 24 25 26 27 28 29 30 31 32 33 34 39 40
##   Female  1  2  5 10  5  1  3  1  2  0  0  0  0  0  1
##   Male    0  3 11 13  9 13  5  5  4  4  1  1  1  1  1
aggregate(cbind(salary, work_yrs, age) ~ sex, 
                   data = mba.df, mean)
##      sex   salary work_yrs      age
## 1 Female 45121.07 3.808824 27.17647
## 2   Male 37013.62 3.893204 27.41748
aggregate(cbind(salary, work_yrs) ~ age, data = mba.df, mean)
##    age    salary  work_yrs
## 1   22  42500.00  1.000000
## 2   23  57282.00  1.750000
## 3   24  49342.24  1.727273
## 4   25  43395.55  2.264151
## 5   26  35982.07  2.875000
## 6   27  31499.37  3.130435
## 7   28  39809.00  4.666667
## 8   29  28067.95  4.500000
## 9   30  55291.25  5.583333
## 10  31  40599.40  5.800000
## 11  32  13662.25  5.625000
## 12  33 118000.00 10.000000
## 13  34  26250.00 11.500000
## 14  35      0.00  9.333333
## 15  36      0.00 12.500000
## 16  37      0.00  9.000000
## 17  39  56000.00 10.500000
## 18  40 183000.00 15.000000
## 19  42      0.00 13.000000
## 20  43      0.00 19.000000
## 21  48      0.00 22.000000
prop.table(ageTable, 2) 
##         
##                  22         23         24         25         26         27
##   Female 1.00000000 0.40000000 0.31250000 0.43478261 0.35714286 0.07142857
##   Male   0.00000000 0.60000000 0.68750000 0.56521739 0.64285714 0.92857143
##         
##                  28         29         30         31         32         33
##   Female 0.37500000 0.16666667 0.33333333 0.00000000 0.00000000 0.00000000
##   Male   0.62500000 0.83333333 0.66666667 1.00000000 1.00000000 1.00000000
##         
##                  34         39         40
##   Female 0.00000000 0.00000000 0.50000000
##   Male   1.00000000 1.00000000 0.50000000
demo <- lm(salary ~ work_yrs +sex + frstlang + satis , data = placed.df)
summary(demo)
## 
## Call:
## lm(formula = salary ~ work_yrs + sex + frstlang + satis, data = placed.df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -30492  -8055  -1744   5362  80436 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   102214.0    11827.8   8.642 1.06e-13 ***
## work_yrs        2409.4      526.1   4.579 1.37e-05 ***
## sexMale         5949.5     3392.2   1.754   0.0826 .  
## frstlangOther  14675.7     6274.0   2.339   0.0214 *  
## satis          -2244.4     1988.4  -1.129   0.2618    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15580 on 98 degrees of freedom
## Multiple R-squared:  0.2695, Adjusted R-squared:  0.2397 
## F-statistic: 9.038 on 4 and 98 DF,  p-value: 2.953e-06