This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
mba.df <- read.csv(paste("MBA Starting Salaries Data.csv", sep=""))
View(mba.df)
summary(mba.df)
## age sex gmat_tot gmat_qpc
## Min. :22.00 Min. :1.000 Min. :450.0 Min. :28.00
## 1st Qu.:25.00 1st Qu.:1.000 1st Qu.:580.0 1st Qu.:72.00
## Median :27.00 Median :1.000 Median :620.0 Median :83.00
## Mean :27.36 Mean :1.248 Mean :619.5 Mean :80.64
## 3rd Qu.:29.00 3rd Qu.:1.000 3rd Qu.:660.0 3rd Qu.:93.00
## Max. :48.00 Max. :2.000 Max. :790.0 Max. :99.00
## gmat_vpc gmat_tpc s_avg f_avg
## Min. :16.00 Min. : 0.0 Min. :2.000 Min. :0.000
## 1st Qu.:71.00 1st Qu.:78.0 1st Qu.:2.708 1st Qu.:2.750
## Median :81.00 Median :87.0 Median :3.000 Median :3.000
## Mean :78.32 Mean :84.2 Mean :3.025 Mean :3.062
## 3rd Qu.:91.00 3rd Qu.:94.0 3rd Qu.:3.300 3rd Qu.:3.250
## Max. :99.00 Max. :99.0 Max. :4.000 Max. :4.000
## quarter work_yrs frstlang salary
## Min. :1.000 Min. : 0.000 Min. :1.000 Min. : 0
## 1st Qu.:1.250 1st Qu.: 2.000 1st Qu.:1.000 1st Qu.: 0
## Median :2.000 Median : 3.000 Median :1.000 Median : 999
## Mean :2.478 Mean : 3.872 Mean :1.117 Mean : 39026
## 3rd Qu.:3.000 3rd Qu.: 4.000 3rd Qu.:1.000 3rd Qu.: 97000
## Max. :4.000 Max. :22.000 Max. :2.000 Max. :220000
## satis
## Min. : 1.0
## 1st Qu.: 5.0
## Median : 6.0
## Mean :172.2
## 3rd Qu.: 7.0
## Max. :998.0
library(psych)
describe(mba.df)
## vars n mean sd median trimmed mad min max
## age 1 274 27.36 3.71 27 26.76 2.97 22 48
## sex 2 274 1.25 0.43 1 1.19 0.00 1 2
## gmat_tot 3 274 619.45 57.54 620 618.86 59.30 450 790
## gmat_qpc 4 274 80.64 14.87 83 82.31 14.83 28 99
## gmat_vpc 5 274 78.32 16.86 81 80.33 14.83 16 99
## gmat_tpc 6 274 84.20 14.02 87 86.12 11.86 0 99
## s_avg 7 274 3.03 0.38 3 3.03 0.44 2 4
## f_avg 8 274 3.06 0.53 3 3.09 0.37 0 4
## quarter 9 274 2.48 1.11 2 2.47 1.48 1 4
## work_yrs 10 274 3.87 3.23 3 3.29 1.48 0 22
## frstlang 11 274 1.12 0.32 1 1.02 0.00 1 2
## salary 12 274 39025.69 50951.56 999 33607.86 1481.12 0 220000
## satis 13 274 172.18 371.61 6 91.50 1.48 1 998
## range skew kurtosis se
## age 26 2.16 6.45 0.22
## sex 1 1.16 -0.66 0.03
## gmat_tot 340 -0.01 0.06 3.48
## gmat_qpc 71 -0.92 0.30 0.90
## gmat_vpc 83 -1.04 0.74 1.02
## gmat_tpc 99 -2.28 9.02 0.85
## s_avg 2 -0.06 -0.38 0.02
## f_avg 4 -2.08 10.85 0.03
## quarter 3 0.02 -1.35 0.07
## work_yrs 22 2.78 9.80 0.20
## frstlang 1 2.37 3.65 0.02
## salary 220000 0.70 -1.05 3078.10
## satis 997 1.77 1.13 22.45
attach(mba.df)
mba.df$sex[mba.df$sex == 1] <- 'Male'
mba.df$sex[mba.df$sex == 2] <- 'Female'
mba.df$sex <- factor(mba.df$sex)
mba.df$frstlang[mba.df$frstlang == 1] <- 'English'
mba.df$frstlang[mba.df$frstlang == 2] <- 'Other'
mba.df$frstlang <- factor(mba.df$frstlang)
str(mba.df)
## 'data.frame': 274 obs. of 13 variables:
## $ age : int 23 24 24 24 24 24 25 25 25 25 ...
## $ sex : Factor w/ 2 levels "Female","Male": 1 2 2 2 1 2 2 1 2 2 ...
## $ gmat_tot: int 620 610 670 570 710 640 610 650 630 680 ...
## $ gmat_qpc: int 77 90 99 56 93 82 89 88 79 99 ...
## $ gmat_vpc: int 87 71 78 81 98 89 74 89 91 81 ...
## $ gmat_tpc: int 87 87 95 75 98 91 87 92 89 96 ...
## $ s_avg : num 3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
## $ f_avg : num 3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
## $ quarter : int 1 1 1 1 1 1 1 1 1 1 ...
## $ work_yrs: int 2 2 2 1 2 2 2 2 2 2 ...
## $ frstlang: Factor w/ 2 levels "English","Other": 1 1 1 1 1 1 1 1 2 1 ...
## $ salary : int 0 0 0 0 999 0 0 0 999 998 ...
## $ satis : int 7 6 6 7 5 6 5 6 4 998 ...
placed.df <- mba.df[which (mba.df$salary > 1000) , ]
View(placed.df)
notPlaced.df <- mba.df[which(mba.df$salary==0), ]
View(notPlaced.df)
avgSalary = mean(placed.df$salary)
avgSalary
## [1] 103030.7
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
some(placed.df)
## age sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg quarter
## 35 22 Female 660 90 92 94 3.5 3.75 1
## 44 25 Male 650 79 93 93 3.3 3.50 1
## 51 26 Male 570 68 74 75 3.8 3.50 1
## 64 27 Male 600 67 84 83 3.5 3.00 1
## 121 24 Male 500 78 30 52 3.0 2.75 2
## 125 28 Female 580 83 58 79 3.1 3.00 2
## 135 29 Male 670 91 93 95 3.1 3.00 2
## 204 30 Male 670 83 97 96 2.8 2.75 3
## 266 24 Female 600 77 78 84 2.6 3.00 4
## 274 40 Female 500 60 45 51 2.5 2.75 4
## work_yrs frstlang salary satis
## 35 1 English 85000 5
## 44 1 English 96000 7
## 51 3 English 105000 6
## 64 3 English 120000 5
## 121 2 English 96500 6
## 125 5 Other 99000 6
## 135 6 English 112000 6
## 204 6 English 102500 5
## 266 2 English 100000 6
## 274 15 Other 220000 6
library(car)
View(placed.df)
describe(placed.df)[,c(1:5)]
## vars n mean sd median
## age 1 103 26.78 3.27 2.60e+01
## sex* 2 103 1.70 0.46 2.00e+00
## gmat_tot 3 103 616.02 50.69 6.20e+02
## gmat_qpc 4 103 79.73 13.39 8.20e+01
## gmat_vpc 5 103 78.56 16.14 8.10e+01
## gmat_tpc 6 103 84.52 11.01 8.70e+01
## s_avg 7 103 3.09 0.38 3.10e+00
## f_avg 8 103 3.09 0.49 3.25e+00
## quarter 9 103 2.26 1.12 2.00e+00
## work_yrs 10 103 3.68 3.01 3.00e+00
## frstlang* 11 103 1.07 0.25 1.00e+00
## salary 12 103 103030.74 17868.80 1.00e+05
## satis 13 103 5.88 0.78 6.00e+00
library(lattice)
histogram(~salary, data = placed.df,
main = "Distribution of Starting Salary", xlab="Starting Salary", col='red' )
histogram(~salary, data = notPlaced.df,
main = "Distribution of Starting Salary", xlab="Starting Salary", col='red' )
library(car)
scatterplot(salary ~ work_yrs ,data=placed.df, main="Scatterplot of Salary with Work Experience", xlab="Work Experience", ylab="MBA's Starting Salaries", horizontal=TRUE)
## Warning in plot.window(...): "horizontal" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "horizontal" is not a graphical
## parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "horizontal"
## is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "horizontal"
## is not a graphical parameter
## Warning in box(...): "horizontal" is not a graphical parameter
## Warning in title(...): "horizontal" is not a graphical parameter
boxplot(salary ~ work_yrs ,data=placed.df, main="Distribution of Salary with Work Experience", ylab="Work Experience", xlab="MBA's Starting Salaries", horizontal=TRUE)
library(car)
scatterplot.matrix(~salary+gmat_tot+s_avg+f_avg, data=placed.df,
main="Salary versus other variables")
## Warning: 'scatterplot.matrix' is deprecated.
## Use 'scatterplotMatrix' instead.
## See help("Deprecated") and help("car-deprecated").
scatterplot.matrix(~salary+gmat_tot+s_avg+f_avg |sex, data=placed.df,
main="Salary versus other variables")
## Warning: 'scatterplot.matrix' is deprecated.
## Use 'scatterplotMatrix' instead.
## See help("Deprecated") and help("car-deprecated").
histogram(~salary, data = placed.df,
main = "Frequency of Starting Salary", xlab="Starting Salary", col='blue' )
ageTable <- table(placed.df$sex, placed.df$age)
ageTable
##
## 22 23 24 25 26 27 28 29 30 31 32 33 34 39 40
## Female 1 2 5 10 5 1 3 1 2 0 0 0 0 0 1
## Male 0 3 11 13 9 13 5 5 4 4 1 1 1 1 1
aggregate(cbind(salary, work_yrs, age) ~ sex,
data = mba.df, mean)
## sex salary work_yrs age
## 1 Female 45121.07 3.808824 27.17647
## 2 Male 37013.62 3.893204 27.41748
aggregate(cbind(salary, work_yrs) ~ age, data = mba.df, mean)
## age salary work_yrs
## 1 22 42500.00 1.000000
## 2 23 57282.00 1.750000
## 3 24 49342.24 1.727273
## 4 25 43395.55 2.264151
## 5 26 35982.07 2.875000
## 6 27 31499.37 3.130435
## 7 28 39809.00 4.666667
## 8 29 28067.95 4.500000
## 9 30 55291.25 5.583333
## 10 31 40599.40 5.800000
## 11 32 13662.25 5.625000
## 12 33 118000.00 10.000000
## 13 34 26250.00 11.500000
## 14 35 0.00 9.333333
## 15 36 0.00 12.500000
## 16 37 0.00 9.000000
## 17 39 56000.00 10.500000
## 18 40 183000.00 15.000000
## 19 42 0.00 13.000000
## 20 43 0.00 19.000000
## 21 48 0.00 22.000000
prop.table(ageTable, 2)
##
## 22 23 24 25 26 27
## Female 1.00000000 0.40000000 0.31250000 0.43478261 0.35714286 0.07142857
## Male 0.00000000 0.60000000 0.68750000 0.56521739 0.64285714 0.92857143
##
## 28 29 30 31 32 33
## Female 0.37500000 0.16666667 0.33333333 0.00000000 0.00000000 0.00000000
## Male 0.62500000 0.83333333 0.66666667 1.00000000 1.00000000 1.00000000
##
## 34 39 40
## Female 0.00000000 0.00000000 0.50000000
## Male 1.00000000 1.00000000 0.50000000
demo <- lm(salary ~ work_yrs +sex + frstlang + satis , data = placed.df)
summary(demo)
##
## Call:
## lm(formula = salary ~ work_yrs + sex + frstlang + satis, data = placed.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -30492 -8055 -1744 5362 80436
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 102214.0 11827.8 8.642 1.06e-13 ***
## work_yrs 2409.4 526.1 4.579 1.37e-05 ***
## sexMale 5949.5 3392.2 1.754 0.0826 .
## frstlangOther 14675.7 6274.0 2.339 0.0214 *
## satis -2244.4 1988.4 -1.129 0.2618
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15580 on 98 degrees of freedom
## Multiple R-squared: 0.2695, Adjusted R-squared: 0.2397
## F-statistic: 9.038 on 4 and 98 DF, p-value: 2.953e-06