mba=read.csv("MBA Starting Salaries Data.csv")
library(psych)
describe(mba)
## vars n mean sd median trimmed mad min max
## age 1 274 27.36 3.71 27 26.76 2.97 22 48
## sex 2 274 1.25 0.43 1 1.19 0.00 1 2
## gmat_tot 3 274 619.45 57.54 620 618.86 59.30 450 790
## gmat_qpc 4 274 80.64 14.87 83 82.31 14.83 28 99
## gmat_vpc 5 274 78.32 16.86 81 80.33 14.83 16 99
## gmat_tpc 6 274 84.20 14.02 87 86.12 11.86 0 99
## s_avg 7 274 3.03 0.38 3 3.03 0.44 2 4
## f_avg 8 274 3.06 0.53 3 3.09 0.37 0 4
## quarter 9 274 2.48 1.11 2 2.47 1.48 1 4
## work_yrs 10 274 3.87 3.23 3 3.29 1.48 0 22
## frstlang 11 274 1.12 0.32 1 1.02 0.00 1 2
## salary 12 274 39025.69 50951.56 999 33607.86 1481.12 0 220000
## satis 13 274 172.18 371.61 6 91.50 1.48 1 998
## range skew kurtosis se
## age 26 2.16 6.45 0.22
## sex 1 1.16 -0.66 0.03
## gmat_tot 340 -0.01 0.06 3.48
## gmat_qpc 71 -0.92 0.30 0.90
## gmat_vpc 83 -1.04 0.74 1.02
## gmat_tpc 99 -2.28 9.02 0.85
## s_avg 2 -0.06 -0.38 0.02
## f_avg 4 -2.08 10.85 0.03
## quarter 3 0.02 -1.35 0.07
## work_yrs 22 2.78 9.80 0.20
## frstlang 1 2.37 3.65 0.02
## salary 220000 0.70 -1.05 3078.10
## satis 997 1.77 1.13 22.45
str(mba)
## 'data.frame': 274 obs. of 13 variables:
## $ age : int 23 24 24 24 24 24 25 25 25 25 ...
## $ sex : int 2 1 1 1 2 1 1 2 1 1 ...
## $ gmat_tot: int 620 610 670 570 710 640 610 650 630 680 ...
## $ gmat_qpc: int 77 90 99 56 93 82 89 88 79 99 ...
## $ gmat_vpc: int 87 71 78 81 98 89 74 89 91 81 ...
## $ gmat_tpc: int 87 87 95 75 98 91 87 92 89 96 ...
## $ s_avg : num 3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
## $ f_avg : num 3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
## $ quarter : int 1 1 1 1 1 1 1 1 1 1 ...
## $ work_yrs: int 2 2 2 1 2 2 2 2 2 2 ...
## $ frstlang: int 1 1 1 1 1 1 1 1 2 1 ...
## $ salary : int 0 0 0 0 999 0 0 0 999 998 ...
## $ satis : int 7 6 6 7 5 6 5 6 4 998 ...
mba$frstlang[mba$frstlang == 2] = 'Other'
mba$frstlang[mba$frstlang == 1] = 'Eng'
mba$sex[mba$sex == 1] = 'M'
mba$sex[mba$sex == 2] = 'F'
mba$frstlang = factor(mba$frstlang)
mba$sex = factor(mba$sex)
str(mba)
## 'data.frame': 274 obs. of 13 variables:
## $ age : int 23 24 24 24 24 24 25 25 25 25 ...
## $ sex : Factor w/ 2 levels "F","M": 1 2 2 2 1 2 2 1 2 2 ...
## $ gmat_tot: int 620 610 670 570 710 640 610 650 630 680 ...
## $ gmat_qpc: int 77 90 99 56 93 82 89 88 79 99 ...
## $ gmat_vpc: int 87 71 78 81 98 89 74 89 91 81 ...
## $ gmat_tpc: int 87 87 95 75 98 91 87 92 89 96 ...
## $ s_avg : num 3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
## $ f_avg : num 3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
## $ quarter : int 1 1 1 1 1 1 1 1 1 1 ...
## $ work_yrs: int 2 2 2 1 2 2 2 2 2 2 ...
## $ frstlang: Factor w/ 2 levels "Eng","Other": 1 1 1 1 1 1 1 1 2 1 ...
## $ salary : int 0 0 0 0 999 0 0 0 999 998 ...
## $ satis : int 7 6 6 7 5 6 5 6 4 998 ...
placed = mba[which(mba$salary > 0), ]
View(placed)
notplaced=mba[which(mba$salary==0), ]
library(lattice)
histogram(~salary, data = placed, xlab="Starting Salary", col='blue' )

aggregate(cbind(salary, work_yrs, age) ~ sex,
data = placed, mean)
## sex salary work_yrs age
## 1 F 68182.96 3.244444 26.17778
## 2 M 54854.72 3.611511 26.99281
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
scatterplot(salary ~ work_yrs ,data=placed, xlab="Work Experience", ylab="MBA Starting Salaries", horizontal=TRUE)
## Warning in plot.window(...): "horizontal" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "horizontal" is not a graphical
## parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "horizontal"
## is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "horizontal"
## is not a graphical parameter
## Warning in box(...): "horizontal" is not a graphical parameter
## Warning in title(...): "horizontal" is not a graphical parameter

boxplot(salary ~ work_yrs ,data=placed,ylab="Work Experience", xlab="MBA Starting Salaries", horizontal=TRUE)

scatterplot(salary ~ sex ,data=mba, ylab="Work Experience", xlab="Starting Salaries", horizontal=FALSE)

scatterplot(salary ~ gmat_tot , data=placed,xlab="GMAT Total", ylab="Salary",labels=row.names(placed))

scatterplot(salary ~ gmat_tot |frstlang, data=placed,
xlab="GMAT Total", ylab="Salary",
labels=row.names(placed))

scatterplot(salary ~ gmat_tot |sex, data=placed,
xlab="GMAT Total", ylab="Salary",
labels=row.names(placed))

scatterplot(salary ~ gmat_tot |satis, data=placed,
xlab="GMAT Total", ylab="Salary",
labels=row.names(placed))
## Warning in smoother(.x[subs], .y[subs], col = col[i], log.x =
## logged("x"), : could not fit smooth
## Warning in smoother(.x[subs], .y[subs], col = col[i], log.x =
## logged("x"), : could not fit smooth
## Warning in smoother(.x[subs], .y[subs], col = col[i], log.x =
## logged("x"), : could not fit smooth
## Warning in smoother(.x[subs], .y[subs], col = col[i], log.x =
## logged("x"), : could not fit smooth

scatterplot.matrix(~salary+s_avg+gmat_tot, data=placed)
## Warning: 'scatterplot.matrix' is deprecated.
## Use 'scatterplotMatrix' instead.
## See help("Deprecated") and help("car-deprecated").

mod1 = lm(salary ~ work_yrs + s_avg + gmat_tot + sex + frstlang + satis , data = placed)
summary(mod1)
##
## Call:
## lm(formula = salary ~ work_yrs + s_avg + gmat_tot + sex + frstlang +
## satis, data = placed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -96610 -12069 9177 24468 133347
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 69714.585 40393.827 1.726 0.08612 .
## work_yrs 1962.803 1192.425 1.646 0.10153
## s_avg 23786.577 7834.309 3.036 0.00276 **
## gmat_tot -104.718 54.670 -1.915 0.05705 .
## sexM -5960.444 6853.894 -0.870 0.38567
## frstlangOther -19178.846 8827.964 -2.173 0.03115 *
## satis -72.048 6.793 -10.606 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 39010 on 177 degrees of freedom
## Multiple R-squared: 0.4661, Adjusted R-squared: 0.448
## F-statistic: 25.76 on 6 and 177 DF, p-value: < 2.2e-16
mod2 = lm(salary ~ work_yrs + sex +frstlang +satis, data = placed)
summary(mod2)
##
## Call:
## lm(formula = salary ~ work_yrs + sex + frstlang + satis, data = placed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -90235 -6145 9521 24271 123037
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 77570.41 7398.84 10.484 <2e-16 ***
## work_yrs 2700.83 1202.03 2.247 0.0259 *
## sexM -10204.55 6911.96 -1.476 0.1416
## frstlangOther -20680.37 8835.08 -2.341 0.0203 *
## satis -73.18 6.95 -10.529 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 40050 on 179 degrees of freedom
## Multiple R-squared: 0.4309, Adjusted R-squared: 0.4182
## F-statistic: 33.88 on 4 and 179 DF, p-value: < 2.2e-16
mod3= lm(salary ~ work_yrs + s_avg + gmat_tot + sex + satis , data = placed)
summary(mod3)
##
## Call:
## lm(formula = salary ~ work_yrs + s_avg + gmat_tot + sex + satis,
## data = placed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -92127 -8848 10355 25202 122965
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 48820.072 39639.962 1.232 0.219728
## work_yrs 1757.467 1201.029 1.463 0.145149
## s_avg 26241.312 7832.985 3.350 0.000986 ***
## gmat_tot -86.620 54.593 -1.587 0.114370
## sexM -4901.693 6907.611 -0.710 0.478876
## satis -73.465 6.832 -10.753 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 39420 on 178 degrees of freedom
## Multiple R-squared: 0.4519, Adjusted R-squared: 0.4365
## F-statistic: 29.35 on 5 and 178 DF, p-value: < 2.2e-16
mod4 <- lm(salary ~ work_yrs + gmat_tot + sex + frstlang , data = placed)
summary(mod4)
##
## Call:
## lm(formula = salary ~ work_yrs + gmat_tot + sex + frstlang, data = placed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -94178 -51200 20857 40147 123092
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 143515.31 45158.17 3.178 0.00175 **
## work_yrs 3701.92 1520.74 2.434 0.01590 *
## gmat_tot -133.27 69.99 -1.904 0.05850 .
## sexM -15021.45 8691.56 -1.728 0.08566 .
## frstlangOther -32223.83 11244.08 -2.866 0.00466 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 50460 on 179 degrees of freedom
## Multiple R-squared: 0.09672, Adjusted R-squared: 0.07653
## F-statistic: 4.792 on 4 and 179 DF, p-value: 0.001074