salary <- read.csv(paste("MBA Starting Salaries Data.csv"))
library(psych)
describe(salary)
## vars n mean sd median trimmed mad min max
## age 1 274 27.36 3.71 27 26.76 2.97 22 48
## sex 2 274 1.25 0.43 1 1.19 0.00 1 2
## gmat_tot 3 274 619.45 57.54 620 618.86 59.30 450 790
## gmat_qpc 4 274 80.64 14.87 83 82.31 14.83 28 99
## gmat_vpc 5 274 78.32 16.86 81 80.33 14.83 16 99
## gmat_tpc 6 274 84.20 14.02 87 86.12 11.86 0 99
## s_avg 7 274 3.03 0.38 3 3.03 0.44 2 4
## f_avg 8 274 3.06 0.53 3 3.09 0.37 0 4
## quarter 9 274 2.48 1.11 2 2.47 1.48 1 4
## work_yrs 10 274 3.87 3.23 3 3.29 1.48 0 22
## frstlang 11 274 1.12 0.32 1 1.02 0.00 1 2
## salary 12 274 39025.69 50951.56 999 33607.86 1481.12 0 220000
## satis 13 274 172.18 371.61 6 91.50 1.48 1 998
## range skew kurtosis se
## age 26 2.16 6.45 0.22
## sex 1 1.16 -0.66 0.03
## gmat_tot 340 -0.01 0.06 3.48
## gmat_qpc 71 -0.92 0.30 0.90
## gmat_vpc 83 -1.04 0.74 1.02
## gmat_tpc 99 -2.28 9.02 0.85
## s_avg 2 -0.06 -0.38 0.02
## f_avg 4 -2.08 10.85 0.03
## quarter 3 0.02 -1.35 0.07
## work_yrs 22 2.78 9.80 0.20
## frstlang 1 2.37 3.65 0.02
## salary 220000 0.70 -1.05 3078.10
## satis 997 1.77 1.13 22.45
boxplot(salary$age,
main = "Age Distribution",
col= "Blue",
horizontal = TRUE)
boxplot(salary$gmat_tot,
main = "Gmat Total Score",
col= "RED",
horizontal = TRUE)
boxplot(salary$gmat_qpc, salary$gmat_vpc, salary$gmat_tpc,
main = "Sectional Percentile",
col= "Green",
ylab = c(" 1 = Quantitative GMAT Percentile","2= Verbal GMAT Percentile", "3= Overall GMAT Percentile"),
horizontal = TRUE)
boxplot(salary$s_avg, salary$f_avg,
main = "Average Marks in MBA",
col= "Grey",
ylab = c("1 = Spring MBA Average","2= Fall MBA Average"),
horizontal = TRUE)
boxplot(salary$s_avg, salary$f_avg,
main = "Average Marks in MBA",
col= "Grey",
ylab = c("1 = Spring MBA Average","2= Fall MBA Average"),
horizontal = TRUE)
boxplot(salary$salary,
main = "Starting Salary",
col= "Grey",
horizontal = TRUE)
boxplot(salary$work_yrs,
main = "Years of Work Experience",
col= "Gold",
horizontal = TRUE)
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
scatterplotMatrix(formula = ~gmat_tot + s_avg + f_avg + work_yrs + salary, data = salary)
library(corrgram)
corrgram(salary,
order=FALSE,
lower.panel=panel.shade,
upper.panel=panel.pie,
diag.panel=panel.minmax,
text.panel=panel.txt,
main="Corrgram of MBA Starting Salary intercorrelations")
pla <- salary[ which(salary$salary!= "0"),]
place <- pla[which (pla$salary!= "999"),]
placed <- place[which(place$salary != "998"),]
View(placed)
fit <- lm ( salary ~ gmat_tot + gmat_tpc + s_avg + f_avg +work_yrs + age, data= placed)
summary(fit)
##
## Call:
## lm(formula = salary ~ gmat_tot + gmat_tpc + s_avg + f_avg + work_yrs +
## age, data = placed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -35226 -7222 -1719 5276 73074
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10199.4 38125.1 0.268 0.7896
## gmat_tot 169.7 123.8 1.370 0.1738
## gmat_tpc -896.5 567.6 -1.580 0.1175
## s_avg 1838.4 4885.9 0.376 0.7075
## f_avg -1470.5 3850.3 -0.382 0.7034
## work_yrs 317.7 1105.1 0.288 0.7743
## age 2307.6 1011.5 2.281 0.0247 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15710 on 96 degrees of freedom
## Multiple R-squared: 0.2729, Adjusted R-squared: 0.2275
## F-statistic: 6.006 on 6 and 96 DF, p-value: 2.306e-05
This is the best model we can fit the data which we can have. As per it the age is statistically significant. Since the p value of F statistics is quite low so we reject the null hypothesis that the parameters are not statistically significant. Ingdividual t test statistics is not much significant.
notplaced <- salary[ which(salary$salary== "0"),]
View(notplaced)