store <- read.csv("~/MBA Starting Salaries Data.csv")
View(store)
library(psych)
describe(store)
## vars n mean sd median trimmed mad min max
## age 1 274 27.36 3.71 27 26.76 2.97 22 48
## sex 2 274 1.25 0.43 1 1.19 0.00 1 2
## gmat_tot 3 274 619.45 57.54 620 618.86 59.30 450 790
## gmat_qpc 4 274 80.64 14.87 83 82.31 14.83 28 99
## gmat_vpc 5 274 78.32 16.86 81 80.33 14.83 16 99
## gmat_tpc 6 274 84.20 14.02 87 86.12 11.86 0 99
## s_avg 7 274 3.03 0.38 3 3.03 0.44 2 4
## f_avg 8 274 3.06 0.53 3 3.09 0.37 0 4
## quarter 9 274 2.48 1.11 2 2.47 1.48 1 4
## work_yrs 10 274 3.87 3.23 3 3.29 1.48 0 22
## frstlang 11 274 1.12 0.32 1 1.02 0.00 1 2
## salary 12 274 39025.69 50951.56 999 33607.86 1481.12 0 220000
## satis 13 274 172.18 371.61 6 91.50 1.48 1 998
## range skew kurtosis se
## age 26 2.16 6.45 0.22
## sex 1 1.16 -0.66 0.03
## gmat_tot 340 -0.01 0.06 3.48
## gmat_qpc 71 -0.92 0.30 0.90
## gmat_vpc 83 -1.04 0.74 1.02
## gmat_tpc 99 -2.28 9.02 0.85
## s_avg 2 -0.06 -0.38 0.02
## f_avg 4 -2.08 10.85 0.03
## quarter 3 0.02 -1.35 0.07
## work_yrs 22 2.78 9.80 0.20
## frstlang 1 2.37 3.65 0.02
## salary 220000 0.70 -1.05 3078.10
## satis 997 1.77 1.13 22.45
hist(store$age, breaks=20,col="blue",xlab="Age in years", main="Age distribution")
plot(store$sex,main = "Graph showing number of Males and Females",col="pink")
hist(store$work_yrs, breaks=20,col="blue",xlab="Work Experience in years", main="Work experience distribution")
hist(store$gmat_tot, breaks=40,col="blue",xlab="score out of 800", main="Gmat Score distribution")
plot(store$frstlang,main = "First Language Distribution",col="red")
newdata <- store[ which(store$satis<='7'), ]
hist(newdata$satis, breaks=5,col="magenta",xlab="Degree of Satisfaction,1=low 7=high", main="Satisfaction distribution")
newdata1 <- store[ which(store$salary !="998" & store$salary !="999"), ]
hist(newdata1$salary, breaks=10,col="magenta",xlab="starting salary", main="Salary distribution")
##Scatter Plots
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
scatterplot(salary ~age, data=newdata1,
spread=FALSE, smoother.args=list(lty=2),
main="Scatter plot of salary vs age",
xlab="age",
ylab="salary")
scatterplot(salary ~frstlang, data=newdata1,
main="Scatter plot of salary vs first language",
xlab="first language",
ylab="salary")
scatterplot(salary ~gmat_tot, data=newdata1,
main="Scatter plot of salary vs Gmat total",
xlab="Gmat score",
ylab="salary")
scatterplot(salary ~work_yrs, data=newdata1,
main="Scatter plot of salary vs Work exp.",
xlab="Work experience in years",
ylab="salary")
scatterplot(salary ~satis, data=newdata1,
main="Scatter plot of salary vs satisfaction",
xlab="Degree of satisfaction",
ylab="salary")
library(corrgram)
corrgram(newdata1, order=TRUE, lower.panel=panel.shade,
upper.panel=panel.pie, text.panel=panel.txt,
main="MBA starting salary analysis Correlogram")
chisq.test(store)
## Warning in chisq.test(store): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: store
## X-squared = 8451300, df = 3276, p-value < 2.2e-16
salary <-xtabs(~store$work_yrs)
salary
## store$work_yrs
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 15 16 18 22
## 3 24 82 56 43 21 12 9 7 2 2 2 2 1 2 3 1 2
salary <-xtabs(~store$sex)
salary
## store$sex
## 1 2
## 206 68
salary <-xtabs(~store$gmat_tot)
salary
## store$gmat_tot
## 450 460 480 500 510 520 530 540 550 560 570 580 590 600 610 620 630 640
## 2 1 1 3 2 1 5 5 8 21 18 15 9 20 18 20 22 12
## 650 660 670 680 690 700 710 720 730 740 750 760 790
## 16 14 17 12 4 5 10 4 2 4 1 1 1
m1 <- lm(salary ~ age + sex + gmat_tot + gmat_qpc + gmat_vpc + gmat_tpc + work_yrs + satis,
data=store)
summary(m1)
##
## Call:
## lm(formula = salary ~ age + sex + gmat_tot + gmat_qpc + gmat_vpc +
## gmat_tpc + work_yrs + satis, data = store)
##
## Residuals:
## Min 1Q Median 3Q Max
## -64763 -45163 -3221 43605 186789
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 214629.575 65128.160 3.295 0.00112 **
## age -3948.105 1530.064 -2.580 0.01041 *
## sex 3730.050 6834.469 0.546 0.58568
## gmat_tot -299.779 209.269 -1.433 0.15318
## gmat_qpc 338.156 582.841 0.580 0.56228
## gmat_vpc 475.523 527.377 0.902 0.36805
## gmat_tpc 522.828 417.568 1.252 0.21164
## work_yrs 3391.302 1754.338 1.933 0.05429 .
## satis -47.715 7.856 -6.073 4.33e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 47600 on 265 degrees of freedom
## Multiple R-squared: 0.1528, Adjusted R-squared: 0.1272
## F-statistic: 5.974 on 8 and 265 DF, p-value: 4.728e-07