We do this by using the read.csv command
# Read the data
MBA.df <- (read.csv(paste("MBA Starting Salaries Data.csv", sep="")))
# Summarize the data
attach(MBA.df)
library(psych)
describe(MBA.df)
## vars n mean sd median trimmed mad min max
## age 1 274 27.36 3.71 27 26.76 2.97 22 48
## sex 2 274 1.25 0.43 1 1.19 0.00 1 2
## gmat_tot 3 274 619.45 57.54 620 618.86 59.30 450 790
## gmat_qpc 4 274 80.64 14.87 83 82.31 14.83 28 99
## gmat_vpc 5 274 78.32 16.86 81 80.33 14.83 16 99
## gmat_tpc 6 274 84.20 14.02 87 86.12 11.86 0 99
## s_avg 7 274 3.03 0.38 3 3.03 0.44 2 4
## f_avg 8 274 3.06 0.53 3 3.09 0.37 0 4
## quarter 9 274 2.48 1.11 2 2.47 1.48 1 4
## work_yrs 10 274 3.87 3.23 3 3.29 1.48 0 22
## frstlang 11 274 1.12 0.32 1 1.02 0.00 1 2
## salary 12 274 39025.69 50951.56 999 33607.86 1481.12 0 220000
## satis 13 274 172.18 371.61 6 91.50 1.48 1 998
## range skew kurtosis se
## age 26 2.16 6.45 0.22
## sex 1 1.16 -0.66 0.03
## gmat_tot 340 -0.01 0.06 3.48
## gmat_qpc 71 -0.92 0.30 0.90
## gmat_vpc 83 -1.04 0.74 1.02
## gmat_tpc 99 -2.28 9.02 0.85
## s_avg 2 -0.06 -0.38 0.02
## f_avg 4 -2.08 10.85 0.03
## quarter 3 0.02 -1.35 0.07
## work_yrs 22 2.78 9.80 0.20
## frstlang 1 2.37 3.65 0.02
## salary 220000 0.70 -1.05 3078.10
## satis 997 1.77 1.13 22.45
# Data Types
str(MBA.df)
## 'data.frame': 274 obs. of 13 variables:
## $ age : int 23 24 24 24 24 24 25 25 25 25 ...
## $ sex : int 2 1 1 1 2 1 1 2 1 1 ...
## $ gmat_tot: int 620 610 670 570 710 640 610 650 630 680 ...
## $ gmat_qpc: int 77 90 99 56 93 82 89 88 79 99 ...
## $ gmat_vpc: int 87 71 78 81 98 89 74 89 91 81 ...
## $ gmat_tpc: int 87 87 95 75 98 91 87 92 89 96 ...
## $ s_avg : num 3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
## $ f_avg : num 3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
## $ quarter : int 1 1 1 1 1 1 1 1 1 1 ...
## $ work_yrs: int 2 2 2 1 2 2 2 2 2 2 ...
## $ frstlang: int 1 1 1 1 1 1 1 1 2 1 ...
## $ salary : int 0 0 0 0 999 0 0 0 999 998 ...
## $ satis : int 7 6 6 7 5 6 5 6 4 998 ...
summary(MBA.df)
## age sex gmat_tot gmat_qpc
## Min. :22.00 Min. :1.000 Min. :450.0 Min. :28.00
## 1st Qu.:25.00 1st Qu.:1.000 1st Qu.:580.0 1st Qu.:72.00
## Median :27.00 Median :1.000 Median :620.0 Median :83.00
## Mean :27.36 Mean :1.248 Mean :619.5 Mean :80.64
## 3rd Qu.:29.00 3rd Qu.:1.000 3rd Qu.:660.0 3rd Qu.:93.00
## Max. :48.00 Max. :2.000 Max. :790.0 Max. :99.00
## gmat_vpc gmat_tpc s_avg f_avg
## Min. :16.00 Min. : 0.0 Min. :2.000 Min. :0.000
## 1st Qu.:71.00 1st Qu.:78.0 1st Qu.:2.708 1st Qu.:2.750
## Median :81.00 Median :87.0 Median :3.000 Median :3.000
## Mean :78.32 Mean :84.2 Mean :3.025 Mean :3.062
## 3rd Qu.:91.00 3rd Qu.:94.0 3rd Qu.:3.300 3rd Qu.:3.250
## Max. :99.00 Max. :99.0 Max. :4.000 Max. :4.000
## quarter work_yrs frstlang salary
## Min. :1.000 Min. : 0.000 Min. :1.000 Min. : 0
## 1st Qu.:1.250 1st Qu.: 2.000 1st Qu.:1.000 1st Qu.: 0
## Median :2.000 Median : 3.000 Median :1.000 Median : 999
## Mean :2.478 Mean : 3.872 Mean :1.117 Mean : 39026
## 3rd Qu.:3.000 3rd Qu.: 4.000 3rd Qu.:1.000 3rd Qu.: 97000
## Max. :4.000 Max. :22.000 Max. :2.000 Max. :220000
## satis
## Min. : 1.0
## 1st Qu.: 5.0
## Median : 6.0
## Mean :172.2
## 3rd Qu.: 7.0
## Max. :998.0
mean(MBA.df$age)
## [1] 27.35766
sd(MBA.df$age)
## [1] 3.710666
mean(MBA.df$salary)
## [1] 39025.69
sd(MBA.df$salary)
## [1] 50951.56
mean(MBA.df$work_yrs)
## [1] 3.872263
sd(MBA.df$work_yrs)
## [1] 3.232464
aggregate(cbind(salary) ~ work_yrs , data = MBA.df, mean)
## work_yrs salary
## 1 0 31999.67
## 2 1 34677.08
## 3 2 45531.24
## 4 3 38494.21
## 5 4 27510.81
## 6 5 34476.10
## 7 6 62041.33
## 8 7 11221.78
## 9 8 60156.86
## 10 9 499.50
## 11 10 59000.00
## 12 11 0.00
## 13 12 0.00
## 14 13 0.00
## 15 15 183000.00
## 16 16 72333.33
## 17 18 0.00
## 18 22 0.00
boxplot(salary ~ work_yrs ,data=MBA.df, main="Effect of Work Experience on Salary", xlab="Work Experience", ylab="Starting Salary")
##Distribution of MBA’s Starting Salary
library(lattice)
histogram(~salary, data = MBA.df, main = "Distribution of MBA's Starting Salary", xlab="Starting Salary")
par(mfrow = c(1,1))
library(lattice)
bwplot(sex ~ salary , data = MBA.df, horizontal = TRUE , xlab = "Salary" , ylab = "Gender" , main = "Comparison of Salaries of Males and Females")
## => 3a. Draw a scatter plot of Profit vs. MTenure
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
scatterplot(age ~ salary, data=MBA.df,
spread=FALSE, smoother.args=list(lty=2), pch=19,
main="Scatterplot of Age vs. Salary",
xlab="Age",
ylab="Salary")
scatterplot(sex ~ salary, data=MBA.df,
spread=FALSE, smoother.args=list(lty=2), pch=19,
main="Scatterplot of Gender vs. Salary",
xlab="Gender",
ylab="Salary")
# Relationship between Salary and other variables
# => 4. Construct a Correlation Matrix for all variables in the dataset
options(digits=2)
cor(MBA.df, use="complete.obs", method="kendall")
## age sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg
## age 1.0000 -0.0649 -0.0893 -0.143 0.0043 -0.0921 0.0746
## sex -0.0649 1.0000 -0.0315 -0.116 0.0632 -0.0292 0.1050
## gmat_tot -0.0893 -0.0315 1.0000 0.570 0.6028 0.9760 0.0757
## gmat_qpc -0.1429 -0.1160 0.5705 1.000 0.1358 0.5662 -0.0336
## gmat_vpc 0.0043 0.0632 0.6028 0.136 1.0000 0.5948 0.1461
## gmat_tpc -0.0921 -0.0292 0.9760 0.566 0.5948 1.0000 0.0698
## s_avg 0.0746 0.1050 0.0757 -0.034 0.1461 0.0698 1.0000
## f_avg 0.0235 0.1064 0.0980 0.019 0.1189 0.0921 0.5907
## quarter 0.0046 -0.1220 -0.0656 0.041 -0.1370 -0.0572 -0.7192
## work_yrs 0.6550 0.0039 -0.1466 -0.190 -0.0477 -0.1440 0.0621
## frstlang 0.0827 0.0015 -0.1049 0.126 -0.2535 -0.1007 -0.1214
## salary -0.0670 -0.0070 0.0015 -0.018 0.0065 0.0019 0.0547
## satis -0.1218 -0.0307 0.0797 -0.025 0.1085 0.0808 -0.0033
## f_avg quarter work_yrs frstlang salary satis
## age 0.0235 0.0046 0.6550 0.0827 -0.0670 -0.1218
## sex 0.1064 -0.1220 0.0039 0.0015 -0.0070 -0.0307
## gmat_tot 0.0980 -0.0656 -0.1466 -0.1049 0.0015 0.0797
## gmat_qpc 0.0192 0.0408 -0.1901 0.1255 -0.0183 -0.0249
## gmat_vpc 0.1189 -0.1370 -0.0477 -0.2535 0.0065 0.1085
## gmat_tpc 0.0921 -0.0572 -0.1440 -0.1007 0.0019 0.0808
## s_avg 0.5907 -0.7192 0.0621 -0.1214 0.0547 -0.0033
## f_avg 1.0000 -0.4800 -0.0038 -0.0700 0.0237 -0.0010
## quarter -0.4800 1.0000 -0.0222 0.0912 -0.1027 -0.0069
## work_yrs -0.0038 -0.0222 1.0000 -0.0012 -0.0091 -0.0502
## frstlang -0.0700 0.0912 -0.0012 1.0000 -0.0100 -0.0911
## salary 0.0237 -0.1027 -0.0091 -0.0100 1.0000 -0.0020
## satis -0.0010 -0.0069 -0.0502 -0.0911 -0.0020 1.0000
library(corrgram)
corrgram(MBA.df, order=FALSE,
lower.panel=panel.shade,
upper.panel=panel.pie,
diag.panel=panel.minmax,
text.panel=panel.txt,
main="Corrgram of MBA starting salaries intercorrelations")
library(car)
scatterplotMatrix(MBA.df[,c("age","sex","work_yrs","salary")],
spread=FALSE, smoother.args=list(lty=2),
main="Scatter Plot Matrix")
scatterplotMatrix(~age+sex+gmat_tot+s_avg+f_avg+work_yrs+salary+satis, data=MBA.df)
##Take a subset of the dataset consisting of only those people who actually got a job.
placed.df <- MBA.df[which (MBA.df$salary > 1000) , ]
placed.df <- MBA.df[which (MBA.df$salary > 1000) , ]
notPlaced.df <- MBA.df[which(MBA.df$salary==0), ]
notDisclosedSalary.df <- MBA.df[which (MBA.df$salary == 999) , ]
clean.df <- rbind(placed.df, notDisclosedSalary.df, notPlaced.df)
clean.df$GotPlaced = (clean.df$salary >1000)
clean.df$GotPlaced <- factor(clean.df$GotPlaced)
t1 <- table(clean.df$GotPlaced == 'TRUE')
t1
##
## FALSE TRUE
## 125 103
aggregate(cbind(salary, work_yrs, age) ~ sex, data = MBA.df, mean)
## sex salary work_yrs age
## 1 1 37014 3.9 27
## 2 2 45121 3.8 27
boxplot(salary ~ sex ,data=MBA.df, main="Effect of Gender on Salary", ylab="Gender", xlab="Starting Salary")
t2 <- xtabs(~ GotPlaced + sex , data= clean.df)
t2
## sex
## GotPlaced 1 2
## FALSE 97 28
## TRUE 72 31
aggregate(cbind(salary, work_yrs) ~ age, data = MBA.df, mean)
## age salary work_yrs
## 1 22 42500 1.0
## 2 23 57282 1.8
## 3 24 49342 1.7
## 4 25 43396 2.3
## 5 26 35982 2.9
## 6 27 31499 3.1
## 7 28 39809 4.7
## 8 29 28068 4.5
## 9 30 55291 5.6
## 10 31 40599 5.8
## 11 32 13662 5.6
## 12 33 118000 10.0
## 13 34 26250 11.5
## 14 35 0 9.3
## 15 36 0 12.5
## 16 37 0 9.0
## 17 39 56000 10.5
## 18 40 183000 15.0
## 19 42 0 13.0
## 20 43 0 19.0
## 21 48 0 22.0
t3 <- xtabs(~ GotPlaced + age , data= clean.df)
t3
## age
## GotPlaced 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 39 40 42 43 48
## FALSE 1 3 15 15 15 21 9 16 5 4 7 0 3 3 2 1 1 0 1 2 1
## TRUE 1 5 16 23 14 14 8 6 6 4 1 1 1 0 0 0 1 2 0 0 0
prop.table(t3, 2)
## age
## GotPlaced 22 23 24 25 26 27 28 29 30 31 32 33 34
## FALSE 0.50 0.38 0.48 0.39 0.52 0.60 0.53 0.73 0.45 0.50 0.88 0.00 0.75
## TRUE 0.50 0.62 0.52 0.61 0.48 0.40 0.47 0.27 0.55 0.50 0.12 1.00 0.25
## age
## GotPlaced 35 36 37 39 40 42 43 48
## FALSE 1.00 1.00 1.00 0.50 0.00 1.00 1.00 1.00
## TRUE 0.00 0.00 0.00 0.50 1.00 0.00 0.00 0.00
aggregate(cbind(salary, work_yrs) ~ frstlang , data = MBA.df, mean)
## frstlang salary work_yrs
## 1 1 40627 3.9
## 2 2 26915 3.6
t4 <- xtabs(~ GotPlaced + frstlang, data=clean.df )
t4
## frstlang
## GotPlaced 1 2
## FALSE 108 17
## TRUE 96 7
prop.table(t4, 2)
## frstlang
## GotPlaced 1 2
## FALSE 0.53 0.71
## TRUE 0.47 0.29
chisq.test(t2)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: t2
## X-squared = 1, df = 1, p-value = 0.2
chisq.test(t4)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: t4
## X-squared = 2, df = 1, p-value = 0.1
chisq.test(t3)
##
## Pearson's Chi-squared test
##
## data: t3
## X-squared = 20, df = 20, p-value = 0.2
Preparing for regression analysis
MBA.df$sex[MBA.df$sex == 1] <- 'Male'
MBA.df$sex[MBA.df$sex == 2] <- 'Female'
MBA.df$sex <- factor(MBA.df$sex)
MBA.df$frstlang[MBA.df$frstlang == 1] <- 'English'
MBA.df$frstlang[MBA.df$frstlang == 2] <- 'Other'
MBA.df$frstlang <- factor(MBA.df$frstlang)
Regression Analysis
m <- salary ~ age + work_yrs + s_avg + f_avg + gmat_qpc + gmat_vpc + sex + frstlang + satis
fit <- lm(m, data = placed.df)
summary(fit)
##
## Call:
## lm(formula = m, data = placed.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25490 -7943 -1508 5760 78864
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 49477 31790 1.56 0.123
## age 1942 1138 1.71 0.091 .
## work_yrs 645 1146 0.56 0.575
## s_avg 3384 5021 0.67 0.502
## f_avg -1093 3813 -0.29 0.775
## gmat_qpc 118 121 0.97 0.333
## gmat_vpc -129 104 -1.24 0.217
## sex -3998 3591 -1.11 0.268
## frstlang 8324 7367 1.13 0.261
## satis -1848 2052 -0.90 0.370
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15600 on 93 degrees of freedom
## Multiple R-squared: 0.307, Adjusted R-squared: 0.24
## F-statistic: 4.57 on 9 and 93 DF, p-value: 5.29e-05