MBA.df <- read.csv(paste("MBA Starting Salaries Data.csv", sep=""))
attach(MBA.df)
dim(MBA.df)
## [1] 274 13
library(psych)
describe(MBA.df)[,c(1:5)]
## vars n mean sd median
## age 1 274 27.36 3.71 27
## sex 2 274 1.25 0.43 1
## gmat_tot 3 274 619.45 57.54 620
## gmat_qpc 4 274 80.64 14.87 83
## gmat_vpc 5 274 78.32 16.86 81
## gmat_tpc 6 274 84.20 14.02 87
## s_avg 7 274 3.03 0.38 3
## f_avg 8 274 3.06 0.53 3
## quarter 9 274 2.48 1.11 2
## work_yrs 10 274 3.87 3.23 3
## frstlang 11 274 1.12 0.32 1
## salary 12 274 39025.69 50951.56 999
## satis 13 274 172.18 371.61 6
str(MBA.df)
## 'data.frame': 274 obs. of 13 variables:
## $ age : int 23 24 24 24 24 24 25 25 25 25 ...
## $ sex : int 2 1 1 1 2 1 1 2 1 1 ...
## $ gmat_tot: int 620 610 670 570 710 640 610 650 630 680 ...
## $ gmat_qpc: int 77 90 99 56 93 82 89 88 79 99 ...
## $ gmat_vpc: int 87 71 78 81 98 89 74 89 91 81 ...
## $ gmat_tpc: int 87 87 95 75 98 91 87 92 89 96 ...
## $ s_avg : num 3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
## $ f_avg : num 3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
## $ quarter : int 1 1 1 1 1 1 1 1 1 1 ...
## $ work_yrs: int 2 2 2 1 2 2 2 2 2 2 ...
## $ frstlang: int 1 1 1 1 1 1 1 1 2 1 ...
## $ salary : int 0 0 0 0 999 0 0 0 999 998 ...
## $ satis : int 7 6 6 7 5 6 5 6 4 998 ...
# Replace the 'sex' columns as follows: 1 = Male, 2 = Female
# Replace the 'frstlang' columns as follows: 1 = English, 2 = Other
# Convert them both into factors
MBA.df$sex[MBA.df$sex == 1] <- 'Male'
MBA.df$sex[MBA.df$sex == 2] <- 'Female'
MBA.df$sex <- factor(MBA.df$sex)
MBA.df$frstlang[MBA.df$frstlang == 1] <- 'English'
MBA.df$frstlang[MBA.df$frstlang == 2] <- 'Other'
MBA.df$frstlang <- factor(MBA.df$frstlang)
str(MBA.df) #Verify the data types, sex and frstlang should be Factor variables
## 'data.frame': 274 obs. of 13 variables:
## $ age : int 23 24 24 24 24 24 25 25 25 25 ...
## $ sex : Factor w/ 2 levels "Female","Male": 1 2 2 2 1 2 2 1 2 2 ...
## $ gmat_tot: int 620 610 670 570 710 640 610 650 630 680 ...
## $ gmat_qpc: int 77 90 99 56 93 82 89 88 79 99 ...
## $ gmat_vpc: int 87 71 78 81 98 89 74 89 91 81 ...
## $ gmat_tpc: int 87 87 95 75 98 91 87 92 89 96 ...
## $ s_avg : num 3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
## $ f_avg : num 3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
## $ quarter : int 1 1 1 1 1 1 1 1 1 1 ...
## $ work_yrs: int 2 2 2 1 2 2 2 2 2 2 ...
## $ frstlang: Factor w/ 2 levels "English","Other": 1 1 1 1 1 1 1 1 2 1 ...
## $ salary : int 0 0 0 0 999 0 0 0 999 998 ...
## $ satis : int 7 6 6 7 5 6 5 6 4 998 ...
# We divide the data into 4 dataframes.
# This is because thee Data has the following special conditions for "Salary".
# 999: Answered Survey But Did Not Disclose Salary
# 998: Did not Answer Survey
# 0: Not Yet Placed
# 999: Answered Survey, Got Placed, Disclosed Salary
# MBAs who got placed and who disclosed their salaries
placed.df <- MBA.df[which (MBA.df$salary > 1000) , ]
View(placed.df)
# MBAs who were not placed
notPlaced.df <- MBA.df[which(MBA.df$salary==0), ]
View(notPlaced.df)
# MBAs who were placed but did not disclose their salary
notDisclosedSalary.df <- MBA.df[which (MBA.df$salary == 999) , ]
View(notDisclosedSalary.df)
# MBAs who did not answer the survey
notAnsweredSurvey.df <- MBA.df[which (MBA.df$salary == 998) , ]
View(notAnsweredSurvey.df)
# Let avgSalary = Average Salary of students who were placed and who disclosed their salary
avgSalary = mean(placed.df$salary)
avgSalary
## [1] 103030.7
# Assume avgSalary to be the salary of Students who did not disclose their salary (notDisclosedSalary.df)
notDisclosedSalary.df$salary = avgSalary
# allPlaced.df: A dataframe containing all students who were placed.
# It includes those students who disclosed their salary (placed.df) and those who did not disclose their salary (notDisclosed.df), where the salary of students who did not disclose their salary (notDisclosed.df) is assumed to be the sample average of the students who disclosed their salary (placed.df)
allPlaced.df <- rbind(placed.df, notDisclosedSalary.df)
library(psych)
describe(allPlaced.df)[,c(1:5)]
## vars n mean sd median
## age 1 138 26.96 3.05 26.0
## sex* 2 138 1.74 0.44 2.0
## gmat_tot 3 138 619.28 53.47 620.0
## gmat_qpc 4 138 81.10 13.59 83.5
## gmat_vpc 5 138 77.99 17.10 81.5
## gmat_tpc 6 138 84.48 13.08 87.0
## s_avg 7 138 3.03 0.38 3.0
## f_avg 8 138 3.06 0.46 3.0
## quarter 9 138 2.43 1.15 2.0
## work_yrs 10 138 3.67 2.75 3.0
## frstlang* 11 138 1.12 0.32 1.0
## salary 12 138 103030.74 15418.25 103030.7
## satis 13 138 5.53 1.11 6.0
View(allPlaced.df)
library(lattice)
histogram(~salary, data = placed.df,
main = "Distribution of Starting Salary", xlab="Starting Salary", col='grey' )
histogram(~salary, data = allPlaced.df,
main = "Distribution of Starting Salary", xlab="Starting Salary", col='grey' )
aggregate(cbind(salary, work_yrs, age) ~ sex,
data = placed.df, mean)
## sex salary work_yrs age
## 1 Female 98524.39 3.258065 26.06452
## 2 Male 104970.97 3.861111 27.08333
aggregate(cbind(salary, work_yrs, age) ~ sex,
data = allPlaced.df, mean)
## sex salary work_yrs age
## 1 Female 99150.27 3.277778 26.13889
## 2 Male 104400.32 3.803922 27.24510
# Scatter plot of Salary with Work Experience
library(car)
scatterplot(salary ~ work_yrs ,data=placed.df, main="Scatterplot of Salary with Work Experience", xlab="Work Experience", ylab="MBA's Starting Salaries", horizontal=TRUE)
# Distribution of Salary with Work Experience
boxplot(salary ~ work_yrs ,data=placed.df, main="Distribution of Salary with Work Experience", ylab="Work Experience", xlab="MBA's Starting Salaries", horizontal=TRUE)
library(lattice)
histogram(~salary, data = placed.df,
main = "Frequency of Starting Salary", xlab="Starting Salary", col='grey' )
# Average Salary by Work Experience
salaryWorkEx = aggregate(salary ~ work_yrs, data = placed.df, mean)
salaryWorkEx
## work_yrs salary
## 1 0 95000.00
## 2 1 103532.00
## 3 2 97673.68
## 4 3 101652.86
## 5 4 105454.55
## 6 5 103142.86
## 7 6 105928.57
## 8 7 98000.00
## 9 8 105025.00
## 10 10 118000.00
## 11 15 183000.00
## 12 16 108500.00
scatterplot(salary ~ gmat_tot , data=placed.df,
xlab="GMAT Total", ylab="Salary",
main="Comparison of Salary with Total GMAT score",
labels=row.names(placed.df))
scatterplot(salary ~ gmat_tot | sex, data=placed.df,
xlab="GMAT Total", ylab="Salary",
main="Comparison of Salary with Total GMAT score",
labels=row.names(placed.df))
boxplot(salary ~ gmat_tot , data=placed.df,
ylab="GMAT Total", xlab="Salary",
main="Comparison of Salary with Total GMAT score",
horizontal=TRUE,
labels=row.names(placed.df))
library(car)
scatterplot.matrix(~salary+gmat_tot+s_avg+f_avg, data=placed.df,
main="Salary versus other variables")
scatterplot.matrix(~salary+gmat_tot+s_avg+f_avg |sex, data=placed.df,
main="Salary versus other variables")
ageTable <- table(placed.df$sex, placed.df$age)
ageTable
##
## 22 23 24 25 26 27 28 29 30 31 32 33 34 39 40
## Female 1 2 5 10 5 1 3 1 2 0 0 0 0 0 1
## Male 0 3 11 13 9 13 5 5 4 4 1 1 1 1 1
aggregate(cbind(salary, work_yrs, age) ~ sex,
data = MBA.df, mean)
## sex salary work_yrs age
## 1 Female 45121.07 3.808824 27.17647
## 2 Male 37013.62 3.893204 27.41748
aggregate(cbind(salary, work_yrs) ~ age, data = MBA.df, mean)
## age salary work_yrs
## 1 22 42500.00 1.000000
## 2 23 57282.00 1.750000
## 3 24 49342.24 1.727273
## 4 25 43395.55 2.264151
## 5 26 35982.07 2.875000
## 6 27 31499.37 3.130435
## 7 28 39809.00 4.666667
## 8 29 28067.95 4.500000
## 9 30 55291.25 5.583333
## 10 31 40599.40 5.800000
## 11 32 13662.25 5.625000
## 12 33 118000.00 10.000000
## 13 34 26250.00 11.500000
## 14 35 0.00 9.333333
## 15 36 0.00 12.500000
## 16 37 0.00 9.000000
## 17 39 56000.00 10.500000
## 18 40 183000.00 15.000000
## 19 42 0.00 13.000000
## 20 43 0.00 19.000000
## 21 48 0.00 22.000000
aggregate(cbind(salary, work_yrs) ~ satis , data = MBA.df, mean)
## satis salary work_yrs
## 1 1 999.000 3.000000
## 2 2 999.000 2.000000
## 3 3 19799.200 4.200000
## 4 4 6293.412 2.941176
## 5 5 40476.311 4.243243
## 6 6 54383.536 4.185567
## 7 7 65718.152 3.727273
## 8 998 998.000 3.086957
boxplot(salary ~ work_yrs ,data=MBA.df, main="Effect of Work Experience on Salary", ylab="Work Experience", xlab="MBA's Starting Salaries", horizontal=FALSE)
boxplot(salary ~ sex ,data=MBA.df, main="Effect of Gender on Salary", ylab="Work Experience", xlab="MBA's Starting Salaries", horizontal=FALSE)
library(lattice)
histogram(~salary, data = MBA.df,
main = "Distribution of MBA's Starting Salary", xlab="MBA's Starting Salary", col='grey' )
knownMBA.df <- rbind(placed.df, notDisclosedSalary.df, notPlaced.df)
View(knownMBA.df)
knownMBA.df$GotPlaced = (knownMBA.df$salary >1000)
View(knownMBA.df)
knownMBA.df$GotPlaced <- factor(knownMBA.df$GotPlaced)
str(knownMBA.df)
## 'data.frame': 228 obs. of 14 variables:
## $ age : int 22 27 25 25 27 28 24 25 25 25 ...
## $ sex : Factor w/ 2 levels "Female","Male": 1 1 1 1 2 1 2 1 1 2 ...
## $ gmat_tot : int 660 700 680 650 710 620 670 560 530 650 ...
## $ gmat_qpc : int 90 94 87 82 96 52 84 52 50 79 ...
## $ gmat_vpc : int 92 98 96 91 96 98 96 81 62 93 ...
## $ gmat_tpc : int 94 98 96 93 98 87 95 72 61 93 ...
## $ s_avg : num 3.5 3.3 3.5 3.4 3.3 3.4 3.3 3.3 3.6 3.3 ...
## $ f_avg : num 3.75 3.25 2.67 3.25 3.5 3.75 3.25 3.5 3.67 3.5 ...
## $ quarter : int 1 1 1 1 1 1 1 1 1 1 ...
## $ work_yrs : int 1 2 2 3 2 5 0 1 3 1 ...
## $ frstlang : Factor w/ 2 levels "English","Other": 1 1 1 1 1 1 1 1 1 1 ...
## $ salary : num 85000 85000 86000 88000 92000 93000 95000 95000 95000 96000 ...
## $ satis : int 5 6 5 7 6 5 4 5 3 7 ...
## $ GotPlaced: Factor w/ 2 levels "FALSE","TRUE": 2 2 2 2 2 2 2 2 2 2 ...
#GotPlaced = factor(year)
#dummies = model.matrix(~year.f)
allplaced <- table(knownMBA.df$GotPlaced == 'TRUE')
allplaced
##
## FALSE TRUE
## 90 138
placedbySex <- xtabs(~ knownMBA.df$GotPlaced + knownMBA.df$sex , data=knownMBA.df)
placedbySex
## knownMBA.df$sex
## knownMBA.df$GotPlaced Female Male
## FALSE 23 67
## TRUE 36 102
addmargins(placedbySex)
## knownMBA.df$sex
## knownMBA.df$GotPlaced Female Male Sum
## FALSE 23 67 90
## TRUE 36 102 138
## Sum 59 169 228
prop.table(placedbySex, 2)
## knownMBA.df$sex
## knownMBA.df$GotPlaced Female Male
## FALSE 0.3898305 0.3964497
## TRUE 0.6101695 0.6035503
placedbyLanguage <- xtabs(~ knownMBA.df$GotPlaced + knownMBA.df$frstlang, data=knownMBA.df)
placedbyLanguage
## knownMBA.df$frstlang
## knownMBA.df$GotPlaced English Other
## FALSE 82 8
## TRUE 122 16
addmargins(placedbyLanguage)
## knownMBA.df$frstlang
## knownMBA.df$GotPlaced English Other Sum
## FALSE 82 8 90
## TRUE 122 16 138
## Sum 204 24 228
prop.table(placedbyLanguage, 2)
## knownMBA.df$frstlang
## knownMBA.df$GotPlaced English Other
## FALSE 0.4019608 0.3333333
## TRUE 0.5980392 0.6666667
chisq.test(placedbySex)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: placedbySex
## X-squared = 3.5816e-30, df = 1, p-value = 1
Since the p value is greater than 0.05, we reject the null hypothesis that the percentage of females who got placed and the percentage of males who got placed is equal. So,from Chi Square Test, it can be concluded that the hypothesis H1 is true. ## H2: The percentage of people placed whose first language is English is higher than the percentage of people placed whose first language is not English
chisq.test(placedbyLanguage)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: placedbyLanguage
## X-squared = 0.18479, df = 1, p-value = 0.6673
Since the p value is greater than 0.05, we reject the null hypothesis that the percentage of people who got placed whose first language is English and the percentage of people who got placed whose first language is not English is equal. So,from Chi Square Test, it can be concluded that the hypothesis H2 is true. # MODEL SELECTION
library(corrplot)
colnames(placed.df)
## [1] "age" "sex" "gmat_tot" "gmat_qpc" "gmat_vpc" "gmat_tpc"
## [7] "s_avg" "f_avg" "quarter" "work_yrs" "frstlang" "salary"
## [13] "satis"
dataColumns <- placed.df[, c("age","work_yrs", "gmat_tot", "gmat_qpc", "gmat_vpc", "gmat_tpc", "s_avg", "f_avg", "quarter", "satis")]
N <- cor(dataColumns)
corrplot(N, method="circle")
res <- cor(dataColumns)
round(res, 2)
## age work_yrs gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg
## age 1.00 0.88 -0.08 -0.17 0.02 -0.10 0.16 -0.22
## work_yrs 0.88 1.00 -0.12 -0.18 -0.03 -0.13 0.16 -0.22
## gmat_tot -0.08 -0.12 1.00 0.67 0.78 0.97 0.17 0.12
## gmat_qpc -0.17 -0.18 0.67 1.00 0.09 0.66 0.02 0.10
## gmat_vpc 0.02 -0.03 0.78 0.09 1.00 0.78 0.16 0.02
## gmat_tpc -0.10 -0.13 0.97 0.66 0.78 1.00 0.14 0.07
## s_avg 0.16 0.16 0.17 0.02 0.16 0.14 1.00 0.45
## f_avg -0.22 -0.22 0.12 0.10 0.02 0.07 0.45 1.00
## quarter -0.13 -0.13 -0.11 0.01 -0.13 -0.10 -0.84 -0.43
## satis 0.11 0.06 0.06 0.00 0.15 0.12 -0.14 -0.12
## quarter satis
## age -0.13 0.11
## work_yrs -0.13 0.06
## gmat_tot -0.11 0.06
## gmat_qpc 0.01 0.00
## gmat_vpc -0.13 0.15
## gmat_tpc -0.10 0.12
## s_avg -0.84 -0.14
## f_avg -0.43 -0.12
## quarter 1.00 0.23
## satis 0.23 1.00
# MBA PERFORMANCE
# The variables tracking performance during the MBA are heavily correlated
mbaPerformance <- placed.df[, c("s_avg", "f_avg", "quarter")]
N <- cor(mbaPerformance)
corrplot(N, method="circle")
res <- cor(mbaPerformance)
round(res, 2)
## s_avg f_avg quarter
## s_avg 1.00 0.45 -0.84
## f_avg 0.45 1.00 -0.43
## quarter -0.84 -0.43 1.00
# The overall performance quartile (quarter) is highly correlated with the Spring (s_avg) and Fall (f_avg) GPA
# We will include 's_avg' and 'f_avg' in our regression, but exclude 'quarter' from our regression.
# Identifying DEPENDENT and INDEPENDENT Variables
# The DEPENDENT variable is "salary"
# Identifying INDEPENDENT Variables:
#1a. Variables related to GMAT are highly correlated: "gmat_tot" "gmat_qpc" "gmat_vpc" "gmat_tpc"
#1b. Variables relatd to MBA performance are highly correlated: "s_avg" "f_avg" "quarter"
#1c. Variables related to age and work experience are highly correlated: "age", "work_yrs"
#1d. Other variables: "sex" , "frstlang" ; "satis"
# 1e. GMAT
# The GMAT related columns are heavily strongly correlated with each other
gmat <- placed.df[, c("gmat_tot", "gmat_qpc", "gmat_vpc", "gmat_tpc")]
res <- cor(gmat)
round(res, 2)
## gmat_tot gmat_qpc gmat_vpc gmat_tpc
## gmat_tot 1.00 0.67 0.78 0.97
## gmat_qpc 0.67 1.00 0.09 0.66
## gmat_vpc 0.78 0.09 1.00 0.78
## gmat_tpc 0.97 0.66 0.78 1.00
library(corrplot)
M <- cor(gmat)
corrplot(M, method="circle")
# However, GMAT verbal and quantitative scores are very weakly correlated
cor(gmat_qpc,gmat_vpc)
## [1] 0.1521801
# Therefore, in our regression we will include gmat_qpc and gmat_vpc , but exclude "gmat_tot" and "gmat_tpc"
# 1f. MBA PERFORMANCE
# The variables tracking performance during the MBA are heavily correlated
mbaPerformance <- placed.df[, c("s_avg", "f_avg", "quarter")]
N <- cor(mbaPerformance)
corrplot(N, method="circle")
res <- cor(mbaPerformance)
round(res, 2)
## s_avg f_avg quarter
## s_avg 1.00 0.45 -0.84
## f_avg 0.45 1.00 -0.43
## quarter -0.84 -0.43 1.00
# The overall performance quartile (quarter) is highly correlated with the Spring (s_avg) and Fall (f_avg) GPA
# We will include 's_avg' and 'f_avg' in our regression, but exclude 'quarter' from our regression.
# 1g. WORK EXPERIENCE AND AGE
# The variables 'age' and 'work_years' are highly correlated. The older the person, the greater the work experience.
cor(age,work_yrs)
## [1] 0.8582981
# Therefore we will include 'work_years' in our regression, but exclude 'age' from our regression
# SUMMARY OF MODEL SELECTION
# Given the above discussion, the indepedent variables we will include in the regression are {work_yrs,s_avg,f_avg,gmat_qpc,gmat_vpc,sex,frstlang,satis}
columns = c("salary", "work_yrs", "gmat_qpc", "gmat_vpc", "s_avg", "f_avg", "satis")
placedVariables <- placed.df[, columns]
res <- cor(placedVariables)
round(res, 2)
## salary work_yrs gmat_qpc gmat_vpc s_avg f_avg satis
## salary 1.00 0.45 0.01 -0.14 0.10 -0.11 -0.04
## work_yrs 0.45 1.00 -0.18 -0.03 0.16 -0.22 0.06
## gmat_qpc 0.01 -0.18 1.00 0.09 0.02 0.10 0.00
## gmat_vpc -0.14 -0.03 0.09 1.00 0.16 0.02 0.15
## s_avg 0.10 0.16 0.02 0.16 1.00 0.45 -0.14
## f_avg -0.11 -0.22 0.10 0.02 0.45 1.00 -0.12
## satis -0.04 0.06 0.00 0.15 -0.14 -0.12 1.00
library(corrplot)
M <- cor(placed.df[, columns])
corrplot(M, method="circle")
library(car)
scatterplotMatrix(~salary + s_avg + f_avg + satis, data=placed.df,
main="Salary versus MBA Performance and MBA Satisfaction")
library(car)
scatterplotMatrix(~salary + work_yrs + gmat_qpc + gmat_vpc, data=placed.df,
main="Salary versus Work Experience; GMAT Performance")
Model1 <- salary ~
work_yrs + s_avg + f_avg + gmat_qpc + gmat_vpc + sex + frstlang + satis
fit1 <- lm(Model1, data = placed.df)
summary(fit1)
##
## Call:
## lm(formula = Model1, data = placed.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -29800 -7822 -1742 4869 82341
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 90136.22 21739.22 4.146 7.4e-05 ***
## work_yrs 2331.12 585.99 3.978 0.000137 ***
## s_avg 4659.05 5015.66 0.929 0.355320
## f_avg -1698.83 3834.70 -0.443 0.658773
## gmat_qpc 98.72 121.85 0.810 0.419884
## gmat_vpc -95.80 102.99 -0.930 0.354699
## sexMale 5289.24 3545.91 1.492 0.139140
## frstlangOther 13994.76 6641.66 2.107 0.037770 *
## satis -1671.20 2070.62 -0.807 0.421643
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15740 on 94 degrees of freedom
## Multiple R-squared: 0.285, Adjusted R-squared: 0.2241
## F-statistic: 4.683 on 8 and 94 DF, p-value: 7.574e-05
library(leaps)
## Warning: package 'leaps' was built under R version 3.4.3
leap <- regsubsets(Model1, data = placed.df, nbest=1)
summary(leap)
## Subset selection object
## Call: regsubsets.formula(Model1, data = placed.df, nbest = 1)
## 8 Variables (and intercept)
## Forced in Forced out
## work_yrs FALSE FALSE
## s_avg FALSE FALSE
## f_avg FALSE FALSE
## gmat_qpc FALSE FALSE
## gmat_vpc FALSE FALSE
## sexMale FALSE FALSE
## frstlangOther FALSE FALSE
## satis FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
## work_yrs s_avg f_avg gmat_qpc gmat_vpc sexMale frstlangOther
## 1 ( 1 ) "*" " " " " " " " " " " " "
## 2 ( 1 ) "*" " " " " " " " " " " "*"
## 3 ( 1 ) "*" " " " " " " " " "*" "*"
## 4 ( 1 ) "*" " " " " " " " " "*" "*"
## 5 ( 1 ) "*" "*" " " " " " " "*" "*"
## 6 ( 1 ) "*" "*" " " " " "*" "*" "*"
## 7 ( 1 ) "*" "*" " " "*" "*" "*" "*"
## 8 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## satis
## 1 ( 1 ) " "
## 2 ( 1 ) " "
## 3 ( 1 ) " "
## 4 ( 1 ) "*"
## 5 ( 1 ) "*"
## 6 ( 1 ) "*"
## 7 ( 1 ) "*"
## 8 ( 1 ) "*"
plot(leap, scale="adjr2")
Model2 <- salary ~
work_yrs +
# age +
# s_avg +
# f_avg +
# quarter +
# gmat_qpc +
# gmat_vpc +
# gmat_tot +
# gmat_tpc +
sex +
frstlang +
satis
fit2 <- lm(Model2, data = placed.df)
summary(fit2)
##
## Call:
## lm(formula = Model2, data = placed.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -30492 -8055 -1744 5362 80436
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 102214.0 11827.8 8.642 1.06e-13 ***
## work_yrs 2409.4 526.1 4.579 1.37e-05 ***
## sexMale 5949.5 3392.2 1.754 0.0826 .
## frstlangOther 14675.7 6274.0 2.339 0.0214 *
## satis -2244.4 1988.4 -1.129 0.2618
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15580 on 98 degrees of freedom
## Multiple R-squared: 0.2695, Adjusted R-squared: 0.2397
## F-statistic: 9.038 on 4 and 98 DF, p-value: 2.953e-06
library(leaps)
leap <- regsubsets(Model2, data = placed.df, nbest=1)
summary(leap)
## Subset selection object
## Call: regsubsets.formula(Model2, data = placed.df, nbest = 1)
## 4 Variables (and intercept)
## Forced in Forced out
## work_yrs FALSE FALSE
## sexMale FALSE FALSE
## frstlangOther FALSE FALSE
## satis FALSE FALSE
## 1 subsets of each size up to 4
## Selection Algorithm: exhaustive
## work_yrs sexMale frstlangOther satis
## 1 ( 1 ) "*" " " " " " "
## 2 ( 1 ) "*" " " "*" " "
## 3 ( 1 ) "*" "*" "*" " "
## 4 ( 1 ) "*" "*" "*" "*"
plot(leap, scale="adjr2")