mba.df<- read.csv("Mba Starting Salaries Data.csv ", sep = ",")
View(mba.df)
summary(mba.df)
## age sex gmat_tot gmat_qpc
## Min. :22.00 Min. :1.000 Min. :450.0 Min. :28.00
## 1st Qu.:25.00 1st Qu.:1.000 1st Qu.:580.0 1st Qu.:72.00
## Median :27.00 Median :1.000 Median :620.0 Median :83.00
## Mean :27.36 Mean :1.248 Mean :619.5 Mean :80.64
## 3rd Qu.:29.00 3rd Qu.:1.000 3rd Qu.:660.0 3rd Qu.:93.00
## Max. :48.00 Max. :2.000 Max. :790.0 Max. :99.00
## gmat_vpc gmat_tpc s_avg f_avg
## Min. :16.00 Min. : 0.0 Min. :2.000 Min. :0.000
## 1st Qu.:71.00 1st Qu.:78.0 1st Qu.:2.708 1st Qu.:2.750
## Median :81.00 Median :87.0 Median :3.000 Median :3.000
## Mean :78.32 Mean :84.2 Mean :3.025 Mean :3.062
## 3rd Qu.:91.00 3rd Qu.:94.0 3rd Qu.:3.300 3rd Qu.:3.250
## Max. :99.00 Max. :99.0 Max. :4.000 Max. :4.000
## quarter work_yrs frstlang salary
## Min. :1.000 Min. : 0.000 Min. :1.000 Min. : 0
## 1st Qu.:1.250 1st Qu.: 2.000 1st Qu.:1.000 1st Qu.: 0
## Median :2.000 Median : 3.000 Median :1.000 Median : 999
## Mean :2.478 Mean : 3.872 Mean :1.117 Mean : 39026
## 3rd Qu.:3.000 3rd Qu.: 4.000 3rd Qu.:1.000 3rd Qu.: 97000
## Max. :4.000 Max. :22.000 Max. :2.000 Max. :220000
## satis
## Min. : 1.0
## 1st Qu.: 5.0
## Median : 6.0
## Mean :172.2
## 3rd Qu.: 7.0
## Max. :998.0
```
str(mba.df)
## 'data.frame': 274 obs. of 13 variables:
## $ age : int 23 24 24 24 24 24 25 25 25 25 ...
## $ sex : int 2 1 1 1 2 1 1 2 1 1 ...
## $ gmat_tot: int 620 610 670 570 710 640 610 650 630 680 ...
## $ gmat_qpc: int 77 90 99 56 93 82 89 88 79 99 ...
## $ gmat_vpc: int 87 71 78 81 98 89 74 89 91 81 ...
## $ gmat_tpc: int 87 87 95 75 98 91 87 92 89 96 ...
## $ s_avg : num 3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
## $ f_avg : num 3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
## $ quarter : int 1 1 1 1 1 1 1 1 1 1 ...
## $ work_yrs: int 2 2 2 1 2 2 2 2 2 2 ...
## $ frstlang: int 1 1 1 1 1 1 1 1 2 1 ...
## $ salary : int 0 0 0 0 999 0 0 0 999 998 ...
## $ satis : int 7 6 6 7 5 6 5 6 4 998 ...
mba.df$sex[mba.df$sex == 1]
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [71] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [106] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [141] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [176] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
mba.df$sex[mba.df$sex == 2]
## [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [36] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
mba.df$sex <- factor(mba.df$sex)
mba.df$frstlang[mba.df$frstlang == 1]
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [71] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [106] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [141] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [176] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [211] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
mba.df$frstlang[mba.df$frstlang == 2]
## [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
mba.df$frstlang <- factor(mba.df$frstlang)
str(mba.df)
## 'data.frame': 274 obs. of 13 variables:
## $ age : int 23 24 24 24 24 24 25 25 25 25 ...
## $ sex : Factor w/ 2 levels "1","2": 2 1 1 1 2 1 1 2 1 1 ...
## $ gmat_tot: int 620 610 670 570 710 640 610 650 630 680 ...
## $ gmat_qpc: int 77 90 99 56 93 82 89 88 79 99 ...
## $ gmat_vpc: int 87 71 78 81 98 89 74 89 91 81 ...
## $ gmat_tpc: int 87 87 95 75 98 91 87 92 89 96 ...
## $ s_avg : num 3.4 3.5 3.3 3.3 3.6 3.9 3.4 3.3 3.3 3.45 ...
## $ f_avg : num 3 4 3.25 2.67 3.75 3.75 3.5 3.75 3.25 3.67 ...
## $ quarter : int 1 1 1 1 1 1 1 1 1 1 ...
## $ work_yrs: int 2 2 2 1 2 2 2 2 2 2 ...
## $ frstlang: Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 2 1 ...
## $ salary : int 0 0 0 0 999 0 0 0 999 998 ...
## $ satis : int 7 6 6 7 5 6 5 6 4 998 ...
placed.df <- mba.df[which (mba.df$salary > 1000) , ]
View(placed.df)
notPlaced.df <- mba.df[which(mba.df$salary==0), ]
View(notPlaced.df)
notDisclosedSalary.df <- mba.df[which (mba.df$salary == 999) , ]
View(notDisclosedSalary.df)
notAnsweredSurvey.df <- mba.df[which (mba.df$salary == 998) , ]
View(notAnsweredSurvey.df)
c1 = dim(placed.df)[1]
c2 = dim(notPlaced.df)[1]
c3 = dim(notDisclosedSalary.df)[1]
c4 = dim(notAnsweredSurvey.df)[1]
c = c1+c2+c3+c4
c[1]
## [1] 274
head(placed.df)
## age sex gmat_tot gmat_qpc gmat_vpc gmat_tpc s_avg f_avg quarter
## 35 22 2 660 90 92 94 3.5 3.75 1
## 36 27 2 700 94 98 98 3.3 3.25 1
## 37 25 2 680 87 96 96 3.5 2.67 1
## 38 25 2 650 82 91 93 3.4 3.25 1
## 39 27 1 710 96 96 98 3.3 3.50 1
## 40 28 2 620 52 98 87 3.4 3.75 1
## work_yrs frstlang salary satis
## 35 1 1 85000 5
## 36 2 1 85000 6
## 37 2 1 86000 5
## 38 3 1 88000 7
## 39 2 1 92000 6
## 40 5 1 93000 5
avgSalary = mean(placed.df$salary)
avgSalary
## [1] 103030.7
notDisclosedSalary.df$salary = avgSalary
allPlaced.df <- rbind(placed.df, notDisclosedSalary.df)
library(psych)
describe(allPlaced.df)[,c(1:5)]
## vars n mean sd median
## age 1 138 26.96 3.05 26.0
## sex* 2 138 1.26 0.44 1.0
## gmat_tot 3 138 619.28 53.47 620.0
## gmat_qpc 4 138 81.10 13.59 83.5
## gmat_vpc 5 138 77.99 17.10 81.5
## gmat_tpc 6 138 84.48 13.08 87.0
## s_avg 7 138 3.03 0.38 3.0
## f_avg 8 138 3.06 0.46 3.0
## quarter 9 138 2.43 1.15 2.0
## work_yrs 10 138 3.67 2.75 3.0
## frstlang* 11 138 1.12 0.32 1.0
## salary 12 138 103030.74 15418.25 103030.7
## satis 13 138 5.53 1.11 6.0
View(allPlaced.df)
library(lattice)
histogram(~salary, data = placed.df,
main = "Distribution of Starting Salary", xlab="Starting Salary", col='grey' )
histogram(~salary, data = allPlaced.df,
main = "Distribution of Starting Salary", xlab="Starting Salary", col='grey' )
aggregate(cbind(salary, work_yrs, age) ~ sex,
data = placed.df, mean)
## sex salary work_yrs age
## 1 1 104970.97 3.861111 27.08333
## 2 2 98524.39 3.258065 26.06452
aggregate(cbind(salary, work_yrs, age) ~ sex,
data = allPlaced.df, mean)
## sex salary work_yrs age
## 1 1 104400.32 3.803922 27.24510
## 2 2 99150.27 3.277778 26.13889
boxplot(salary ~ work_yrs ,data=placed.df, main="Distribution of Salary with Work Experience", ylab="Work Experience", xlab="MBA's Starting Salaries", horizontal=TRUE)
library(lattice)
histogram(~salary, data = placed.df,
main = "Frequency of Starting Salary", xlab="Starting Salary", col='grey' )
salaryWorkEx = aggregate(salary ~ work_yrs, data = placed.df, mean)
salaryWorkEx
## work_yrs salary
## 1 0 95000.00
## 2 1 103532.00
## 3 2 97673.68
## 4 3 101652.86
## 5 4 105454.55
## 6 5 103142.86
## 7 6 105928.57
## 8 7 98000.00
## 9 8 105025.00
## 10 10 118000.00
## 11 15 183000.00
## 12 16 108500.00
colnames(placed.df)
## [1] "age" "sex" "gmat_tot" "gmat_qpc" "gmat_vpc" "gmat_tpc"
## [7] "s_avg" "f_avg" "quarter" "work_yrs" "frstlang" "salary"
## [13] "satis"
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
scatterplot.matrix(~salary+gmat_tot+s_avg+f_avg, data=placed.df,
main="Salary versus other variables")
## Warning: 'scatterplot.matrix' is deprecated.
## Use 'scatterplotMatrix' instead.
## See help("Deprecated") and help("car-deprecated").
scatterplot.matrix(~salary+gmat_tot+s_avg+f_avg |sex, data=placed.df,
main="Salary versus other variables")
## Warning: 'scatterplot.matrix' is deprecated.
## Use 'scatterplotMatrix' instead.
## See help("Deprecated") and help("car-deprecated").
ageTable <- table(placed.df$sex, placed.df$age)
ageTable
##
## 22 23 24 25 26 27 28 29 30 31 32 33 34 39 40
## 1 0 3 11 13 9 13 5 5 4 4 1 1 1 1 1
## 2 1 2 5 10 5 1 3 1 2 0 0 0 0 0 1
aggregate(cbind(salary, work_yrs, age) ~ sex,
data = mba.df, mean)
## sex salary work_yrs age
## 1 1 37013.62 3.893204 27.41748
## 2 2 45121.07 3.808824 27.17647
aggregate(cbind(salary, work_yrs) ~ age, data = mba.df, mean)
## age salary work_yrs
## 1 22 42500.00 1.000000
## 2 23 57282.00 1.750000
## 3 24 49342.24 1.727273
## 4 25 43395.55 2.264151
## 5 26 35982.07 2.875000
## 6 27 31499.37 3.130435
## 7 28 39809.00 4.666667
## 8 29 28067.95 4.500000
## 9 30 55291.25 5.583333
## 10 31 40599.40 5.800000
## 11 32 13662.25 5.625000
## 12 33 118000.00 10.000000
## 13 34 26250.00 11.500000
## 14 35 0.00 9.333333
## 15 36 0.00 12.500000
## 16 37 0.00 9.000000
## 17 39 56000.00 10.500000
## 18 40 183000.00 15.000000
## 19 42 0.00 13.000000
## 20 43 0.00 19.000000
## 21 48 0.00 22.000000
aggregate(cbind(salary, work_yrs) ~ satis , data = mba.df, mean)
## satis salary work_yrs
## 1 1 999.000 3.000000
## 2 2 999.000 2.000000
## 3 3 19799.200 4.200000
## 4 4 6293.412 2.941176
## 5 5 40476.311 4.243243
## 6 6 54383.536 4.185567
## 7 7 65718.152 3.727273
## 8 998 998.000 3.086957
boxplot(salary ~ work_yrs ,data=mba.df, main="Effect of Work Experience on Salary", ylab="Work Experience", xlab="MBA's Starting Salaries", horizontal=FALSE)
boxplot(salary ~ sex ,data=mba.df, main="Effect of Gender on Salary", ylab="Work Experience", xlab="MBA's Starting Salaries", horizontal=FALSE)
library(lattice)
histogram(~salary, data = mba.df,
main = "Distribution of MBA's Starting Salary", xlab="MBA's Starting Slariy", col='grey' )
library(lattice)
histogram(~salary, data = mba.df,
main = "Distribution of MBA's Starting Salary", xlab="MBA's Starting Slariy", col='grey' )
library(lattice)
histogram(~salary, data = mba.df,
main = "Distribution of MBA's Starting Salary", xlab="MBA's Starting Slariy", col='grey' )
knownmba.df<- rbind(placed.df,notDisclosedSalary.df,notPlaced.df)
View(knownmba.df)
knownmba.df$GotPlaced = (knownmba.df$salary >1000)
View(knownmba.df)
knownmba.df$GotPlaced <- factor(knownmba.df$GotPlaced)
str(knownmba.df)
## 'data.frame': 228 obs. of 14 variables:
## $ age : int 22 27 25 25 27 28 24 25 25 25 ...
## $ sex : Factor w/ 2 levels "1","2": 2 2 2 2 1 2 1 2 2 1 ...
## $ gmat_tot : int 660 700 680 650 710 620 670 560 530 650 ...
## $ gmat_qpc : int 90 94 87 82 96 52 84 52 50 79 ...
## $ gmat_vpc : int 92 98 96 91 96 98 96 81 62 93 ...
## $ gmat_tpc : int 94 98 96 93 98 87 95 72 61 93 ...
## $ s_avg : num 3.5 3.3 3.5 3.4 3.3 3.4 3.3 3.3 3.6 3.3 ...
## $ f_avg : num 3.75 3.25 2.67 3.25 3.5 3.75 3.25 3.5 3.67 3.5 ...
## $ quarter : int 1 1 1 1 1 1 1 1 1 1 ...
## $ work_yrs : int 1 2 2 3 2 5 0 1 3 1 ...
## $ frstlang : Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ salary : num 85000 85000 86000 88000 92000 93000 95000 95000 95000 96000 ...
## $ satis : int 5 6 5 7 6 5 4 5 3 7 ...
## $ GotPlaced: Factor w/ 2 levels "FALSE","TRUE": 2 2 2 2 2 2 2 2 2 2 ...
placedbySex <- xtabs(~ knownmba.df$GotPlaced + knownmba.df$sex , data=knownmba.df)
placedbySex
## knownmba.df$sex
## knownmba.df$GotPlaced 1 2
## FALSE 67 23
## TRUE 102 36
addmargins(placedbySex)
## knownmba.df$sex
## knownmba.df$GotPlaced 1 2 Sum
## FALSE 67 23 90
## TRUE 102 36 138
## Sum 169 59 228
prop.table(placedbySex, 2)
## knownmba.df$sex
## knownmba.df$GotPlaced 1 2
## FALSE 0.3964497 0.3898305
## TRUE 0.6035503 0.6101695
placedbyLanguage <- xtabs(~ knownmba.df$GotPlaced + knownmba.df$frstlang, data=knownmba.df)
placedbyLanguage
## knownmba.df$frstlang
## knownmba.df$GotPlaced 1 2
## FALSE 82 8
## TRUE 122 16
addmargins(placedbyLanguage)
## knownmba.df$frstlang
## knownmba.df$GotPlaced 1 2 Sum
## FALSE 82 8 90
## TRUE 122 16 138
## Sum 204 24 228
prop.table(placedbyLanguage, 2)
## knownmba.df$frstlang
## knownmba.df$GotPlaced 1 2
## FALSE 0.4019608 0.3333333
## TRUE 0.5980392 0.6666667
chisq.test(placedbySex)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: placedbySex
## X-squared = 3.5816e-30, df = 1, p-value = 1
```
chisq.test(placedbyLanguage)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: placedbyLanguage
## X-squared = 0.18479, df = 1, p-value = 0.6673
```