mba_salary=read.csv(paste("C:/Users/kogentix/Downloads/Internship_docs/week4/MBA Starting Salaries Data.csv",sep=""),)
job <- mba_salary[ which(mba_salary$salary !="998" & mba_salary$salary !="999" & mba_salary$salary!="0"), ]
#summary(mba_salary)
hist(mba_salary$age,col="aliceblue", xlab="Age in years",main="Age Distribution")

mba_salary$sex=factor(mba_salary$sex, levels=c(1,2), labels=c("Male","Female"))
plot(mba_salary$sex,main = "Gender distribution")

hist(mba_salary$gmat_tot, xlab="GMAT total score",main="Distribution of GMAT scores", breaks=20)

hist(mba_salary$work_yrs,xlab="Years of work experience",main = "Work Experience",breaks = 20)

mba_salary$frstlang = factor(mba_salary$frstlang, levels=c(1,2), labels=c("English","Others"))
plot(mba_salary$frstlang,main = "Language Distribution")

data <- mba_salary[ which(mba_salary$satis<='7'), ]
hist(data$satis,breaks =5,xlab="Degree of Satisfaction (1=low,7=high)", main="Satisfaction distribution")

data1 <- mba_salary[ which(mba_salary$salary !="998" & mba_salary$salary !="999"), ]
hist(data1$salary,breaks=5,xlab="starting salary", main="Salary distribution")

boxplot(mba_salary$gmat_qpc, main="Quant Percentile Distribution", xlab="GMAT QPC", horizontal = TRUE)

boxplot(mba_salary$gmat_vpc, main="Verbal Percentile Distribution", xlab="GMAT VPC", horizontal = TRUE)

boxplot(mba_salary$gmat_tpc, main="Overall Percentile Distribution", xlab="GMAT TPC", horizontal = TRUE)

boxplot(mba_salary$s_avg, main="Spring MBA Avg Dist.", xlab="s_avg", horizontal = TRUE)

boxplot(mba_salary$f_avg, main="Fall MBA Avg Dist.", xlab="f_avg", horizontal = TRUE)

boxplot(mba_salary$gmat_qpc, main="Quartile Ranking Dist.", xlab="Quart", horizontal = TRUE)

library(car)
pairs(~salary+sex+age+gmat_tpc+frstlang+satis+work_yrs, data=job,main="Salary vs. other variables")

plot(mba_salary$gmat_tpc, mba_salary$work_yrs, main = "Total GMAT Percentile vs. Work Experience", xlab = "GMAT Percentile", ylab = "Work Experience")
abline(lm(mba_salary$work_yrs~mba_salary$gmat_tpc))

library(corrgram)
## Warning: package 'corrgram' was built under R version 3.4.3
corrgram(job, order=TRUE, lower.panel=panel.shade,
upper.panel=panel.pie, text.panel=panel.txt,
main="MBA starting salary analysis")

x <- job[,c("age", "gmat_tot", "gmat_qpc","gmat_vpc","gmat_tpc","s_avg","f_avg","work_yrs","salary")]
y <- job[,c("age", "gmat_tot", "gmat_qpc","gmat_vpc","gmat_tpc","s_avg","f_avg","work_yrs","salary")]
cov(x,y)
## age gmat_tot gmat_qpc gmat_vpc
## age 10.7045498 -13.054445 -7.22796497 9.505045e-01
## gmat_tot -13.0544451 2569.293737 452.14258519 6.386360e+02
## gmat_qpc -7.2279650 452.142585 179.18027794 2.045850e+01
## gmat_vpc 0.9505045 638.636018 20.45849990 2.606602e+02
## gmat_tpc -3.4602132 539.362269 97.03607462 1.393882e+02
## s_avg 0.1938587 3.299562 0.07838473 9.694594e-01
## f_avg -0.3462517 3.027432 0.64252142 1.803303e-01
## work_yrs 8.6728536 -18.738816 -7.36245955 -1.366838e+00
## salary 29210.5193223 -82124.485056 3382.43784504 -3.964803e+04
## gmat_tpc s_avg f_avg work_yrs
## age -3.460213e+00 0.19385875 -0.34625167 8.6728536
## gmat_tot 5.393623e+02 3.29956215 3.02743194 -18.7388159
## gmat_qpc 9.703607e+01 0.07838473 0.64252142 -7.3624595
## gmat_vpc 1.393882e+02 0.96945936 0.18033029 -1.3668380
## gmat_tpc 1.211342e+02 0.58062916 0.37850562 -4.3892062
## s_avg 5.806292e-01 0.14325138 0.08231046 0.1860480
## f_avg 3.785056e-01 0.08231046 0.23786375 -0.3176271
## work_yrs -4.389206e+00 0.18604797 -0.31762707 9.0630116
## salary -2.596339e+04 688.02042071 -924.11288026 24458.1995050
## salary
## age 2.921052e+04
## gmat_tot -8.212449e+04
## gmat_qpc 3.382438e+03
## gmat_vpc -3.964803e+04
## gmat_tpc -2.596339e+04
## s_avg 6.880204e+02
## f_avg -9.241129e+02
## work_yrs 2.445820e+04
## salary 3.192940e+08
job <- mba_salary[ which(mba_salary$salary !="998" & mba_salary$salary !="999" & mba_salary$salary!="0"), ]
xtabs(~salary+age,data=job)
## age
## salary 22 23 24 25 26 27 28 29 30 31 32 33 34 39 40
## 64000 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
## 77000 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## 78256 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## 82000 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## 85000 1 0 0 1 1 1 0 0 0 0 0 0 0 0 0
## 86000 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0
## 88000 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
## 88500 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
## 90000 0 0 0 2 0 1 0 0 0 0 0 0 0 0 0
## 92000 0 0 0 2 0 1 0 0 0 0 0 0 0 0 0
## 93000 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0
## 95000 0 0 1 5 0 0 0 1 0 0 0 0 0 0 0
## 96000 0 0 1 1 2 0 0 0 0 0 0 0 0 0 0
## 96500 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
## 97000 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0
## 98000 0 1 3 2 1 1 1 1 0 0 0 0 0 0 0
## 99000 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
## 100000 0 1 4 1 1 1 0 0 0 1 0 0 0 0 0
## 100400 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## 101000 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0
## 101100 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## 101600 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## 102500 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
## 103000 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
## 104000 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0
## 105000 0 1 1 2 3 1 0 0 1 1 0 0 1 0 0
## 106000 0 0 0 0 0 0 0 1 2 0 0 0 0 0 0
## 107000 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## 107300 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## 107500 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
## 108000 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0
## 110000 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
## 112000 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0
## 115000 0 0 1 1 0 3 0 0 0 0 0 0 0 0 0
## 118000 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 120000 0 0 0 0 0 1 1 0 2 0 0 0 0 0 0
## 126710 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## 130000 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## 145800 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
## 146000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## 162000 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
## 220000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
chisq.test(job$age,job$salary)
## Warning in chisq.test(job$age, job$salary): Chi-squared approximation may
## be incorrect
##
## Pearson's Chi-squared test
##
## data: job$age and job$salary
## X-squared = 717.62, df = 574, p-value = 3.929e-05
t.test(mba_salary$age,mba_salary$salary)
##
## Welch Two Sample t-test
##
## data: mba_salary$age and mba_salary$salary
## t = -12.67, df = 273, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -45058.15 -32938.51
## sample estimates:
## mean of x mean of y
## 27.35766 39025.68978
fit <- lm(salary ~gmat_tot+gmat_qpc+gmat_vpc+gmat_tpc, data = job)
summary(fit)
##
## Call:
## lm(formula = salary ~ gmat_tot + gmat_qpc + gmat_vpc + gmat_tpc,
## data = job)
##
## Residuals:
## Min 1Q Median 3Q Max
## -40370 -8250 -2164 5253 100097
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 109539.54 48054.24 2.279 0.0248 *
## gmat_tot 55.01 181.71 0.303 0.7627
## gmat_qpc 718.40 541.90 1.326 0.1880
## gmat_vpc 546.10 543.85 1.004 0.3178
## gmat_tpc -1663.16 801.57 -2.075 0.0406 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17670 on 98 degrees of freedom
## Multiple R-squared: 0.06089, Adjusted R-squared: 0.02256
## F-statistic: 1.589 on 4 and 98 DF, p-value: 0.1834