mba_salary=read.csv(paste("C:/Users/kogentix/Downloads/Internship_docs/week4/MBA Starting Salaries Data.csv",sep=""),)
job <- mba_salary[ which(mba_salary$salary !="998" & mba_salary$salary !="999" & mba_salary$salary!="0"), ]
#summary(mba_salary)
hist(mba_salary$age,col="aliceblue", xlab="Age in years",main="Age Distribution")

mba_salary$sex=factor(mba_salary$sex, levels=c(1,2), labels=c("Male","Female"))
plot(mba_salary$sex,main = "Gender distribution")

hist(mba_salary$gmat_tot, xlab="GMAT total score",main="Distribution of GMAT scores", breaks=20)

hist(mba_salary$work_yrs,xlab="Years of work experience",main = "Work Experience",breaks = 20)

mba_salary$frstlang = factor(mba_salary$frstlang, levels=c(1,2), labels=c("English","Others"))
plot(mba_salary$frstlang,main = "Language Distribution")

data <- mba_salary[ which(mba_salary$satis<='7'), ]
hist(data$satis,breaks =5,xlab="Degree of Satisfaction (1=low,7=high)", main="Satisfaction  distribution")

data1 <- mba_salary[ which(mba_salary$salary !="998" & mba_salary$salary !="999"), ]
hist(data1$salary,breaks=5,xlab="starting salary", main="Salary  distribution")

boxplot(mba_salary$gmat_qpc, main="Quant Percentile Distribution", xlab="GMAT QPC", horizontal = TRUE)

boxplot(mba_salary$gmat_vpc, main="Verbal Percentile Distribution", xlab="GMAT VPC", horizontal = TRUE)

boxplot(mba_salary$gmat_tpc, main="Overall Percentile Distribution", xlab="GMAT TPC", horizontal = TRUE)

boxplot(mba_salary$s_avg, main="Spring MBA Avg Dist.", xlab="s_avg", horizontal = TRUE)

boxplot(mba_salary$f_avg, main="Fall MBA Avg Dist.", xlab="f_avg", horizontal = TRUE)

boxplot(mba_salary$gmat_qpc, main="Quartile Ranking Dist.", xlab="Quart", horizontal = TRUE)

library(car)    

pairs(~salary+sex+age+gmat_tpc+frstlang+satis+work_yrs, data=job,main="Salary vs. other variables")

plot(mba_salary$gmat_tpc, mba_salary$work_yrs, main = "Total GMAT Percentile vs. Work Experience", xlab = "GMAT Percentile", ylab = "Work Experience")
abline(lm(mba_salary$work_yrs~mba_salary$gmat_tpc))

library(corrgram)
## Warning: package 'corrgram' was built under R version 3.4.3
    corrgram(job, order=TRUE, lower.panel=panel.shade,
    upper.panel=panel.pie, text.panel=panel.txt,
    main="MBA starting salary analysis")

x <- job[,c("age", "gmat_tot", "gmat_qpc","gmat_vpc","gmat_tpc","s_avg","f_avg","work_yrs","salary")]
y <- job[,c("age", "gmat_tot", "gmat_qpc","gmat_vpc","gmat_tpc","s_avg","f_avg","work_yrs","salary")]
cov(x,y)
##                    age      gmat_tot      gmat_qpc      gmat_vpc
## age         10.7045498    -13.054445   -7.22796497  9.505045e-01
## gmat_tot   -13.0544451   2569.293737  452.14258519  6.386360e+02
## gmat_qpc    -7.2279650    452.142585  179.18027794  2.045850e+01
## gmat_vpc     0.9505045    638.636018   20.45849990  2.606602e+02
## gmat_tpc    -3.4602132    539.362269   97.03607462  1.393882e+02
## s_avg        0.1938587      3.299562    0.07838473  9.694594e-01
## f_avg       -0.3462517      3.027432    0.64252142  1.803303e-01
## work_yrs     8.6728536    -18.738816   -7.36245955 -1.366838e+00
## salary   29210.5193223 -82124.485056 3382.43784504 -3.964803e+04
##               gmat_tpc        s_avg         f_avg      work_yrs
## age      -3.460213e+00   0.19385875   -0.34625167     8.6728536
## gmat_tot  5.393623e+02   3.29956215    3.02743194   -18.7388159
## gmat_qpc  9.703607e+01   0.07838473    0.64252142    -7.3624595
## gmat_vpc  1.393882e+02   0.96945936    0.18033029    -1.3668380
## gmat_tpc  1.211342e+02   0.58062916    0.37850562    -4.3892062
## s_avg     5.806292e-01   0.14325138    0.08231046     0.1860480
## f_avg     3.785056e-01   0.08231046    0.23786375    -0.3176271
## work_yrs -4.389206e+00   0.18604797   -0.31762707     9.0630116
## salary   -2.596339e+04 688.02042071 -924.11288026 24458.1995050
##                 salary
## age       2.921052e+04
## gmat_tot -8.212449e+04
## gmat_qpc  3.382438e+03
## gmat_vpc -3.964803e+04
## gmat_tpc -2.596339e+04
## s_avg     6.880204e+02
## f_avg    -9.241129e+02
## work_yrs  2.445820e+04
## salary    3.192940e+08
job <- mba_salary[ which(mba_salary$salary !="998" & mba_salary$salary !="999" & mba_salary$salary!="0"), ]
xtabs(~salary+age,data=job)
##         age
## salary   22 23 24 25 26 27 28 29 30 31 32 33 34 39 40
##   64000   0  0  1  0  0  0  0  0  0  0  0  0  0  0  0
##   77000   0  1  0  0  0  0  0  0  0  0  0  0  0  0  0
##   78256   0  1  0  0  0  0  0  0  0  0  0  0  0  0  0
##   82000   0  0  0  0  1  0  0  0  0  0  0  0  0  0  0
##   85000   1  0  0  1  1  1  0  0  0  0  0  0  0  0  0
##   86000   0  0  0  1  1  0  0  0  0  0  0  0  0  0  0
##   88000   0  0  0  1  0  0  0  0  0  0  0  0  0  0  0
##   88500   0  0  0  0  0  1  0  0  0  0  0  0  0  0  0
##   90000   0  0  0  2  0  1  0  0  0  0  0  0  0  0  0
##   92000   0  0  0  2  0  1  0  0  0  0  0  0  0  0  0
##   93000   0  0  0  1  0  0  1  0  0  1  0  0  0  0  0
##   95000   0  0  1  5  0  0  0  1  0  0  0  0  0  0  0
##   96000   0  0  1  1  2  0  0  0  0  0  0  0  0  0  0
##   96500   0  0  1  0  0  0  0  0  0  0  0  0  0  0  0
##   97000   0  0  0  0  0  1  1  0  0  0  0  0  0  0  0
##   98000   0  1  3  2  1  1  1  1  0  0  0  0  0  0  0
##   99000   0  0  0  0  0  0  1  0  0  0  0  0  0  0  0
##   100000  0  1  4  1  1  1  0  0  0  1  0  0  0  0  0
##   100400  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0
##   101000  0  0  1  1  0  0  0  0  0  0  0  0  0  0  0
##   101100  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0
##   101600  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0
##   102500  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0
##   103000  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0
##   104000  0  0  0  0  0  0  1  0  0  1  0  0  0  0  0
##   105000  0  1  1  2  3  1  0  0  1  1  0  0  1  0  0
##   106000  0  0  0  0  0  0  0  1  2  0  0  0  0  0  0
##   107000  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0
##   107300  0  0  0  0  0  0  0  0  0  0  1  0  0  0  0
##   107500  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0
##   108000  0  0  0  1  0  0  1  0  0  0  0  0  0  0  0
##   110000  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0
##   112000  0  0  1  0  0  0  0  1  0  0  0  0  0  1  0
##   115000  0  0  1  1  0  3  0  0  0  0  0  0  0  0  0
##   118000  0  0  0  0  0  0  0  0  0  0  0  1  0  0  0
##   120000  0  0  0  0  0  1  1  0  2  0  0  0  0  0  0
##   126710  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0
##   130000  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0
##   145800  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0
##   146000  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1
##   162000  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0
##   220000  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1
chisq.test(job$age,job$salary)
## Warning in chisq.test(job$age, job$salary): Chi-squared approximation may
## be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  job$age and job$salary
## X-squared = 717.62, df = 574, p-value = 3.929e-05
t.test(mba_salary$age,mba_salary$salary)
## 
##  Welch Two Sample t-test
## 
## data:  mba_salary$age and mba_salary$salary
## t = -12.67, df = 273, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -45058.15 -32938.51
## sample estimates:
##   mean of x   mean of y 
##    27.35766 39025.68978
fit <- lm(salary ~gmat_tot+gmat_qpc+gmat_vpc+gmat_tpc, data = job)
summary(fit)
## 
## Call:
## lm(formula = salary ~ gmat_tot + gmat_qpc + gmat_vpc + gmat_tpc, 
##     data = job)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -40370  -8250  -2164   5253 100097 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 109539.54   48054.24   2.279   0.0248 *
## gmat_tot        55.01     181.71   0.303   0.7627  
## gmat_qpc       718.40     541.90   1.326   0.1880  
## gmat_vpc       546.10     543.85   1.004   0.3178  
## gmat_tpc     -1663.16     801.57  -2.075   0.0406 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17670 on 98 degrees of freedom
## Multiple R-squared:  0.06089,    Adjusted R-squared:  0.02256 
## F-statistic: 1.589 on 4 and 98 DF,  p-value: 0.1834