MBA <- read.csv("E:/Documents/internship-R/MBA Starting Salaries Data.csv")
library(psych)
## Warning: package 'psych' was built under R version 3.4.3
describe(MBA)
##          vars   n     mean       sd median  trimmed     mad min    max
## age         1 274    27.36     3.71     27    26.76    2.97  22     48
## sex         2 274     1.25     0.43      1     1.19    0.00   1      2
## gmat_tot    3 274   619.45    57.54    620   618.86   59.30 450    790
## gmat_qpc    4 274    80.64    14.87     83    82.31   14.83  28     99
## gmat_vpc    5 274    78.32    16.86     81    80.33   14.83  16     99
## gmat_tpc    6 274    84.20    14.02     87    86.12   11.86   0     99
## s_avg       7 274     3.03     0.38      3     3.03    0.44   2      4
## f_avg       8 274     3.06     0.53      3     3.09    0.37   0      4
## quarter     9 274     2.48     1.11      2     2.47    1.48   1      4
## work_yrs   10 274     3.87     3.23      3     3.29    1.48   0     22
## frstlang   11 274     1.12     0.32      1     1.02    0.00   1      2
## salary     12 274 39025.69 50951.56    999 33607.86 1481.12   0 220000
## satis      13 274   172.18   371.61      6    91.50    1.48   1    998
##           range  skew kurtosis      se
## age          26  2.16     6.45    0.22
## sex           1  1.16    -0.66    0.03
## gmat_tot    340 -0.01     0.06    3.48
## gmat_qpc     71 -0.92     0.30    0.90
## gmat_vpc     83 -1.04     0.74    1.02
## gmat_tpc     99 -2.28     9.02    0.85
## s_avg         2 -0.06    -0.38    0.02
## f_avg         4 -2.08    10.85    0.03
## quarter       3  0.02    -1.35    0.07
## work_yrs     22  2.78     9.80    0.20
## frstlang      1  2.37     3.65    0.02
## salary   220000  0.70    -1.05 3078.10
## satis       997  1.77     1.13   22.45

DESCRIPTIVE PLOTS

hist(MBA$age, breaks=20,col="red",xlab="Age in years", main="Age  distribution")

plot(MBA$sex,main = "Graph showing number of Males and Females",col="green")

hist(MBA$work_yrs, breaks=20,col="grey",xlab="Work Experience in years", main="Work experience distribution")

hist(MBA$gmat_tot, breaks=40,col="dark green",xlab="score out of 800", main="Gmat Score distribution")

plot(MBA$frstlang,main = "First Language Distribution",col="orange")

newdata <- MBA[ which(MBA$satis<='7'), ]
hist(newdata$satis, breaks=5,col="magenta",xlab="Degree of Satisfaction,1=low 7=high", main="Satisfaction  distribution")

SCATTER PLOTS

library(car)
## Warning: package 'car' was built under R version 3.4.3
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
pairs(~salary+sex+age+gmat_tpc+frstlang+satis+work_yrs, data=MBA,main="Comparision of Salary and other variables")

library(car)
scatterplot(salary ~age,     data=MBA,
            spread=FALSE, smoother.args=list(lty=2),
            main="Scatter plot of salary vs age",
            xlab="age",
            ylab="salary")

library(car)
scatterplot(salary ~age,     data=MBA,
            spread=FALSE, smoother.args=list(lty=2),
            main="Scatter plot of salary vs age",
            xlab="age",
            ylab="salary")

newdata1 <- MBA[ which(MBA$salary !="998" & MBA$salary !="999"), ]
scatterplot(salary ~work_yrs,     data=newdata1,
            main="Scatter plot of salary vs Work exp.",
            xlab="Work experience in years",
            ylab="salary")

library(corrgram)
## Warning: package 'corrgram' was built under R version 3.4.3
corrgram(newdata1, order=TRUE, lower.panel=panel.shade,
         upper.panel=panel.pie, text.panel=panel.txt,
         main="MBA starting salary analysis Correlogram")

CONTINGENCY TABLES

aggregate(salary~age,data=MBA,mean)
##    age    salary
## 1   22  42500.00
## 2   23  57282.00
## 3   24  49342.24
## 4   25  43395.55
## 5   26  35982.07
## 6   27  31499.37
## 7   28  39809.00
## 8   29  28067.95
## 9   30  55291.25
## 10  31  40599.40
## 11  32  13662.25
## 12  33 118000.00
## 13  34  26250.00
## 14  35      0.00
## 15  36      0.00
## 16  37      0.00
## 17  39  56000.00
## 18  40 183000.00
## 19  42      0.00
## 20  43      0.00
## 21  48      0.00
chisq.test(table(MBA$salary,MBA$age))
## Warning in chisq.test(table(MBA$salary, MBA$age)): Chi-squared
## approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  table(MBA$salary, MBA$age)
## X-squared = 1114.2, df = 880, p-value = 1.178e-07

SALARY AND EXPERIENCE

aggregate(MBA$salary~MBA$work_yrs,FUN=mean)
##    MBA$work_yrs MBA$salary
## 1             0   31999.67
## 2             1   34677.08
## 3             2   45531.24
## 4             3   38494.21
## 5             4   27510.81
## 6             5   34476.10
## 7             6   62041.33
## 8             7   11221.78
## 9             8   60156.86
## 10            9     499.50
## 11           10   59000.00
## 12           11       0.00
## 13           12       0.00
## 14           13       0.00
## 15           15  183000.00
## 16           16   72333.33
## 17           18       0.00
## 18           22       0.00
chisq.test(table(MBA$salary,MBA$work_yrs))
## Warning in chisq.test(table(MBA$salary, MBA$work_yrs)): Chi-squared
## approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  table(MBA$salary, MBA$work_yrs)
## X-squared = 846.05, df = 748, p-value = 0.007162

SALARY AND GMAT SCORES

aggregate(MBA$salary~MBA$gmat_tot,FUN=mean)
##    MBA$gmat_tot MBA$salary
## 1           450    499.000
## 2           460    998.000
## 3           480      0.000
## 4           500 105833.000
## 5           510      0.000
## 6           520  78256.000
## 7           530  39800.000
## 8           540  41600.000
## 9           550  42213.625
## 10          560  36047.238
## 11          570  40610.889
## 12          580  53466.333
## 13          590  21999.333
## 14          600  48849.350
## 15          610  26944.000
## 16          620  62664.800
## 17          630  38885.636
## 18          640   9582.667
## 19          650  44562.125
## 20          660  33456.500
## 21          670  41793.471
## 22          680  51332.917
## 23          690    998.500
## 24          700  73400.000
## 25          710  40699.700
## 26          720  21499.500
## 27          730    499.500
## 28          740    748.750
## 29          750      0.000
## 30          760      0.000
## 31          790    999.000
chisq.test(table(MBA$salary,MBA$gmat_tot))
## Warning in chisq.test(table(MBA$salary, MBA$gmat_tot)): Chi-squared
## approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  table(MBA$salary, MBA$gmat_tot)
## X-squared = 1267.7, df = 1320, p-value = 0.8456

REGRESSION ANALYSIS
MODEL 1

mod1 <- lm(salary ~gmat_tot+gmat_qpc+gmat_vpc+gmat_tpc, data = MBA)
summary(mod1)
## 
## Call:
## lm(formula = salary ~ gmat_tot + gmat_qpc + gmat_vpc + gmat_tpc, 
##     data = MBA)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -48199 -41195 -33034  56735 182897 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 141539.0    59303.9   2.387   0.0177 *
## gmat_tot      -369.7      222.7  -1.660   0.0980 .
## gmat_qpc       465.7      615.2   0.757   0.4497  
## gmat_vpc       573.4      563.0   1.018   0.3094  
## gmat_tpc       523.2      443.0   1.181   0.2386  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 50900 on 269 degrees of freedom
## Multiple R-squared:  0.01651,    Adjusted R-squared:  0.001889 
## F-statistic: 1.129 on 4 and 269 DF,  p-value: 0.343

p value is greater than 0.05,so this model is not a good model. R-squared value shows that there is 1.6 % error in data evaluation and Residual Standard error is 50900.

MODEL 2

mod2<- lm(salary ~satis+work_yrs+frstlang, data = MBA)
summary(mod2)
## 
## Call:
## lm(formula = salary ~ satis + work_yrs + frstlang, data = MBA)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -49117 -47235  -2246  49225 187005 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 59455.437  11221.844   5.298 2.43e-07 ***
## satis         -45.735      7.912  -5.780 2.05e-08 ***
## work_yrs     -458.980    907.095  -0.506    0.613    
## frstlang    -9650.834   9087.065  -1.062    0.289    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 48150 on 270 degrees of freedom
## Multiple R-squared:  0.1168, Adjusted R-squared:  0.107 
## F-statistic: 11.91 on 3 and 270 DF,  p-value: 2.4e-07

p value is less than 0.05,so this model is better than model 1. R-squared value shows that there is 1.1 % error in data evaluation and Residual Standard error is 48150.

MODEL 3

mod3<- lm(salary ~age+sex, data =MBA)
summary(mod3)
## 
## Call:
## lm(formula = salary ~ age + sex, data = MBA)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -48601 -38030 -35529  54440 185565 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  51953.6    24832.5   2.092   0.0374 *
## age           -833.3      830.9  -1.003   0.3168  
## sex           7906.6     7124.9   1.110   0.2681  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 50920 on 271 degrees of freedom
## Multiple R-squared:  0.008421,   Adjusted R-squared:  0.001103 
## F-statistic: 1.151 on 2 and 271 DF,  p-value: 0.3179

This shows that only no variable is correlated to salary of mba.Also the overall p value is greater than 0.05,so this model is not a good model. R-squared value shows that there is 0 % error in data evaluation and Residual Standard error is 50920.