salary <- read.csv(paste("MBA Starting Salaries Data.csv"))
library(psych)
describe(salary)
##          vars   n     mean       sd median  trimmed     mad min    max
## age         1 274    27.36     3.71     27    26.76    2.97  22     48
## sex         2 274     1.25     0.43      1     1.19    0.00   1      2
## gmat_tot    3 274   619.45    57.54    620   618.86   59.30 450    790
## gmat_qpc    4 274    80.64    14.87     83    82.31   14.83  28     99
## gmat_vpc    5 274    78.32    16.86     81    80.33   14.83  16     99
## gmat_tpc    6 274    84.20    14.02     87    86.12   11.86   0     99
## s_avg       7 274     3.03     0.38      3     3.03    0.44   2      4
## f_avg       8 274     3.06     0.53      3     3.09    0.37   0      4
## quarter     9 274     2.48     1.11      2     2.47    1.48   1      4
## work_yrs   10 274     3.87     3.23      3     3.29    1.48   0     22
## frstlang   11 274     1.12     0.32      1     1.02    0.00   1      2
## salary     12 274 39025.69 50951.56    999 33607.86 1481.12   0 220000
## satis      13 274   172.18   371.61      6    91.50    1.48   1    998
##           range  skew kurtosis      se
## age          26  2.16     6.45    0.22
## sex           1  1.16    -0.66    0.03
## gmat_tot    340 -0.01     0.06    3.48
## gmat_qpc     71 -0.92     0.30    0.90
## gmat_vpc     83 -1.04     0.74    1.02
## gmat_tpc     99 -2.28     9.02    0.85
## s_avg         2 -0.06    -0.38    0.02
## f_avg         4 -2.08    10.85    0.03
## quarter       3  0.02    -1.35    0.07
## work_yrs     22  2.78     9.80    0.20
## frstlang      1  2.37     3.65    0.02
## salary   220000  0.70    -1.05 3078.10
## satis       997  1.77     1.13   22.45
boxplot(salary$age,
        main = "Age Distribution",
        col= "Blue",
        horizontal = TRUE)

boxplot(salary$gmat_tot,
        main = "Gmat Total Score",
        col= "RED",
        horizontal = TRUE)

boxplot(salary$gmat_qpc, salary$gmat_vpc, salary$gmat_tpc,
        main = "Sectional Percentile",
        col= "Green",
        ylab = c(" 1 = Quantitative GMAT Percentile","2= Verbal GMAT Percentile", "3= Overall GMAT Percentile"),
        horizontal = TRUE)

boxplot(salary$s_avg, salary$f_avg,
        main = "Average Marks in MBA",
        col= "Grey",
        ylab = c("1 = Spring MBA Average","2= Fall MBA Average"),
        horizontal = TRUE)

boxplot(salary$s_avg, salary$f_avg,
        main = "Average Marks in MBA",
        col= "Grey",
        ylab = c("1 = Spring MBA Average","2= Fall MBA Average"),
        horizontal = TRUE)

boxplot(salary$salary,
        main = "Starting Salary",
        col= "Grey",
        horizontal = TRUE)

boxplot(salary$work_yrs,
        main = "Years of Work Experience",
        col= "Gold",
        horizontal = TRUE)

library(car)
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
scatterplotMatrix(formula = ~gmat_tot + s_avg + f_avg + work_yrs + salary, data = salary)

library(corrgram)
corrgram(salary, 
         order=FALSE, 
         lower.panel=panel.shade, 
         upper.panel=panel.pie, 
         diag.panel=panel.minmax, 
         text.panel=panel.txt, 
         main="Corrgram of MBA Starting Salary intercorrelations")

pla <- salary[ which(salary$salary!= "0"),]
place <- pla[which (pla$salary!= "999"),]
placed <- place[which(place$salary != "998"),]
View(placed)
fit <- lm ( salary ~ gmat_tot + gmat_tpc + s_avg + f_avg +work_yrs + age, data= placed)
summary(fit)
## 
## Call:
## lm(formula = salary ~ gmat_tot + gmat_tpc + s_avg + f_avg + work_yrs + 
##     age, data = placed)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -35226  -7222  -1719   5276  73074 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  10199.4    38125.1   0.268   0.7896  
## gmat_tot       169.7      123.8   1.370   0.1738  
## gmat_tpc      -896.5      567.6  -1.580   0.1175  
## s_avg         1838.4     4885.9   0.376   0.7075  
## f_avg        -1470.5     3850.3  -0.382   0.7034  
## work_yrs       317.7     1105.1   0.288   0.7743  
## age           2307.6     1011.5   2.281   0.0247 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15710 on 96 degrees of freedom
## Multiple R-squared:  0.2729, Adjusted R-squared:  0.2275 
## F-statistic: 6.006 on 6 and 96 DF,  p-value: 2.306e-05

This is the best model we can fit the data which we can have. As per it the age is statistically significant. Since the p value of F statistics is quite low so we reject the null hypothesis that the parameters are not statistically significant. Ingdividual t test statistics is not much significant.

notplaced <- salary[ which(salary$salary== "0"),]
View(notplaced)