setwd("C:/Office/Week 4 Day 2")
MBAStartingSalariesData.df <- read.csv(paste("MBA Starting Salaries Data.csv"),sep = ",")

View(MBAStartingSalariesData.df)
summary(MBAStartingSalariesData.df)
##       age             sex           gmat_tot        gmat_qpc    
##  Min.   :22.00   Min.   :1.000   Min.   :450.0   Min.   :28.00  
##  1st Qu.:25.00   1st Qu.:1.000   1st Qu.:580.0   1st Qu.:72.00  
##  Median :27.00   Median :1.000   Median :620.0   Median :83.00  
##  Mean   :27.36   Mean   :1.248   Mean   :619.5   Mean   :80.64  
##  3rd Qu.:29.00   3rd Qu.:1.000   3rd Qu.:660.0   3rd Qu.:93.00  
##  Max.   :48.00   Max.   :2.000   Max.   :790.0   Max.   :99.00  
##     gmat_vpc        gmat_tpc        s_avg           f_avg      
##  Min.   :16.00   Min.   : 0.0   Min.   :2.000   Min.   :0.000  
##  1st Qu.:71.00   1st Qu.:78.0   1st Qu.:2.708   1st Qu.:2.750  
##  Median :81.00   Median :87.0   Median :3.000   Median :3.000  
##  Mean   :78.32   Mean   :84.2   Mean   :3.025   Mean   :3.062  
##  3rd Qu.:91.00   3rd Qu.:94.0   3rd Qu.:3.300   3rd Qu.:3.250  
##  Max.   :99.00   Max.   :99.0   Max.   :4.000   Max.   :4.000  
##     quarter         work_yrs         frstlang         salary      
##  Min.   :1.000   Min.   : 0.000   Min.   :1.000   Min.   :     0  
##  1st Qu.:1.250   1st Qu.: 2.000   1st Qu.:1.000   1st Qu.:     0  
##  Median :2.000   Median : 3.000   Median :1.000   Median :   999  
##  Mean   :2.478   Mean   : 3.872   Mean   :1.117   Mean   : 39026  
##  3rd Qu.:3.000   3rd Qu.: 4.000   3rd Qu.:1.000   3rd Qu.: 97000  
##  Max.   :4.000   Max.   :22.000   Max.   :2.000   Max.   :220000  
##      satis      
##  Min.   :  1.0  
##  1st Qu.:  5.0  
##  Median :  6.0  
##  Mean   :172.2  
##  3rd Qu.:  7.0  
##  Max.   :998.0
boxplot(MBAStartingSalariesData.df$gmat_tot~MBAStartingSalariesData.df$sex)

hist(MBAStartingSalariesData.df$salary)

plot(MBAStartingSalariesData.df$age, MBAStartingSalariesData.df$salary)

var(MBAStartingSalariesData.df)
##                    age           sex      gmat_tot      gmat_qpc
## age       1.376904e+01 -4.513248e-02 -3.115879e+01 -1.192655e+01
## sex      -4.513248e-02  1.872677e-01 -1.328841e+00 -1.053769e+00
## gmat_tot -3.115879e+01 -1.328841e+00  3.310688e+03  6.200233e+02
## gmat_qpc -1.192655e+01 -1.053769e+00  6.200233e+02  2.210731e+02
## gmat_vpc -2.763643e+00  5.463758e-01  7.260006e+02  3.814826e+01
## gmat_tpc -8.839978e+00 -4.908960e-02  6.839911e+02  1.357997e+02
## s_avg     2.116874e-01  2.096227e-02  2.480257e+00 -1.691233e-01
## f_avg    -3.399348e-02  2.082698e-02  3.154688e+00  5.753854e-01
## quarter  -2.045935e-01 -6.414267e-02 -5.891153e+00  6.001979e-01
## work_yrs  1.029494e+01 -1.580172e-02 -3.391634e+01 -1.137186e+01
## frstlang  6.796610e-02  2.138980e-04 -2.499933e+00  6.646346e-01
## salary   -1.183042e+04  1.518264e+03 -1.611600e+05 -3.335823e+04
## satis    -1.763499e+02 -8.780808e+00  1.765263e+03  3.348371e+02
##               gmat_vpc     gmat_tpc         s_avg        f_avg
## age         -2.7636427   -8.8399775    0.21168739  -0.03399348
## sex          0.5463758   -0.0490896    0.02096227   0.02082698
## gmat_tot   726.0006417  683.9910698    2.48025721   3.15468838
## gmat_qpc    38.1482581  135.7996845   -0.16912329   0.57538542
## gmat_vpc   284.2481217  157.4932488    1.31357023   0.67207000
## gmat_tpc   157.4932488  196.6057057    0.62710008   0.58698618
## s_avg        1.3135702    0.6271001    0.14521760   0.11016898
## f_avg        0.6720700    0.5869862    0.11016898   0.27567237
## quarter     -3.2676666   -1.2923719   -0.32237213  -0.26080880
## work_yrs    -3.6181653   -7.8575172    0.15926392  -0.06628700
## frstlang    -2.1145691   -0.4663244   -0.01671372  -0.00626026
## salary   -5273.8523836 3522.7500067 2831.60098580 787.65597177
## satis      392.3562739  484.2466779   -4.62884495   2.12532927
##                quarter      work_yrs      frstlang        salary
## age      -2.045935e-01   10.29493864  6.796610e-02 -1.183042e+04
## sex      -6.414267e-02   -0.01580172  2.138980e-04  1.518264e+03
## gmat_tot -5.891153e+00  -33.91633914 -2.499933e+00 -1.611600e+05
## gmat_qpc  6.001979e-01  -11.37186171  6.646346e-01 -3.335823e+04
## gmat_vpc -3.267667e+00   -3.61816529 -2.114569e+00 -5.273852e+03
## gmat_tpc -1.292372e+00   -7.85751718 -4.663244e-01  3.522750e+03
## s_avg    -3.223721e-01    0.15926392 -1.671372e-02  2.831601e+03
## f_avg    -2.608088e-01   -0.06628700 -6.260260e-03  7.876560e+02
## quarter   1.232119e+00   -0.30866822  3.553381e-02 -9.296214e+03
## work_yrs -3.086682e-01   10.44882490 -2.898318e-02  1.486147e+03
## frstlang  3.553381e-02   -0.02898318  1.035266e-01 -1.419586e+03
## salary   -9.296214e+03 1486.14704152 -1.419586e+03  2.596062e+09
## satis    -5.227133e-03 -131.24080907  9.484532e+00 -6.347115e+06
##                  satis
## age      -1.763499e+02
## sex      -8.780808e+00
## gmat_tot  1.765263e+03
## gmat_qpc  3.348371e+02
## gmat_vpc  3.923563e+02
## gmat_tpc  4.842467e+02
## s_avg    -4.628845e+00
## f_avg     2.125329e+00
## quarter  -5.227133e-03
## work_yrs -1.312408e+02
## frstlang  9.484532e+00
## salary   -6.347115e+06
## satis     1.380974e+05
library(corrgram)
corrgram(MBAStartingSalariesData.df, order=TRUE, lower.panel=panel.shade,
         upper.panel=panel.pie,)

MBAStartingSalaries.df = placed.df+notplaced.df + undisclosedsalary.df +SurveyNotAnswered.df

placed.df <- MBAStartingSalariesData.df[which(MBAStartingSalariesData.df$salary >999),]
notplaced.df<- MBAStartingSalariesData.df[which(MBAStartingSalariesData.df$salary==0), ]
undisclosedsalary.df<- MBAStartingSalariesData.df[which(MBAStartingSalariesData.df$salary==999),]
SurveyNotAnswered.df<- MBAStartingSalariesData.df[which(MBAStartingSalariesData.df$salary==998),]
View(placed.df)
View(notplaced.df)
View(undisclosedsalary.df)
View(SurveyNotAnswered.df)

MBAStartingSalaries.df = placed.df+notplaced.df + undisclosedsalary.df +SurveyNotAnswered.df

table(placed.df$sex, placed.df$salary)
##    
##     64000 77000 78256 82000 85000 86000 88000 88500 90000 92000 93000
##   1     0     1     0     0     1     0     0     1     3     2     2
##   2     1     0     1     1     3     2     1     0     0     1     1
##    
##     95000 96000 96500 97000 98000 99000 100000 100400 101000 101100 101600
##   1     4     3     1     2     6     0      4      1      0      1      1
##   2     3     1     0     0     4     1      5      0      2      0      0
##    
##     102500 103000 104000 105000 106000 107000 107300 107500 108000 110000
##   1      1      1      2     11      2      1      1      1      2      0
##   2      0      0      0      0      1      0      0      0      0      1
##    
##     112000 115000 118000 120000 126710 130000 145800 146000 162000 220000
##   1      3      5      1      3      1      1      1      1      1      0
##   2      0      0      0      1      0      0      0      0      0      1
chisq.test(placed.df)
## Warning in chisq.test(placed.df): Chi-squared approximation may be
## incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  placed.df
## X-squared = 3620.8, df = 1224, p-value < 2.2e-16
t.test(placed.df)
## 
##  One Sample t-test
## 
## data:  placed.df
## t = 10.492, df = 1338, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  6500.198 9490.068
## sample estimates:
## mean of x 
##  7995.133
Model1 <- salary ~ 
             work_yrs + s_avg + f_avg + gmat_qpc + gmat_vpc + sex + frstlang + satis
bestmodel <- lm(Model1, data = placed.df)
summary(bestmodel)
## 
## Call:
## lm(formula = Model1, data = placed.df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -29800  -7822  -1742   4869  82341 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 86719.94   23350.43   3.714 0.000346 ***
## work_yrs     2331.12     585.99   3.978 0.000137 ***
## s_avg        4659.05    5015.66   0.929 0.355320    
## f_avg       -1698.83    3834.70  -0.443 0.658773    
## gmat_qpc       98.72     121.85   0.810 0.419884    
## gmat_vpc      -95.80     102.99  -0.930 0.354699    
## sex         -5289.24    3545.91  -1.492 0.139140    
## frstlang    13994.76    6641.66   2.107 0.037770 *  
## satis       -1671.20    2070.62  -0.807 0.421643    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15740 on 94 degrees of freedom
## Multiple R-squared:  0.285,  Adjusted R-squared:  0.2241 
## F-statistic: 4.683 on 8 and 94 DF,  p-value: 7.574e-05