# Read the Data

mydata <- read.csv(paste("MBA Starting Salaries Data.csv", sep = ""))
View(mydata)

# Filtering the bad data out and Summarizing

filterdata <- mydata[mydata$salary !=998 & mydata$salary !=999 & mydata$satis !=998 & mydata$satis !=999, ]
summary(filterdata)
##       age             sex          gmat_tot        gmat_qpc    
##  Min.   :22.00   Min.   :1.00   Min.   :450.0   Min.   :28.00  
##  1st Qu.:25.00   1st Qu.:1.00   1st Qu.:570.0   1st Qu.:72.00  
##  Median :27.00   Median :1.00   Median :610.0   Median :82.00  
##  Mean   :27.59   Mean   :1.28   Mean   :615.2   Mean   :79.35  
##  3rd Qu.:29.00   3rd Qu.:2.00   3rd Qu.:650.0   3rd Qu.:91.00  
##  Max.   :48.00   Max.   :2.00   Max.   :760.0   Max.   :99.00  
##     gmat_vpc        gmat_tpc         s_avg           f_avg      
##  Min.   :22.00   Min.   : 0.00   Min.   :2.000   Min.   :0.000  
##  1st Qu.:71.00   1st Qu.:75.00   1st Qu.:2.800   1st Qu.:2.750  
##  Median :81.00   Median :87.00   Median :3.090   Median :3.000  
##  Mean   :78.13   Mean   :83.48   Mean   :3.064   Mean   :3.078  
##  3rd Qu.:91.00   3rd Qu.:93.00   3rd Qu.:3.300   3rd Qu.:3.330  
##  Max.   :99.00   Max.   :99.00   Max.   :4.000   Max.   :4.000  
##     quarter         work_yrs         frstlang         salary      
##  Min.   :1.000   Min.   : 0.000   Min.   :1.000   Min.   :     0  
##  1st Qu.:1.000   1st Qu.: 2.000   1st Qu.:1.000   1st Qu.:     0  
##  Median :2.000   Median : 3.000   Median :1.000   Median : 85000  
##  Mean   :2.394   Mean   : 4.104   Mean   :1.078   Mean   : 54985  
##  3rd Qu.:3.000   3rd Qu.: 5.000   3rd Qu.:1.000   3rd Qu.:100000  
##  Max.   :4.000   Max.   :22.000   Max.   :2.000   Max.   :220000  
##      satis      
##  Min.   :3.000  
##  1st Qu.:5.000  
##  Median :6.000  
##  Mean   :5.762  
##  3rd Qu.:6.000  
##  Max.   :7.000

Now we need to analyze the filter data for three questions which are: 1. Do male students recieve higher salaries than female students? 2. Does GMAT score affects salaries? 3. If English as the first language has any impact on the salary? 4. Are MBA students satisfied with their course?

# Visualization of Data

BoxPlot for salary recieved by male and female students

boxplot(salary ~ sex, data=filterdata, horizontal=TRUE, yaxt="n",
ylab="Sex", xlab="salary",
main="Comparison of Salaries of Males and Females")
axis(side=2, at=c(1,2), labels=c("Females", "Males"))

1 is for male students while 2 is for female students. As the chart depicts the boxplot for female students lies above the box plot of the male students indicating that female students recieve higher salaries than the male students. BarPlot for gmat score

counts <- table (filterdata$gmat_tot)
barplot(counts, main="gmattot", 
         xlab="gmattot")

Boxplot for salary of those having English as first language and those who don’t have English as First language

boxplot(salary ~ frstlang, data=filterdata, horizontal=TRUE, yaxt="n",
ylab="FrstLang", xlab="Salary",
main="Comparison of Salaries of Students with English and Students without English")
axis(side=2, at=c(1,2), labels=c("No English", "English"))

1 is for those students having English as their first language and 2 is for those students who do not have English as first language. The box plot is higher for those students who have English as first language but not much higher , the difference is not much so we cannot exactly say if having English as first language provides a higher salary. BarPlot for Satisfaction of students

counts <- table (filterdata$satis)
barplot(counts, main="satisfaction", 
         xlab="satis")

Most of the scores are within the higher score range so statistically the students seem to have appreciated their MBA course.

# Measuring Association

ScatterPlots for Salary And Other variables

library(car)
## Warning: package 'car' was built under R version 3.4.3
scatterplotMatrix(filterdata[,c("salary","age","gmat_tot","work_yrs","f_avg","s_avg", "gmat_qpc", "gmat_vpc", "gmat_tpc")], spread=FALSE, smoother.args=list(lty=2), main="Scatter Plot Matrix")

In the chart we can observe that Age and Salary, Rank and Salary, Yrs of Working Experience and Salary are well correlated but Fall MBA Average and Salary,Spring MBA Average and Salary, Quarter and Salary, Overall Percentile Score vs. Salary, Verbal score GMAT and Salary, Quantitative score GMAT and Salary, Total Score GMAT and Salary are weakly correlated.

# Variance Covariance Matrix

x <- filterdata[,c("gmat_tot", "gmat_qpc", "gmat_vpc", "gmat_tpc", "s_avg", "f_avg", "work_yrs" , "satis")]
y <- filterdata[,c("salary")]
cov(x,y)
##                 [,1]
## gmat_tot   -170.8814
## gmat_qpc  22855.7178
## gmat_vpc   2901.3078
## gmat_tpc  43822.5292
## s_avg      1940.5276
## f_avg       244.3157
## work_yrs -10442.6267
## satis      6436.2945

# Corrgram Representation

library(corrgram)
## Warning: package 'corrgram' was built under R version 3.4.3
corrgram(filterdata, order=FALSE, lower.panel=panel.shade, upper.panel=panel.pie, diag.panel=panel.minmax, text.panel=panel.txt, main="Corrgram of store.df intercorrelations")

In the chart we can observe that Age and Salary, Rank and Salary, Yrs of Working Experience and Salary are well correlated but Fall MBA Average and Salary,Spring MBA Average and Salary, Quarter and Salary, Overall Percentile Score vs. Salary, Verbal score GMAT and Salary, Quantitative score GMAT and Salary, Total Score GMAT and Salary are weakly correlated.

# Contingency Tables

Between Salary & Sex

options(digits = 2)
table(filterdata$sex, filterdata$salary)
##    
##      0 64000 77000 78256 82000 85000 86000 88000 88500 90000 92000 93000
##   1 67     0     1     0     0     1     0     0     1     3     2     2
##   2 23     1     0     1     1     3     2     1     0     0     1     1
##    
##     95000 96000 96500 97000 98000 99000 100000 100400 101000 101100 101600
##   1     4     3     1     2     6     0      4      1      0      1      1
##   2     3     1     0     0     4     1      5      0      2      0      0
##    
##     102500 103000 104000 105000 106000 107000 107300 107500 108000 110000
##   1      1      1      2     11      2      1      1      1      2      0
##   2      0      0      0      0      1      0      0      0      0      1
##    
##     112000 115000 118000 120000 126710 130000 145800 146000 162000 220000
##   1      3      5      1      3      1      1      1      1      1      0
##   2      0      0      0      1      0      0      0      0      0      1

In the table , code 1 is for male students and code 2 denotes female students.

Between Salary & Gmat Score

table(filterdata$salary, filterdata$gmat_tot)
##         
##          450 480 500 510 520 530 540 550 560 570 580 590 600 610 620 630
##   0        1   1   0   2   0   3   3   4   8   7   4   3   3   9   4   5
##   64000    0   0   0   0   0   0   0   0   1   0   0   0   0   0   0   0
##   77000    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##   78256    0   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0
##   82000    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##   85000    0   0   0   0   0   0   0   0   0   0   0   0   0   0   1   0
##   86000    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1
##   88000    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##   88500    0   0   0   0   0   0   0   0   0   0   0   0   0   0   1   0
##   90000    0   0   0   0   0   0   0   0   0   0   1   0   0   0   0   1
##   92000    0   0   0   0   0   0   0   0   0   0   0   0   0   0   1   0
##   93000    0   0   0   0   0   0   1   0   0   0   0   0   0   1   1   0
##   95000    0   0   0   0   0   1   0   0   2   0   0   0   0   2   0   0
##   96000    0   0   0   0   0   0   0   0   1   0   0   1   1   0   0   0
##   96500    0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0
##   97000    0   0   0   0   0   0   0   0   0   0   1   0   0   0   1   0
##   98000    0   0   0   0   0   0   0   0   1   3   1   1   0   1   0   0
##   99000    0   0   0   0   0   0   0   0   0   0   1   0   0   0   0   0
##   100000   0   0   0   0   0   0   0   0   2   0   1   0   1   1   0   1
##   100400   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1
##   101000   0   0   0   0   0   0   0   0   0   0   0   0   1   0   1   0
##   101100   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##   101600   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1
##   102500   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##   103000   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1   0
##   104000   0   0   0   0   0   1   0   0   1   0   0   0   0   0   0   0
##   105000   0   0   0   0   0   0   0   2   0   2   3   0   1   0   1   0
##   106000   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1   0
##   107000   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0
##   107300   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##   107500   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1
##   108000   0   0   0   0   0   0   0   0   0   1   0   0   1   0   0   0
##   110000   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##   112000   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0
##   115000   0   0   0   0   0   0   1   0   0   1   0   0   0   0   1   1
##   118000   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1   0
##   120000   0   0   0   0   0   0   0   0   0   0   0   0   2   0   0   0
##   126710   0   0   0   0   0   0   0   1   0   0   0   0   0   0   0   0
##   130000   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##   145800   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1   0
##   146000   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1
##   162000   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##   220000   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0
##         
##          640 650 660 670 680 700 710 720 730 740 750 760
##   0        6   5   3   4   3   2   4   2   1   1   1   1
##   64000    0   0   0   0   0   0   0   0   0   0   0   0
##   77000    0   0   1   0   0   0   0   0   0   0   0   0
##   78256    0   0   0   0   0   0   0   0   0   0   0   0
##   82000    0   0   0   1   0   0   0   0   0   0   0   0
##   85000    0   0   1   0   0   1   0   1   0   0   0   0
##   86000    0   0   0   0   1   0   0   0   0   0   0   0
##   88000    0   1   0   0   0   0   0   0   0   0   0   0
##   88500    0   0   0   0   0   0   0   0   0   0   0   0
##   90000    0   1   0   0   0   0   0   0   0   0   0   0
##   92000    0   0   1   0   0   0   1   0   0   0   0   0
##   93000    0   0   0   0   0   0   0   0   0   0   0   0
##   95000    0   0   0   2   0   0   0   0   0   0   0   0
##   96000    0   1   0   0   0   0   0   0   0   0   0   0
##   96500    0   0   0   0   0   0   0   0   0   0   0   0
##   97000    0   0   0   0   0   0   0   0   0   0   0   0
##   98000    0   0   0   1   1   0   1   0   0   0   0   0
##   99000    0   0   0   0   0   0   0   0   0   0   0   0
##   100000   0   2   0   0   0   0   1   0   0   0   0   0
##   100400   0   0   0   0   0   0   0   0   0   0   0   0
##   101000   0   0   0   0   0   0   0   0   0   0   0   0
##   101100   0   0   1   0   0   0   0   0   0   0   0   0
##   101600   0   0   0   0   0   0   0   0   0   0   0   0
##   102500   0   0   0   1   0   0   0   0   0   0   0   0
##   103000   0   0   0   0   0   0   0   0   0   0   0   0
##   104000   0   0   0   0   0   0   0   0   0   0   0   0
##   105000   0   1   0   0   1   0   0   0   0   0   0   0
##   106000   0   0   0   0   2   0   0   0   0   0   0   0
##   107000   0   0   0   0   0   0   0   0   0   0   0   0
##   107300   0   0   1   0   0   0   0   0   0   0   0   0
##   107500   0   0   0   0   0   0   0   0   0   0   0   0
##   108000   0   0   0   0   0   0   0   0   0   0   0   0
##   110000   1   0   0   0   0   0   0   0   0   0   0   0
##   112000   0   0   0   1   1   0   0   0   0   0   0   0
##   115000   0   0   0   0   0   0   1   0   0   0   0   0
##   118000   0   0   0   0   0   0   0   0   0   0   0   0
##   120000   0   0   0   1   0   1   0   0   0   0   0   0
##   126710   0   0   0   0   0   0   0   0   0   0   0   0
##   130000   0   1   0   0   0   0   0   0   0   0   0   0
##   145800   0   0   0   0   0   0   0   0   0   0   0   0
##   146000   0   0   0   0   0   0   0   0   0   0   0   0
##   162000   0   0   0   0   0   1   0   0   0   0   0   0
##   220000   0   0   0   0   0   0   0   0   0   0   0   0

Between Salary & First Language English

table(filterdata$salary, filterdata$frstlang)
##         
##           1  2
##   0      82  8
##   64000   1  0
##   77000   1  0
##   78256   1  0
##   82000   1  0
##   85000   4  0
##   86000   2  0
##   88000   1  0
##   88500   1  0
##   90000   3  0
##   92000   3  0
##   93000   3  0
##   95000   7  0
##   96000   4  0
##   96500   1  0
##   97000   2  0
##   98000   8  2
##   99000   0  1
##   100000  9  0
##   100400  1  0
##   101000  2  0
##   101100  1  0
##   101600  1  0
##   102500  1  0
##   103000  1  0
##   104000  1  1
##   105000 11  0
##   106000  3  0
##   107000  1  0
##   107300  0  1
##   107500  1  0
##   108000  2  0
##   110000  1  0
##   112000  3  0
##   115000  5  0
##   118000  0  1
##   120000  4  0
##   126710  1  0
##   130000  1  0
##   145800  1  0
##   146000  1  0
##   162000  1  0
##   220000  0  1

In the table , code 1 is for those students who have English as First Language and Code 0 is for those not having English as First language.

Between Salary and Working Years

table(filterdata$salary, filterdata$work_yrs)
##         
##           0  1  2  3  4  5  6  7  8  9 10 11 12 13 15 16 18 22
##   0       1 12 22 14  9 12  2  5  2  1  1  2  2  1  0  1  1  2
##   64000   0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
##   77000   0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
##   78256   0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
##   82000   0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
##   85000   0  1  2  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0
##   86000   0  0  1  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0
##   88000   0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0
##   88500   0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0
##   90000   0  0  2  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0
##   92000   0  0  3  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
##   93000   0  0  0  0  1  1  0  0  1  0  0  0  0  0  0  0  0  0
##   95000   1  1  2  2  0  1  0  0  0  0  0  0  0  0  0  0  0  0
##   96000   0  1  2  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0
##   96500   0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
##   97000   0  0  0  1  1  0  0  0  0  0  0  0  0  0  0  0  0  0
##   98000   0  0  7  1  1  0  0  1  0  0  0  0  0  0  0  0  0  0
##   99000   0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0
##   100000  0  0  6  1  1  0  1  0  0  0  0  0  0  0  0  0  0  0
##   100400  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0
##   101000  0  0  2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
##   101100  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0
##   101600  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0
##   102500  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0
##   103000  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0
##   104000  0  0  0  0  2  0  0  0  0  0  0  0  0  0  0  0  0  0
##   105000  0  0  4  4  0  1  1  0  0  0  0  0  0  0  0  1  0  0
##   106000  0  0  0  0  0  0  2  0  1  0  0  0  0  0  0  0  0  0
##   107000  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
##   107300  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
##   107500  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0
##   108000  0  0  0  1  1  0  0  0  0  0  0  0  0  0  0  0  0  0
##   110000  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0
##   112000  0  0  1  0  0  0  1  0  0  0  0  0  0  0  0  1  0  0
##   115000  0  2  0  1  2  0  0  0  0  0  0  0  0  0  0  0  0  0
##   118000  0  0  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0
##   120000  0  0  0  1  0  2  0  0  1  0  0  0  0  0  0  0  0  0
##   126710  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0
##   130000  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0
##   145800  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
##   146000  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  0  0  0
##   162000  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
##   220000  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  0  0  0

# Chi Square Test

to evaluate whether the male students recieve higher salary than that of female students.

salbysex <- xtabs(~ sex + salary, data=filterdata)
salbysex
##    salary
## sex  0 64000 77000 78256 82000 85000 86000 88000 88500 90000 92000 93000
##   1 67     0     1     0     0     1     0     0     1     3     2     2
##   2 23     1     0     1     1     3     2     1     0     0     1     1
##    salary
## sex 95000 96000 96500 97000 98000 99000 100000 100400 101000 101100 101600
##   1     4     3     1     2     6     0      4      1      0      1      1
##   2     3     1     0     0     4     1      5      0      2      0      0
##    salary
## sex 102500 103000 104000 105000 106000 107000 107300 107500 108000 110000
##   1      1      1      2     11      2      1      1      1      2      0
##   2      0      0      0      0      1      0      0      0      0      1
##    salary
## sex 112000 115000 118000 120000 126710 130000 145800 146000 162000 220000
##   1      3      5      1      3      1      1      1      1      1      0
##   2      0      0      0      1      0      0      0      0      0      1
chisq.test(salbysex)
## Warning in chisq.test(salbysex): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  salbysex
## X-squared = 60, df = 40, p-value = 0.08

p-value > 0.05 so we can conclude that male students do not recieve higher salary than that of female students.

to evaluate if those having English as first language recieve higher salary

salbylang <- xtabs(~ frstlang + salary, data=filterdata)
salbylang
##         salary
## frstlang  0 64000 77000 78256 82000 85000 86000 88000 88500 90000 92000
##        1 82     1     1     1     1     4     2     1     1     3     3
##        2  8     0     0     0     0     0     0     0     0     0     0
##         salary
## frstlang 93000 95000 96000 96500 97000 98000 99000 100000 100400 101000
##        1     3     7     4     1     2     8     0      9      1      2
##        2     0     0     0     0     0     2     1      0      0      0
##         salary
## frstlang 101100 101600 102500 103000 104000 105000 106000 107000 107300
##        1      1      1      1      1      1     11      3      1      0
##        2      0      0      0      0      1      0      0      0      1
##         salary
## frstlang 107500 108000 110000 112000 115000 118000 120000 126710 130000
##        1      1      2      1      3      5      0      4      1      1
##        2      0      0      0      0      0      1      0      0      0
##         salary
## frstlang 145800 146000 162000 220000
##        1      1      1      1      0
##        2      0      0      0      1
chisq.test(salbylang)
## Warning in chisq.test(salbylang): Chi-squared approximation may be
## incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  salbylang
## X-squared = 60, df = 40, p-value = 0.02

p-value < 0.05 so we can conclude that salary depends upon English as first language.

# t Test

attach(filterdata)
t.test(salary,sex)
## 
##  Welch Two Sample t-test
## 
## data:  salary and sex
## t = 10, df = 200, p-value <2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  47438 62530
## sample estimates:
## mean of x mean of y 
##   54985.3       1.3

p-value < 0.05 so we can conclude that female students recieve higher salary than that of male students.

# Regression Models

Converting sex and frstlang to factors

filterdata$frstlang <- as.factor(filterdata$frstlang)
filterdata$sex <- as.factor(filterdata$sex)
str(filterdata)
## 'data.frame':    193 obs. of  13 variables:
##  $ age     : int  23 24 24 24 24 25 25 27 27 28 ...
##  $ sex     : Factor w/ 2 levels "1","2": 2 1 1 1 1 1 2 1 1 2 ...
##  $ gmat_tot: int  620 610 670 570 640 610 650 740 750 540 ...
##  $ gmat_qpc: int  77 90 99 56 82 89 88 99 99 75 ...
##  $ gmat_vpc: int  87 71 78 81 89 74 89 96 98 50 ...
##  $ gmat_tpc: int  87 87 95 75 91 87 92 99 99 65 ...
##  $ s_avg   : num  3.4 3.5 3.3 3.3 3.9 3.4 3.3 3.5 3.4 3.6 ...
##  $ f_avg   : num  3 4 3.25 2.67 3.75 3.5 3.75 3.5 3.5 4 ...
##  $ quarter : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ work_yrs: int  2 2 2 1 2 2 2 3 1 5 ...
##  $ frstlang: Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 2 1 ...
##  $ salary  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ satis   : int  7 6 6 7 6 5 6 6 5 5 ...

Salary= _0 + _1 sex Salary= _0 + _1 gmat_tot Salary= _0 + _1 frstlang

Running the regressions

m1 <- lm(salary~ sex, data=filterdata)
summary(m1)
## 
## Call:
## lm(formula = salary ~ sex, data = filterdata)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -56560 -54373  28440  45627 163440 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    54374       4519   12.03   <2e-16 ***
## sex2            2187       8544    0.26      0.8    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 53300 on 191 degrees of freedom
## Multiple R-squared:  0.000343,   Adjusted R-squared:  -0.00489 
## F-statistic: 0.0655 on 1 and 191 DF,  p-value: 0.798
m2 <- lm(salary~ gmat_tot, data=filterdata)
summary(m2)
## 
## Call:
## lm(formula = salary ~ gmat_tot, data = filterdata)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -54994 -54985  30019  45020 165009 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)
## (Intercept)  5.50e+04   4.20e+04    1.31     0.19
## gmat_tot    -5.35e-02   6.80e+01    0.00     1.00
## 
## Residual standard error: 53300 on 191 degrees of freedom
## Multiple R-squared:  3.23e-09,   Adjusted R-squared:  -0.00524 
## F-statistic: 6.18e-07 on 1 and 191 DF,  p-value: 0.999
m3 <- lm(salary~ frstlang, data=filterdata)
summary(m3)
## 
## Call:
## lm(formula = salary ~ frstlang, data = filterdata)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -56287 -54876  30124  45124 163713 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    54876       3994    13.7   <2e-16 ***
## frstlang2       1411      14327     0.1     0.92    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 53300 on 191 degrees of freedom
## Multiple R-squared:  5.08e-05,   Adjusted R-squared:  -0.00518 
## F-statistic: 0.0097 on 1 and 191 DF,  p-value: 0.922

Model 1: salary ~ sex Model 2: salary ~ gmat_tot + frstlang

fit <- lm(salary ~ sex + gmat_tot + frstlang, data = filterdata)
fit1 <- lm(salary ~ sex + gmat_tot + frstlang, data=filterdata)
fit2 <- lm(salary ~ sex, data=filterdata)
anova(fit1, fit2)
## Analysis of Variance Table
## 
## Model 1: salary ~ sex + gmat_tot + frstlang
## Model 2: salary ~ sex
##   Res.Df      RSS Df Sum of Sq    F Pr(>F)
## 1    189 5.42e+11                         
## 2    191 5.42e+11 -2 -29923413 0.01   0.99
fit1 <- lm(salary ~ sex + gmat_tot + frstlang, data=filterdata)
fit3 <- lm(salary ~ gmat_tot + frstlang, data=filterdata)
anova(fit1, fit3)
## Analysis of Variance Table
## 
## Model 1: salary ~ sex + gmat_tot + frstlang
## Model 2: salary ~ gmat_tot + frstlang
##   Res.Df      RSS Df Sum of Sq    F Pr(>F)
## 1    189 5.42e+11                         
## 2    190 5.42e+11 -1 -1.88e+08 0.07    0.8
anova(fit)
## Analysis of Variance Table
## 
## Response: salary
##            Df   Sum Sq  Mean Sq F value Pr(>F)
## sex         1 1.86e+08 1.86e+08    0.06   0.80
## gmat_tot    1 3.05e+05 3.05e+05    0.00   0.99
## frstlang    1 2.96e+07 2.96e+07    0.01   0.92
## Residuals 189 5.42e+11 2.87e+09

Model 1 best fits the model as it has the lowest p-value

# Contingency Table for placed and not placed

table(filterdata$salary != "0", filterdata$salary> "0")
##        
##         FALSE TRUE
##   FALSE    90    0
##   TRUE      0  103

90 students didn’t get jobs while 103 did

# ChiSquare Test

datajob <- table(filterdata$salary != "0", filterdata$salary> "0")
chisq.test(datajob)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  datajob
## X-squared = 200, df = 1, p-value <2e-16

# CHALLENGE PART

logitMod <- glm(sex ~ salary, data=filterdata, family=binomial(link="logit"))
summary(logitMod)
## 
## Call:
## glm(formula = sex ~ salary, family = binomial(link = "logit"), 
##     data = filterdata)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -0.839  -0.822  -0.795   1.580   1.615  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -9.89e-01   2.33e-01   -4.24  2.2e-05 ***
## salary       7.78e-07   3.02e-06    0.26      0.8    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 228.80  on 192  degrees of freedom
## Residual deviance: 228.74  on 191  degrees of freedom
## AIC: 232.7
## 
## Number of Fisher Scoring iterations: 4
logitMod1 <- glm(frstlang ~ salary, data=filterdata, family=binomial(link="logit"))
summary(logitMod1)
## 
## Call:
## glm(formula = frstlang ~ salary, family = binomial(link = "logit"), 
##     data = filterdata)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -0.413  -0.407  -0.397  -0.397   2.272  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.50e+00   3.92e-01   -6.39  1.7e-10 ***
## salary       5.02e-07   5.07e-06    0.10     0.92    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 105.44  on 192  degrees of freedom
## Residual deviance: 105.43  on 191  degrees of freedom
## AIC: 109.4
## 
## Number of Fisher Scoring iterations: 5