This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
dataset = read.csv('MBA Starting Salaries Data.csv')
View(dataset)
attach(dataset)
summary(dataset)
## age sex gmat_tot gmat_qpc
## Min. :22.00 Min. :1.000 Min. :450.0 Min. :28.00
## 1st Qu.:25.00 1st Qu.:1.000 1st Qu.:580.0 1st Qu.:72.00
## Median :27.00 Median :1.000 Median :620.0 Median :83.00
## Mean :27.36 Mean :1.248 Mean :619.5 Mean :80.64
## 3rd Qu.:29.00 3rd Qu.:1.000 3rd Qu.:660.0 3rd Qu.:93.00
## Max. :48.00 Max. :2.000 Max. :790.0 Max. :99.00
## gmat_vpc gmat_tpc s_avg f_avg
## Min. :16.00 Min. : 0.0 Min. :2.000 Min. :0.000
## 1st Qu.:71.00 1st Qu.:78.0 1st Qu.:2.708 1st Qu.:2.750
## Median :81.00 Median :87.0 Median :3.000 Median :3.000
## Mean :78.32 Mean :84.2 Mean :3.025 Mean :3.062
## 3rd Qu.:91.00 3rd Qu.:94.0 3rd Qu.:3.300 3rd Qu.:3.250
## Max. :99.00 Max. :99.0 Max. :4.000 Max. :4.000
## quarter work_yrs frstlang salary
## Min. :1.000 Min. : 0.000 Min. :1.000 Min. : 0
## 1st Qu.:1.250 1st Qu.: 2.000 1st Qu.:1.000 1st Qu.: 0
## Median :2.000 Median : 3.000 Median :1.000 Median : 999
## Mean :2.478 Mean : 3.872 Mean :1.117 Mean : 39026
## 3rd Qu.:3.000 3rd Qu.: 4.000 3rd Qu.:1.000 3rd Qu.: 97000
## Max. :4.000 Max. :22.000 Max. :2.000 Max. :220000
## satis
## Min. : 1.0
## 1st Qu.: 5.0
## Median : 6.0
## Mean :172.2
## 3rd Qu.: 7.0
## Max. :998.0
library(psych)
describe(dataset)
## vars n mean sd median trimmed mad min max
## age 1 274 27.36 3.71 27 26.76 2.97 22 48
## sex 2 274 1.25 0.43 1 1.19 0.00 1 2
## gmat_tot 3 274 619.45 57.54 620 618.86 59.30 450 790
## gmat_qpc 4 274 80.64 14.87 83 82.31 14.83 28 99
## gmat_vpc 5 274 78.32 16.86 81 80.33 14.83 16 99
## gmat_tpc 6 274 84.20 14.02 87 86.12 11.86 0 99
## s_avg 7 274 3.03 0.38 3 3.03 0.44 2 4
## f_avg 8 274 3.06 0.53 3 3.09 0.37 0 4
## quarter 9 274 2.48 1.11 2 2.47 1.48 1 4
## work_yrs 10 274 3.87 3.23 3 3.29 1.48 0 22
## frstlang 11 274 1.12 0.32 1 1.02 0.00 1 2
## salary 12 274 39025.69 50951.56 999 33607.86 1481.12 0 220000
## satis 13 274 172.18 371.61 6 91.50 1.48 1 998
## range skew kurtosis se
## age 26 2.16 6.45 0.22
## sex 1 1.16 -0.66 0.03
## gmat_tot 340 -0.01 0.06 3.48
## gmat_qpc 71 -0.92 0.30 0.90
## gmat_vpc 83 -1.04 0.74 1.02
## gmat_tpc 99 -2.28 9.02 0.85
## s_avg 2 -0.06 -0.38 0.02
## f_avg 4 -2.08 10.85 0.03
## quarter 3 0.02 -1.35 0.07
## work_yrs 22 2.78 9.80 0.20
## frstlang 1 2.37 3.65 0.02
## salary 220000 0.70 -1.05 3078.10
## satis 997 1.77 1.13 22.45
#getting rid of erroneous values in the dataset (null and 999 and 998)
dataset2=dataset[which (dataset$salary!=0 & dataset$salary!=999 & dataset$salary!=998), ]
View(dataset2)
#Splitting into training and test set
library(caTools)
set.seed(123)
split = sample.split(dataset2$salary, SplitRatio = 0.8)
training_set = subset(dataset2, split == TRUE)
test_set = subset(dataset2, split == FALSE)
View(training_set)
View(test_set)
cor(dataset2)
## age sex gmat_tot gmat_qpc gmat_vpc
## age 1.00000000 -0.14352927 -0.07871678 -0.165039057 0.01799420
## sex -0.14352927 1.00000000 -0.01955548 -0.147099027 0.05341428
## gmat_tot -0.07871678 -0.01955548 1.00000000 0.666382266 0.78038546
## gmat_qpc -0.16503906 -0.14709903 0.66638227 1.000000000 0.09466541
## gmat_vpc 0.01799420 0.05341428 0.78038546 0.094665411 1.00000000
## gmat_tpc -0.09609156 -0.04686981 0.96680810 0.658650025 0.78443167
## s_avg 0.15654954 0.08079985 0.17198874 0.015471662 0.15865101
## f_avg -0.21699191 0.16572186 0.12246257 0.098418869 0.02290167
## quarter -0.12568145 -0.02139041 -0.10578964 0.012648346 -0.12862079
## work_yrs 0.88052470 -0.09233003 -0.12280018 -0.182701263 -0.02812182
## frstlang 0.35026743 0.07512009 -0.13164323 0.014198516 -0.21835333
## salary 0.49964284 -0.16628869 -0.09067141 0.014141299 -0.13743230
## satis 0.10832308 -0.09199534 0.06474206 -0.003984632 0.14863481
## gmat_tpc s_avg f_avg quarter work_yrs
## age -0.09609156 0.15654954 -0.21699191 -0.12568145 0.88052470
## sex -0.04686981 0.08079985 0.16572186 -0.02139041 -0.09233003
## gmat_tot 0.96680810 0.17198874 0.12246257 -0.10578964 -0.12280018
## gmat_qpc 0.65865003 0.01547166 0.09841887 0.01264835 -0.18270126
## gmat_vpc 0.78443167 0.15865101 0.02290167 -0.12862079 -0.02812182
## gmat_tpc 1.00000000 0.13938500 0.07051391 -0.09955033 -0.13246963
## s_avg 0.13938500 1.00000000 0.44590413 -0.84038355 0.16328236
## f_avg 0.07051391 0.44590413 1.00000000 -0.43144819 -0.21633018
## quarter -0.09955033 -0.84038355 -0.43144819 1.00000000 -0.12896722
## work_yrs -0.13246963 0.16328236 -0.21633018 -0.12896722 1.00000000
## frstlang -0.16437561 -0.13788905 -0.05061394 0.10955726 0.19627277
## salary -0.13201783 0.10173175 -0.10603897 -0.12848526 0.45466634
## satis 0.11630842 -0.14356557 -0.11773304 0.22511985 0.06299926
## frstlang salary satis
## age 0.35026743 0.49964284 0.108323083
## sex 0.07512009 -0.16628869 -0.091995338
## gmat_tot -0.13164323 -0.09067141 0.064742057
## gmat_qpc 0.01419852 0.01414130 -0.003984632
## gmat_vpc -0.21835333 -0.13743230 0.148634805
## gmat_tpc -0.16437561 -0.13201783 0.116308417
## s_avg -0.13788905 0.10173175 -0.143565573
## f_avg -0.05061394 -0.10603897 -0.117733043
## quarter 0.10955726 -0.12848526 0.225119851
## work_yrs 0.19627277 0.45466634 0.062999256
## frstlang 1.00000000 0.26701953 0.089834769
## salary 0.26701953 1.00000000 -0.040050600
## satis 0.08983477 -0.04005060 1.000000000
#Visualizing the corgram
library(corrgram)
corrgram(dataset2, order=NULL, lower.panel=panel.shade,
upper.panel=NULL, text.panel=panel.txt,
main="Mba Starting Salaries")
#We observe that age , workyrs and firstlang are highly corelated and
#create a regression model on the basis of this
attach(dataset2)
## The following objects are masked from dataset:
##
## age, f_avg, frstlang, gmat_qpc, gmat_tot, gmat_tpc, gmat_vpc,
## quarter, s_avg, salary, satis, sex, work_yrs
regressor = lm(formula = salary ~ age+work_yrs+frstlang,
data = dataset2)
summary(regressor)
##
## Call:
## lm(formula = salary ~ age + work_yrs + frstlang, data = dataset2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31941 -9139 -1086 4793 75526
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 40492.7 23417.5 1.729 0.0869 .
## age 1892.0 1075.9 1.759 0.0818 .
## work_yrs 747.2 1116.9 0.669 0.5050
## frstlang 8546.9 6728.1 1.270 0.2069
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15570 on 99 degrees of freedom
## Multiple R-squared: 0.2626, Adjusted R-squared: 0.2403
## F-statistic: 11.75 on 3 and 99 DF, p-value: 1.188e-06
#not getting a good model so trying a regression on all features
regressor2 = lm(formula = salary ~ .,
data = dataset2)
summary(regressor2)
##
## Call:
## lm(formula = salary ~ ., data = dataset2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26489 -7983 -373 5923 70602
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 78005.66 52981.93 1.472 0.1444
## age 1750.65 1130.92 1.548 0.1251
## sex -3584.07 3595.85 -0.997 0.3216
## gmat_tot 16.19 178.85 0.090 0.9281
## gmat_qpc 796.55 496.78 1.603 0.1123
## gmat_vpc 546.31 501.97 1.088 0.2794
## gmat_tpc -1457.09 714.94 -2.038 0.0445 *
## s_avg -931.53 8240.31 -0.113 0.9102
## f_avg -2222.82 3894.57 -0.571 0.5696
## quarter -2336.56 2721.89 -0.858 0.3929
## work_yrs 749.66 1135.90 0.660 0.5110
## frstlang 7719.42 7373.27 1.047 0.2979
## satis -1086.54 2157.76 -0.504 0.6158
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15430 on 90 degrees of freedom
## Multiple R-squared: 0.3422, Adjusted R-squared: 0.2545
## F-statistic: 3.902 on 12 and 90 DF, p-value: 8.086e-05
#finally choosing a 0.05 level of signif and going with gmat_tpc and age
regressor3 = lm(formula = salary ~ gmat_tpc+age,
data = dataset2)
summary(regressor3)
##
## Call:
## lm(formula = salary ~ gmat_tpc + age, data = dataset2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33439 -8093 -1819 4692 76859
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 42789.6 18247.8 2.345 0.021 *
## gmat_tpc -137.7 140.6 -0.979 0.330
## age 2684.3 473.0 5.675 1.36e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15560 on 100 degrees of freedom
## Multiple R-squared: 0.2568, Adjusted R-squared: 0.2419
## F-statistic: 17.27 on 2 and 100 DF, p-value: 3.6e-07
#trying with prediction on test sets for regressors1,2,3
y_pred1 = predict(regressor, newdata = test_set)
y_pred2 = predict(regressor2, newdata = test_set)
y_pred3 = predict(regressor3, newdata = test_set)
View(y_pred1)
View(y_pred2)
View(y_pred3)
# Logistic Regression
# Importing the dataset GETTING RID OF 999-998 values
dataset3=dataset[which (dataset$salary!=999 & dataset$salary!=998), ]
View(dataset3)
#adding a new field to find whether studeent has been placed or not
dataset3$Placed_B<-ifelse(dataset3$salary==0,0,1)
#removing the dataset field salary so that we acn judge on basis of independent parameters only
dataset3<-dataset3[-c(12)]
# Splitting the dataset into the Training set and Test set
# install.packages('caTools')
library(caTools)
set.seed(123)
split = sample.split(dataset3$Placed_B, SplitRatio = 0.75)
training_set = subset(dataset3, split == TRUE)
test_set = subset(dataset3, split == FALSE)
# Feature Scaling
training_set[,1:12] = scale(training_set[,1:12])
test_set[,1:12] = scale(test_set[,1:12])
# Fitting Logistic Regression to the Training set
classifier = glm(formula = Placed_B~ .,
family = binomial,
data = training_set)
# Predicting the Test set results
prob_pred = predict(classifier, type = 'response', newdata = test_set[,1:12])
y_pred = ifelse(prob_pred > 0.5, 1, 0)
# Making the Confusion Matrix
cm = table(test_set[, 13], y_pred > 0.5)
cm
##
## FALSE TRUE
## 0 12 10
## 1 9 17