This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
#Raj Kapoor
#mr.rajkapoor393@gmail.com
#NIT Rourkela
dataset = read.csv('MBA Starting Salaries Data.csv')
View(dataset)
#getting rid of erroneous values in the dataset (null and 999 and 998)
dataset2=dataset[which (dataset$salary!=0 & dataset$salary!=999 & dataset$salary!=998), ]
View(dataset2)
#dataset2 has 103 entries
#Splitting into training and test set
library(caTools)
set.seed(123)
split = sample.split(dataset2$salary, SplitRatio = 0.8)
training_set = subset(dataset2, split == TRUE)
test_set = subset(dataset2, split == FALSE)
View(training_set)
View(test_set)
cor(dataset2)
## age sex gmat_tot gmat_qpc gmat_vpc
## age 1.00000000 -0.14352927 -0.07871678 -0.165039057 0.01799420
## sex -0.14352927 1.00000000 -0.01955548 -0.147099027 0.05341428
## gmat_tot -0.07871678 -0.01955548 1.00000000 0.666382266 0.78038546
## gmat_qpc -0.16503906 -0.14709903 0.66638227 1.000000000 0.09466541
## gmat_vpc 0.01799420 0.05341428 0.78038546 0.094665411 1.00000000
## gmat_tpc -0.09609156 -0.04686981 0.96680810 0.658650025 0.78443167
## s_avg 0.15654954 0.08079985 0.17198874 0.015471662 0.15865101
## f_avg -0.21699191 0.16572186 0.12246257 0.098418869 0.02290167
## quarter -0.12568145 -0.02139041 -0.10578964 0.012648346 -0.12862079
## work_yrs 0.88052470 -0.09233003 -0.12280018 -0.182701263 -0.02812182
## frstlang 0.35026743 0.07512009 -0.13164323 0.014198516 -0.21835333
## salary 0.49964284 -0.16628869 -0.09067141 0.014141299 -0.13743230
## satis 0.10832308 -0.09199534 0.06474206 -0.003984632 0.14863481
## gmat_tpc s_avg f_avg quarter work_yrs
## age -0.09609156 0.15654954 -0.21699191 -0.12568145 0.88052470
## sex -0.04686981 0.08079985 0.16572186 -0.02139041 -0.09233003
## gmat_tot 0.96680810 0.17198874 0.12246257 -0.10578964 -0.12280018
## gmat_qpc 0.65865003 0.01547166 0.09841887 0.01264835 -0.18270126
## gmat_vpc 0.78443167 0.15865101 0.02290167 -0.12862079 -0.02812182
## gmat_tpc 1.00000000 0.13938500 0.07051391 -0.09955033 -0.13246963
## s_avg 0.13938500 1.00000000 0.44590413 -0.84038355 0.16328236
## f_avg 0.07051391 0.44590413 1.00000000 -0.43144819 -0.21633018
## quarter -0.09955033 -0.84038355 -0.43144819 1.00000000 -0.12896722
## work_yrs -0.13246963 0.16328236 -0.21633018 -0.12896722 1.00000000
## frstlang -0.16437561 -0.13788905 -0.05061394 0.10955726 0.19627277
## salary -0.13201783 0.10173175 -0.10603897 -0.12848526 0.45466634
## satis 0.11630842 -0.14356557 -0.11773304 0.22511985 0.06299926
## frstlang salary satis
## age 0.35026743 0.49964284 0.108323083
## sex 0.07512009 -0.16628869 -0.091995338
## gmat_tot -0.13164323 -0.09067141 0.064742057
## gmat_qpc 0.01419852 0.01414130 -0.003984632
## gmat_vpc -0.21835333 -0.13743230 0.148634805
## gmat_tpc -0.16437561 -0.13201783 0.116308417
## s_avg -0.13788905 0.10173175 -0.143565573
## f_avg -0.05061394 -0.10603897 -0.117733043
## quarter 0.10955726 -0.12848526 0.225119851
## work_yrs 0.19627277 0.45466634 0.062999256
## frstlang 1.00000000 0.26701953 0.089834769
## salary 0.26701953 1.00000000 -0.040050600
## satis 0.08983477 -0.04005060 1.000000000
#Visualizing the corgram
library(corrgram)
corrgram(dataset2, order=NULL, lower.panel=panel.shade,
upper.panel=NULL, text.panel=panel.txt,
main="Mba Starting Salaries")
#We observe that age , workyrs and firstlang are highly corelated and
#create a regression model on the basis of this
attach(dataset2)
regressor = lm(formula = salary ~ age+work_yrs+frstlang,
data = dataset2)
summary(regressor)
##
## Call:
## lm(formula = salary ~ age + work_yrs + frstlang, data = dataset2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31941 -9139 -1086 4793 75526
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 40492.7 23417.5 1.729 0.0869 .
## age 1892.0 1075.9 1.759 0.0818 .
## work_yrs 747.2 1116.9 0.669 0.5050
## frstlang 8546.9 6728.1 1.270 0.2069
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15570 on 99 degrees of freedom
## Multiple R-squared: 0.2626, Adjusted R-squared: 0.2403
## F-statistic: 11.75 on 3 and 99 DF, p-value: 1.188e-06
#not getting a good model so trying a regression on all features
regressor2 = lm(formula = salary ~ .,
data = dataset2)
summary(regressor2)
##
## Call:
## lm(formula = salary ~ ., data = dataset2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26489 -7983 -373 5923 70602
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 78005.66 52981.93 1.472 0.1444
## age 1750.65 1130.92 1.548 0.1251
## sex -3584.07 3595.85 -0.997 0.3216
## gmat_tot 16.19 178.85 0.090 0.9281
## gmat_qpc 796.55 496.78 1.603 0.1123
## gmat_vpc 546.31 501.97 1.088 0.2794
## gmat_tpc -1457.09 714.94 -2.038 0.0445 *
## s_avg -931.53 8240.31 -0.113 0.9102
## f_avg -2222.82 3894.57 -0.571 0.5696
## quarter -2336.56 2721.89 -0.858 0.3929
## work_yrs 749.66 1135.90 0.660 0.5110
## frstlang 7719.42 7373.27 1.047 0.2979
## satis -1086.54 2157.76 -0.504 0.6158
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15430 on 90 degrees of freedom
## Multiple R-squared: 0.3422, Adjusted R-squared: 0.2545
## F-statistic: 3.902 on 12 and 90 DF, p-value: 8.086e-05
#getting a 0.04 p value on gmat_tpc
#finally choosing a 0.05 level of signif and going with gmat_tpc and age
regressor3 = lm(formula = salary ~ gmat_tpc+age,
data = dataset2)
summary(regressor3)
##
## Call:
## lm(formula = salary ~ gmat_tpc + age, data = dataset2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33439 -8093 -1819 4692 76859
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 42789.6 18247.8 2.345 0.021 *
## gmat_tpc -137.7 140.6 -0.979 0.330
## age 2684.3 473.0 5.675 1.36e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15560 on 100 degrees of freedom
## Multiple R-squared: 0.2568, Adjusted R-squared: 0.2419
## F-statistic: 17.27 on 2 and 100 DF, p-value: 3.6e-07
#trying with prediction on test sets for regressors1,2,3
y_pred1 = predict(regressor, newdata = test_set)
y_pred2 = predict(regressor2, newdata = test_set)
y_pred3 = predict(regressor3, newdata = test_set)
View(y_pred1)
View(y_pred2)
View(y_pred3)
#we see the test set for row 39 salary was 92000
#regressor2 performed the most well
#but improvizations have to be made
#proceeding to the second part of the task
# Logistic Regression
# Importing the dataset GETTING RID OF 999-998 values
dataset3=dataset[which (dataset$salary!=999 & dataset$salary!=998), ]
View(dataset3)
#adding a new field to find whether studeent has been placed or not
dataset3$Placed_B<-ifelse(dataset3$salary==0,0,1)
#removing the dataset field salary so that we acn judge on basis of independent parameters only
dataset3<-dataset3[-c(12)]
# Splitting the dataset into the Training set and Test set
# install.packages('caTools')
library(caTools)
set.seed(123)
split = sample.split(dataset3$Placed_B, SplitRatio = 0.75)
training_set = subset(dataset3, split == TRUE)
test_set = subset(dataset3, split == FALSE)
# Feature Scaling
training_set[,1:12] = scale(training_set[,1:12])
test_set[,1:12] = scale(test_set[,1:12])
# Fitting Logistic Regression to the Training set
classifier = glm(formula = Placed_B~ .,
family = binomial,
data = training_set)
# Predicting the Test set results
prob_pred = predict(classifier, type = 'response', newdata = test_set[,1:12])
y_pred = ifelse(prob_pred > 0.5, 1, 0)
# Making the Confusion Matrix
cm = table(test_set[, 13], y_pred > 0.5)
cm
##
## FALSE TRUE
## 0 12 10
## 1 9 17
#FALSE TRUE
#0 12 10
#1 9 17
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.