#Add some functions to use for analysis from server
source("http://bigblue.depaul.edu/jlee141/econdata/R/func_lib.R")
#Import Data
Placement_data <- read.csv("/Users/rosenguyen/Documents/Classes/ECO520/Placement_Data_Full_Class.csv")
#library
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#Descriptive Analytics
str(Placement_data)
## 'data.frame': 21285 obs. of 15 variables:
## $ sl_no : int 1 2 3 4 5 6 7 8 9 10 ...
## $ gender : chr "M" "M" "M" "M" ...
## $ ssc_p : num 67 79.3 65 56 85.8 ...
## $ ssc_b : chr "Others" "Central" "Central" "Central" ...
## $ hsc_p : num 91 78.3 68 52 73.6 ...
## $ hsc_b : chr "Others" "Others" "Central" "Central" ...
## $ hsc_s : chr "Commerce" "Science" "Arts" "Science" ...
## $ degree_p : num 58 77.5 64 52 73.3 ...
## $ degree_t : chr "Sci&Tech" "Sci&Tech" "Comm&Mgmt" "Sci&Tech" ...
## $ workex : chr "No" "Yes" "No" "No" ...
## $ etest_p : num 55 86.5 75 66 96.8 ...
## $ specialisation: chr "Mkt&HR" "Mkt&Fin" "Mkt&Fin" "Mkt&HR" ...
## $ mba_p : num 58.8 66.3 57.8 59.4 55.5 ...
## $ status : chr "Placed" "Placed" "Placed" "Not Placed" ...
## $ salary : int 270000 200000 250000 NA 425000 NA NA 252000 231000 NA ...
head(Placement_data)
## sl_no gender ssc_p ssc_b hsc_p hsc_b hsc_s degree_p degree_t workex
## 1 1 M 67.00 Others 91.00 Others Commerce 58.00 Sci&Tech No
## 2 2 M 79.33 Central 78.33 Others Science 77.48 Sci&Tech Yes
## 3 3 M 65.00 Central 68.00 Central Arts 64.00 Comm&Mgmt No
## 4 4 M 56.00 Central 52.00 Central Science 52.00 Sci&Tech No
## 5 5 M 85.80 Central 73.60 Central Commerce 73.30 Comm&Mgmt No
## 6 6 M 55.00 Others 49.80 Others Science 67.25 Sci&Tech Yes
## etest_p specialisation mba_p status salary
## 1 55.0 Mkt&HR 58.80 Placed 270000
## 2 86.5 Mkt&Fin 66.28 Placed 200000
## 3 75.0 Mkt&Fin 57.80 Placed 250000
## 4 66.0 Mkt&HR 59.43 Not Placed NA
## 5 96.8 Mkt&Fin 55.50 Placed 425000
## 6 55.0 Mkt&Fin 51.58 Not Placed NA
tail(Placement_data)
## sl_no gender ssc_p ssc_b hsc_p hsc_b hsc_s degree_p degree_t
## 21280 21280 M 62.0 Central 72 Central Commerce 65.0 Comm&Mgmt
## 21281 21281 M 80.6 Others 82 Others Commerce 77.6 Comm&Mgmt
## 21282 21282 M 58.0 Others 60 Others Science 72.0 Sci&Tech
## 21283 21283 M 67.0 Others 67 Others Commerce 73.0 Comm&Mgmt
## 21284 21284 F 74.0 Others 66 Others Commerce 58.0 Comm&Mgmt
## 21285 21285 M 62.0 Central 58 Others Science 53.0 Comm&Mgmt
## workex etest_p specialisation mba_p status salary
## 21280 No 67 Mkt&Fin 56.49 Placed 216000
## 21281 No 91 Mkt&Fin 74.49 Placed 400000
## 21282 No 74 Mkt&Fin 53.62 Placed 275000
## 21283 Yes 59 Mkt&Fin 69.72 Placed 295000
## 21284 No 70 Mkt&HR 60.23 Placed 204000
## 21285 No 89 Mkt&HR 60.22 Not Placed NA
summary(Placement_data)
## sl_no gender ssc_p ssc_b
## Min. : 1 Length:21285 Min. :40.89 Length:21285
## 1st Qu.: 5322 Class :character 1st Qu.:60.40 Class :character
## Median :10643 Mode :character Median :67.00 Mode :character
## Mean :10643 Mean :67.30
## 3rd Qu.:15964 3rd Qu.:76.00
## Max. :21285 Max. :89.40
##
## hsc_p hsc_b hsc_s degree_p
## Min. :37.00 Length:21285 Length:21285 Min. :50.00
## 1st Qu.:60.80 Class :character Class :character 1st Qu.:61.00
## Median :65.00 Mode :character Mode :character Median :66.00
## Mean :66.33 Mean :66.37
## 3rd Qu.:73.00 3rd Qu.:72.00
## Max. :97.70 Max. :91.00
##
## degree_t workex etest_p specialisation
## Length:21285 Length:21285 Min. :50.0 Length:21285
## Class :character Class :character 1st Qu.:60.0 Class :character
## Mode :character Mode :character Median :71.0 Mode :character
## Mean :72.1
## 3rd Qu.:84.0
## Max. :98.0
##
## mba_p status salary
## Min. :51.21 Length:21285 Min. :200000
## 1st Qu.:57.90 Class :character 1st Qu.:240000
## Median :62.00 Mode :character Median :265000
## Mean :62.28 Mean :288655
## 3rd Qu.:66.28 3rd Qu.:300000
## Max. :77.89 Max. :940000
## NA's :6633
#find missing values
Placement <- subset(Placement_data, is.na(salary) | salary < 900000)
#data without unnecessary variables
Placement <- subset(Placement, select=-c(sl_no))
#Categorical Variables
table(Placement$gender)
##
## F M
## 7524 13662
table(Placement$ssc_b)
##
## Central Others
## 11385 9801
table(Placement$hsc_b)
##
## Central Others
## 8217 12969
table(Placement$hsc_s)
##
## Arts Commerce Science
## 1089 11088 9009
table(Placement$degree_t)
##
## Comm&Mgmt Others Sci&Tech
## 14256 1089 5841
table(Placement$workex)
##
## No Yes
## 13959 7227
table(Placement$specialisation)
##
## Mkt&Fin Mkt&HR
## 11781 9405
table(Placement$status)
##
## Not Placed Placed
## 6633 14553
#Convert Categorical Variables into Dummy Variables
Placement$gender <- ifelse(Placement$gender=="F",1,0)
Placement$ssc_b <- ifelse(Placement$ssc_b =="Central",1,0)
Placement$hsc_b <- ifelse(Placement$hsc_b =="Central",1,0)
Placement$workex <- ifelse(Placement$workex =="Yes",1,0)
Placement$specialisation <- ifelse(Placement$specialisation =="Mkt&Fin",1,0)
Placement$status <- ifelse(Placement$status =="Placed",1,0)
Placement$hsc_s <- ifelse(Placement$hsc_s == "Arts", 1, ifelse(Placement$hsc_s == "Commerce", 2, 0))
Placement$degree_t <- ifelse(Placement$degree_t == "Comm&Mgmt", 1, ifelse(Placement$degree_t == "Sci&Tech",2,0))
str(Placement)
## 'data.frame': 21186 obs. of 14 variables:
## $ gender : num 0 0 0 0 0 0 1 0 0 0 ...
## $ ssc_p : num 67 79.3 65 56 85.8 ...
## $ ssc_b : num 0 1 1 1 1 0 0 1 1 1 ...
## $ hsc_p : num 91 78.3 68 52 73.6 ...
## $ hsc_b : num 0 0 1 1 1 0 0 1 1 1 ...
## $ hsc_s : num 2 0 1 0 2 0 2 0 2 2 ...
## $ degree_p : num 58 77.5 64 52 73.3 ...
## $ degree_t : num 2 2 1 2 1 2 1 2 1 1 ...
## $ workex : num 0 1 0 0 0 1 0 1 0 0 ...
## $ etest_p : num 55 86.5 75 66 96.8 ...
## $ specialisation: num 0 1 1 0 1 1 1 1 1 1 ...
## $ mba_p : num 58.8 66.3 57.8 59.4 55.5 ...
## $ status : num 1 1 1 0 1 0 0 1 1 0 ...
## $ salary : int 270000 200000 250000 NA 425000 NA NA 252000 231000 NA ...
#factor variables
Placement$gender <- as.factor(Placement$gender)
Placement$ssc_b <- as.factor(Placement$ssc_b)
Placement$hsc_b <- as.factor(Placement$hsc_b)
Placement$hsc_s <- as.factor(Placement$hsc_s)
Placement$degree_t <- as.factor(Placement$degree_t)
Placement$workex <- as.factor(Placement$workex)
Placement$specialisation <- as.factor(Placement$specialisation)
str(Placement)
## 'data.frame': 21186 obs. of 14 variables:
## $ gender : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 2 1 1 1 ...
## $ ssc_p : num 67 79.3 65 56 85.8 ...
## $ ssc_b : Factor w/ 2 levels "0","1": 1 2 2 2 2 1 1 2 2 2 ...
## $ hsc_p : num 91 78.3 68 52 73.6 ...
## $ hsc_b : Factor w/ 2 levels "0","1": 1 1 2 2 2 1 1 2 2 2 ...
## $ hsc_s : Factor w/ 3 levels "0","1","2": 3 1 2 1 3 1 3 1 3 3 ...
## $ degree_p : num 58 77.5 64 52 73.3 ...
## $ degree_t : Factor w/ 3 levels "0","1","2": 3 3 2 3 2 3 2 3 2 2 ...
## $ workex : Factor w/ 2 levels "0","1": 1 2 1 1 1 2 1 2 1 1 ...
## $ etest_p : num 55 86.5 75 66 96.8 ...
## $ specialisation: Factor w/ 2 levels "0","1": 1 2 2 1 2 2 2 2 2 2 ...
## $ mba_p : num 58.8 66.3 57.8 59.4 55.5 ...
## $ status : num 1 1 1 0 1 0 0 1 1 0 ...
## $ salary : int 270000 200000 250000 NA 425000 NA NA 252000 231000 NA ...
#Dummy Variables
table(Placement$gender)
##
## 0 1
## 13662 7524
table(Placement$ssc_b)
##
## 0 1
## 9801 11385
table(Placement$hsc_b)
##
## 0 1
## 12969 8217
table(Placement$hsc_s)
##
## 0 1 2
## 9009 1089 11088
table(Placement$degree_t)
##
## 0 1 2
## 1089 14256 5841
table(Placement$workex)
##
## 0 1
## 13959 7227
table(Placement$specialisation)
##
## 0 1
## 9405 11781
table(Placement$status)
##
## 0 1
## 6633 14553
## Regression Models using Train Data set
## Model 1: salary= b0 + b1*degree_t + e
mod1 <- lm(salary ~ degree_t, data=Placement)
summary(mod1)
##
## Call:
## lm(formula = salary ~ degree_t, data = Placement)
##
## Residuals:
## Min 1Q Median 3Q Max
## -114610 -47079 -22079 27921 377921
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 280400 3324 84.347 <2e-16 ***
## degree_t1 -8321 3406 -2.443 0.0146 *
## degree_t2 34210 3521 9.715 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 73960 on 14550 degrees of freedom
## (6633 observations deleted due to missingness)
## Multiple R-squared: 0.06165, Adjusted R-squared: 0.06152
## F-statistic: 478 on 2 and 14550 DF, p-value: < 2.2e-16
## Model 2: using stepwise to remove all the insignificant variables
mod2 <- step(lm(salary ~., data=Placement, direction = "both"))
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'direction' will be disregarded
## Start: AIC=324734
## salary ~ gender + ssc_p + ssc_b + hsc_p + hsc_b + hsc_s + degree_p +
## degree_t + workex + etest_p + specialisation + mba_p + status
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'direction' will be disregarded
##
## Step: AIC=324734
## salary ~ gender + ssc_p + ssc_b + hsc_p + hsc_b + hsc_s + degree_p +
## degree_t + workex + etest_p + specialisation + mba_p
##
## Df Sum of Sq RSS AIC
## - workex 1 6.0710e+08 7.1262e+13 324732
## - ssc_b 1 8.2907e+08 7.1263e+13 324732
## <none> 7.1262e+13 324734
## - ssc_p 1 2.6572e+10 7.1288e+13 324737
## - hsc_b 1 3.2083e+10 7.1294e+13 324739
## - hsc_p 1 8.5063e+10 7.1347e+13 324749
## - etest_p 1 2.7960e+11 7.1541e+13 324789
## - specialisation 1 5.8806e+11 7.1850e+13 324852
## - degree_p 1 7.6820e+11 7.2030e+13 324888
## - hsc_s 2 1.0088e+12 7.2271e+13 324935
## - gender 1 1.3272e+12 7.2589e+13 325001
## - mba_p 1 2.3393e+12 7.3601e+13 325202
## - degree_t 2 2.6904e+12 7.3952e+13 325269
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'direction' will be disregarded
##
## Step: AIC=324732.2
## salary ~ gender + ssc_p + ssc_b + hsc_p + hsc_b + hsc_s + degree_p +
## degree_t + etest_p + specialisation + mba_p
##
## Df Sum of Sq RSS AIC
## - ssc_b 1 5.3392e+08 7.1263e+13 324730
## <none> 7.1262e+13 324732
## - ssc_p 1 2.6200e+10 7.1289e+13 324736
## - hsc_b 1 3.4862e+10 7.1297e+13 324737
## - hsc_p 1 8.4724e+10 7.1347e+13 324747
## - etest_p 1 2.7958e+11 7.1542e+13 324787
## - specialisation 1 6.0730e+11 7.1870e+13 324854
## - degree_p 1 7.7274e+11 7.2035e+13 324887
## - hsc_s 2 1.0102e+12 7.2273e+13 324933
## - gender 1 1.3468e+12 7.2609e+13 325003
## - mba_p 1 2.4394e+12 7.3702e+13 325220
## - degree_t 2 2.7659e+12 7.4028e+13 325282
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'direction' will be disregarded
##
## Step: AIC=324730.3
## salary ~ gender + ssc_p + hsc_p + hsc_b + hsc_s + degree_p +
## degree_t + etest_p + specialisation + mba_p
##
## Df Sum of Sq RSS AIC
## <none> 7.1263e+13 324730
## - ssc_p 1 2.5683e+10 7.1289e+13 324734
## - hsc_b 1 6.8714e+10 7.1332e+13 324742
## - hsc_p 1 8.7219e+10 7.1350e+13 324746
## - etest_p 1 2.8538e+11 7.1548e+13 324786
## - specialisation 1 6.0680e+11 7.1870e+13 324852
## - degree_p 1 7.7244e+11 7.2035e+13 324885
## - hsc_s 2 1.0098e+12 7.2273e+13 324931
## - gender 1 1.3467e+12 7.2610e+13 325001
## - mba_p 1 2.4400e+12 7.3703e+13 325218
## - degree_t 2 2.8752e+12 7.4138e+13 325302
summary(mod2)
##
## Call:
## lm(formula = salary ~ gender + ssc_p + hsc_p + hsc_b + hsc_s +
## degree_p + degree_t + etest_p + specialisation + mba_p, data = Placement,
## direction = "both")
##
## Residuals:
## Min 1Q Median 3Q Max
## -130249 -42859 -10959 21203 368032
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 180967.90 8520.80 21.238 < 2e-16 ***
## gender1 -22863.28 1379.26 -16.577 < 2e-16 ***
## ssc_p -192.78 84.22 -2.289 0.022086 *
## hsc_p 314.76 74.61 4.218 2.47e-05 ***
## hsc_b1 -4657.29 1243.82 -3.744 0.000182 ***
## hsc_s1 -35074.23 3567.11 -9.833 < 2e-16 ***
## hsc_s2 11479.99 1839.21 6.242 4.45e-10 ***
## degree_p -1376.97 109.68 -12.554 < 2e-16 ***
## degree_t1 -37181.75 3697.60 -10.056 < 2e-16 ***
## degree_t2 7928.16 3748.25 2.115 0.034433 *
## etest_p 375.86 49.26 7.631 2.48e-14 ***
## specialisation1 14545.55 1307.24 11.127 < 2e-16 ***
## mba_p 2891.09 129.57 22.312 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 70010 on 14540 degrees of freedom
## (6633 observations deleted due to missingness)
## Multiple R-squared: 0.1599, Adjusted R-squared: 0.1592
## F-statistic: 230.6 on 12 and 14540 DF, p-value: < 2.2e-16
## Residuals of 3 models
pe1 <- residuals(mod1, newdata=Placement)
pe2 <- residuals(mod2, newdata=Placement)
mse1 <- mean(pe1^2)
mse2 <- mean(pe2^2)
print(c(mse1,mse2))
## [1] 5469253894 4896785160
rmse1 <- mse1^0.5
rmse2 <- mse2^0.5
print(c(rmse1,rmse2))
## [1] 73954.40 69977.03
#Remove salary dependent variable
Placement_status<- subset(Placement, select=-c(salary))
#Split data set into train and test data
set.seed(2111845)
train_idx <- sample(nrow(Placement_status), nrow(Placement_status)*0.8)
train <- Placement_status[train_idx,]
test <- Placement_status[-train_idx,]
testy <- test$status
library(ROCR)
#Linear Probability Model
## Model 1: lpm0
lpm0 <- lm(status~ssc_p+hsc_p+degree_p+etest_p+mba_p, data=train)
summary(lpm0)
##
## Call:
## lm(formula = status ~ ssc_p + hsc_p + degree_p + etest_p + mba_p,
## data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.85505 -0.20397 0.04614 0.24717 0.82302
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.8525993 0.0305319 -27.93 <2e-16 ***
## ssc_p 0.0203971 0.0003013 67.70 <2e-16 ***
## hsc_p 0.0110375 0.0002798 39.45 <2e-16 ***
## degree_p 0.0151205 0.0004261 35.48 <2e-16 ***
## etest_p -0.0020934 0.0001992 -10.51 <2e-16 ***
## mba_p -0.0227895 0.0004893 -46.57 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3267 on 16942 degrees of freedom
## Multiple R-squared: 0.5031, Adjusted R-squared: 0.5029
## F-statistic: 3430 on 5 and 16942 DF, p-value: < 2.2e-16
###lpm0 - Adjusted R-squared: 0.5017
yhat0 <- predict(lpm0, newdata = test)
auc_plot(yhat0, testy, "LPM")

###AUC=0.93214
##Model 2: lpm1
lpm1<- step(lm(status~., data=train), direction = "both")
## Start: AIC=-40096.86
## status ~ gender + ssc_p + ssc_b + hsc_p + hsc_b + hsc_s + degree_p +
## degree_t + workex + etest_p + specialisation + mba_p
##
## Df Sum of Sq RSS AIC
## <none> 1588.0 -40097
## - hsc_b 1 4.79 1592.8 -40048
## - specialisation 1 7.48 1595.5 -40019
## - ssc_b 1 10.14 1598.2 -39991
## - etest_p 1 19.50 1607.5 -39892
## - gender 1 22.17 1610.2 -39864
## - hsc_s 2 29.71 1617.8 -39787
## - degree_t 2 53.93 1642.0 -39535
## - workex 1 95.89 1683.9 -39105
## - hsc_p 1 114.08 1702.1 -38923
## - degree_p 1 126.25 1714.3 -38802
## - mba_p 1 179.34 1767.4 -38285
## - ssc_p 1 405.05 1993.1 -36248
summary(lpm1)
##
## Call:
## lm(formula = status ~ gender + ssc_p + ssc_b + hsc_p + hsc_b +
## hsc_s + degree_p + degree_t + workex + etest_p + specialisation +
## mba_p, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.73019 -0.21766 0.01742 0.21855 0.74377
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.9163307 0.0321154 -28.532 < 2e-16 ***
## gender1 -0.0843637 0.0054871 -15.375 < 2e-16 ***
## ssc_p 0.0202170 0.0003076 65.719 < 2e-16 ***
## ssc_b1 -0.0661458 0.0063607 -10.399 < 2e-16 ***
## hsc_p 0.0104585 0.0002999 34.878 < 2e-16 ***
## hsc_b1 0.0454550 0.0063619 7.145 9.37e-13 ***
## hsc_s1 0.0741773 0.0129975 5.707 1.17e-08 ***
## hsc_s2 -0.0939156 0.0071668 -13.104 < 2e-16 ***
## degree_p 0.0151988 0.0004142 36.690 < 2e-16 ***
## degree_t1 0.1488804 0.0120611 12.344 < 2e-16 ***
## degree_t2 -0.0212936 0.0125178 -1.701 0.089 .
## workex1 0.1680323 0.0052550 31.976 < 2e-16 ***
## etest_p -0.0028008 0.0001942 -14.420 < 2e-16 ***
## specialisation1 0.0469132 0.0052521 8.932 < 2e-16 ***
## mba_p -0.0215769 0.0004934 -43.729 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3062 on 16933 degrees of freedom
## Multiple R-squared: 0.5637, Adjusted R-squared: 0.5633
## F-statistic: 1563 on 14 and 16933 DF, p-value: < 2.2e-16
##lpm1 <- Adjusted R-squared: 0.5651
yhat1 <- predict(lpm1, newdata=test)
auc_plot(yhat1, testy, "LPM")

##AUC=0.95582
conf_table(yhat1, testy, "LPM")
## estname prob true_total truepos falsneg detection_rate false_total falspos
## 1 LPM 0.1 2897 2897 0 1 1341 1078
## 2 LPM 0.2 2897 2897 0 1 1341 905
## 3 LPM 0.3 2897 2857 40 0.9862 1341 642
## 4 LPM 0.4 2897 2798 99 0.9658 1341 411
## 5 LPM 0.5 2897 2712 185 0.9361 1341 240
## 6 LPM 0.6 2897 2598 299 0.8968 1341 181
## 7 LPM 0.7 2897 2167 730 0.748 1341 39
## 8 LPM 0.8 2897 1730 1167 0.5972 1341 0
## 9 LPM 0.9 2897 1265 1632 0.4367 1341 0
## trueneg false_pos_rate
## 1 263 0.8039
## 2 436 0.6749
## 3 699 0.4787
## 4 930 0.3065
## 5 1101 0.179
## 6 1160 0.135
## 7 1302 0.0291
## 8 1341 0
## 9 1341 0
##Final decision
dec_lpm <- ifelse(yhat1>=0.3, 1,0)
table(testy, dec_lpm)
## dec_lpm
## testy 0 1
## 0 699 642
## 1 40 2857
#Logistic Model
##Model 1: logit0
logit0 <- glm(status~ssc_p+hsc_p+degree_p+etest_p+mba_p, data=train, family= binomial(link = logit))
loghat0 <- predict(logit0, newdata = test, type="response")
###AUC = 0.93204
##Final decision
dec_logit <- ifelse(yhat1>=0.3, 1,0)
table(testy, dec_lpm)
## dec_lpm
## testy 0 1
## 0 699 642
## 1 40 2857
##Model 2: logit1
logit1 <- step(glm(status~., data = train, family = binomial(link=logit)), direction = "both")
## Start: AIC=7922.27
## status ~ gender + ssc_p + ssc_b + hsc_p + hsc_b + hsc_s + degree_p +
## degree_t + workex + etest_p + specialisation + mba_p
##
## Df Deviance AIC
## <none> 7892.3 7922.3
## - ssc_b 1 7898.0 7926.0
## - specialisation 1 7908.5 7936.5
## - hsc_b 1 7910.9 7938.9
## - etest_p 1 7935.4 7963.4
## - hsc_s 2 8025.7 8051.7
## - gender 1 8165.0 8193.0
## - degree_t 2 8316.5 8342.5
## - workex 1 8654.2 8682.2
## - hsc_p 1 8683.1 8711.1
## - degree_p 1 8940.3 8968.3
## - mba_p 1 9117.7 9145.7
## - ssc_p 1 11165.0 11193.0
loghat1 <- predict(logit1, newdata = test, type = "response")
auc_plot(loghat1, testy, "LOGIT")

###AUC = 0.95961
conf_table(loghat1, testy, "LOGIT")
## estname prob true_total truepos falsneg detection_rate false_total falspos
## 1 LOGIT 0.1 2897 2857 40 0.9862 1341 645
## 2 LOGIT 0.2 2897 2837 60 0.9793 1341 485
## 3 LOGIT 0.3 2897 2772 125 0.9569 1341 396
## 4 LOGIT 0.4 2897 2772 125 0.9569 1341 310
## 5 LOGIT 0.5 2897 2712 185 0.9361 1341 216
## 6 LOGIT 0.6 2897 2612 285 0.9016 1341 216
## 7 LOGIT 0.7 2897 2520 377 0.8699 1341 133
## 8 LOGIT 0.8 2897 2414 483 0.8333 1341 50
## 9 LOGIT 0.9 2897 2176 721 0.7511 1341 28
## trueneg false_pos_rate
## 1 696 0.481
## 2 856 0.3617
## 3 945 0.2953
## 4 1031 0.2312
## 5 1125 0.1611
## 6 1125 0.1611
## 7 1208 0.0992
## 8 1291 0.0373
## 9 1313 0.0209
#Final Decision
dec_logit <- ifelse(loghat1>=0.2,1,0)
table(testy, dec_logit)
## dec_logit
## testy 0 1
## 0 856 485
## 1 60 2837
#Random Forest Model
train$status <- as.factor(train$status)
test$status <- as.factor(test$status)
if(!(require(randomForest))) install.packages("randomForest")
## Loading required package: randomForest
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
rf1 <- randomForest(formula=status~., data=train, mtry=5, ntree=500)
summary(rf1)
## Length Class Mode
## call 5 -none- call
## type 1 -none- character
## predicted 16948 factor numeric
## err.rate 1500 -none- numeric
## confusion 6 -none- numeric
## votes 33896 matrix numeric
## oob.times 16948 -none- numeric
## classes 2 -none- character
## importance 12 -none- numeric
## importanceSD 0 -none- NULL
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 14 -none- list
## y 16948 factor numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## terms 3 terms call
rfhat1 <- predict(rf1, newdata=test, type="prob")
rfhat1 <- rfhat1[,2]
auc_plot(rfhat1, testy, "RANDOMFOREST")

###AUC=1
conf_table(rfhat1, testy, "RANDOMFOREST")
## estname prob true_total truepos falsneg detection_rate false_total
## 1 RANDOMFOREST 0.1 2897 2897 0 1 1341
## 2 RANDOMFOREST 0.2 2897 2897 0 1 1341
## 3 RANDOMFOREST 0.3 2897 2897 0 1 1341
## 4 RANDOMFOREST 0.4 2897 2897 0 1 1341
## 5 RANDOMFOREST 0.5 2897 2897 0 1 1341
## 6 RANDOMFOREST 0.6 2897 2897 0 1 1341
## 7 RANDOMFOREST 0.7 2897 2897 0 1 1341
## 8 RANDOMFOREST 0.8 2897 2897 0 1 1341
## 9 RANDOMFOREST 0.9 2897 2897 0 1 1341
## falspos trueneg false_pos_rate
## 1 0 1341 0
## 2 0 1341 0
## 3 0 1341 0
## 4 0 1341 0
## 5 0 1341 0
## 6 0 1341 0
## 7 0 1341 0
## 8 0 1341 0
## 9 0 1341 0
###Final Decision
dec_rf <- ifelse(rfhat1 >= 0.1,1,0)
table(testy, dec_rf)
## dec_rf
## testy 0 1
## 0 1341 0
## 1 0 2897
#Neural Network Analysis
##install.packages(neuralnet)
library(neuralnet)
##
## Attaching package: 'neuralnet'
## The following object is masked from 'package:ROCR':
##
## prediction
## The following object is masked from 'package:dplyr':
##
## compute
Placement_status[] <- lapply(Placement_status, as.numeric)
str(Placement_status)
## 'data.frame': 21186 obs. of 13 variables:
## $ gender : num 1 1 1 1 1 1 2 1 1 1 ...
## $ ssc_p : num 67 79.3 65 56 85.8 ...
## $ ssc_b : num 1 2 2 2 2 1 1 2 2 2 ...
## $ hsc_p : num 91 78.3 68 52 73.6 ...
## $ hsc_b : num 1 1 2 2 2 1 1 2 2 2 ...
## $ hsc_s : num 3 1 2 1 3 1 3 1 3 3 ...
## $ degree_p : num 58 77.5 64 52 73.3 ...
## $ degree_t : num 3 3 2 3 2 3 2 3 2 2 ...
## $ workex : num 1 2 1 1 1 2 1 2 1 1 ...
## $ etest_p : num 55 86.5 75 66 96.8 ...
## $ specialisation: num 1 2 2 1 2 2 2 2 2 2 ...
## $ mba_p : num 58.8 66.3 57.8 59.4 55.5 ...
## $ status : num 1 1 1 0 1 0 0 1 1 0 ...
zplacement_status<- min_max_nor(Placement_status)
##split dataset
#Split Train and Test
zplacement_status_idx <- sample(nrow(zplacement_status), round(0.7*nrow(zplacement_status)))
ztrain <- zplacement_status[zplacement_status_idx,]
ztest <- zplacement_status[-zplacement_status_idx,]
ztesty <- ztest$status
##Model 1: nnet1
ztrain$status <- as.factor(ztrain$status)
nnet1 <- neuralnet(status~., data=ztrain, hidden=c(3), stepmax=1e+06)
plot(nnet1)
pred1 <- predict(nnet1, newdata= ztest)
pred1 <- pred1[,2]
auc_plot(pred1, ztesty, "Neural Network")
##
## Attaching package: 'neuralnet'
## The following object is masked from 'package:ROCR':
##
## prediction
## The following object is masked from 'package:dplyr':
##
## compute
###AUC= 0.9603
##Final Decision
conf_table(pred1, ztesty, "Neural Network")
## estname prob true_total truepos falsneg detection_rate false_total
## 1 Neural Network 0.1 4345 4298 47 0.9892 2011
## 2 Neural Network 0.2 4345 4298 47 0.9892 2011
## 3 Neural Network 0.3 4345 4298 47 0.9892 2011
## 4 Neural Network 0.4 4345 4298 47 0.9892 2011
## 5 Neural Network 0.5 4345 4298 47 0.9892 2011
## 6 Neural Network 0.6 4345 4298 47 0.9892 2011
## 7 Neural Network 0.7 4345 4298 47 0.9892 2011
## 8 Neural Network 0.8 4345 4298 47 0.9892 2011
## 9 Neural Network 0.9 4345 4298 47 0.9892 2011
## falspos trueneg false_pos_rate
## 1 211 1800 0.1049
## 2 211 1800 0.1049
## 3 211 1800 0.1049
## 4 211 1800 0.1049
## 5 211 1800 0.1049
## 6 211 1800 0.1049
## 7 211 1800 0.1049
## 8 211 1800 0.1049
## 9 211 1800 0.1049
dec_nn <- ifelse(pred1 >=0.1,1,0)
table(ztesty, dec_nn)
## dec_nn
## ztesty 0 1
## 0 1800 211
## 1 47 4298
#Combine and Compare Models
##Combine Graphs
par(mfrow=c(2,2))
plot(testy, yhat1)
plot(testy, loghat1)
plot(testy, rfhat1)
plot(ztesty, pred1)
##Find the model with the lowest error
fit_eval(yhat1, testy, "Linear Progression")
## n ME MAE MPE MAPE MSE RMSE
## Linear Progression 4238 0.001512145 0.249521 NaN Inf 0.09312148 0.3051581
fit_eval(loghat1, testy, "Logit")
## n ME MAE MPE MAPE MSE RMSE
## Logit 4238 0.004077048 0.1442842 -Inf Inf 0.07324195 0.2706325
fit_eval(rfhat1, testy, "Random Forest")
## n ME MAE MPE MAPE MSE RMSE
## Random Forest 4238 6.60689e-06 0.0001340255 NaN NaN 5.455403e-07 0.000738607
fit_eval(pred1, ztesty, "Neural Network")
## n ME MAE MPE MAPE MSE RMSE
## Neural Network 6356 -0.003743628 0.07977579 NaN Inf 0.03883906 0.1970763