#Add some functions to use for analysis from server
source("http://bigblue.depaul.edu/jlee141/econdata/R/func_lib.R") 

#Import Data
Placement_data <- read.csv("/Users/rosenguyen/Documents/Classes/ECO520/Placement_Data_Full_Class.csv") 
#library
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#Descriptive Analytics
str(Placement_data)
## 'data.frame':    21285 obs. of  15 variables:
##  $ sl_no         : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ gender        : chr  "M" "M" "M" "M" ...
##  $ ssc_p         : num  67 79.3 65 56 85.8 ...
##  $ ssc_b         : chr  "Others" "Central" "Central" "Central" ...
##  $ hsc_p         : num  91 78.3 68 52 73.6 ...
##  $ hsc_b         : chr  "Others" "Others" "Central" "Central" ...
##  $ hsc_s         : chr  "Commerce" "Science" "Arts" "Science" ...
##  $ degree_p      : num  58 77.5 64 52 73.3 ...
##  $ degree_t      : chr  "Sci&Tech" "Sci&Tech" "Comm&Mgmt" "Sci&Tech" ...
##  $ workex        : chr  "No" "Yes" "No" "No" ...
##  $ etest_p       : num  55 86.5 75 66 96.8 ...
##  $ specialisation: chr  "Mkt&HR" "Mkt&Fin" "Mkt&Fin" "Mkt&HR" ...
##  $ mba_p         : num  58.8 66.3 57.8 59.4 55.5 ...
##  $ status        : chr  "Placed" "Placed" "Placed" "Not Placed" ...
##  $ salary        : int  270000 200000 250000 NA 425000 NA NA 252000 231000 NA ...
head(Placement_data)
##   sl_no gender ssc_p   ssc_b hsc_p   hsc_b    hsc_s degree_p  degree_t workex
## 1     1      M 67.00  Others 91.00  Others Commerce    58.00  Sci&Tech     No
## 2     2      M 79.33 Central 78.33  Others  Science    77.48  Sci&Tech    Yes
## 3     3      M 65.00 Central 68.00 Central     Arts    64.00 Comm&Mgmt     No
## 4     4      M 56.00 Central 52.00 Central  Science    52.00  Sci&Tech     No
## 5     5      M 85.80 Central 73.60 Central Commerce    73.30 Comm&Mgmt     No
## 6     6      M 55.00  Others 49.80  Others  Science    67.25  Sci&Tech    Yes
##   etest_p specialisation mba_p     status salary
## 1    55.0         Mkt&HR 58.80     Placed 270000
## 2    86.5        Mkt&Fin 66.28     Placed 200000
## 3    75.0        Mkt&Fin 57.80     Placed 250000
## 4    66.0         Mkt&HR 59.43 Not Placed     NA
## 5    96.8        Mkt&Fin 55.50     Placed 425000
## 6    55.0        Mkt&Fin 51.58 Not Placed     NA
tail(Placement_data)
##       sl_no gender ssc_p   ssc_b hsc_p   hsc_b    hsc_s degree_p  degree_t
## 21280 21280      M  62.0 Central    72 Central Commerce     65.0 Comm&Mgmt
## 21281 21281      M  80.6  Others    82  Others Commerce     77.6 Comm&Mgmt
## 21282 21282      M  58.0  Others    60  Others  Science     72.0  Sci&Tech
## 21283 21283      M  67.0  Others    67  Others Commerce     73.0 Comm&Mgmt
## 21284 21284      F  74.0  Others    66  Others Commerce     58.0 Comm&Mgmt
## 21285 21285      M  62.0 Central    58  Others  Science     53.0 Comm&Mgmt
##       workex etest_p specialisation mba_p     status salary
## 21280     No      67        Mkt&Fin 56.49     Placed 216000
## 21281     No      91        Mkt&Fin 74.49     Placed 400000
## 21282     No      74        Mkt&Fin 53.62     Placed 275000
## 21283    Yes      59        Mkt&Fin 69.72     Placed 295000
## 21284     No      70         Mkt&HR 60.23     Placed 204000
## 21285     No      89         Mkt&HR 60.22 Not Placed     NA
summary(Placement_data)
##      sl_no          gender              ssc_p          ssc_b          
##  Min.   :    1   Length:21285       Min.   :40.89   Length:21285      
##  1st Qu.: 5322   Class :character   1st Qu.:60.40   Class :character  
##  Median :10643   Mode  :character   Median :67.00   Mode  :character  
##  Mean   :10643                      Mean   :67.30                     
##  3rd Qu.:15964                      3rd Qu.:76.00                     
##  Max.   :21285                      Max.   :89.40                     
##                                                                       
##      hsc_p          hsc_b              hsc_s              degree_p    
##  Min.   :37.00   Length:21285       Length:21285       Min.   :50.00  
##  1st Qu.:60.80   Class :character   Class :character   1st Qu.:61.00  
##  Median :65.00   Mode  :character   Mode  :character   Median :66.00  
##  Mean   :66.33                                         Mean   :66.37  
##  3rd Qu.:73.00                                         3rd Qu.:72.00  
##  Max.   :97.70                                         Max.   :91.00  
##                                                                       
##    degree_t            workex             etest_p     specialisation    
##  Length:21285       Length:21285       Min.   :50.0   Length:21285      
##  Class :character   Class :character   1st Qu.:60.0   Class :character  
##  Mode  :character   Mode  :character   Median :71.0   Mode  :character  
##                                        Mean   :72.1                     
##                                        3rd Qu.:84.0                     
##                                        Max.   :98.0                     
##                                                                         
##      mba_p          status              salary      
##  Min.   :51.21   Length:21285       Min.   :200000  
##  1st Qu.:57.90   Class :character   1st Qu.:240000  
##  Median :62.00   Mode  :character   Median :265000  
##  Mean   :62.28                      Mean   :288655  
##  3rd Qu.:66.28                      3rd Qu.:300000  
##  Max.   :77.89                      Max.   :940000  
##                                     NA's   :6633
#find missing values
Placement <- subset(Placement_data, is.na(salary) | salary < 900000)

#data without unnecessary variables
Placement <- subset(Placement, select=-c(sl_no))

#Categorical Variables
table(Placement$gender)
## 
##     F     M 
##  7524 13662
table(Placement$ssc_b)
## 
## Central  Others 
##   11385    9801
table(Placement$hsc_b)
## 
## Central  Others 
##    8217   12969
table(Placement$hsc_s)
## 
##     Arts Commerce  Science 
##     1089    11088     9009
table(Placement$degree_t)
## 
## Comm&Mgmt    Others  Sci&Tech 
##     14256      1089      5841
table(Placement$workex)
## 
##    No   Yes 
## 13959  7227
table(Placement$specialisation)
## 
## Mkt&Fin  Mkt&HR 
##   11781    9405
table(Placement$status)
## 
## Not Placed     Placed 
##       6633      14553
#Convert Categorical Variables into Dummy Variables
Placement$gender <- ifelse(Placement$gender=="F",1,0)
Placement$ssc_b <- ifelse(Placement$ssc_b =="Central",1,0)
Placement$hsc_b <- ifelse(Placement$hsc_b =="Central",1,0)
Placement$workex <- ifelse(Placement$workex =="Yes",1,0)
Placement$specialisation <- ifelse(Placement$specialisation =="Mkt&Fin",1,0)
Placement$status <- ifelse(Placement$status =="Placed",1,0)
Placement$hsc_s <- ifelse(Placement$hsc_s == "Arts", 1, ifelse(Placement$hsc_s == "Commerce", 2, 0))
Placement$degree_t <- ifelse(Placement$degree_t == "Comm&Mgmt", 1, ifelse(Placement$degree_t == "Sci&Tech",2,0))

str(Placement)
## 'data.frame':    21186 obs. of  14 variables:
##  $ gender        : num  0 0 0 0 0 0 1 0 0 0 ...
##  $ ssc_p         : num  67 79.3 65 56 85.8 ...
##  $ ssc_b         : num  0 1 1 1 1 0 0 1 1 1 ...
##  $ hsc_p         : num  91 78.3 68 52 73.6 ...
##  $ hsc_b         : num  0 0 1 1 1 0 0 1 1 1 ...
##  $ hsc_s         : num  2 0 1 0 2 0 2 0 2 2 ...
##  $ degree_p      : num  58 77.5 64 52 73.3 ...
##  $ degree_t      : num  2 2 1 2 1 2 1 2 1 1 ...
##  $ workex        : num  0 1 0 0 0 1 0 1 0 0 ...
##  $ etest_p       : num  55 86.5 75 66 96.8 ...
##  $ specialisation: num  0 1 1 0 1 1 1 1 1 1 ...
##  $ mba_p         : num  58.8 66.3 57.8 59.4 55.5 ...
##  $ status        : num  1 1 1 0 1 0 0 1 1 0 ...
##  $ salary        : int  270000 200000 250000 NA 425000 NA NA 252000 231000 NA ...
#factor variables
Placement$gender <- as.factor(Placement$gender)
Placement$ssc_b <- as.factor(Placement$ssc_b)
Placement$hsc_b <- as.factor(Placement$hsc_b)
Placement$hsc_s <- as.factor(Placement$hsc_s)
Placement$degree_t <- as.factor(Placement$degree_t)
Placement$workex <- as.factor(Placement$workex)
Placement$specialisation <- as.factor(Placement$specialisation)
str(Placement)
## 'data.frame':    21186 obs. of  14 variables:
##  $ gender        : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 2 1 1 1 ...
##  $ ssc_p         : num  67 79.3 65 56 85.8 ...
##  $ ssc_b         : Factor w/ 2 levels "0","1": 1 2 2 2 2 1 1 2 2 2 ...
##  $ hsc_p         : num  91 78.3 68 52 73.6 ...
##  $ hsc_b         : Factor w/ 2 levels "0","1": 1 1 2 2 2 1 1 2 2 2 ...
##  $ hsc_s         : Factor w/ 3 levels "0","1","2": 3 1 2 1 3 1 3 1 3 3 ...
##  $ degree_p      : num  58 77.5 64 52 73.3 ...
##  $ degree_t      : Factor w/ 3 levels "0","1","2": 3 3 2 3 2 3 2 3 2 2 ...
##  $ workex        : Factor w/ 2 levels "0","1": 1 2 1 1 1 2 1 2 1 1 ...
##  $ etest_p       : num  55 86.5 75 66 96.8 ...
##  $ specialisation: Factor w/ 2 levels "0","1": 1 2 2 1 2 2 2 2 2 2 ...
##  $ mba_p         : num  58.8 66.3 57.8 59.4 55.5 ...
##  $ status        : num  1 1 1 0 1 0 0 1 1 0 ...
##  $ salary        : int  270000 200000 250000 NA 425000 NA NA 252000 231000 NA ...
#Dummy Variables
table(Placement$gender)
## 
##     0     1 
## 13662  7524
table(Placement$ssc_b)
## 
##     0     1 
##  9801 11385
table(Placement$hsc_b)
## 
##     0     1 
## 12969  8217
table(Placement$hsc_s)
## 
##     0     1     2 
##  9009  1089 11088
table(Placement$degree_t)
## 
##     0     1     2 
##  1089 14256  5841
table(Placement$workex)
## 
##     0     1 
## 13959  7227
table(Placement$specialisation)
## 
##     0     1 
##  9405 11781
table(Placement$status)
## 
##     0     1 
##  6633 14553
##  Regression Models using Train Data set 
## Model 1: salary= b0 + b1*degree_t + e
mod1 <- lm(salary ~ degree_t, data=Placement)
summary(mod1)
## 
## Call:
## lm(formula = salary ~ degree_t, data = Placement)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -114610  -47079  -22079   27921  377921 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   280400       3324  84.347   <2e-16 ***
## degree_t1      -8321       3406  -2.443   0.0146 *  
## degree_t2      34210       3521   9.715   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 73960 on 14550 degrees of freedom
##   (6633 observations deleted due to missingness)
## Multiple R-squared:  0.06165,    Adjusted R-squared:  0.06152 
## F-statistic:   478 on 2 and 14550 DF,  p-value: < 2.2e-16
## Model 2: using stepwise to remove all the insignificant variables 
mod2 <- step(lm(salary ~., data=Placement, direction = "both"))
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'direction' will be disregarded
## Start:  AIC=324734
## salary ~ gender + ssc_p + ssc_b + hsc_p + hsc_b + hsc_s + degree_p + 
##     degree_t + workex + etest_p + specialisation + mba_p + status
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'direction' will be disregarded
## 
## Step:  AIC=324734
## salary ~ gender + ssc_p + ssc_b + hsc_p + hsc_b + hsc_s + degree_p + 
##     degree_t + workex + etest_p + specialisation + mba_p
## 
##                  Df  Sum of Sq        RSS    AIC
## - workex          1 6.0710e+08 7.1262e+13 324732
## - ssc_b           1 8.2907e+08 7.1263e+13 324732
## <none>                         7.1262e+13 324734
## - ssc_p           1 2.6572e+10 7.1288e+13 324737
## - hsc_b           1 3.2083e+10 7.1294e+13 324739
## - hsc_p           1 8.5063e+10 7.1347e+13 324749
## - etest_p         1 2.7960e+11 7.1541e+13 324789
## - specialisation  1 5.8806e+11 7.1850e+13 324852
## - degree_p        1 7.6820e+11 7.2030e+13 324888
## - hsc_s           2 1.0088e+12 7.2271e+13 324935
## - gender          1 1.3272e+12 7.2589e+13 325001
## - mba_p           1 2.3393e+12 7.3601e+13 325202
## - degree_t        2 2.6904e+12 7.3952e+13 325269
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'direction' will be disregarded
## 
## Step:  AIC=324732.2
## salary ~ gender + ssc_p + ssc_b + hsc_p + hsc_b + hsc_s + degree_p + 
##     degree_t + etest_p + specialisation + mba_p
## 
##                  Df  Sum of Sq        RSS    AIC
## - ssc_b           1 5.3392e+08 7.1263e+13 324730
## <none>                         7.1262e+13 324732
## - ssc_p           1 2.6200e+10 7.1289e+13 324736
## - hsc_b           1 3.4862e+10 7.1297e+13 324737
## - hsc_p           1 8.4724e+10 7.1347e+13 324747
## - etest_p         1 2.7958e+11 7.1542e+13 324787
## - specialisation  1 6.0730e+11 7.1870e+13 324854
## - degree_p        1 7.7274e+11 7.2035e+13 324887
## - hsc_s           2 1.0102e+12 7.2273e+13 324933
## - gender          1 1.3468e+12 7.2609e+13 325003
## - mba_p           1 2.4394e+12 7.3702e+13 325220
## - degree_t        2 2.7659e+12 7.4028e+13 325282
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'direction' will be disregarded
## 
## Step:  AIC=324730.3
## salary ~ gender + ssc_p + hsc_p + hsc_b + hsc_s + degree_p + 
##     degree_t + etest_p + specialisation + mba_p
## 
##                  Df  Sum of Sq        RSS    AIC
## <none>                         7.1263e+13 324730
## - ssc_p           1 2.5683e+10 7.1289e+13 324734
## - hsc_b           1 6.8714e+10 7.1332e+13 324742
## - hsc_p           1 8.7219e+10 7.1350e+13 324746
## - etest_p         1 2.8538e+11 7.1548e+13 324786
## - specialisation  1 6.0680e+11 7.1870e+13 324852
## - degree_p        1 7.7244e+11 7.2035e+13 324885
## - hsc_s           2 1.0098e+12 7.2273e+13 324931
## - gender          1 1.3467e+12 7.2610e+13 325001
## - mba_p           1 2.4400e+12 7.3703e+13 325218
## - degree_t        2 2.8752e+12 7.4138e+13 325302
summary(mod2)
## 
## Call:
## lm(formula = salary ~ gender + ssc_p + hsc_p + hsc_b + hsc_s + 
##     degree_p + degree_t + etest_p + specialisation + mba_p, data = Placement, 
##     direction = "both")
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -130249  -42859  -10959   21203  368032 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     180967.90    8520.80  21.238  < 2e-16 ***
## gender1         -22863.28    1379.26 -16.577  < 2e-16 ***
## ssc_p             -192.78      84.22  -2.289 0.022086 *  
## hsc_p              314.76      74.61   4.218 2.47e-05 ***
## hsc_b1           -4657.29    1243.82  -3.744 0.000182 ***
## hsc_s1          -35074.23    3567.11  -9.833  < 2e-16 ***
## hsc_s2           11479.99    1839.21   6.242 4.45e-10 ***
## degree_p         -1376.97     109.68 -12.554  < 2e-16 ***
## degree_t1       -37181.75    3697.60 -10.056  < 2e-16 ***
## degree_t2         7928.16    3748.25   2.115 0.034433 *  
## etest_p            375.86      49.26   7.631 2.48e-14 ***
## specialisation1  14545.55    1307.24  11.127  < 2e-16 ***
## mba_p             2891.09     129.57  22.312  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 70010 on 14540 degrees of freedom
##   (6633 observations deleted due to missingness)
## Multiple R-squared:  0.1599, Adjusted R-squared:  0.1592 
## F-statistic: 230.6 on 12 and 14540 DF,  p-value: < 2.2e-16
## Residuals of 3 models 
pe1 <- residuals(mod1, newdata=Placement)
pe2 <- residuals(mod2, newdata=Placement)

mse1 <- mean(pe1^2)
mse2 <- mean(pe2^2)
print(c(mse1,mse2))
## [1] 5469253894 4896785160
rmse1 <- mse1^0.5
rmse2 <- mse2^0.5
print(c(rmse1,rmse2))
## [1] 73954.40 69977.03
#Remove salary dependent variable 
Placement_status<- subset(Placement, select=-c(salary))

#Split data set into train and test data
set.seed(2111845)
train_idx <- sample(nrow(Placement_status), nrow(Placement_status)*0.8)
train <- Placement_status[train_idx,]
test <- Placement_status[-train_idx,]
testy <- test$status

library(ROCR)
#Linear Probability Model
## Model 1: lpm0
lpm0 <- lm(status~ssc_p+hsc_p+degree_p+etest_p+mba_p, data=train)
summary(lpm0)
## 
## Call:
## lm(formula = status ~ ssc_p + hsc_p + degree_p + etest_p + mba_p, 
##     data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.85505 -0.20397  0.04614  0.24717  0.82302 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.8525993  0.0305319  -27.93   <2e-16 ***
## ssc_p        0.0203971  0.0003013   67.70   <2e-16 ***
## hsc_p        0.0110375  0.0002798   39.45   <2e-16 ***
## degree_p     0.0151205  0.0004261   35.48   <2e-16 ***
## etest_p     -0.0020934  0.0001992  -10.51   <2e-16 ***
## mba_p       -0.0227895  0.0004893  -46.57   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3267 on 16942 degrees of freedom
## Multiple R-squared:  0.5031, Adjusted R-squared:  0.5029 
## F-statistic:  3430 on 5 and 16942 DF,  p-value: < 2.2e-16
###lpm0 - Adjusted R-squared:  0.5017 
yhat0 <- predict(lpm0, newdata = test)
auc_plot(yhat0, testy, "LPM")

###AUC=0.93214

##Model 2: lpm1
lpm1<- step(lm(status~., data=train), direction = "both")
## Start:  AIC=-40096.86
## status ~ gender + ssc_p + ssc_b + hsc_p + hsc_b + hsc_s + degree_p + 
##     degree_t + workex + etest_p + specialisation + mba_p
## 
##                  Df Sum of Sq    RSS    AIC
## <none>                        1588.0 -40097
## - hsc_b           1      4.79 1592.8 -40048
## - specialisation  1      7.48 1595.5 -40019
## - ssc_b           1     10.14 1598.2 -39991
## - etest_p         1     19.50 1607.5 -39892
## - gender          1     22.17 1610.2 -39864
## - hsc_s           2     29.71 1617.8 -39787
## - degree_t        2     53.93 1642.0 -39535
## - workex          1     95.89 1683.9 -39105
## - hsc_p           1    114.08 1702.1 -38923
## - degree_p        1    126.25 1714.3 -38802
## - mba_p           1    179.34 1767.4 -38285
## - ssc_p           1    405.05 1993.1 -36248
summary(lpm1)
## 
## Call:
## lm(formula = status ~ gender + ssc_p + ssc_b + hsc_p + hsc_b + 
##     hsc_s + degree_p + degree_t + workex + etest_p + specialisation + 
##     mba_p, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.73019 -0.21766  0.01742  0.21855  0.74377 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -0.9163307  0.0321154 -28.532  < 2e-16 ***
## gender1         -0.0843637  0.0054871 -15.375  < 2e-16 ***
## ssc_p            0.0202170  0.0003076  65.719  < 2e-16 ***
## ssc_b1          -0.0661458  0.0063607 -10.399  < 2e-16 ***
## hsc_p            0.0104585  0.0002999  34.878  < 2e-16 ***
## hsc_b1           0.0454550  0.0063619   7.145 9.37e-13 ***
## hsc_s1           0.0741773  0.0129975   5.707 1.17e-08 ***
## hsc_s2          -0.0939156  0.0071668 -13.104  < 2e-16 ***
## degree_p         0.0151988  0.0004142  36.690  < 2e-16 ***
## degree_t1        0.1488804  0.0120611  12.344  < 2e-16 ***
## degree_t2       -0.0212936  0.0125178  -1.701    0.089 .  
## workex1          0.1680323  0.0052550  31.976  < 2e-16 ***
## etest_p         -0.0028008  0.0001942 -14.420  < 2e-16 ***
## specialisation1  0.0469132  0.0052521   8.932  < 2e-16 ***
## mba_p           -0.0215769  0.0004934 -43.729  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3062 on 16933 degrees of freedom
## Multiple R-squared:  0.5637, Adjusted R-squared:  0.5633 
## F-statistic:  1563 on 14 and 16933 DF,  p-value: < 2.2e-16
##lpm1 <- Adjusted R-squared:  0.5651
yhat1 <- predict(lpm1, newdata=test)
auc_plot(yhat1, testy, "LPM")

##AUC=0.95582
conf_table(yhat1, testy, "LPM")
##   estname prob true_total truepos falsneg detection_rate false_total falspos
## 1     LPM  0.1       2897    2897       0              1        1341    1078
## 2     LPM  0.2       2897    2897       0              1        1341     905
## 3     LPM  0.3       2897    2857      40         0.9862        1341     642
## 4     LPM  0.4       2897    2798      99         0.9658        1341     411
## 5     LPM  0.5       2897    2712     185         0.9361        1341     240
## 6     LPM  0.6       2897    2598     299         0.8968        1341     181
## 7     LPM  0.7       2897    2167     730          0.748        1341      39
## 8     LPM  0.8       2897    1730    1167         0.5972        1341       0
## 9     LPM  0.9       2897    1265    1632         0.4367        1341       0
##   trueneg false_pos_rate
## 1     263         0.8039
## 2     436         0.6749
## 3     699         0.4787
## 4     930         0.3065
## 5    1101          0.179
## 6    1160          0.135
## 7    1302         0.0291
## 8    1341              0
## 9    1341              0
##Final decision
dec_lpm <- ifelse(yhat1>=0.3, 1,0)
table(testy, dec_lpm)
##      dec_lpm
## testy    0    1
##     0  699  642
##     1   40 2857
#Logistic Model

##Model 1: logit0
logit0 <- glm(status~ssc_p+hsc_p+degree_p+etest_p+mba_p, data=train, family= binomial(link = logit))
loghat0 <- predict(logit0, newdata = test, type="response")
###AUC = 0.93204

##Final decision
dec_logit <- ifelse(yhat1>=0.3, 1,0)
table(testy, dec_lpm)
##      dec_lpm
## testy    0    1
##     0  699  642
##     1   40 2857
##Model 2: logit1
logit1 <- step(glm(status~., data = train, family = binomial(link=logit)), direction = "both")
## Start:  AIC=7922.27
## status ~ gender + ssc_p + ssc_b + hsc_p + hsc_b + hsc_s + degree_p + 
##     degree_t + workex + etest_p + specialisation + mba_p
## 
##                  Df Deviance     AIC
## <none>                7892.3  7922.3
## - ssc_b           1   7898.0  7926.0
## - specialisation  1   7908.5  7936.5
## - hsc_b           1   7910.9  7938.9
## - etest_p         1   7935.4  7963.4
## - hsc_s           2   8025.7  8051.7
## - gender          1   8165.0  8193.0
## - degree_t        2   8316.5  8342.5
## - workex          1   8654.2  8682.2
## - hsc_p           1   8683.1  8711.1
## - degree_p        1   8940.3  8968.3
## - mba_p           1   9117.7  9145.7
## - ssc_p           1  11165.0 11193.0
loghat1 <- predict(logit1, newdata = test, type = "response")
auc_plot(loghat1, testy, "LOGIT")

###AUC = 0.95961

conf_table(loghat1, testy, "LOGIT")
##   estname prob true_total truepos falsneg detection_rate false_total falspos
## 1   LOGIT  0.1       2897    2857      40         0.9862        1341     645
## 2   LOGIT  0.2       2897    2837      60         0.9793        1341     485
## 3   LOGIT  0.3       2897    2772     125         0.9569        1341     396
## 4   LOGIT  0.4       2897    2772     125         0.9569        1341     310
## 5   LOGIT  0.5       2897    2712     185         0.9361        1341     216
## 6   LOGIT  0.6       2897    2612     285         0.9016        1341     216
## 7   LOGIT  0.7       2897    2520     377         0.8699        1341     133
## 8   LOGIT  0.8       2897    2414     483         0.8333        1341      50
## 9   LOGIT  0.9       2897    2176     721         0.7511        1341      28
##   trueneg false_pos_rate
## 1     696          0.481
## 2     856         0.3617
## 3     945         0.2953
## 4    1031         0.2312
## 5    1125         0.1611
## 6    1125         0.1611
## 7    1208         0.0992
## 8    1291         0.0373
## 9    1313         0.0209
#Final Decision
dec_logit <- ifelse(loghat1>=0.2,1,0)
table(testy, dec_logit)
##      dec_logit
## testy    0    1
##     0  856  485
##     1   60 2837
#Random Forest Model
train$status <- as.factor(train$status)
test$status <- as.factor(test$status)
if(!(require(randomForest))) install.packages("randomForest")
## Loading required package: randomForest
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
rf1 <- randomForest(formula=status~., data=train, mtry=5, ntree=500)
summary(rf1)
##                 Length Class  Mode     
## call                5  -none- call     
## type                1  -none- character
## predicted       16948  factor numeric  
## err.rate         1500  -none- numeric  
## confusion           6  -none- numeric  
## votes           33896  matrix numeric  
## oob.times       16948  -none- numeric  
## classes             2  -none- character
## importance         12  -none- numeric  
## importanceSD        0  -none- NULL     
## localImportance     0  -none- NULL     
## proximity           0  -none- NULL     
## ntree               1  -none- numeric  
## mtry                1  -none- numeric  
## forest             14  -none- list     
## y               16948  factor numeric  
## test                0  -none- NULL     
## inbag               0  -none- NULL     
## terms               3  terms  call
rfhat1 <- predict(rf1, newdata=test, type="prob")
rfhat1 <- rfhat1[,2]
auc_plot(rfhat1, testy, "RANDOMFOREST")

###AUC=1
conf_table(rfhat1, testy, "RANDOMFOREST")
##        estname prob true_total truepos falsneg detection_rate false_total
## 1 RANDOMFOREST  0.1       2897    2897       0              1        1341
## 2 RANDOMFOREST  0.2       2897    2897       0              1        1341
## 3 RANDOMFOREST  0.3       2897    2897       0              1        1341
## 4 RANDOMFOREST  0.4       2897    2897       0              1        1341
## 5 RANDOMFOREST  0.5       2897    2897       0              1        1341
## 6 RANDOMFOREST  0.6       2897    2897       0              1        1341
## 7 RANDOMFOREST  0.7       2897    2897       0              1        1341
## 8 RANDOMFOREST  0.8       2897    2897       0              1        1341
## 9 RANDOMFOREST  0.9       2897    2897       0              1        1341
##   falspos trueneg false_pos_rate
## 1       0    1341              0
## 2       0    1341              0
## 3       0    1341              0
## 4       0    1341              0
## 5       0    1341              0
## 6       0    1341              0
## 7       0    1341              0
## 8       0    1341              0
## 9       0    1341              0
###Final Decision
dec_rf <- ifelse(rfhat1 >= 0.1,1,0)
table(testy, dec_rf)
##      dec_rf
## testy    0    1
##     0 1341    0
##     1    0 2897
#Neural Network Analysis
##install.packages(neuralnet)
library(neuralnet)
## 
## Attaching package: 'neuralnet'
## The following object is masked from 'package:ROCR':
## 
##     prediction
## The following object is masked from 'package:dplyr':
## 
##     compute
Placement_status[] <- lapply(Placement_status, as.numeric)
str(Placement_status)
## 'data.frame':    21186 obs. of  13 variables:
##  $ gender        : num  1 1 1 1 1 1 2 1 1 1 ...
##  $ ssc_p         : num  67 79.3 65 56 85.8 ...
##  $ ssc_b         : num  1 2 2 2 2 1 1 2 2 2 ...
##  $ hsc_p         : num  91 78.3 68 52 73.6 ...
##  $ hsc_b         : num  1 1 2 2 2 1 1 2 2 2 ...
##  $ hsc_s         : num  3 1 2 1 3 1 3 1 3 3 ...
##  $ degree_p      : num  58 77.5 64 52 73.3 ...
##  $ degree_t      : num  3 3 2 3 2 3 2 3 2 2 ...
##  $ workex        : num  1 2 1 1 1 2 1 2 1 1 ...
##  $ etest_p       : num  55 86.5 75 66 96.8 ...
##  $ specialisation: num  1 2 2 1 2 2 2 2 2 2 ...
##  $ mba_p         : num  58.8 66.3 57.8 59.4 55.5 ...
##  $ status        : num  1 1 1 0 1 0 0 1 1 0 ...
zplacement_status<- min_max_nor(Placement_status)

##split dataset
#Split Train and Test
zplacement_status_idx <- sample(nrow(zplacement_status), round(0.7*nrow(zplacement_status)))
ztrain <- zplacement_status[zplacement_status_idx,]
ztest <- zplacement_status[-zplacement_status_idx,]
ztesty <- ztest$status

##Model 1: nnet1
ztrain$status <- as.factor(ztrain$status)
nnet1 <- neuralnet(status~., data=ztrain, hidden=c(3), stepmax=1e+06)
plot(nnet1)
pred1 <- predict(nnet1, newdata= ztest)
pred1 <- pred1[,2]
auc_plot(pred1, ztesty, "Neural Network")
## 
## Attaching package: 'neuralnet'
## The following object is masked from 'package:ROCR':
## 
##     prediction
## The following object is masked from 'package:dplyr':
## 
##     compute
###AUC= 0.9603

##Final Decision
conf_table(pred1, ztesty, "Neural Network")
##          estname prob true_total truepos falsneg detection_rate false_total
## 1 Neural Network  0.1       4345    4298      47         0.9892        2011
## 2 Neural Network  0.2       4345    4298      47         0.9892        2011
## 3 Neural Network  0.3       4345    4298      47         0.9892        2011
## 4 Neural Network  0.4       4345    4298      47         0.9892        2011
## 5 Neural Network  0.5       4345    4298      47         0.9892        2011
## 6 Neural Network  0.6       4345    4298      47         0.9892        2011
## 7 Neural Network  0.7       4345    4298      47         0.9892        2011
## 8 Neural Network  0.8       4345    4298      47         0.9892        2011
## 9 Neural Network  0.9       4345    4298      47         0.9892        2011
##   falspos trueneg false_pos_rate
## 1     211    1800         0.1049
## 2     211    1800         0.1049
## 3     211    1800         0.1049
## 4     211    1800         0.1049
## 5     211    1800         0.1049
## 6     211    1800         0.1049
## 7     211    1800         0.1049
## 8     211    1800         0.1049
## 9     211    1800         0.1049
dec_nn <- ifelse(pred1 >=0.1,1,0)
table(ztesty, dec_nn)
##       dec_nn
## ztesty    0    1
##      0 1800  211
##      1   47 4298
#Combine and Compare Models

##Combine Graphs
par(mfrow=c(2,2))
plot(testy, yhat1)
plot(testy, loghat1)
plot(testy, rfhat1)
plot(ztesty, pred1)

##Find the model with the lowest error
fit_eval(yhat1, testy, "Linear Progression")
##                       n          ME      MAE MPE MAPE        MSE      RMSE
## Linear Progression 4238 0.001512145 0.249521 NaN  Inf 0.09312148 0.3051581
fit_eval(loghat1, testy, "Logit")
##          n          ME       MAE  MPE MAPE        MSE      RMSE
## Logit 4238 0.004077048 0.1442842 -Inf  Inf 0.07324195 0.2706325
fit_eval(rfhat1, testy, "Random Forest")
##                  n          ME          MAE MPE MAPE          MSE        RMSE
## Random Forest 4238 6.60689e-06 0.0001340255 NaN  NaN 5.455403e-07 0.000738607
fit_eval(pred1, ztesty, "Neural Network")
##                   n           ME        MAE MPE MAPE        MSE      RMSE
## Neural Network 6356 -0.003743628 0.07977579 NaN  Inf 0.03883906 0.1970763