chpater-13.knit

library(fastDummies)

## Warning: package 'fastDummies' was built under R version 4.2.1

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.7     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(dplyr)
library(rpart)
library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

library(lift)
library(adabag)

## Warning: package 'adabag' was built under R version 4.2.1

## Loading required package: foreach

## 
## Attaching package: 'foreach'

## The following objects are masked from 'package:purrr':
## 
##     accumulate, when

## Loading required package: doParallel

## Warning: package 'doParallel' was built under R version 4.2.1

## Loading required package: iterators

## Loading required package: parallel

library(randomForest)

## randomForest 4.7-1.1

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

library(clusterSim)

## Warning: package 'clusterSim' was built under R version 4.2.1

## Loading required package: cluster

## Loading required package: MASS

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

Load the Data

ebay = read.csv('C:/Users/Public/eBayAuctions.csv')
head(ebay)

##           Category currency sellerRating Duration endDay ClosePrice OpenPrice
## 1 Music/Movie/Game       US         3249        5    Mon       0.01      0.01
## 2 Music/Movie/Game       US         3249        5    Mon       0.01      0.01
## 3 Music/Movie/Game       US         3249        5    Mon       0.01      0.01
## 4 Music/Movie/Game       US         3249        5    Mon       0.01      0.01
## 5 Music/Movie/Game       US         3249        5    Mon       0.01      0.01
## 6 Music/Movie/Game       US         3249        5    Mon       0.01      0.01
##   Competitive.
## 1            0
## 2            0
## 3            0
## 4            0
## 5            0
## 6            0

Convert the Categorical data into Numeric data

ebay= ebay[,-1]
data_dummies = dummy_cols(ebay, remove_first_dummy = TRUE,
 select_columns =colnames(select_if(ebay,

is.character)),remove_selected_columns = TRUE)
data_dummies$Competitive.= as.factor(data_dummies$Competitive.)

head(data_dummies)

##   sellerRating Duration ClosePrice OpenPrice Competitive. currency_GBP
## 1         3249        5       0.01      0.01            0            0
## 2         3249        5       0.01      0.01            0            0
## 3         3249        5       0.01      0.01            0            0
## 4         3249        5       0.01      0.01            0            0
## 5         3249        5       0.01      0.01            0            0
## 6         3249        5       0.01      0.01            0            0
##   currency_US endDay_Mon endDay_Sat endDay_Sun endDay_Thu endDay_Tue endDay_Wed
## 1           1          1          0          0          0          0          0
## 2           1          1          0          0          0          0          0
## 3           1          1          0          0          0          0          0
## 4           1          1          0          0          0          0          0
## 5           1          1          0          0          0          0          0
## 6           1          1          0          0          0          0          0

Splitting the data into train and test

n = nrow(data_dummies)
set.seed(101)
trainIndex = sample(1:n, size = round(0.7*n), replace=FALSE)
train = data_dummies[trainIndex ,]
test = data_dummies[-trainIndex ,]

fit = rpart(Competitive. ~., data = train)
printcp(fit)

## 
## Classification tree:
## rpart(formula = Competitive. ~ ., data = train)
## 
## Variables actually used in tree construction:
## [1] ClosePrice   OpenPrice    sellerRating
## 
## Root node error: 645/1380 = 0.46739
## 
## n= 1380 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.243411      0   1.00000 1.00000 0.028736
## 2 0.112403      1   0.75659 0.75814 0.027548
## 3 0.083721      3   0.53178 0.54109 0.025035
## 4 0.043411      4   0.44806 0.45891 0.023641
## 5 0.041860      5   0.40465 0.43721 0.023223
## 6 0.010000      7   0.32093 0.33488 0.020927

fit.pruned = prune(fit, cp = 0.010000)
pred <- predict(fit.pruned, test[,-5],type = "class")
confusionMatrix(table(test$Competitive.,pred))

## Confusion Matrix and Statistics
## 
##    pred
##       0   1
##   0 237  24
##   1  68 263
##                                           
##                Accuracy : 0.8446          
##                  95% CI : (0.8129, 0.8728)
##     No Information Rate : 0.5152          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6903          
##                                           
##  Mcnemar's Test P-Value : 7.358e-06       
##                                           
##             Sensitivity : 0.7770          
##             Specificity : 0.9164          
##          Pos Pred Value : 0.9080          
##          Neg Pred Value : 0.7946          
##              Prevalence : 0.5152          
##          Detection Rate : 0.4003          
##    Detection Prevalence : 0.4409          
##       Balanced Accuracy : 0.8467          
##                                           
##        'Positive' Class : 0               
##

TopDecileLift(pred, test$Competitive.)

## [1] 1.789

ebay_boosting= boosting(Competitive. ~., data=train)
pred = predict(ebay_boosting, test)
print(pred$confusion)

##                Observed Class
## Predicted Class   0   1
##               0 246  30
##               1  15 301

confusionMatrix(pred$confusion)

## Confusion Matrix and Statistics
## 
##                Observed Class
## Predicted Class   0   1
##               0 246  30
##               1  15 301
##                                          
##                Accuracy : 0.924          
##                  95% CI : (0.8996, 0.944)
##     No Information Rate : 0.5591         
##     P-Value [Acc > NIR] : < 2e-16        
##                                          
##                   Kappa : 0.8467         
##                                          
##  Mcnemar's Test P-Value : 0.03689        
##                                          
##             Sensitivity : 0.9425         
##             Specificity : 0.9094         
##          Pos Pred Value : 0.8913         
##          Neg Pred Value : 0.9525         
##              Prevalence : 0.4409         
##          Detection Rate : 0.4155         
##    Detection Prevalence : 0.4662         
##       Balanced Accuracy : 0.9259         
##                                          
##        'Positive' Class : 0              
##

TopDecileLift(as.numeric(pred$class), test$Competitive.)

## [1] 1.789

ebay_bagging <- bagging(Competitive.~., data=train)
pred = predict(ebay_bagging, test)
print(pred$confusion)

##                Observed Class
## Predicted Class   0   1
##               0 243  62
##               1  18 269

confusionMatrix(pred$confusion)

## Confusion Matrix and Statistics
## 
##                Observed Class
## Predicted Class   0   1
##               0 243  62
##               1  18 269
##                                           
##                Accuracy : 0.8649          
##                  95% CI : (0.8347, 0.8914)
##     No Information Rate : 0.5591          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7307          
##                                           
##  Mcnemar's Test P-Value : 1.528e-06       
##                                           
##             Sensitivity : 0.9310          
##             Specificity : 0.8127          
##          Pos Pred Value : 0.7967          
##          Neg Pred Value : 0.9373          
##              Prevalence : 0.4409          
##          Detection Rate : 0.4105          
##    Detection Prevalence : 0.5152          
##       Balanced Accuracy : 0.8719          
##                                           
##        'Positive' Class : 0               
##

TopDecileLift(as.numeric(pred$class), test$Competitive.)

## [1] 1.789

ebay.rf <- randomForest(Competitive.~., data=train, mtry=4)
pred = predict(ebay.rf, test)
confusionMatrix(table(test$Competitive.,pred))

## Confusion Matrix and Statistics
## 
##    pred
##       0   1
##   0 240  21
##   1  44 287
##                                           
##                Accuracy : 0.8902          
##                  95% CI : (0.8622, 0.9142)
##     No Information Rate : 0.5203          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7793          
##                                           
##  Mcnemar's Test P-Value : 0.006357        
##                                           
##             Sensitivity : 0.8451          
##             Specificity : 0.9318          
##          Pos Pred Value : 0.9195          
##          Neg Pred Value : 0.8671          
##              Prevalence : 0.4797          
##          Detection Rate : 0.4054          
##    Detection Prevalence : 0.4409          
##       Balanced Accuracy : 0.8884          
##                                           
##        'Positive' Class : 0               
##

TopDecileLift(pred, test$Competitive.)

## [1] 1.789

flights = read.csv('C:/Users/Public/FlightDelays.csv')
head(flights)

##   CRS_DEP_TIME CARRIER DEP_TIME DEST DISTANCE    FL_DATE FL_NUM ORIGIN Weather
## 1         1455      OH     1455  JFK      184 01/01/2004   5935    BWI       0
## 2         1640      DH     1640  JFK      213 01/01/2004   6155    DCA       0
## 3         1245      DH     1245  LGA      229 01/01/2004   7208    IAD       0
## 4         1715      DH     1709  LGA      229 01/01/2004   7215    IAD       0
## 5         1039      DH     1035  LGA      229 01/01/2004   7792    IAD       0
## 6          840      DH      839  JFK      228 01/01/2004   7800    IAD       0
##   DAY_WEEK DAY_OF_MONTH TAIL_NUM Flight.Status
## 1        4            1   N940CA        ontime
## 2        4            1   N405FJ        ontime
## 3        4            1   N695BR        ontime
## 4        4            1   N662BR        ontime
## 5        4            1   N698BR        ontime
## 6        4            1   N687BR        ontime

Transforming variable day of week info a categorical variable.

str(flights)

## 'data.frame':    2201 obs. of  13 variables:
##  $ CRS_DEP_TIME : int  1455 1640 1245 1715 1039 840 1240 1645 1715 2120 ...
##  $ CARRIER      : chr  "OH" "DH" "DH" "DH" ...
##  $ DEP_TIME     : int  1455 1640 1245 1709 1035 839 1243 1644 1710 2129 ...
##  $ DEST         : chr  "JFK" "JFK" "LGA" "LGA" ...
##  $ DISTANCE     : int  184 213 229 229 229 228 228 228 228 228 ...
##  $ FL_DATE      : chr  "01/01/2004" "01/01/2004" "01/01/2004" "01/01/2004" ...
##  $ FL_NUM       : int  5935 6155 7208 7215 7792 7800 7806 7810 7812 7814 ...
##  $ ORIGIN       : chr  "BWI" "DCA" "IAD" "IAD" ...
##  $ Weather      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ DAY_WEEK     : int  4 4 4 4 4 4 4 4 4 4 ...
##  $ DAY_OF_MONTH : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ TAIL_NUM     : chr  "N940CA" "N405FJ" "N695BR" "N662BR" ...
##  $ Flight.Status: chr  "ontime" "ontime" "ontime" "ontime" ...

flights$DAY_WEEK= as.factor(flights$DAY_WEEK)

Used cut() to bin the scheduled departure time into eight bins.

flights$DEP_TIME=cut(flights$DEP_TIME, b = 8,labels = c(1,2,3,4,5,6,7,8))

Removing the unwanted data from the data and Converting the Output to Factor

flights = flights[,-c(6,7,12)]
flights$Flight.Status = ifelse(flights$Flight.Status== 'delayed',1,0)

Creating Dummies and Standardizing the data

flights_dummies = dummy_cols(flights, select_columns = c("CARRIER",
"DEST",'ORIGIN','DAY_WEEK'),
 remove_first_dummy = TRUE,remove_selected_columns = TRUE)
flight_stand = data.Normalization(flights_dummies[,-
c(6)],type="n1",normalization="column")

## Warning in data.Normalization(flights_dummies[, -c(6)], type = "n1",
## normalization = "column"): Data not numeric, normalization not applicable

flight_stand$Flight.Status = as.factor(flights_dummies$Flight.Status)
#Splitting the data into train and test
n = nrow(flight_stand)
set.seed(101)
trainIndex = sample(1:n, size = round(0.7*n), replace=FALSE)
train = flight_stand[trainIndex ,]
test = flight_stand[-trainIndex ,]

Traning the CART model

fit = rpart(Flight.Status ~., data = train)
printcp(fit)

## 
## Classification tree:
## rpart(formula = Flight.Status ~ ., data = train)
## 
## Variables actually used in tree construction:
## [1] CRS_DEP_TIME DEP_TIME     Weather     
## 
## Root node error: 300/1541 = 0.19468
## 
## n= 1541 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.073333      0   1.00000 1.00000 0.051811
## 2 0.036667      1   0.92667 0.91667 0.050103
## 3 0.010000      5   0.74667 0.75667 0.046375

fit.pruned = prune(fit, cp = 0.010000)
pred <- predict(fit.pruned, test,type = "class")
confusionMatrix(table(test$Flight.Status,pred))

## Confusion Matrix and Statistics
## 
##    pred
##       0   1
##   0 523   9
##   1  91  37
##                                          
##                Accuracy : 0.8485         
##                  95% CI : (0.8188, 0.875)
##     No Information Rate : 0.9303         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.3596         
##                                          
##  Mcnemar's Test P-Value : 5.496e-16      
##                                          
##             Sensitivity : 0.8518         
##             Specificity : 0.8043         
##          Pos Pred Value : 0.9831         
##          Neg Pred Value : 0.2891         
##              Prevalence : 0.9303         
##          Detection Rate : 0.7924         
##    Detection Prevalence : 0.8061         
##       Balanced Accuracy : 0.8281         
##                                          
##        'Positive' Class : 0              
##

flights_boosting= boosting(Flight.Status~ ., data=train)
pred = predict(flights_boosting, test)
print(pred$confusion)

##                Observed Class
## Predicted Class   0   1
##               0 503  75
##               1  29  53

confusionMatrix(pred$confusion)

## Confusion Matrix and Statistics
## 
##                Observed Class
## Predicted Class   0   1
##               0 503  75
##               1  29  53
##                                           
##                Accuracy : 0.8424          
##                  95% CI : (0.8123, 0.8694)
##     No Information Rate : 0.8061          
##     P-Value [Acc > NIR] : 0.009091        
##                                           
##                   Kappa : 0.4164          
##                                           
##  Mcnemar's Test P-Value : 1.021e-05       
##                                           
##             Sensitivity : 0.9455          
##             Specificity : 0.4141          
##          Pos Pred Value : 0.8702          
##          Neg Pred Value : 0.6463          
##              Prevalence : 0.8061          
##          Detection Rate : 0.7621          
##    Detection Prevalence : 0.8758          
##       Balanced Accuracy : 0.6798          
##                                           
##        'Positive' Class : 0               
##